Markit_v2 / src /core /converter.py
AnseMin's picture
Approach #2 -- converting latex output from GOT OCR to markdown
5b7f920
import tempfile
import logging
import time
import os
from pathlib import Path
# Use relative imports instead of absolute imports
from src.core.parser_factory import ParserFactory
# Import all parsers to ensure they're registered
from src import parsers
# Import the LaTeX to Markdown converter
try:
from src.core.latex_to_markdown_converter import convert_latex_to_markdown
HAS_GEMINI_CONVERTER = True
except ImportError:
HAS_GEMINI_CONVERTER = False
logging.warning("LaTeX to Markdown converter not available. Raw LaTeX will be returned for formatted text.")
# Reference to the cancellation flag from ui.py
# This will be set by the UI when the cancel button is clicked
conversion_cancelled = None # Will be a threading.Event object
# Flag to track if conversion is currently in progress
_conversion_in_progress = False
def set_cancellation_flag(flag):
"""Set the reference to the cancellation flag from ui.py"""
global conversion_cancelled
conversion_cancelled = flag
def is_conversion_in_progress():
"""Check if conversion is currently in progress"""
global _conversion_in_progress
return _conversion_in_progress
def check_cancellation():
"""Check if cancellation has been requested"""
if conversion_cancelled and conversion_cancelled.is_set():
logging.info("Cancellation detected in check_cancellation")
return True
return False
def safe_delete_file(file_path):
"""Safely delete a file with error handling"""
if file_path and os.path.exists(file_path):
try:
os.unlink(file_path)
except Exception as e:
logging.error(f"Error cleaning up temp file {file_path}: {e}")
def convert_file(file_path, parser_name, ocr_method_name, output_format):
"""
Convert a file using the specified parser and OCR method.
Args:
file_path: Path to the file
parser_name: Name of the parser to use
ocr_method_name: Name of the OCR method to use
output_format: Output format (Markdown, JSON, Text, Document Tags)
Returns:
tuple: (content, download_file_path)
"""
global conversion_cancelled, _conversion_in_progress
# Set the conversion in progress flag
_conversion_in_progress = True
# Temporary file paths to clean up
temp_input = None
tmp_path = None
# Ensure we clean up the flag when we're done
try:
if not file_path:
return "Please upload a file.", None
# Check for cancellation
if check_cancellation():
logging.info("Cancellation detected at start of convert_file")
return "Conversion cancelled.", None
# Create a temporary file with English filename
try:
original_ext = Path(file_path).suffix
with tempfile.NamedTemporaryFile(suffix=original_ext, delete=False) as temp_file:
temp_input = temp_file.name
# Copy the content of original file to temp file
with open(file_path, 'rb') as original:
# Read in smaller chunks and check for cancellation between chunks
chunk_size = 1024 * 1024 # 1MB chunks
while True:
# Check for cancellation frequently
if check_cancellation():
logging.info("Cancellation detected during file copy")
safe_delete_file(temp_input)
return "Conversion cancelled.", None
chunk = original.read(chunk_size)
if not chunk:
break
temp_file.write(chunk)
file_path = temp_input
except Exception as e:
safe_delete_file(temp_input)
return f"Error creating temporary file: {e}", None
# Check for cancellation again
if check_cancellation():
logging.info("Cancellation detected after file preparation")
safe_delete_file(temp_input)
return "Conversion cancelled.", None
content = None
try:
# Use the parser factory to parse the document
start = time.time()
# Pass the cancellation flag to the parser factory
content = ParserFactory.parse_document(
file_path=file_path,
parser_name=parser_name,
ocr_method_name=ocr_method_name,
output_format=output_format.lower(),
cancellation_flag=conversion_cancelled # Pass the flag to parsers
)
# If content indicates cancellation, return early
if content == "Conversion cancelled.":
logging.info("Parser reported cancellation")
safe_delete_file(temp_input)
return content, None
duration = time.time() - start
logging.info(f"Processed in {duration:.2f} seconds.")
# Check for cancellation after processing
if check_cancellation():
logging.info("Cancellation detected after processing")
safe_delete_file(temp_input)
return "Conversion cancelled.", None
# Process LaTeX content for GOT-OCR formatted text
if parser_name == "GOT-OCR (jpg,png only)" and ocr_method_name == "Formatted Text" and HAS_GEMINI_CONVERTER:
logging.info("Converting LaTeX output to Markdown using Gemini API")
start_convert = time.time()
# Check for cancellation before conversion
if check_cancellation():
logging.info("Cancellation detected before LaTeX conversion")
safe_delete_file(temp_input)
return "Conversion cancelled.", None
try:
markdown_content = convert_latex_to_markdown(content)
if markdown_content:
content = markdown_content
logging.info(f"LaTeX conversion completed in {time.time() - start_convert:.2f} seconds")
else:
logging.warning("LaTeX to Markdown conversion failed, using raw LaTeX output")
except Exception as e:
logging.error(f"Error converting LaTeX to Markdown: {str(e)}")
# Continue with the original content on error
# Check for cancellation after conversion
if check_cancellation():
logging.info("Cancellation detected after LaTeX conversion")
safe_delete_file(temp_input)
return "Conversion cancelled.", None
except Exception as e:
safe_delete_file(temp_input)
return f"Error: {e}", None
# Determine the file extension based on the output format
if output_format == "Markdown":
ext = ".md"
elif output_format == "JSON":
ext = ".json"
elif output_format == "Text":
ext = ".txt"
elif output_format == "Document Tags":
ext = ".doctags"
else:
ext = ".txt"
# Check for cancellation again
if check_cancellation():
logging.info("Cancellation detected before output file creation")
safe_delete_file(temp_input)
return "Conversion cancelled.", None
try:
# Create a temporary file for download
with tempfile.NamedTemporaryFile(mode="w", suffix=ext, delete=False, encoding="utf-8") as tmp:
tmp_path = tmp.name
# Write in chunks and check for cancellation
chunk_size = 10000 # characters
for i in range(0, len(content), chunk_size):
# Check for cancellation
if check_cancellation():
logging.info("Cancellation detected during output file writing")
safe_delete_file(tmp_path)
safe_delete_file(temp_input)
return "Conversion cancelled.", None
tmp.write(content[i:i+chunk_size])
# Clean up the temporary input file
safe_delete_file(temp_input)
temp_input = None # Mark as cleaned up
return content, tmp_path
except Exception as e:
safe_delete_file(tmp_path)
safe_delete_file(temp_input)
return f"Error: {e}", None
finally:
# Always clean up any remaining temp files
safe_delete_file(temp_input)
if check_cancellation() and tmp_path:
safe_delete_file(tmp_path)
# Always clear the conversion in progress flag when done
_conversion_in_progress = False