import tempfile import logging import time import os from pathlib import Path # Use relative imports instead of absolute imports from src.core.parser_factory import ParserFactory # Import all parsers to ensure they're registered from src import parsers # Import the LaTeX to Markdown converter try: from src.core.latex_to_markdown_converter import convert_latex_to_markdown HAS_GEMINI_CONVERTER = True except ImportError: HAS_GEMINI_CONVERTER = False logging.warning("LaTeX to Markdown converter not available. Raw LaTeX will be returned for formatted text.") # Reference to the cancellation flag from ui.py # This will be set by the UI when the cancel button is clicked conversion_cancelled = None # Will be a threading.Event object # Flag to track if conversion is currently in progress _conversion_in_progress = False def set_cancellation_flag(flag): """Set the reference to the cancellation flag from ui.py""" global conversion_cancelled conversion_cancelled = flag def is_conversion_in_progress(): """Check if conversion is currently in progress""" global _conversion_in_progress return _conversion_in_progress def check_cancellation(): """Check if cancellation has been requested""" if conversion_cancelled and conversion_cancelled.is_set(): logging.info("Cancellation detected in check_cancellation") return True return False def safe_delete_file(file_path): """Safely delete a file with error handling""" if file_path and os.path.exists(file_path): try: os.unlink(file_path) except Exception as e: logging.error(f"Error cleaning up temp file {file_path}: {e}") def convert_file(file_path, parser_name, ocr_method_name, output_format): """ Convert a file using the specified parser and OCR method. Args: file_path: Path to the file parser_name: Name of the parser to use ocr_method_name: Name of the OCR method to use output_format: Output format (Markdown, JSON, Text, Document Tags) Returns: tuple: (content, download_file_path) """ global conversion_cancelled, _conversion_in_progress # Set the conversion in progress flag _conversion_in_progress = True # Temporary file paths to clean up temp_input = None tmp_path = None # Ensure we clean up the flag when we're done try: if not file_path: return "Please upload a file.", None # Check for cancellation if check_cancellation(): logging.info("Cancellation detected at start of convert_file") return "Conversion cancelled.", None # Create a temporary file with English filename try: original_ext = Path(file_path).suffix with tempfile.NamedTemporaryFile(suffix=original_ext, delete=False) as temp_file: temp_input = temp_file.name # Copy the content of original file to temp file with open(file_path, 'rb') as original: # Read in smaller chunks and check for cancellation between chunks chunk_size = 1024 * 1024 # 1MB chunks while True: # Check for cancellation frequently if check_cancellation(): logging.info("Cancellation detected during file copy") safe_delete_file(temp_input) return "Conversion cancelled.", None chunk = original.read(chunk_size) if not chunk: break temp_file.write(chunk) file_path = temp_input except Exception as e: safe_delete_file(temp_input) return f"Error creating temporary file: {e}", None # Check for cancellation again if check_cancellation(): logging.info("Cancellation detected after file preparation") safe_delete_file(temp_input) return "Conversion cancelled.", None content = None try: # Use the parser factory to parse the document start = time.time() # Pass the cancellation flag to the parser factory content = ParserFactory.parse_document( file_path=file_path, parser_name=parser_name, ocr_method_name=ocr_method_name, output_format=output_format.lower(), cancellation_flag=conversion_cancelled # Pass the flag to parsers ) # If content indicates cancellation, return early if content == "Conversion cancelled.": logging.info("Parser reported cancellation") safe_delete_file(temp_input) return content, None duration = time.time() - start logging.info(f"Processed in {duration:.2f} seconds.") # Check for cancellation after processing if check_cancellation(): logging.info("Cancellation detected after processing") safe_delete_file(temp_input) return "Conversion cancelled.", None # Process LaTeX content for GOT-OCR formatted text if parser_name == "GOT-OCR (jpg,png only)" and ocr_method_name == "Formatted Text" and HAS_GEMINI_CONVERTER: logging.info("Converting LaTeX output to Markdown using Gemini API") start_convert = time.time() # Check for cancellation before conversion if check_cancellation(): logging.info("Cancellation detected before LaTeX conversion") safe_delete_file(temp_input) return "Conversion cancelled.", None try: markdown_content = convert_latex_to_markdown(content) if markdown_content: content = markdown_content logging.info(f"LaTeX conversion completed in {time.time() - start_convert:.2f} seconds") else: logging.warning("LaTeX to Markdown conversion failed, using raw LaTeX output") except Exception as e: logging.error(f"Error converting LaTeX to Markdown: {str(e)}") # Continue with the original content on error # Check for cancellation after conversion if check_cancellation(): logging.info("Cancellation detected after LaTeX conversion") safe_delete_file(temp_input) return "Conversion cancelled.", None except Exception as e: safe_delete_file(temp_input) return f"Error: {e}", None # Determine the file extension based on the output format if output_format == "Markdown": ext = ".md" elif output_format == "JSON": ext = ".json" elif output_format == "Text": ext = ".txt" elif output_format == "Document Tags": ext = ".doctags" else: ext = ".txt" # Check for cancellation again if check_cancellation(): logging.info("Cancellation detected before output file creation") safe_delete_file(temp_input) return "Conversion cancelled.", None try: # Create a temporary file for download with tempfile.NamedTemporaryFile(mode="w", suffix=ext, delete=False, encoding="utf-8") as tmp: tmp_path = tmp.name # Write in chunks and check for cancellation chunk_size = 10000 # characters for i in range(0, len(content), chunk_size): # Check for cancellation if check_cancellation(): logging.info("Cancellation detected during output file writing") safe_delete_file(tmp_path) safe_delete_file(temp_input) return "Conversion cancelled.", None tmp.write(content[i:i+chunk_size]) # Clean up the temporary input file safe_delete_file(temp_input) temp_input = None # Mark as cleaned up return content, tmp_path except Exception as e: safe_delete_file(tmp_path) safe_delete_file(temp_input) return f"Error: {e}", None finally: # Always clean up any remaining temp files safe_delete_file(temp_input) if check_cancellation() and tmp_path: safe_delete_file(tmp_path) # Always clear the conversion in progress flag when done _conversion_in_progress = False