File size: 9,148 Bytes
dda982a
 
 
 
 
 
 
 
 
 
b3a5734
dda982a
5b7f920
 
 
 
 
 
 
 
dda982a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5b7f920
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dda982a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
import tempfile
import logging
import time
import os
from pathlib import Path

# Use relative imports instead of absolute imports
from src.core.parser_factory import ParserFactory

# Import all parsers to ensure they're registered
from src import parsers

# Import the LaTeX to Markdown converter
try:
    from src.core.latex_to_markdown_converter import convert_latex_to_markdown
    HAS_GEMINI_CONVERTER = True
except ImportError:
    HAS_GEMINI_CONVERTER = False
    logging.warning("LaTeX to Markdown converter not available. Raw LaTeX will be returned for formatted text.")

# Reference to the cancellation flag from ui.py
# This will be set by the UI when the cancel button is clicked
conversion_cancelled = None  # Will be a threading.Event object
# Flag to track if conversion is currently in progress
_conversion_in_progress = False

def set_cancellation_flag(flag):
    """Set the reference to the cancellation flag from ui.py"""
    global conversion_cancelled
    conversion_cancelled = flag

def is_conversion_in_progress():
    """Check if conversion is currently in progress"""
    global _conversion_in_progress
    return _conversion_in_progress

def check_cancellation():
    """Check if cancellation has been requested"""
    if conversion_cancelled and conversion_cancelled.is_set():
        logging.info("Cancellation detected in check_cancellation")
        return True
    return False

def safe_delete_file(file_path):
    """Safely delete a file with error handling"""
    if file_path and os.path.exists(file_path):
        try:
            os.unlink(file_path)
        except Exception as e:
            logging.error(f"Error cleaning up temp file {file_path}: {e}")

def convert_file(file_path, parser_name, ocr_method_name, output_format):
    """
    Convert a file using the specified parser and OCR method.
    
    Args:
        file_path: Path to the file
        parser_name: Name of the parser to use
        ocr_method_name: Name of the OCR method to use
        output_format: Output format (Markdown, JSON, Text, Document Tags)
        
    Returns:
        tuple: (content, download_file_path)
    """
    global conversion_cancelled, _conversion_in_progress
    
    # Set the conversion in progress flag
    _conversion_in_progress = True
    
    # Temporary file paths to clean up
    temp_input = None
    tmp_path = None
    
    # Ensure we clean up the flag when we're done
    try:
        if not file_path:
            return "Please upload a file.", None

        # Check for cancellation
        if check_cancellation():
            logging.info("Cancellation detected at start of convert_file")
            return "Conversion cancelled.", None

        # Create a temporary file with English filename
        try:
            original_ext = Path(file_path).suffix
            with tempfile.NamedTemporaryFile(suffix=original_ext, delete=False) as temp_file:
                temp_input = temp_file.name
                # Copy the content of original file to temp file
                with open(file_path, 'rb') as original:
                    # Read in smaller chunks and check for cancellation between chunks
                    chunk_size = 1024 * 1024  # 1MB chunks
                    while True:
                        # Check for cancellation frequently
                        if check_cancellation():
                            logging.info("Cancellation detected during file copy")
                            safe_delete_file(temp_input)
                            return "Conversion cancelled.", None
                        
                        chunk = original.read(chunk_size)
                        if not chunk:
                            break
                        temp_file.write(chunk)
            file_path = temp_input
        except Exception as e:
            safe_delete_file(temp_input)
            return f"Error creating temporary file: {e}", None

        # Check for cancellation again
        if check_cancellation():
            logging.info("Cancellation detected after file preparation")
            safe_delete_file(temp_input)
            return "Conversion cancelled.", None

        content = None
        try:
            # Use the parser factory to parse the document
            start = time.time()
            
            # Pass the cancellation flag to the parser factory
            content = ParserFactory.parse_document(
                file_path=file_path,
                parser_name=parser_name,
                ocr_method_name=ocr_method_name,
                output_format=output_format.lower(),
                cancellation_flag=conversion_cancelled  # Pass the flag to parsers
            )
            
            # If content indicates cancellation, return early
            if content == "Conversion cancelled.":
                logging.info("Parser reported cancellation")
                safe_delete_file(temp_input)
                return content, None
            
            duration = time.time() - start
            logging.info(f"Processed in {duration:.2f} seconds.")
            
            # Check for cancellation after processing
            if check_cancellation():
                logging.info("Cancellation detected after processing")
                safe_delete_file(temp_input)
                return "Conversion cancelled.", None
                
            # Process LaTeX content for GOT-OCR formatted text
            if parser_name == "GOT-OCR (jpg,png only)" and ocr_method_name == "Formatted Text" and HAS_GEMINI_CONVERTER:
                logging.info("Converting LaTeX output to Markdown using Gemini API")
                start_convert = time.time()
                
                # Check for cancellation before conversion
                if check_cancellation():
                    logging.info("Cancellation detected before LaTeX conversion")
                    safe_delete_file(temp_input)
                    return "Conversion cancelled.", None
                
                try:
                    markdown_content = convert_latex_to_markdown(content)
                    if markdown_content:
                        content = markdown_content
                        logging.info(f"LaTeX conversion completed in {time.time() - start_convert:.2f} seconds")
                    else:
                        logging.warning("LaTeX to Markdown conversion failed, using raw LaTeX output")
                except Exception as e:
                    logging.error(f"Error converting LaTeX to Markdown: {str(e)}")
                    # Continue with the original content on error
                
                # Check for cancellation after conversion
                if check_cancellation():
                    logging.info("Cancellation detected after LaTeX conversion")
                    safe_delete_file(temp_input)
                    return "Conversion cancelled.", None
                
        except Exception as e:
            safe_delete_file(temp_input)
            return f"Error: {e}", None

        # Determine the file extension based on the output format
        if output_format == "Markdown":
            ext = ".md"
        elif output_format == "JSON":
            ext = ".json"
        elif output_format == "Text":
            ext = ".txt"
        elif output_format == "Document Tags":
            ext = ".doctags"
        else:
            ext = ".txt"

        # Check for cancellation again
        if check_cancellation():
            logging.info("Cancellation detected before output file creation")
            safe_delete_file(temp_input)
            return "Conversion cancelled.", None

        try:
            # Create a temporary file for download
            with tempfile.NamedTemporaryFile(mode="w", suffix=ext, delete=False, encoding="utf-8") as tmp:
                tmp_path = tmp.name
                # Write in chunks and check for cancellation
                chunk_size = 10000  # characters
                for i in range(0, len(content), chunk_size):
                    # Check for cancellation
                    if check_cancellation():
                        logging.info("Cancellation detected during output file writing")
                        safe_delete_file(tmp_path)
                        safe_delete_file(temp_input)
                        return "Conversion cancelled.", None
                    
                    tmp.write(content[i:i+chunk_size])
            
            # Clean up the temporary input file
            safe_delete_file(temp_input)
            temp_input = None  # Mark as cleaned up
                
            return content, tmp_path
        except Exception as e:
            safe_delete_file(tmp_path)
            safe_delete_file(temp_input)
            return f"Error: {e}", None
    finally:
        # Always clean up any remaining temp files
        safe_delete_file(temp_input)
        if check_cancellation() and tmp_path:
            safe_delete_file(tmp_path)
            
        # Always clear the conversion in progress flag when done
        _conversion_in_progress = False