Spaces:
Running
on
Zero
Running
on
Zero
from typing import Optional, Dict, Any, Union | |
from pathlib import Path | |
import threading | |
import logging | |
import time | |
from src.parsers.parser_interface import DocumentParser | |
from src.parsers.parser_registry import ParserRegistry | |
class ParserFactory: | |
"""Factory for creating parser instances.""" | |
def create_parser(cls, parser_name: str) -> Optional[DocumentParser]: | |
""" | |
Create a parser instance. | |
Args: | |
parser_name: Name of the parser to create | |
Returns: | |
An instance of the requested parser or None if not found | |
""" | |
parser_class = ParserRegistry.get_parser_class(parser_name) | |
if not parser_class: | |
return None | |
return parser_class() | |
def parse_document(cls, | |
file_path: Union[str, Path], | |
parser_name: str, | |
ocr_method_name: str, | |
output_format: str = "markdown", | |
cancellation_flag: Optional[threading.Event] = None, | |
**kwargs) -> str: | |
""" | |
Parse a document using the specified parser and OCR method. | |
Args: | |
file_path: Path to the document | |
parser_name: Name of the parser to use | |
ocr_method_name: Display name of the OCR method to use | |
output_format: Output format (markdown, json, text, document_tags) | |
cancellation_flag: Optional flag to check for cancellation | |
**kwargs: Additional parser-specific options | |
Returns: | |
str: The parsed content | |
""" | |
# Helper function to check cancellation | |
def check_cancellation(): | |
if cancellation_flag and cancellation_flag.is_set(): | |
logging.info("Cancellation detected in parser_factory") | |
return True | |
return False | |
# Check for cancellation immediately | |
if check_cancellation(): | |
return "Conversion cancelled." | |
parser = cls.create_parser(parser_name) | |
if not parser: | |
raise ValueError(f"Unknown parser: {parser_name}") | |
# Get the internal OCR method ID | |
ocr_method_id = ParserRegistry.get_ocr_method_id(parser_name, ocr_method_name) | |
if not ocr_method_id: | |
raise ValueError(f"Unknown OCR method: {ocr_method_name} for parser {parser_name}") | |
# Check for cancellation again before starting the parsing | |
if check_cancellation(): | |
return "Conversion cancelled." | |
# Add a function to check cancellation that parsers can call | |
def should_check_cancellation(): | |
"""Function that parsers can call to check if they should check cancellation""" | |
# No need to sleep here - this just returns whether cancellation should be checked | |
return True | |
# Parse the document, passing the cancellation flag and helper functions | |
kwargs['cancellation_flag'] = cancellation_flag | |
kwargs['check_cancellation'] = check_cancellation | |
kwargs['should_check_cancellation'] = should_check_cancellation | |
kwargs['output_format'] = output_format | |
# Parse the document | |
result = parser.parse(file_path, ocr_method=ocr_method_id, **kwargs) | |
# Check one more time after parsing completes | |
if check_cancellation(): | |
return "Conversion cancelled." | |
return result |