Spaces:

Ansemin101
/

Markit_v2

Running on Zero

App Files Files Community

AnseMin commited on 29 days ago

Commit

dbdd7c8

1 Parent(s): 5b7f920

Initial Implementation of Markitdown. Implemented:

Browse files

- Basic python usage
- Image integration using OpenAI
- Plugin disabled by default

Files changed (9) hide show

.env.example +3 -0
README.md +65 -0
app.py +22 -2
requirements.txt +5 -1
setup.sh +6 -0
src/main.py +1 -2
src/parsers/__init__.py +7 -0
src/parsers/markitdown_parser.py +111 -0
src/ui/ui.py +49 -3

.env.example ADDED Viewed

	@@ -0,0 +1,3 @@

+# API keys for various services
+GOOGLE_API_KEY=your_google_api_key_here
+OPENAI_API_KEY=your_openai_api_key_here

README.md CHANGED Viewed

@@ -11,6 +11,71 @@ startup_script: setup.sh
 pinned: false
 ---
 # Markit: Document to Markdown Converter
 [![Hugging Face Space](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-blue)](https://huggingface.co/spaces/Ansemin101/Markit)

 pinned: false
 ---
+# Document to Markdown Converter
+A Hugging Face Space that converts various document formats to Markdown, now with MarkItDown integration!
+## Features
+- Convert PDFs, Office documents, images, and more to Markdown
+- Multiple parser options:
+  - MarkItDown: For comprehensive document conversion
+  - GOT-OCR: For image-based OCR with LaTeX support
+  - Gemini Flash: For AI-powered text extraction
+- Download converted documents as Markdown files
+- Clean, responsive UI
+## Using MarkItDown
+This app integrates [Microsoft's MarkItDown](https://github.com/microsoft/markitdown) library, which supports a wide range of file formats:
+- PDF
+- PowerPoint (PPTX)
+- Word (DOCX)
+- Excel (XLSX)
+- Images (JPG, PNG)
+- Audio files (with transcription)
+- HTML
+- Text-based formats (CSV, JSON, XML)
+- ZIP files
+- YouTube URLs
+- EPubs
+- And more!
+## Environment Variables
+You can enhance the functionality by setting these environment variables:
+- `OPENAI_API_KEY`: Enables AI-based image descriptions in MarkItDown
+- `GOOGLE_API_KEY`: Used for Gemini Flash parser and LaTeX to Markdown conversion
+## Usage
+1. Select a file to upload
+2. Choose "MarkItDown" as the parser
+3. Select "Standard Conversion"
+4. Click "Convert"
+5. View the Markdown output and download the converted file
+## Local Development
+1. Clone the repository
+2. Create a `.env` file based on `.env.example`
+3. Install dependencies:
+   ```
+   pip install -r requirements.txt
+   ```
+4. Run the application:
+   ```
+   python app.py
+   ```
+## Credits
+- [MarkItDown](https://github.com/microsoft/markitdown) by Microsoft
+- [GOT-OCR](https://github.com/stepfun-ai/GOT-OCR-2.0) for image-based OCR
+- [Gradio](https://gradio.app/) for the UI framework
 # Markit: Document to Markdown Converter
 [![Hugging Face Space](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-blue)](https://huggingface.co/spaces/Ansemin101/Markit)

app.py CHANGED Viewed

@@ -64,6 +64,19 @@ except ImportError:
     print("WARNING: NumPy not installed. Installing NumPy 1.26.3...")
     subprocess.run([sys.executable, "-m", "pip", "install", "-q", "numpy==1.26.3"], check=False)
 # Try to load environment variables from .env file
 try:
     from dotenv import load_dotenv
@@ -72,16 +85,23 @@ try:
 except ImportError:
     print("python-dotenv not installed, skipping .env file loading")
-# Load Gemini API key from environment variable
 gemini_api_key = os.getenv("GOOGLE_API_KEY")
-# Check if API key is available and print a message if not
 if not gemini_api_key:
     print("Warning: GOOGLE_API_KEY environment variable not found. Gemini Flash parser and LaTeX to Markdown conversion may not work.")
 else:
     print(f"Found Gemini API key: {gemini_api_key[:5]}...{gemini_api_key[-5:] if len(gemini_api_key) > 10 else ''}")
     print("Gemini API will be used for LaTeX to Markdown conversion when using GOT-OCR with Formatted Text mode")
 # Add the current directory to the Python path
 sys.path.append(current_dir)

     print("WARNING: NumPy not installed. Installing NumPy 1.26.3...")
     subprocess.run([sys.executable, "-m", "pip", "install", "-q", "numpy==1.26.3"], check=False)
+# Check if markitdown is installed
+try:
+    from markitdown import MarkItDown
+    print("MarkItDown is installed")
+except ImportError:
+    print("WARNING: MarkItDown not installed. Installing...")
+    subprocess.run([sys.executable, "-m", "pip", "install", "-q", "markitdown[all]"], check=False)
+    try:
+        from markitdown import MarkItDown
+        print("MarkItDown installed successfully")
+    except ImportError:
+        print("ERROR: Failed to install MarkItDown")
 # Try to load environment variables from .env file
 try:
     from dotenv import load_dotenv
 except ImportError:
     print("python-dotenv not installed, skipping .env file loading")
+# Load API keys from environment variables
 gemini_api_key = os.getenv("GOOGLE_API_KEY")
+openai_api_key = os.getenv("OPENAI_API_KEY")
+# Check if API keys are available and print messages
 if not gemini_api_key:
     print("Warning: GOOGLE_API_KEY environment variable not found. Gemini Flash parser and LaTeX to Markdown conversion may not work.")
 else:
     print(f"Found Gemini API key: {gemini_api_key[:5]}...{gemini_api_key[-5:] if len(gemini_api_key) > 10 else ''}")
     print("Gemini API will be used for LaTeX to Markdown conversion when using GOT-OCR with Formatted Text mode")
+if not openai_api_key:
+    print("Warning: OPENAI_API_KEY environment variable not found. LLM-based image description in MarkItDown may not work.")
+else:
+    print(f"Found OpenAI API key: {openai_api_key[:5]}...{openai_api_key[-5:] if len(openai_api_key) > 10 else ''}")
+    print("OpenAI API will be available for LLM-based image descriptions in MarkItDown")
 # Add the current directory to the Python path
 sys.path.append(current_dir)

requirements.txt CHANGED Viewed

@@ -23,4 +23,8 @@ torchvision
 git+https://github.com/huggingface/transformers.git@main
 accelerate
 verovio  # Added missing dependency
-huggingface_hub[cli]>=0.19.0

 git+https://github.com/huggingface/transformers.git@main
 accelerate
 verovio  # Added missing dependency
+huggingface_hub[cli]>=0.19.0
+# MarkItDown and its dependencies
+markitdown[all]
+openai>=1.1.0  # For LLM image description support

setup.sh CHANGED Viewed

@@ -29,6 +29,7 @@ echo "NumPy installed successfully"
 echo "Installing Python dependencies..."
 pip install -q -U pillow opencv-python
 pip install -q -U google-genai
 # pip install -q -U latex2markdown - removed, now using Gemini API for LaTeX conversion
 echo "Python dependencies installed successfully"
@@ -45,6 +46,11 @@ echo "Installing spaces module for ZeroGPU support..."
 pip install -q -U spaces
 echo "Spaces module installed successfully"
 # Install the project in development mode only if setup.py or pyproject.toml exists
 if [ -f "setup.py" ] || [ -f "pyproject.toml" ]; then
     echo "Installing project in development mode..."

 echo "Installing Python dependencies..."
 pip install -q -U pillow opencv-python
 pip install -q -U google-genai
+pip install -q -U openai>=1.1.0  # For LLM image description support
 # pip install -q -U latex2markdown - removed, now using Gemini API for LaTeX conversion
 echo "Python dependencies installed successfully"
 pip install -q -U spaces
 echo "Spaces module installed successfully"
+# Install markitdown with all optional dependencies
+echo "Installing MarkItDown with all dependencies..."
+pip install -q -U 'markitdown[all]'
+echo "MarkItDown installed successfully"
 # Install the project in development mode only if setup.py or pyproject.toml exists
 if [ -f "setup.py" ] || [ -f "pyproject.toml" ]; then
     echo "Installing project in development mode..."

src/main.py CHANGED Viewed

@@ -1,9 +1,8 @@
 import parsers  # Import all parsers to ensure they're registered
 from src.ui.ui import launch_ui
 def main():
     launch_ui(
         server_name="0.0.0.0",
         server_port=7860,

 import parsers  # Import all parsers to ensure they're registered
 from src.ui.ui import launch_ui
 def main():
+    # Launch the UI
     launch_ui(
         server_name="0.0.0.0",
         server_port=7860,

src/parsers/__init__.py CHANGED Viewed

@@ -4,6 +4,13 @@
 from src.parsers.gemini_flash_parser import GeminiFlashParser
 from src.parsers.got_ocr_parser import GotOcrParser
 # You can add new parsers here in the future
 # This file makes the parsers directory a Python package

 from src.parsers.gemini_flash_parser import GeminiFlashParser
 from src.parsers.got_ocr_parser import GotOcrParser
+# Import MarkItDown parser if available - needs to be imported last so it's default
+try:
+    from src.parsers.markitdown_parser import MarkItDownParser
+    print("MarkItDown parser imported successfully")
+except ImportError as e:
+    print(f"Error importing MarkItDown parser: {str(e)}")
 # You can add new parsers here in the future
 # This file makes the parsers directory a Python package

src/parsers/markitdown_parser.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import logging
+import os
+from pathlib import Path
+from typing import Dict, List, Optional, Any, Union
+import io
+# Import the parser interface and registry
+from src.parsers.parser_interface import DocumentParser
+from src.parsers.parser_registry import ParserRegistry
+# Check for MarkItDown availability
+try:
+    from markitdown import MarkItDown
+    from openai import OpenAI
+    HAS_MARKITDOWN = True
+except ImportError:
+    HAS_MARKITDOWN = False
+    logging.warning("MarkItDown package not installed. Please install with 'pip install markitdown[all]'")
+# Configure logging
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+class MarkItDownParser(DocumentParser):
+    """
+    Parser implementation using MarkItDown for converting various file formats to Markdown.
+    """
+    def __init__(self):
+        self.markdown_instance = None
+        # Initialize MarkItDown instance
+        if HAS_MARKITDOWN:
+            try:
+                # Check for OpenAI API key for LLM-based image descriptions
+                openai_api_key = os.getenv("OPENAI_API_KEY")
+                if openai_api_key:
+                    client = OpenAI()
+                    self.markdown_instance = MarkItDown(
+                        enable_plugins=False,
+                        llm_client=client,
+                        llm_model="gpt-4o"
+                    )
+                    logger.info("MarkItDown initialized with OpenAI support for image descriptions")
+                else:
+                    self.markdown_instance = MarkItDown(enable_plugins=False)
+                    logger.info("MarkItDown initialized without OpenAI support")
+            except Exception as e:
+                logger.error(f"Error initializing MarkItDown: {str(e)}")
+                self.markdown_instance = None
+    def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
+        """
+        Parse a document and return its content as Markdown.
+        Args:
+            file_path: Path to the document
+            ocr_method: OCR method to use (not used in this parser)
+            **kwargs: Additional options including cancellation checking
+        Returns:
+            str: Markdown representation of the document
+        """
+        # Check if MarkItDown is available
+        if not HAS_MARKITDOWN or self.markdown_instance is None:
+            return "Error: MarkItDown is not available. Please install with 'pip install markitdown[all]'"
+        # Get cancellation check function from kwargs
+        check_cancellation = kwargs.get('check_cancellation', lambda: False)
+        # Check for cancellation before starting
+        if check_cancellation():
+            return "Conversion cancelled."
+        try:
+            # Convert the file using the standard instance
+            result = self.markdown_instance.convert(file_path)
+            # Check for cancellation after processing
+            if check_cancellation():
+                return "Conversion cancelled."
+            return result.text_content
+        except Exception as e:
+            logger.error(f"Error converting file with MarkItDown: {str(e)}")
+            return f"Error: {str(e)}"
+    @classmethod
+    def get_name(cls) -> str:
+        return "MarkItDown"
+    @classmethod
+    def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
+        return [
+            {
+                "id": "standard",
+                "name": "Standard Conversion",
+                "default_params": {}
+            }
+        ]
+    @classmethod
+    def get_description(cls) -> str:
+        return "MarkItDown parser for converting various file formats to Markdown"
+# Register the parser with the registry if available
+if HAS_MARKITDOWN:
+    ParserRegistry.register(MarkItDownParser)
+    logger.info("MarkItDown parser registered successfully")
+else:
+    logger.warning("Could not register MarkItDown parser: Package not installed")

src/ui/ui.py CHANGED Viewed

@@ -7,6 +7,15 @@ from pathlib import Path
 from src.core.converter import convert_file, set_cancellation_flag, is_conversion_in_progress
 from src.parsers.parser_registry import ParserRegistry
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
@@ -158,8 +167,43 @@ def create_ui():
             margin-top: 15px;
             margin-bottom: 15px;
         }
     """) as demo:
-        # Remove the header
         # State to track if cancellation is requested
         cancel_requested = gr.State(False)
         # State to store the conversion thread
@@ -168,13 +212,15 @@ def create_ui():
         output_format_state = gr.State("Markdown")
         # File input first
-        file_input = gr.File(label="Upload PDF", type="filepath")
         # Provider and OCR options below the file input
         with gr.Row(elem_classes=["provider-options-row"]):
             with gr.Column(scale=1):
                 parser_names = ParserRegistry.get_parser_names()
-                default_parser = parser_names[0] if parser_names else "PyPdfium"
                 provider_dropdown = gr.Dropdown(
                     label="Provider",

 from src.core.converter import convert_file, set_cancellation_flag, is_conversion_in_progress
 from src.parsers.parser_registry import ParserRegistry
+# Import MarkItDown to check if it's available
+try:
+    from markitdown import MarkItDown
+    HAS_MARKITDOWN = True
+    logging.info("MarkItDown is available for use")
+except ImportError:
+    HAS_MARKITDOWN = False
+    logging.warning("MarkItDown is not available")
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
             margin-top: 15px;
             margin-bottom: 15px;
         }
+        /* Style the app title */
+        .app-title {
+            text-align: center;
+            margin-bottom: 20px;
+        }
+        /* Info section */
+        .info-section {
+            background-color: #f8f9fa;
+            padding: 10px;
+            border-radius: 5px;
+            margin-bottom: 15px;
+            font-size: 14px;
+        }
     """) as demo:
+        # Add title and description
+        gr.HTML(
+            """
+            <div class="app-title">
+                <h1>Document to Markdown Converter</h1>
+                <p>Convert documents to markdown format using various parsers including MarkItDown</p>
+            </div>
+            """
+        )
+        # Add MarkItDown info block if it's available
+        if HAS_MARKITDOWN:
+            gr.HTML(
+                """
+                <div class="info-section">
+                    <strong>MarkItDown is available!</strong> Use it to convert various file formats
+                    including PDF, Office documents, images, and more to Markdown format.
+                </div>
+                """
+            )
         # State to track if cancellation is requested
         cancel_requested = gr.State(False)
         # State to store the conversion thread
         output_format_state = gr.State("Markdown")
         # File input first
+        file_input = gr.File(label="Upload Document", type="filepath")
         # Provider and OCR options below the file input
         with gr.Row(elem_classes=["provider-options-row"]):
             with gr.Column(scale=1):
                 parser_names = ParserRegistry.get_parser_names()
+                # Make MarkItDown the default parser if available
+                default_parser = next((p for p in parser_names if p == "MarkItDown"), parser_names[0] if parser_names else "PyPdfium")
                 provider_dropdown = gr.Dropdown(
                     label="Provider",