AnseMin commited on
Commit
dbdd7c8
·
1 Parent(s): 5b7f920

Initial Implementation of Markitdown. Implemented:

Browse files

- Basic python usage
- Image integration using OpenAI
- Plugin disabled by default

.env.example ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # API keys for various services
2
+ GOOGLE_API_KEY=your_google_api_key_here
3
+ OPENAI_API_KEY=your_openai_api_key_here
README.md CHANGED
@@ -11,6 +11,71 @@ startup_script: setup.sh
11
  pinned: false
12
  ---
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  # Markit: Document to Markdown Converter
15
 
16
  [![Hugging Face Space](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-blue)](https://huggingface.co/spaces/Ansemin101/Markit)
 
11
  pinned: false
12
  ---
13
 
14
+ # Document to Markdown Converter
15
+
16
+ A Hugging Face Space that converts various document formats to Markdown, now with MarkItDown integration!
17
+
18
+ ## Features
19
+
20
+ - Convert PDFs, Office documents, images, and more to Markdown
21
+ - Multiple parser options:
22
+ - MarkItDown: For comprehensive document conversion
23
+ - GOT-OCR: For image-based OCR with LaTeX support
24
+ - Gemini Flash: For AI-powered text extraction
25
+ - Download converted documents as Markdown files
26
+ - Clean, responsive UI
27
+
28
+ ## Using MarkItDown
29
+
30
+ This app integrates [Microsoft's MarkItDown](https://github.com/microsoft/markitdown) library, which supports a wide range of file formats:
31
+
32
+ - PDF
33
+ - PowerPoint (PPTX)
34
+ - Word (DOCX)
35
+ - Excel (XLSX)
36
+ - Images (JPG, PNG)
37
+ - Audio files (with transcription)
38
+ - HTML
39
+ - Text-based formats (CSV, JSON, XML)
40
+ - ZIP files
41
+ - YouTube URLs
42
+ - EPubs
43
+ - And more!
44
+
45
+ ## Environment Variables
46
+
47
+ You can enhance the functionality by setting these environment variables:
48
+
49
+ - `OPENAI_API_KEY`: Enables AI-based image descriptions in MarkItDown
50
+ - `GOOGLE_API_KEY`: Used for Gemini Flash parser and LaTeX to Markdown conversion
51
+
52
+ ## Usage
53
+
54
+ 1. Select a file to upload
55
+ 2. Choose "MarkItDown" as the parser
56
+ 3. Select "Standard Conversion"
57
+ 4. Click "Convert"
58
+ 5. View the Markdown output and download the converted file
59
+
60
+ ## Local Development
61
+
62
+ 1. Clone the repository
63
+ 2. Create a `.env` file based on `.env.example`
64
+ 3. Install dependencies:
65
+ ```
66
+ pip install -r requirements.txt
67
+ ```
68
+ 4. Run the application:
69
+ ```
70
+ python app.py
71
+ ```
72
+
73
+ ## Credits
74
+
75
+ - [MarkItDown](https://github.com/microsoft/markitdown) by Microsoft
76
+ - [GOT-OCR](https://github.com/stepfun-ai/GOT-OCR-2.0) for image-based OCR
77
+ - [Gradio](https://gradio.app/) for the UI framework
78
+
79
  # Markit: Document to Markdown Converter
80
 
81
  [![Hugging Face Space](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-blue)](https://huggingface.co/spaces/Ansemin101/Markit)
app.py CHANGED
@@ -64,6 +64,19 @@ except ImportError:
64
  print("WARNING: NumPy not installed. Installing NumPy 1.26.3...")
65
  subprocess.run([sys.executable, "-m", "pip", "install", "-q", "numpy==1.26.3"], check=False)
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  # Try to load environment variables from .env file
68
  try:
69
  from dotenv import load_dotenv
@@ -72,16 +85,23 @@ try:
72
  except ImportError:
73
  print("python-dotenv not installed, skipping .env file loading")
74
 
75
- # Load Gemini API key from environment variable
76
  gemini_api_key = os.getenv("GOOGLE_API_KEY")
 
77
 
78
- # Check if API key is available and print a message if not
79
  if not gemini_api_key:
80
  print("Warning: GOOGLE_API_KEY environment variable not found. Gemini Flash parser and LaTeX to Markdown conversion may not work.")
81
  else:
82
  print(f"Found Gemini API key: {gemini_api_key[:5]}...{gemini_api_key[-5:] if len(gemini_api_key) > 10 else ''}")
83
  print("Gemini API will be used for LaTeX to Markdown conversion when using GOT-OCR with Formatted Text mode")
84
 
 
 
 
 
 
 
85
  # Add the current directory to the Python path
86
  sys.path.append(current_dir)
87
 
 
64
  print("WARNING: NumPy not installed. Installing NumPy 1.26.3...")
65
  subprocess.run([sys.executable, "-m", "pip", "install", "-q", "numpy==1.26.3"], check=False)
66
 
67
+ # Check if markitdown is installed
68
+ try:
69
+ from markitdown import MarkItDown
70
+ print("MarkItDown is installed")
71
+ except ImportError:
72
+ print("WARNING: MarkItDown not installed. Installing...")
73
+ subprocess.run([sys.executable, "-m", "pip", "install", "-q", "markitdown[all]"], check=False)
74
+ try:
75
+ from markitdown import MarkItDown
76
+ print("MarkItDown installed successfully")
77
+ except ImportError:
78
+ print("ERROR: Failed to install MarkItDown")
79
+
80
  # Try to load environment variables from .env file
81
  try:
82
  from dotenv import load_dotenv
 
85
  except ImportError:
86
  print("python-dotenv not installed, skipping .env file loading")
87
 
88
+ # Load API keys from environment variables
89
  gemini_api_key = os.getenv("GOOGLE_API_KEY")
90
+ openai_api_key = os.getenv("OPENAI_API_KEY")
91
 
92
+ # Check if API keys are available and print messages
93
  if not gemini_api_key:
94
  print("Warning: GOOGLE_API_KEY environment variable not found. Gemini Flash parser and LaTeX to Markdown conversion may not work.")
95
  else:
96
  print(f"Found Gemini API key: {gemini_api_key[:5]}...{gemini_api_key[-5:] if len(gemini_api_key) > 10 else ''}")
97
  print("Gemini API will be used for LaTeX to Markdown conversion when using GOT-OCR with Formatted Text mode")
98
 
99
+ if not openai_api_key:
100
+ print("Warning: OPENAI_API_KEY environment variable not found. LLM-based image description in MarkItDown may not work.")
101
+ else:
102
+ print(f"Found OpenAI API key: {openai_api_key[:5]}...{openai_api_key[-5:] if len(openai_api_key) > 10 else ''}")
103
+ print("OpenAI API will be available for LLM-based image descriptions in MarkItDown")
104
+
105
  # Add the current directory to the Python path
106
  sys.path.append(current_dir)
107
 
requirements.txt CHANGED
@@ -23,4 +23,8 @@ torchvision
23
  git+https://github.com/huggingface/transformers.git@main
24
  accelerate
25
  verovio # Added missing dependency
26
- huggingface_hub[cli]>=0.19.0
 
 
 
 
 
23
  git+https://github.com/huggingface/transformers.git@main
24
  accelerate
25
  verovio # Added missing dependency
26
+ huggingface_hub[cli]>=0.19.0
27
+
28
+ # MarkItDown and its dependencies
29
+ markitdown[all]
30
+ openai>=1.1.0 # For LLM image description support
setup.sh CHANGED
@@ -29,6 +29,7 @@ echo "NumPy installed successfully"
29
  echo "Installing Python dependencies..."
30
  pip install -q -U pillow opencv-python
31
  pip install -q -U google-genai
 
32
  # pip install -q -U latex2markdown - removed, now using Gemini API for LaTeX conversion
33
  echo "Python dependencies installed successfully"
34
 
@@ -45,6 +46,11 @@ echo "Installing spaces module for ZeroGPU support..."
45
  pip install -q -U spaces
46
  echo "Spaces module installed successfully"
47
 
 
 
 
 
 
48
  # Install the project in development mode only if setup.py or pyproject.toml exists
49
  if [ -f "setup.py" ] || [ -f "pyproject.toml" ]; then
50
  echo "Installing project in development mode..."
 
29
  echo "Installing Python dependencies..."
30
  pip install -q -U pillow opencv-python
31
  pip install -q -U google-genai
32
+ pip install -q -U openai>=1.1.0 # For LLM image description support
33
  # pip install -q -U latex2markdown - removed, now using Gemini API for LaTeX conversion
34
  echo "Python dependencies installed successfully"
35
 
 
46
  pip install -q -U spaces
47
  echo "Spaces module installed successfully"
48
 
49
+ # Install markitdown with all optional dependencies
50
+ echo "Installing MarkItDown with all dependencies..."
51
+ pip install -q -U 'markitdown[all]'
52
+ echo "MarkItDown installed successfully"
53
+
54
  # Install the project in development mode only if setup.py or pyproject.toml exists
55
  if [ -f "setup.py" ] || [ -f "pyproject.toml" ]; then
56
  echo "Installing project in development mode..."
src/main.py CHANGED
@@ -1,9 +1,8 @@
1
  import parsers # Import all parsers to ensure they're registered
2
-
3
  from src.ui.ui import launch_ui
4
 
5
-
6
  def main():
 
7
  launch_ui(
8
  server_name="0.0.0.0",
9
  server_port=7860,
 
1
  import parsers # Import all parsers to ensure they're registered
 
2
  from src.ui.ui import launch_ui
3
 
 
4
  def main():
5
+ # Launch the UI
6
  launch_ui(
7
  server_name="0.0.0.0",
8
  server_port=7860,
src/parsers/__init__.py CHANGED
@@ -4,6 +4,13 @@
4
  from src.parsers.gemini_flash_parser import GeminiFlashParser
5
  from src.parsers.got_ocr_parser import GotOcrParser
6
 
 
 
 
 
 
 
 
7
  # You can add new parsers here in the future
8
 
9
  # This file makes the parsers directory a Python package
 
4
  from src.parsers.gemini_flash_parser import GeminiFlashParser
5
  from src.parsers.got_ocr_parser import GotOcrParser
6
 
7
+ # Import MarkItDown parser if available - needs to be imported last so it's default
8
+ try:
9
+ from src.parsers.markitdown_parser import MarkItDownParser
10
+ print("MarkItDown parser imported successfully")
11
+ except ImportError as e:
12
+ print(f"Error importing MarkItDown parser: {str(e)}")
13
+
14
  # You can add new parsers here in the future
15
 
16
  # This file makes the parsers directory a Python package
src/parsers/markitdown_parser.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from pathlib import Path
4
+ from typing import Dict, List, Optional, Any, Union
5
+ import io
6
+
7
+ # Import the parser interface and registry
8
+ from src.parsers.parser_interface import DocumentParser
9
+ from src.parsers.parser_registry import ParserRegistry
10
+
11
+ # Check for MarkItDown availability
12
+ try:
13
+ from markitdown import MarkItDown
14
+ from openai import OpenAI
15
+ HAS_MARKITDOWN = True
16
+ except ImportError:
17
+ HAS_MARKITDOWN = False
18
+ logging.warning("MarkItDown package not installed. Please install with 'pip install markitdown[all]'")
19
+
20
+ # Configure logging
21
+ logger = logging.getLogger(__name__)
22
+ logger.setLevel(logging.DEBUG)
23
+
24
+ class MarkItDownParser(DocumentParser):
25
+ """
26
+ Parser implementation using MarkItDown for converting various file formats to Markdown.
27
+ """
28
+
29
+ def __init__(self):
30
+ self.markdown_instance = None
31
+ # Initialize MarkItDown instance
32
+ if HAS_MARKITDOWN:
33
+ try:
34
+ # Check for OpenAI API key for LLM-based image descriptions
35
+ openai_api_key = os.getenv("OPENAI_API_KEY")
36
+ if openai_api_key:
37
+ client = OpenAI()
38
+ self.markdown_instance = MarkItDown(
39
+ enable_plugins=False,
40
+ llm_client=client,
41
+ llm_model="gpt-4o"
42
+ )
43
+ logger.info("MarkItDown initialized with OpenAI support for image descriptions")
44
+ else:
45
+ self.markdown_instance = MarkItDown(enable_plugins=False)
46
+ logger.info("MarkItDown initialized without OpenAI support")
47
+ except Exception as e:
48
+ logger.error(f"Error initializing MarkItDown: {str(e)}")
49
+ self.markdown_instance = None
50
+
51
+ def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
52
+ """
53
+ Parse a document and return its content as Markdown.
54
+
55
+ Args:
56
+ file_path: Path to the document
57
+ ocr_method: OCR method to use (not used in this parser)
58
+ **kwargs: Additional options including cancellation checking
59
+
60
+ Returns:
61
+ str: Markdown representation of the document
62
+ """
63
+ # Check if MarkItDown is available
64
+ if not HAS_MARKITDOWN or self.markdown_instance is None:
65
+ return "Error: MarkItDown is not available. Please install with 'pip install markitdown[all]'"
66
+
67
+ # Get cancellation check function from kwargs
68
+ check_cancellation = kwargs.get('check_cancellation', lambda: False)
69
+
70
+ # Check for cancellation before starting
71
+ if check_cancellation():
72
+ return "Conversion cancelled."
73
+
74
+ try:
75
+ # Convert the file using the standard instance
76
+ result = self.markdown_instance.convert(file_path)
77
+
78
+ # Check for cancellation after processing
79
+ if check_cancellation():
80
+ return "Conversion cancelled."
81
+
82
+ return result.text_content
83
+ except Exception as e:
84
+ logger.error(f"Error converting file with MarkItDown: {str(e)}")
85
+ return f"Error: {str(e)}"
86
+
87
+ @classmethod
88
+ def get_name(cls) -> str:
89
+ return "MarkItDown"
90
+
91
+ @classmethod
92
+ def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
93
+ return [
94
+ {
95
+ "id": "standard",
96
+ "name": "Standard Conversion",
97
+ "default_params": {}
98
+ }
99
+ ]
100
+
101
+ @classmethod
102
+ def get_description(cls) -> str:
103
+ return "MarkItDown parser for converting various file formats to Markdown"
104
+
105
+
106
+ # Register the parser with the registry if available
107
+ if HAS_MARKITDOWN:
108
+ ParserRegistry.register(MarkItDownParser)
109
+ logger.info("MarkItDown parser registered successfully")
110
+ else:
111
+ logger.warning("Could not register MarkItDown parser: Package not installed")
src/ui/ui.py CHANGED
@@ -7,6 +7,15 @@ from pathlib import Path
7
  from src.core.converter import convert_file, set_cancellation_flag, is_conversion_in_progress
8
  from src.parsers.parser_registry import ParserRegistry
9
 
 
 
 
 
 
 
 
 
 
10
  # Configure logging
11
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
12
  logger = logging.getLogger(__name__)
@@ -158,8 +167,43 @@ def create_ui():
158
  margin-top: 15px;
159
  margin-bottom: 15px;
160
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  """) as demo:
162
- # Remove the header
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  # State to track if cancellation is requested
164
  cancel_requested = gr.State(False)
165
  # State to store the conversion thread
@@ -168,13 +212,15 @@ def create_ui():
168
  output_format_state = gr.State("Markdown")
169
 
170
  # File input first
171
- file_input = gr.File(label="Upload PDF", type="filepath")
172
 
173
  # Provider and OCR options below the file input
174
  with gr.Row(elem_classes=["provider-options-row"]):
175
  with gr.Column(scale=1):
176
  parser_names = ParserRegistry.get_parser_names()
177
- default_parser = parser_names[0] if parser_names else "PyPdfium"
 
 
178
 
179
  provider_dropdown = gr.Dropdown(
180
  label="Provider",
 
7
  from src.core.converter import convert_file, set_cancellation_flag, is_conversion_in_progress
8
  from src.parsers.parser_registry import ParserRegistry
9
 
10
+ # Import MarkItDown to check if it's available
11
+ try:
12
+ from markitdown import MarkItDown
13
+ HAS_MARKITDOWN = True
14
+ logging.info("MarkItDown is available for use")
15
+ except ImportError:
16
+ HAS_MARKITDOWN = False
17
+ logging.warning("MarkItDown is not available")
18
+
19
  # Configure logging
20
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
21
  logger = logging.getLogger(__name__)
 
167
  margin-top: 15px;
168
  margin-bottom: 15px;
169
  }
170
+
171
+ /* Style the app title */
172
+ .app-title {
173
+ text-align: center;
174
+ margin-bottom: 20px;
175
+ }
176
+
177
+ /* Info section */
178
+ .info-section {
179
+ background-color: #f8f9fa;
180
+ padding: 10px;
181
+ border-radius: 5px;
182
+ margin-bottom: 15px;
183
+ font-size: 14px;
184
+ }
185
  """) as demo:
186
+ # Add title and description
187
+ gr.HTML(
188
+ """
189
+ <div class="app-title">
190
+ <h1>Document to Markdown Converter</h1>
191
+ <p>Convert documents to markdown format using various parsers including MarkItDown</p>
192
+ </div>
193
+ """
194
+ )
195
+
196
+ # Add MarkItDown info block if it's available
197
+ if HAS_MARKITDOWN:
198
+ gr.HTML(
199
+ """
200
+ <div class="info-section">
201
+ <strong>MarkItDown is available!</strong> Use it to convert various file formats
202
+ including PDF, Office documents, images, and more to Markdown format.
203
+ </div>
204
+ """
205
+ )
206
+
207
  # State to track if cancellation is requested
208
  cancel_requested = gr.State(False)
209
  # State to store the conversion thread
 
212
  output_format_state = gr.State("Markdown")
213
 
214
  # File input first
215
+ file_input = gr.File(label="Upload Document", type="filepath")
216
 
217
  # Provider and OCR options below the file input
218
  with gr.Row(elem_classes=["provider-options-row"]):
219
  with gr.Column(scale=1):
220
  parser_names = ParserRegistry.get_parser_names()
221
+
222
+ # Make MarkItDown the default parser if available
223
+ default_parser = next((p for p in parser_names if p == "MarkItDown"), parser_names[0] if parser_names else "PyPdfium")
224
 
225
  provider_dropdown = gr.Dropdown(
226
  label="Provider",