Spaces:

feras-vbrl
/

pdf-to-markdown-converter

Running

File size: 10,057 Bytes

import streamlit as st
from docling.document_converter import DocumentConverter
import tempfile
import os
import logging
import time
from PIL import Image
import zipfile
import io

# vLLM and docling_core imports for batch processing
try:
    from vllm import LLM, SamplingParams
    from docling_core.types.doc import DoclingDocument
    from docling_core.types.doc.document import DocTagsDocument
    from pathlib import Path
    VLLM_AVAILABLE = True
except ImportError:
    VLLM_AVAILABLE = False

# Create necessary directories
os.makedirs("img", exist_ok=True)
os.makedirs("out", exist_ok=True)

# Configure logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

# Custom CSS for better layout
st.markdown("""
    <style>    
        .stFileUploader {
            padding: 1rem;
        }
        
        button[data-testid="stFileUploaderButtonPrimary"] {
            background-color: #000660 !important;
            border: none !important;
            color: white !important;
        }

        .stButton button {
            background-color: #006666;
            border: none !important;
            color: white;
            padding: 0.5rem 2rem !important;
        }
        .stButton button:hover {
            background-color: #008080 !important;
            color: white !important;
            border-color: #008080 !important;
        }
        .upload-text {
            font-size: 1.2rem;
            margin-bottom: 1rem;
        }
        div[data-testid="stFileUploadDropzone"]:hover {
            border-color: #006666 !important;
            background-color: rgba(0, 102, 102, 0.05) !important;
        }
    </style>
""", unsafe_allow_html=True)

# Create tabs for different functionalities
tab1, tab2 = st.tabs(["PDF to Markdown", "Batch Image Processing"])

with tab1:
    st.title("PDF to Markdown Converter")

# Initialize session state if it doesn't exist
if 'converter' not in st.session_state:
    try:
        st.session_state.converter = DocumentConverter()
        logger.debug("Converter successfully created")
    except Exception as e:
        logger.error(f"Error creating converter: {str(e)}")
        st.error(f"Error creating converter: {str(e)}")
        st.stop()

# Main upload area
uploaded_file = st.file_uploader(
    "Upload your PDF file",
    type=['pdf'],
    key='pdf_uploader',
    help="Drag and drop or click to select a PDF file (max 200MB)"
)

# URL input area with spacing
st.markdown("<br>", unsafe_allow_html=True)
url = st.text_input("Or enter a PDF URL")

# Unified convert button
convert_clicked = st.button("Convert to Markdown", type="primary")

# Process either uploaded file or URL
if convert_clicked:
    if uploaded_file is not None:
        try:
            with st.spinner('Converting file...'):
                with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
                    tmp_file.write(uploaded_file.getvalue())
                    tmp_path = tmp_file.name
                    logger.debug(f"Temporary file created at: {tmp_path}")

                    try:
                        result = st.session_state.converter.convert(tmp_path)
                        markdown_text = result.document.export_to_markdown()
                        
                        output_filename = os.path.splitext(uploaded_file.name)[0] + '.md'
                        
                        st.success("Conversion completed!")
                        st.download_button(
                            label="Download Markdown file",
                            data=markdown_text,
                            file_name=output_filename,
                            mime="text/markdown"
                        )

                    except Exception as e:
                        logger.error(f"Error converting file: {str(e)}")
                        st.error(f"Error converting file: {str(e)}")
                    
                    finally:
                        if os.path.exists(tmp_path):
                            os.unlink(tmp_path)
                            logger.debug("Temporary file deleted")

        except Exception as e:
            logger.error(f"Error processing file: {str(e)}")
            st.error(f"Error processing file: {str(e)}")
            
    elif url:
        try:
            with st.spinner('Converting from URL...'):
                logger.debug(f"Converting from URL: {url}")
                result = st.session_state.converter.convert(url)
                markdown_text = result.document.export_to_markdown()
                
                output_filename = url.split('/')[-1].split('.')[0] + '.md'
                
                st.success("Conversion completed!")
                st.download_button(
                    label="Download Markdown file",
                    data=markdown_text,
                    file_name=output_filename,
                    mime="text/markdown"
                )

        except Exception as e:
            logger.error(f"Error converting from URL: {str(e)}")
            st.error(f"Error converting from URL: {str(e)}")
    else:
        st.warning("Please upload a file or enter a URL first")

# Batch processing tab
with tab2:
    st.title("Batch Image Processing with vLLM")
    
    if not VLLM_AVAILABLE:
        st.warning("vLLM and docling_core are required for batch processing. Please install them with: pip install vllm docling_core")
    else:
        st.write("This feature uses vLLM to process multiple images and convert them to Markdown.")
        
        # Ensure directories exist
        img_dir = "img"
        out_dir = "out"
        os.makedirs(img_dir, exist_ok=True)
        os.makedirs(out_dir, exist_ok=True)
        
        st.info(f"Images will be processed from the '{img_dir}' directory and results will be saved to the '{out_dir}' directory.")
        
        # Model configuration
        MODEL_PATH = st.text_input("Model Path", value="ds4sd/SmolDocling-256M-preview")
        PROMPT_TEXT = st.text_area("Prompt Text", value="Convert page to Docling.")
        
        # File uploader for multiple images
        uploaded_images = st.file_uploader(
            "Upload image files",
            type=['png', 'jpg', 'jpeg'],
            accept_multiple_files=True,
            key='image_uploader',
            help="Drag and drop or click to select image files"
        )
        
        # Process button
        process_clicked = st.button("Process Images", type="primary", key="process_button")
        
        if process_clicked and uploaded_images:
            try:
                with st.spinner('Processing images...'):
                    # Initialize LLM
                    llm = LLM(model=MODEL_PATH, limit_mm_per_prompt={"image": 1})
                    
                    sampling_params = SamplingParams(
                        temperature=0.0,
                        max_tokens=8192
                    )
                    
                    chat_template = f"<|im_start|>User:<image>{PROMPT_TEXT}<end_of_utterance>\nAssistant:"
                    
                    start_time = time.time()
                    
                    # Create a ZIP file in memory to store all outputs
                    zip_buffer = io.BytesIO()
                    with zipfile.ZipFile(zip_buffer, 'w') as zip_file:
                        
                        progress_bar = st.progress(0)
                        status_text = st.empty()
                        
                        for idx, img_file in enumerate(uploaded_images):
                            img_name = img_file.name
                            status_text.text(f"Processing {img_name} ({idx+1}/{len(uploaded_images)})")
                            
                            # Open image
                            image = Image.open(img_file).convert("RGB")
                            
                            # Process with vLLM
                            llm_input = {"prompt": chat_template, "multi_modal_data": {"image": image}}
                            output = llm.generate([llm_input], sampling_params=sampling_params)[0]
                            
                            doctags = output.outputs[0].text
                            img_fn = os.path.splitext(img_name)[0]
                            
                            # Add doctags to zip
                            zip_file.writestr(f"{img_fn}.dt", doctags)
                            
                            # Convert to Docling Document
                            doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
                            doc = DoclingDocument(name=img_fn)
                            doc.load_from_doctags(doctags_doc)
                            
                            # Export as markdown and add to zip
                            md_content = doc.export_to_markdown()
                            zip_file.writestr(f"{img_fn}.md", md_content)
                            
                            # Update progress
                            progress_bar.progress((idx + 1) / len(uploaded_images))
                    
                    total_time = time.time() - start_time
                    
                    # Offer the ZIP file for download
                    st.success(f"Processing completed in {total_time:.2f} seconds!")
                    
                    zip_buffer.seek(0)
                    st.download_button(
                        label="Download All Results",
                        data=zip_buffer,
                        file_name="processed_images.zip",
                        mime="application/zip"
                    )
                    
            except Exception as e:
                logger.error(f"Error in batch processing: {str(e)}")
                st.error(f"Error in batch processing: {str(e)}")
        
        elif process_clicked:
            st.warning("Please upload at least one image file")