Spaces:

feras-vbrl
/

pdf-to-markdown-converter

Running

App Files Files Community

pdf-to-markdown-converter / app.py

feras-vbrl

Upload 4 files

195dd9b verified 18 days ago

raw

history blame contribute delete

10.1 kB

	import streamlit as st
	from docling.document_converter import DocumentConverter
	import tempfile
	import os
	import logging
	import time
	from PIL import Image
	import zipfile
	import io

	# vLLM and docling_core imports for batch processing
	try:
	from vllm import LLM, SamplingParams
	from docling_core.types.doc import DoclingDocument
	from docling_core.types.doc.document import DocTagsDocument
	from pathlib import Path
	VLLM_AVAILABLE = True
	except ImportError:
	VLLM_AVAILABLE = False

	# Create necessary directories
	os.makedirs("img", exist_ok=True)
	os.makedirs("out", exist_ok=True)

	# Configure logging
	logging.basicConfig(level=logging.DEBUG)
	logger = logging.getLogger(__name__)

	# Custom CSS for better layout
	st.markdown("""
	<style>
	.stFileUploader {
	padding: 1rem;
	}

	button[data-testid="stFileUploaderButtonPrimary"] {
	background-color: #000660 !important;
	border: none !important;
	color: white !important;
	}

	.stButton button {
	background-color: #006666;
	border: none !important;
	color: white;
	padding: 0.5rem 2rem !important;
	}
	.stButton button:hover {
	background-color: #008080 !important;
	color: white !important;
	border-color: #008080 !important;
	}
	.upload-text {
	font-size: 1.2rem;
	margin-bottom: 1rem;
	}
	div[data-testid="stFileUploadDropzone"]:hover {
	border-color: #006666 !important;
	background-color: rgba(0, 102, 102, 0.05) !important;
	}
	</style>
	""", unsafe_allow_html=True)

	# Create tabs for different functionalities
	tab1, tab2 = st.tabs(["PDF to Markdown", "Batch Image Processing"])

	with tab1:
	st.title("PDF to Markdown Converter")

	# Initialize session state if it doesn't exist
	if 'converter' not in st.session_state:
	try:
	st.session_state.converter = DocumentConverter()
	logger.debug("Converter successfully created")
	except Exception as e:
	logger.error(f"Error creating converter: {str(e)}")
	st.error(f"Error creating converter: {str(e)}")
	st.stop()

	# Main upload area
	uploaded_file = st.file_uploader(
	"Upload your PDF file",
	type=['pdf'],
	key='pdf_uploader',
	help="Drag and drop or click to select a PDF file (max 200MB)"
	)

	# URL input area with spacing
	st.markdown("<br>", unsafe_allow_html=True)
	url = st.text_input("Or enter a PDF URL")

	# Unified convert button
	convert_clicked = st.button("Convert to Markdown", type="primary")

	# Process either uploaded file or URL
	if convert_clicked:
	if uploaded_file is not None:
	try:
	with st.spinner('Converting file...'):
	with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
	tmp_file.write(uploaded_file.getvalue())
	tmp_path = tmp_file.name
	logger.debug(f"Temporary file created at: {tmp_path}")

	try:
	result = st.session_state.converter.convert(tmp_path)
	markdown_text = result.document.export_to_markdown()

	output_filename = os.path.splitext(uploaded_file.name)[0] + '.md'

	st.success("Conversion completed!")
	st.download_button(
	label="Download Markdown file",
	data=markdown_text,
	file_name=output_filename,
	mime="text/markdown"
	)

	except Exception as e:
	logger.error(f"Error converting file: {str(e)}")
	st.error(f"Error converting file: {str(e)}")

	finally:
	if os.path.exists(tmp_path):
	os.unlink(tmp_path)
	logger.debug("Temporary file deleted")

	except Exception as e:
	logger.error(f"Error processing file: {str(e)}")
	st.error(f"Error processing file: {str(e)}")

	elif url:
	try:
	with st.spinner('Converting from URL...'):
	logger.debug(f"Converting from URL: {url}")
	result = st.session_state.converter.convert(url)
	markdown_text = result.document.export_to_markdown()

	output_filename = url.split('/')[-1].split('.')[0] + '.md'

	st.success("Conversion completed!")
	st.download_button(
	label="Download Markdown file",
	data=markdown_text,
	file_name=output_filename,
	mime="text/markdown"
	)

	except Exception as e:
	logger.error(f"Error converting from URL: {str(e)}")
	st.error(f"Error converting from URL: {str(e)}")
	else:
	st.warning("Please upload a file or enter a URL first")

	# Batch processing tab
	with tab2:
	st.title("Batch Image Processing with vLLM")

	if not VLLM_AVAILABLE:
	st.warning("vLLM and docling_core are required for batch processing. Please install them with: pip install vllm docling_core")
	else:
	st.write("This feature uses vLLM to process multiple images and convert them to Markdown.")

	# Ensure directories exist
	img_dir = "img"
	out_dir = "out"
	os.makedirs(img_dir, exist_ok=True)
	os.makedirs(out_dir, exist_ok=True)

	st.info(f"Images will be processed from the '{img_dir}' directory and results will be saved to the '{out_dir}' directory.")

	# Model configuration
	MODEL_PATH = st.text_input("Model Path", value="ds4sd/SmolDocling-256M-preview")
	PROMPT_TEXT = st.text_area("Prompt Text", value="Convert page to Docling.")

	# File uploader for multiple images
	uploaded_images = st.file_uploader(
	"Upload image files",
	type=['png', 'jpg', 'jpeg'],
	accept_multiple_files=True,
	key='image_uploader',
	help="Drag and drop or click to select image files"
	)

	# Process button
	process_clicked = st.button("Process Images", type="primary", key="process_button")

	if process_clicked and uploaded_images:
	try:
	with st.spinner('Processing images...'):
	# Initialize LLM
	llm = LLM(model=MODEL_PATH, limit_mm_per_prompt={"image": 1})

	sampling_params = SamplingParams(
	temperature=0.0,
	max_tokens=8192
	)

	chat_template = f"<\|im_start\|>User:<image>{PROMPT_TEXT}<end_of_utterance>\nAssistant:"

	start_time = time.time()

	# Create a ZIP file in memory to store all outputs
	zip_buffer = io.BytesIO()
	with zipfile.ZipFile(zip_buffer, 'w') as zip_file:

	progress_bar = st.progress(0)
	status_text = st.empty()

	for idx, img_file in enumerate(uploaded_images):
	img_name = img_file.name
	status_text.text(f"Processing {img_name} ({idx+1}/{len(uploaded_images)})")

	# Open image
	image = Image.open(img_file).convert("RGB")

	# Process with vLLM
	llm_input = {"prompt": chat_template, "multi_modal_data": {"image": image}}
	output = llm.generate([llm_input], sampling_params=sampling_params)[0]

	doctags = output.outputs[0].text
	img_fn = os.path.splitext(img_name)[0]

	# Add doctags to zip
	zip_file.writestr(f"{img_fn}.dt", doctags)

	# Convert to Docling Document
	doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
	doc = DoclingDocument(name=img_fn)
	doc.load_from_doctags(doctags_doc)

	# Export as markdown and add to zip
	md_content = doc.export_to_markdown()
	zip_file.writestr(f"{img_fn}.md", md_content)

	# Update progress
	progress_bar.progress((idx + 1) / len(uploaded_images))

	total_time = time.time() - start_time

	# Offer the ZIP file for download
	st.success(f"Processing completed in {total_time:.2f} seconds!")

	zip_buffer.seek(0)
	st.download_button(
	label="Download All Results",
	data=zip_buffer,
	file_name="processed_images.zip",
	mime="application/zip"
	)

	except Exception as e:
	logger.error(f"Error in batch processing: {str(e)}")
	st.error(f"Error in batch processing: {str(e)}")

	elif process_clicked:
	st.warning("Please upload at least one image file")