import gradio as gr import os import json import base64 import tempfile from pathlib import Path EXTRACTORS = ['pdf_plumber', 'py_pdf', 'docling', 'extractous', 'pypdfium2', 'pymupdf', 'pymupdf_llm'] def add_page_breaks(text, page_offsets): """Add page break markers to text based on page_offsets.""" if not page_offsets: return text result = [] last_offset = 0 for offset in page_offsets: result.append(text[last_offset:offset]) result.append("\n<---page-break--->\n") last_offset = offset # Add any remaining text if last_offset < len(text): result.append(text[last_offset:]) return "".join(result) class ExtractorComparer: def __init__(self): self.json_files = [] self.current_index = 0 self.current_data = None self.temp_pdf_path = None self.current_pdf_bytes = None def load_files(self, directory_path): """Load all JSON files from the specified directory.""" self.json_files = [] try: for filename in os.listdir(directory_path): if filename.endswith('.json') or filename.endswith('.jsonl'): self.json_files.append(os.path.join(directory_path, filename)) if self.json_files: self.current_index = 0 file_progress, annotation_status = self.get_progress_info() return file_progress, annotation_status else: return "No JSON files found", "No files loaded" except Exception as e: return f"Error loading files: {str(e)}", "Error" def load_current_file(self): """Load the current JSON file data.""" if not self.json_files: return None, "N/A", "N/A" try: with open(self.json_files[self.current_index], 'r') as f: self.current_data = json.load(f) # Extract PDF bytes from pdf_plumber pdf_bytes = None debug_info = "" if 'pdf_plumber' in self.current_data: plumber_data = self.current_data['pdf_plumber'] if 'media' in plumber_data and plumber_data['media'] and isinstance(plumber_data['media'], list) and len(plumber_data['media']) > 0: media_item = plumber_data['media'][0] if 'media_bytes' in media_item and media_item['media_bytes']: try: pdf_bytes = base64.b64decode(media_item['media_bytes']) self.current_pdf_bytes = pdf_bytes except Exception as e: debug_info = f"Error decoding media_bytes: {str(e)}" # Create temporary file for the PDF if we have bytes if pdf_bytes: if self.temp_pdf_path: try: os.remove(self.temp_pdf_path) except: pass with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file: temp_file.write(pdf_bytes) self.temp_pdf_path = temp_file.name # Convert to base64 for passing to the frontend base64_pdf = base64.b64encode(pdf_bytes).decode('utf-8') # Generate progress information file_progress, annotation_status = self.get_progress_info() return base64_pdf, file_progress, annotation_status else: file_progress, annotation_status = self.get_progress_info() return None, file_progress, annotation_status except Exception as e: return None, "Error loading file", "No annotation" def get_progress_info(self): """Generate progress information and annotation status.""" if not self.json_files: return "No files loaded", "No annotation" current_file = self.json_files[self.current_index] filename = Path(current_file).name # File progress information file_progress = f"File {self.current_index + 1} of {len(self.json_files)}: {filename}" # Check if this file has been annotated with a best extractor best_extractor_file = os.path.splitext(current_file)[0] + "_best.txt" annotation_status = "Not annotated" if os.path.exists(best_extractor_file): try: with open(best_extractor_file, 'r') as f: best_extractor = f.read().strip() annotation_status = f"Best extractor: {best_extractor}" except: pass # Count total annotated files annotated_count = 0 for json_file in self.json_files: best_file = os.path.splitext(json_file)[0] + "_best.txt" if os.path.exists(best_file): annotated_count += 1 file_progress = f"{file_progress} (Annotated: {annotated_count}/{len(self.json_files)})" return file_progress, annotation_status def get_extractor_text(self, extractor_name): """Get text with page breaks for the specified extractor.""" if not self.current_data or extractor_name not in self.current_data: return "" extractor_data = self.current_data[extractor_name] if 'text' not in extractor_data: return f"No text found for {extractor_name}" text = extractor_data.get('text', '') # Get page offsets page_offsets = [] if 'media' in extractor_data and extractor_data['media'] and len(extractor_data['media']) > 0: media_item = extractor_data['media'][0] if 'metadata' in media_item and 'pdf_metadata' in media_item['metadata'] and 'page_offsets' in media_item['metadata']['pdf_metadata']: page_offsets = media_item['metadata']['pdf_metadata']['page_offsets'] return add_page_breaks(text, page_offsets) def next_pdf(self): """Load the next PDF in the list.""" if not self.json_files: return None, "N/A", "N/A" self.current_index = (self.current_index + 1) % len(self.json_files) return self.load_current_file() def prev_pdf(self): """Load the previous PDF in the list.""" if not self.json_files: return None, "N/A", "N/A" self.current_index = (self.current_index - 1) % len(self.json_files) return self.load_current_file() def set_best_extractor(self, extractor_name): """Record that this extractor is the best for the current file.""" if not self.json_files or not self.current_data: return "N/A", "N/A" try: # Create a record about the best extractor result_file = os.path.splitext(self.json_files[self.current_index])[0] + "_best.txt" with open(result_file, 'w') as f: f.write(extractor_name) # Get updated progress info after annotation file_progress, annotation_status = self.get_progress_info() return file_progress, annotation_status except Exception as e: return "Error saving annotation", "No annotation" def create_interface(): comparer = ExtractorComparer() # Custom CSS for basic font in text areas custom_css = """ .extraction-text textarea { font-family: Arial, Helvetica, sans-serif !important; font-size: 14px !important; line-height: 1.5 !important; } """ with gr.Blocks(title="PDF Extractor Comparer", theme="soft", css=custom_css, head= """ """ ) as demo: gr.Markdown("## PDF Extractor Comparer") with gr.Row(): directory_input = gr.Textbox( label="Path to JSON Directory", placeholder="e.g., /path/to/your/json/files", value="extraction/truncated" ) load_button = gr.Button("Load PDFs", variant="primary") # Main layout: PDF viewer on left, status and controls on right with gr.Row(): # Left column: PDF viewer with gr.Column(scale=3): # PDF viewer using iframe with JavaScript handling pdf_viewer_html = gr.HTML( label="PDF Document", value='''
Click "Load PDFs" to start viewing documents.
''' ) # Hidden component to store the Base64 PDF data pdf_data_hidden = gr.Textbox(visible=False, elem_id="pdf_base64_data") # Right column: Progress and controls with gr.Column(scale=1): # Progress information file_progress_output = gr.Textbox(label="File Progress", interactive=False) annotation_status_output = gr.Textbox(label="Annotation Status", interactive=False) # Navigation with gr.Row(): prev_button = gr.Button("⬅️ Previous", elem_id="prev_button") next_button = gr.Button("Next ➡️", elem_id="next_button") # Best extractor selection gr.Markdown("### Select Best Extractor") extractor_buttons = [] for extractor in EXTRACTORS: button = gr.Button(extractor, variant="secondary") extractor_buttons.append(button) button.click( comparer.set_best_extractor, inputs=[gr.Textbox(value=extractor, visible=False)], outputs=[file_progress_output, annotation_status_output] ) # Extractors section below the PDF gr.Markdown("### Extractor Comparison") # Extractor dropdowns with gr.Row(): extractor1_dropdown = gr.Dropdown( choices=EXTRACTORS, label="Extractor 1", value=EXTRACTORS[0] if EXTRACTORS else None ) extractor2_dropdown = gr.Dropdown( choices=EXTRACTORS, label="Extractor 2", value=EXTRACTORS[1] if len(EXTRACTORS) > 1 else EXTRACTORS[0] if EXTRACTORS else None ) # Extractor text outputs with applied class for styling with gr.Row(): extractor1_text = gr.Textbox( label="Extractor 1 Output", lines=15, elem_classes=["extraction-text"] ) extractor2_text = gr.Textbox( label="Extractor 2 Output", lines=15, elem_classes=["extraction-text"] ) # Event handlers load_button.click( comparer.load_files, inputs=[directory_input], outputs=[file_progress_output, annotation_status_output] ).then( comparer.load_current_file, outputs=[pdf_data_hidden, file_progress_output, annotation_status_output] ).then( comparer.get_extractor_text, inputs=[extractor1_dropdown], outputs=[extractor1_text] ).then( comparer.get_extractor_text, inputs=[extractor2_dropdown], outputs=[extractor2_text] ) prev_button.click( comparer.prev_pdf, outputs=[pdf_data_hidden, file_progress_output, annotation_status_output] ).then( comparer.get_extractor_text, inputs=[extractor1_dropdown], outputs=[extractor1_text] ).then( comparer.get_extractor_text, inputs=[extractor2_dropdown], outputs=[extractor2_text] ) next_button.click( comparer.next_pdf, outputs=[pdf_data_hidden, file_progress_output, annotation_status_output] ).then( comparer.get_extractor_text, inputs=[extractor1_dropdown], outputs=[extractor1_text] ).then( comparer.get_extractor_text, inputs=[extractor2_dropdown], outputs=[extractor2_text] ) extractor1_dropdown.change( comparer.get_extractor_text, inputs=[extractor1_dropdown], outputs=[extractor1_text] ) extractor2_dropdown.change( comparer.get_extractor_text, inputs=[extractor2_dropdown], outputs=[extractor2_text] ) # JavaScript for PDF handling demo.load( fn=None, js=""" function() { console.log("Setting up PDF.js viewer"); // Configure PDF.js worker if (window.pdfjsLib) { window.pdfjsLib.GlobalWorkerOptions.workerSrc = "https://unpkg.com/pdfjs-dist@3.11.174/build/pdf.worker.min.js"; console.log("PDF.js configured with worker"); } else { console.warn("PDF.js not found in head, attempting to load dynamically"); // Fallback to load PDF.js dynamically if not in the head const pdfJsScript = document.createElement('script'); pdfJsScript.src = "https://unpkg.com/pdfjs-dist@3.11.174/build/pdf.min.js"; document.head.appendChild(pdfJsScript); pdfJsScript.onload = function() { window.pdfjsLib.GlobalWorkerOptions.workerSrc = "https://unpkg.com/pdfjs-dist@3.11.174/build/pdf.worker.min.js"; console.log("PDF.js loaded dynamically"); }; } // To track when we should force a refresh let currentPdfHash = ""; // Function to render a PDF page async function renderPage(pdf, pageNumber, container) { try { const page = await pdf.getPage(pageNumber); // Create page container const pageContainer = document.createElement('div'); pageContainer.className = 'pdf-page'; pageContainer.style.position = 'relative'; pageContainer.style.margin = '10px auto'; pageContainer.style.boxShadow = '0 2px 5px rgba(0,0,0,0.2)'; // Create canvas for this page const canvas = document.createElement('canvas'); const context = canvas.getContext('2d'); pageContainer.appendChild(canvas); // Set up viewport with scale based on container width const containerWidth = container.clientWidth - 30; // Account for margins const originalViewport = page.getViewport({ scale: 1 }); const scale = containerWidth / originalViewport.width; const viewport = page.getViewport({ scale }); // Set canvas dimensions canvas.width = viewport.width; canvas.height = viewport.height; // Render the PDF page into canvas context await page.render({ canvasContext: context, viewport: viewport }).promise; // Add to the container container.appendChild(pageContainer); return true; } catch (error) { console.error(`Error rendering page ${pageNumber}:`, error); return false; } } // Simple hash function for PDF data to detect changes function hashData(str) { let hash = 0; if (str.length === 0) return hash; for (let i = 0; i < Math.min(str.length, 10000); i++) { const char = str.charCodeAt(i); hash = ((hash << 5) - hash) + char; hash = hash & hash; } // Also include the length as PDFs with same start can be different return `${hash}_${str.length}`; } // Function to display PDF from base64 data async function displayPdfFromBase64(base64Data) { try { if (!base64Data || base64Data.length < 100) { console.log("No valid PDF data received"); document.getElementById('pdf-fallback').style.display = 'flex'; document.getElementById('pdf-container').innerHTML = ''; return; } // Check if this is the same PDF we already have displayed const dataHash = hashData(base64Data); if (dataHash === currentPdfHash) { console.log("Same PDF already displayed, skipping render"); return; } // Update the current PDF hash currentPdfHash = dataHash; console.log("PDF changed, rendering new document"); // Check if PDF.js is loaded if (!window.pdfjsLib) { console.warn("PDF.js not loaded yet, waiting..."); document.getElementById('pdf-fallback').innerHTML = '
Loading PDF viewer...
'; setTimeout(() => displayPdfFromBase64(base64Data), 500); return; } // Convert base64 to array buffer const binaryString = atob(base64Data); const bytes = new Uint8Array(binaryString.length); for (let i = 0; i < binaryString.length; i++) { bytes[i] = binaryString.charCodeAt(i); } // Clear existing content const container = document.getElementById('pdf-container'); container.innerHTML = ''; document.getElementById('pdf-fallback').style.display = 'none'; // Load and render the PDF try { // Show loading indicator const loadingIndicator = document.createElement('div'); loadingIndicator.style.padding = '20px'; loadingIndicator.style.textAlign = 'center'; loadingIndicator.innerText = 'Loading PDF...'; container.appendChild(loadingIndicator); // Load document const loadingTask = window.pdfjsLib.getDocument({ data: bytes }); const pdf = await loadingTask.promise; // Clear the loading indicator container.innerHTML = ''; console.log(`PDF loaded with ${pdf.numPages} pages`); // Render all pages const pagePromises = []; for (let i = 1; i <= pdf.numPages; i++) { pagePromises.push(renderPage(pdf, i, container)); } // Wait for all pages to render await Promise.all(pagePromises); console.log("All pages rendered"); // Scroll to top container.scrollTop = 0; } catch (error) { console.error("Error loading PDF:", error); document.getElementById('pdf-fallback').innerHTML = `
Error loading PDF: ${error.message || 'Unknown error'}
`; document.getElementById('pdf-fallback').style.display = 'flex'; currentPdfHash = ""; // Reset hash to allow retry } } catch (error) { console.error("Error processing PDF data:", error); document.getElementById('pdf-fallback').innerHTML = `
Error processing PDF: ${error.message || 'Unknown error'}
`; document.getElementById('pdf-fallback').style.display = 'flex'; currentPdfHash = ""; // Reset hash to allow retry } } // Check for PDF data function setupPdfListener() { const dataElement = document.getElementById('pdf_base64_data'); if (!dataElement) { console.log("PDF data element not found, will retry"); setTimeout(setupPdfListener, 1000); return; } const textarea = dataElement.querySelector('textarea'); if (!textarea) { console.log("Textarea not found, will retry"); setTimeout(setupPdfListener, 1000); return; } console.log("Found PDF data element, setting up listeners"); // Display initial data if available if (textarea.value && textarea.value.length > 100) { displayPdfFromBase64(textarea.value); } // Use both an observer and polling for robustness // 1. Create MutationObserver to watch for value changes const observer = new MutationObserver((mutations) => { for (const mutation of mutations) { if (textarea.value && textarea.value.length > 100) { displayPdfFromBase64(textarea.value); break; } } }); // Observe the textarea for changes observer.observe(textarea, { attributes: true, characterData: true, subtree: true, childList: true }); // 2. Also use polling as a fallback setInterval(() => { if (textarea.value && textarea.value.length > 100) { displayPdfFromBase64(textarea.value); } }, 1000); // Monitor the next/prev buttons to force PDF refresh const prevButton = document.getElementById('prev_button'); const nextButton = document.getElementById('next_button'); if (prevButton) { prevButton.addEventListener('click', () => { console.log("Prev button clicked, forcing PDF refresh"); currentPdfHash = ""; // Reset hash to force refresh }); } if (nextButton) { nextButton.addEventListener('click', () => { console.log("Next button clicked, forcing PDF refresh"); currentPdfHash = ""; // Reset hash to force refresh }); } } // Start checking for PDF data setTimeout(setupPdfListener, 1000); // Add keyboard shortcuts document.addEventListener('keydown', function(event) { if (event.target.tagName === 'INPUT' || event.target.tagName === 'TEXTAREA') { return; } var buttonId = null; if (event.key === 'ArrowLeft') buttonId = 'prev_button'; else if (event.key === 'ArrowRight') buttonId = 'next_button'; if (buttonId) { var button = document.getElementById(buttonId); if (button) { event.preventDefault(); button.click(); } } }); } """ ) return demo if __name__ == "__main__": demo = create_interface() demo.launch()