Spaces:

HuggingFaceFW-Dev
/

PDF-Extraction-Comparisson

Running

File size: 27,553 Bytes

import gradio as gr
import os
import json
import base64
import tempfile
from pathlib import Path

EXTRACTORS = ['pdf_plumber', 'py_pdf', 'docling', 'extractous', 'pypdfium2', 'pymupdf', 'pymupdf_llm']

def add_page_breaks(text, page_offsets):
    """Add page break markers to text based on page_offsets."""
    if not page_offsets:
        return text
    
    result = []
    last_offset = 0
    for offset in page_offsets:
        result.append(text[last_offset:offset])
        result.append("\n<---page-break--->\n")
        last_offset = offset
    
    # Add any remaining text
    if last_offset < len(text):
        result.append(text[last_offset:])
    
    return "".join(result)

class ExtractorComparer:
    def __init__(self):
        self.json_files = []
        self.current_index = 0
        self.current_data = None
        self.temp_pdf_path = None
        self.current_pdf_bytes = None
    
    def load_files(self, directory_path):
        """Load all JSON files from the specified directory."""
        self.json_files = []
        try:
            for filename in os.listdir(directory_path):
                if filename.endswith('.json') or filename.endswith('.jsonl'):
                    self.json_files.append(os.path.join(directory_path, filename))
            
            if self.json_files:
                self.current_index = 0
                file_progress, annotation_status = self.get_progress_info()
                return file_progress, annotation_status
            else:
                return "No JSON files found", "No files loaded"
        except Exception as e:
            return f"Error loading files: {str(e)}", "Error"
    
    def load_current_file(self):
        """Load the current JSON file data."""
        if not self.json_files:
            return None, "N/A", "N/A"
        
        try:
            with open(self.json_files[self.current_index], 'r') as f:
                self.current_data = json.load(f)
            
            # Extract PDF bytes from pdf_plumber
            pdf_bytes = None
            debug_info = ""
            if 'pdf_plumber' in self.current_data:
                plumber_data = self.current_data['pdf_plumber']
                if 'media' in plumber_data and plumber_data['media'] and isinstance(plumber_data['media'], list) and len(plumber_data['media']) > 0:
                    media_item = plumber_data['media'][0]
                    if 'media_bytes' in media_item and media_item['media_bytes']:
                        try:
                            pdf_bytes = base64.b64decode(media_item['media_bytes'])
                            self.current_pdf_bytes = pdf_bytes
                        except Exception as e:
                            debug_info = f"Error decoding media_bytes: {str(e)}"
            
            # Create temporary file for the PDF if we have bytes
            if pdf_bytes:
                if self.temp_pdf_path:
                    try:
                        os.remove(self.temp_pdf_path)
                    except:
                        pass
                
                with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
                    temp_file.write(pdf_bytes)
                    self.temp_pdf_path = temp_file.name
                
                # Convert to base64 for passing to the frontend
                base64_pdf = base64.b64encode(pdf_bytes).decode('utf-8')
                
                # Generate progress information
                file_progress, annotation_status = self.get_progress_info()
                
                return base64_pdf, file_progress, annotation_status
            else:
                file_progress, annotation_status = self.get_progress_info()
                return None, file_progress, annotation_status
        except Exception as e:
            return None, "Error loading file", "No annotation"
    
    def get_progress_info(self):
        """Generate progress information and annotation status."""
        if not self.json_files:
            return "No files loaded", "No annotation"
        
        current_file = self.json_files[self.current_index]
        filename = Path(current_file).name
        
        # File progress information
        file_progress = f"File {self.current_index + 1} of {len(self.json_files)}: {filename}"
        
        # Check if this file has been annotated with a best extractor
        best_extractor_file = os.path.splitext(current_file)[0] + "_best.txt"
        annotation_status = "Not annotated"
        
        if os.path.exists(best_extractor_file):
            try:
                with open(best_extractor_file, 'r') as f:
                    best_extractor = f.read().strip()
                annotation_status = f"Best extractor: {best_extractor}"
            except:
                pass
                
        # Count total annotated files
        annotated_count = 0
        for json_file in self.json_files:
            best_file = os.path.splitext(json_file)[0] + "_best.txt"
            if os.path.exists(best_file):
                annotated_count += 1
                
        file_progress = f"{file_progress} (Annotated: {annotated_count}/{len(self.json_files)})"
        
        return file_progress, annotation_status
    
    def get_extractor_text(self, extractor_name):
        """Get text with page breaks for the specified extractor."""
        if not self.current_data or extractor_name not in self.current_data:
            return ""
        
        extractor_data = self.current_data[extractor_name]
        if 'text' not in extractor_data:
            return f"No text found for {extractor_name}"
        
        text = extractor_data.get('text', '')
        
        # Get page offsets
        page_offsets = []
        if 'media' in extractor_data and extractor_data['media'] and len(extractor_data['media']) > 0:
            media_item = extractor_data['media'][0]
            if 'metadata' in media_item and 'pdf_metadata' in media_item['metadata'] and 'page_offsets' in media_item['metadata']['pdf_metadata']:
                page_offsets = media_item['metadata']['pdf_metadata']['page_offsets']
        
        return add_page_breaks(text, page_offsets)
    
    def next_pdf(self):
        """Load the next PDF in the list."""
        if not self.json_files:
            return None, "N/A", "N/A"
        
        self.current_index = (self.current_index + 1) % len(self.json_files)
        return self.load_current_file()
    
    def prev_pdf(self):
        """Load the previous PDF in the list."""
        if not self.json_files:
            return None, "N/A", "N/A"
        
        self.current_index = (self.current_index - 1) % len(self.json_files)
        return self.load_current_file()
    
    def set_best_extractor(self, extractor_name):
        """Record that this extractor is the best for the current file."""
        if not self.json_files or not self.current_data:
            return "N/A", "N/A"
        
        try:
            # Create a record about the best extractor
            result_file = os.path.splitext(self.json_files[self.current_index])[0] + "_best.txt"
            with open(result_file, 'w') as f:
                f.write(extractor_name)
            
            # Get updated progress info after annotation
            file_progress, annotation_status = self.get_progress_info()
            
            return file_progress, annotation_status
        except Exception as e:
            return "Error saving annotation", "No annotation"

def create_interface():
    comparer = ExtractorComparer()
    
    # Custom CSS for basic font in text areas
    custom_css = """
    .extraction-text textarea {
        font-family: Arial, Helvetica, sans-serif !important;
        font-size: 14px !important;
        line-height: 1.5 !important;
    }
    """
    
    with gr.Blocks(title="PDF Extractor Comparer", theme="soft", css=custom_css, head=
                   """
                   <script src="https://unpkg.com/[email protected]/build/pdf.min.js"></script>
                   """
                   ) as demo:
        gr.Markdown("## PDF Extractor Comparer")
        
        with gr.Row():
            directory_input = gr.Textbox(
                label="Path to JSON Directory",
                placeholder="e.g., /path/to/your/json/files",
                value="extraction/truncated"
            )
            load_button = gr.Button("Load PDFs", variant="primary")
        
        # Main layout: PDF viewer on left, status and controls on right
        with gr.Row():
            # Left column: PDF viewer
            with gr.Column(scale=3):
                # PDF viewer using iframe with JavaScript handling
                pdf_viewer_html = gr.HTML(
                    label="PDF Document",
                    value='''
                    <div style="width:100%; height:700px; position:relative; border:1px solid #ddd;">
                        <div id="pdf-container" style="width:100%; height:100%; overflow:auto;"></div>
                        <div id="pdf-fallback" style="position:absolute; top:0; left:0; width:100%; height:100%; 
                             display:flex; align-items:center; justify-content:center; padding:20px; text-align:center; font-family: Arial, sans-serif;">
                            Click "Load PDFs" to start viewing documents.
                        </div>
                    </div>
                    '''
                )
                # Hidden component to store the Base64 PDF data
                pdf_data_hidden = gr.Textbox(visible=False, elem_id="pdf_base64_data")
            
            # Right column: Progress and controls
            with gr.Column(scale=1):
                # Progress information
                file_progress_output = gr.Textbox(label="File Progress", interactive=False)
                annotation_status_output = gr.Textbox(label="Annotation Status", interactive=False)
                
                # Navigation
                with gr.Row():
                    prev_button = gr.Button("⬅️ Previous", elem_id="prev_button")
                    next_button = gr.Button("Next ➡️", elem_id="next_button")
                
                # Best extractor selection
                gr.Markdown("### Select Best Extractor")
                extractor_buttons = []
                for extractor in EXTRACTORS:
                    button = gr.Button(extractor, variant="secondary")
                    extractor_buttons.append(button)
                    button.click(
                        comparer.set_best_extractor, 
                        inputs=[gr.Textbox(value=extractor, visible=False)],
                        outputs=[file_progress_output, annotation_status_output]
                    )
        
        # Extractors section below the PDF
        gr.Markdown("### Extractor Comparison")
        
        # Extractor dropdowns
        with gr.Row():
            extractor1_dropdown = gr.Dropdown(
                choices=EXTRACTORS, 
                label="Extractor 1",
                value=EXTRACTORS[0] if EXTRACTORS else None
            )
            extractor2_dropdown = gr.Dropdown(
                choices=EXTRACTORS, 
                label="Extractor 2",
                value=EXTRACTORS[1] if len(EXTRACTORS) > 1 else EXTRACTORS[0] if EXTRACTORS else None
            )
        
        # Extractor text outputs with applied class for styling
        with gr.Row():
            extractor1_text = gr.Textbox(
                label="Extractor 1 Output", 
                lines=15,
                elem_classes=["extraction-text"]
            )
            extractor2_text = gr.Textbox(
                label="Extractor 2 Output", 
                lines=15,
                elem_classes=["extraction-text"]
            )
        
        # Event handlers
        load_button.click(
            comparer.load_files, 
            inputs=[directory_input],
            outputs=[file_progress_output, annotation_status_output]
        ).then(
            comparer.load_current_file,
            outputs=[pdf_data_hidden, file_progress_output, annotation_status_output]
        ).then(
            comparer.get_extractor_text,
            inputs=[extractor1_dropdown],
            outputs=[extractor1_text]
        ).then(
            comparer.get_extractor_text,
            inputs=[extractor2_dropdown],
            outputs=[extractor2_text]
        )
        
        prev_button.click(
            comparer.prev_pdf,
            outputs=[pdf_data_hidden, file_progress_output, annotation_status_output]
        ).then(
            comparer.get_extractor_text,
            inputs=[extractor1_dropdown],
            outputs=[extractor1_text]
        ).then(
            comparer.get_extractor_text,
            inputs=[extractor2_dropdown],
            outputs=[extractor2_text]
        )
        
        next_button.click(
            comparer.next_pdf,
            outputs=[pdf_data_hidden, file_progress_output, annotation_status_output]
        ).then(
            comparer.get_extractor_text,
            inputs=[extractor1_dropdown],
            outputs=[extractor1_text]
        ).then(
            comparer.get_extractor_text,
            inputs=[extractor2_dropdown],
            outputs=[extractor2_text]
        )
        
        extractor1_dropdown.change(
            comparer.get_extractor_text,
            inputs=[extractor1_dropdown],
            outputs=[extractor1_text]
        )
        
        extractor2_dropdown.change(
            comparer.get_extractor_text,
            inputs=[extractor2_dropdown],
            outputs=[extractor2_text]
        )
        
        # JavaScript for PDF handling
        demo.load(
            fn=None,
            js="""
            function() {
                console.log("Setting up PDF.js viewer");
                
                // Configure PDF.js worker
                if (window.pdfjsLib) {
                    window.pdfjsLib.GlobalWorkerOptions.workerSrc = "https://unpkg.com/[email protected]/build/pdf.worker.min.js";
                    console.log("PDF.js configured with worker");
                } else {
                    console.warn("PDF.js not found in head, attempting to load dynamically");
                    // Fallback to load PDF.js dynamically if not in the head
                    const pdfJsScript = document.createElement('script');
                    pdfJsScript.src = "https://unpkg.com/[email protected]/build/pdf.min.js";
                    document.head.appendChild(pdfJsScript);
                    
                    pdfJsScript.onload = function() {
                        window.pdfjsLib.GlobalWorkerOptions.workerSrc = "https://unpkg.com/[email protected]/build/pdf.worker.min.js";
                        console.log("PDF.js loaded dynamically");
                    };
                }
                
                // To track when we should force a refresh
                let currentPdfHash = "";
                
                // Function to render a PDF page
                async function renderPage(pdf, pageNumber, container) {
                    try {
                        const page = await pdf.getPage(pageNumber);
                        
                        // Create page container
                        const pageContainer = document.createElement('div');
                        pageContainer.className = 'pdf-page';
                        pageContainer.style.position = 'relative';
                        pageContainer.style.margin = '10px auto';
                        pageContainer.style.boxShadow = '0 2px 5px rgba(0,0,0,0.2)';
                        
                        // Create canvas for this page
                        const canvas = document.createElement('canvas');
                        const context = canvas.getContext('2d');
                        pageContainer.appendChild(canvas);
                        
                        // Set up viewport with scale based on container width
                        const containerWidth = container.clientWidth - 30; // Account for margins
                        const originalViewport = page.getViewport({ scale: 1 });
                        const scale = containerWidth / originalViewport.width;
                        const viewport = page.getViewport({ scale });
                        
                        // Set canvas dimensions
                        canvas.width = viewport.width;
                        canvas.height = viewport.height;
                        
                        // Render the PDF page into canvas context
                        await page.render({
                            canvasContext: context,
                            viewport: viewport
                        }).promise;
                        
                        // Add to the container
                        container.appendChild(pageContainer);
                        
                        return true;
                    } catch (error) {
                        console.error(`Error rendering page ${pageNumber}:`, error);
                        return false;
                    }
                }
                
                // Simple hash function for PDF data to detect changes
                function hashData(str) {
                    let hash = 0;
                    if (str.length === 0) return hash;
                    for (let i = 0; i < Math.min(str.length, 10000); i++) {
                        const char = str.charCodeAt(i);
                        hash = ((hash << 5) - hash) + char;
                        hash = hash & hash;
                    }
                    // Also include the length as PDFs with same start can be different
                    return `${hash}_${str.length}`;
                }
                
                // Function to display PDF from base64 data
                async function displayPdfFromBase64(base64Data) {
                    try {
                        if (!base64Data || base64Data.length < 100) {
                            console.log("No valid PDF data received");
                            document.getElementById('pdf-fallback').style.display = 'flex';
                            document.getElementById('pdf-container').innerHTML = '';
                            return;
                        }
                        
                        // Check if this is the same PDF we already have displayed
                        const dataHash = hashData(base64Data);
                        if (dataHash === currentPdfHash) {
                            console.log("Same PDF already displayed, skipping render");
                            return;
                        }
                        
                        // Update the current PDF hash
                        currentPdfHash = dataHash;
                        console.log("PDF changed, rendering new document");
                        
                        // Check if PDF.js is loaded
                        if (!window.pdfjsLib) {
                            console.warn("PDF.js not loaded yet, waiting...");
                            document.getElementById('pdf-fallback').innerHTML = 
                                '<div style="font-family: Arial, sans-serif;">Loading PDF viewer...</div>';
                            setTimeout(() => displayPdfFromBase64(base64Data), 500);
                            return;
                        }
                        
                        // Convert base64 to array buffer
                        const binaryString = atob(base64Data);
                        const bytes = new Uint8Array(binaryString.length);
                        for (let i = 0; i < binaryString.length; i++) {
                            bytes[i] = binaryString.charCodeAt(i);
                        }
                        
                        // Clear existing content
                        const container = document.getElementById('pdf-container');
                        container.innerHTML = '';
                        document.getElementById('pdf-fallback').style.display = 'none';
                        
                        // Load and render the PDF
                        try {
                            // Show loading indicator
                            const loadingIndicator = document.createElement('div');
                            loadingIndicator.style.padding = '20px';
                            loadingIndicator.style.textAlign = 'center';
                            loadingIndicator.innerText = 'Loading PDF...';
                            container.appendChild(loadingIndicator);
                            
                            // Load document
                            const loadingTask = window.pdfjsLib.getDocument({ data: bytes });
                            const pdf = await loadingTask.promise;
                            
                            // Clear the loading indicator
                            container.innerHTML = '';
                            
                            console.log(`PDF loaded with ${pdf.numPages} pages`);
                            
                            // Render all pages
                            const pagePromises = [];
                            for (let i = 1; i <= pdf.numPages; i++) {
                                pagePromises.push(renderPage(pdf, i, container));
                            }
                            
                            // Wait for all pages to render
                            await Promise.all(pagePromises);
                            console.log("All pages rendered");
                            
                            // Scroll to top
                            container.scrollTop = 0;
                            
                        } catch (error) {
                            console.error("Error loading PDF:", error);
                            document.getElementById('pdf-fallback').innerHTML = 
                                `<div style="color: red; font-family: Arial, sans-serif;">
                                    Error loading PDF: ${error.message || 'Unknown error'}
                                </div>`;
                            document.getElementById('pdf-fallback').style.display = 'flex';
                            currentPdfHash = ""; // Reset hash to allow retry
                        }
                    } catch (error) {
                        console.error("Error processing PDF data:", error);
                        document.getElementById('pdf-fallback').innerHTML = 
                            `<div style="color: red; font-family: Arial, sans-serif;">
                                Error processing PDF: ${error.message || 'Unknown error'}
                            </div>`;
                        document.getElementById('pdf-fallback').style.display = 'flex';
                        currentPdfHash = ""; // Reset hash to allow retry
                    }
                }
                
                // Check for PDF data
                function setupPdfListener() {
                    const dataElement = document.getElementById('pdf_base64_data');
                    if (!dataElement) {
                        console.log("PDF data element not found, will retry");
                        setTimeout(setupPdfListener, 1000);
                        return;
                    }
                    
                    const textarea = dataElement.querySelector('textarea');
                    if (!textarea) {
                        console.log("Textarea not found, will retry");
                        setTimeout(setupPdfListener, 1000);
                        return;
                    }
                    
                    console.log("Found PDF data element, setting up listeners");
                    
                    // Display initial data if available
                    if (textarea.value && textarea.value.length > 100) {
                        displayPdfFromBase64(textarea.value);
                    }
                    
                    // Use both an observer and polling for robustness
                    // 1. Create MutationObserver to watch for value changes
                    const observer = new MutationObserver((mutations) => {
                        for (const mutation of mutations) {
                            if (textarea.value && textarea.value.length > 100) {
                                displayPdfFromBase64(textarea.value);
                                break;
                            }
                        }
                    });
                    
                    // Observe the textarea for changes
                    observer.observe(textarea, { 
                        attributes: true,
                        characterData: true,
                        subtree: true,
                        childList: true
                    });
                    
                    // 2. Also use polling as a fallback
                    setInterval(() => {
                        if (textarea.value && textarea.value.length > 100) {
                            displayPdfFromBase64(textarea.value);
                        }
                    }, 1000);
                    
                    // Monitor the next/prev buttons to force PDF refresh
                    const prevButton = document.getElementById('prev_button');
                    const nextButton = document.getElementById('next_button');
                    
                    if (prevButton) {
                        prevButton.addEventListener('click', () => {
                            console.log("Prev button clicked, forcing PDF refresh");
                            currentPdfHash = ""; // Reset hash to force refresh
                        });
                    }
                    
                    if (nextButton) {
                        nextButton.addEventListener('click', () => {
                            console.log("Next button clicked, forcing PDF refresh");
                            currentPdfHash = ""; // Reset hash to force refresh
                        });
                    }
                }
                
                // Start checking for PDF data
                setTimeout(setupPdfListener, 1000);
                
                // Add keyboard shortcuts
                document.addEventListener('keydown', function(event) {
                    if (event.target.tagName === 'INPUT' || event.target.tagName === 'TEXTAREA') {
                        return;
                    }
                    
                    var buttonId = null;
                    if (event.key === 'ArrowLeft') buttonId = 'prev_button';
                    else if (event.key === 'ArrowRight') buttonId = 'next_button';
                    
                    if (buttonId) {
                        var button = document.getElementById(buttonId);
                        if (button) {
                            event.preventDefault();
                            button.click();
                        }
                    }
                });
            }
            """
        )
    
    return demo

if __name__ == "__main__":
    demo = create_interface()
    demo.launch()