PDF-Extraction-Comparisson / extractor_compare.py
hynky's picture
hynky HF Staff
add default path
02219ce
import gradio as gr
import os
import json
import base64
import tempfile
from pathlib import Path
EXTRACTORS = ['pdf_plumber', 'py_pdf', 'docling', 'extractous', 'pypdfium2', 'pymupdf', 'pymupdf_llm']
def add_page_breaks(text, page_offsets):
"""Add page break markers to text based on page_offsets."""
if not page_offsets:
return text
result = []
last_offset = 0
for offset in page_offsets:
result.append(text[last_offset:offset])
result.append("\n<---page-break--->\n")
last_offset = offset
# Add any remaining text
if last_offset < len(text):
result.append(text[last_offset:])
return "".join(result)
class ExtractorComparer:
def __init__(self):
self.json_files = []
self.current_index = 0
self.current_data = None
self.temp_pdf_path = None
self.current_pdf_bytes = None
def load_files(self, directory_path):
"""Load all JSON files from the specified directory."""
self.json_files = []
try:
for filename in os.listdir(directory_path):
if filename.endswith('.json') or filename.endswith('.jsonl'):
self.json_files.append(os.path.join(directory_path, filename))
if self.json_files:
self.current_index = 0
file_progress, annotation_status = self.get_progress_info()
return file_progress, annotation_status
else:
return "No JSON files found", "No files loaded"
except Exception as e:
return f"Error loading files: {str(e)}", "Error"
def load_current_file(self):
"""Load the current JSON file data."""
if not self.json_files:
return None, "N/A", "N/A"
try:
with open(self.json_files[self.current_index], 'r') as f:
self.current_data = json.load(f)
# Extract PDF bytes from pdf_plumber
pdf_bytes = None
debug_info = ""
if 'pdf_plumber' in self.current_data:
plumber_data = self.current_data['pdf_plumber']
if 'media' in plumber_data and plumber_data['media'] and isinstance(plumber_data['media'], list) and len(plumber_data['media']) > 0:
media_item = plumber_data['media'][0]
if 'media_bytes' in media_item and media_item['media_bytes']:
try:
pdf_bytes = base64.b64decode(media_item['media_bytes'])
self.current_pdf_bytes = pdf_bytes
except Exception as e:
debug_info = f"Error decoding media_bytes: {str(e)}"
# Create temporary file for the PDF if we have bytes
if pdf_bytes:
if self.temp_pdf_path:
try:
os.remove(self.temp_pdf_path)
except:
pass
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
temp_file.write(pdf_bytes)
self.temp_pdf_path = temp_file.name
# Convert to base64 for passing to the frontend
base64_pdf = base64.b64encode(pdf_bytes).decode('utf-8')
# Generate progress information
file_progress, annotation_status = self.get_progress_info()
return base64_pdf, file_progress, annotation_status
else:
file_progress, annotation_status = self.get_progress_info()
return None, file_progress, annotation_status
except Exception as e:
return None, "Error loading file", "No annotation"
def get_progress_info(self):
"""Generate progress information and annotation status."""
if not self.json_files:
return "No files loaded", "No annotation"
current_file = self.json_files[self.current_index]
filename = Path(current_file).name
# File progress information
file_progress = f"File {self.current_index + 1} of {len(self.json_files)}: {filename}"
# Check if this file has been annotated with a best extractor
best_extractor_file = os.path.splitext(current_file)[0] + "_best.txt"
annotation_status = "Not annotated"
if os.path.exists(best_extractor_file):
try:
with open(best_extractor_file, 'r') as f:
best_extractor = f.read().strip()
annotation_status = f"Best extractor: {best_extractor}"
except:
pass
# Count total annotated files
annotated_count = 0
for json_file in self.json_files:
best_file = os.path.splitext(json_file)[0] + "_best.txt"
if os.path.exists(best_file):
annotated_count += 1
file_progress = f"{file_progress} (Annotated: {annotated_count}/{len(self.json_files)})"
return file_progress, annotation_status
def get_extractor_text(self, extractor_name):
"""Get text with page breaks for the specified extractor."""
if not self.current_data or extractor_name not in self.current_data:
return ""
extractor_data = self.current_data[extractor_name]
if 'text' not in extractor_data:
return f"No text found for {extractor_name}"
text = extractor_data.get('text', '')
# Get page offsets
page_offsets = []
if 'media' in extractor_data and extractor_data['media'] and len(extractor_data['media']) > 0:
media_item = extractor_data['media'][0]
if 'metadata' in media_item and 'pdf_metadata' in media_item['metadata'] and 'page_offsets' in media_item['metadata']['pdf_metadata']:
page_offsets = media_item['metadata']['pdf_metadata']['page_offsets']
return add_page_breaks(text, page_offsets)
def next_pdf(self):
"""Load the next PDF in the list."""
if not self.json_files:
return None, "N/A", "N/A"
self.current_index = (self.current_index + 1) % len(self.json_files)
return self.load_current_file()
def prev_pdf(self):
"""Load the previous PDF in the list."""
if not self.json_files:
return None, "N/A", "N/A"
self.current_index = (self.current_index - 1) % len(self.json_files)
return self.load_current_file()
def set_best_extractor(self, extractor_name):
"""Record that this extractor is the best for the current file."""
if not self.json_files or not self.current_data:
return "N/A", "N/A"
try:
# Create a record about the best extractor
result_file = os.path.splitext(self.json_files[self.current_index])[0] + "_best.txt"
with open(result_file, 'w') as f:
f.write(extractor_name)
# Get updated progress info after annotation
file_progress, annotation_status = self.get_progress_info()
return file_progress, annotation_status
except Exception as e:
return "Error saving annotation", "No annotation"
def create_interface():
comparer = ExtractorComparer()
# Custom CSS for basic font in text areas
custom_css = """
.extraction-text textarea {
font-family: Arial, Helvetica, sans-serif !important;
font-size: 14px !important;
line-height: 1.5 !important;
}
"""
with gr.Blocks(title="PDF Extractor Comparer", theme="soft", css=custom_css, head=
"""
<script src="https://unpkg.com/[email protected]/build/pdf.min.js"></script>
"""
) as demo:
gr.Markdown("## PDF Extractor Comparer")
with gr.Row():
directory_input = gr.Textbox(
label="Path to JSON Directory",
placeholder="e.g., /path/to/your/json/files",
value="extraction/truncated"
)
load_button = gr.Button("Load PDFs", variant="primary")
# Main layout: PDF viewer on left, status and controls on right
with gr.Row():
# Left column: PDF viewer
with gr.Column(scale=3):
# PDF viewer using iframe with JavaScript handling
pdf_viewer_html = gr.HTML(
label="PDF Document",
value='''
<div style="width:100%; height:700px; position:relative; border:1px solid #ddd;">
<div id="pdf-container" style="width:100%; height:100%; overflow:auto;"></div>
<div id="pdf-fallback" style="position:absolute; top:0; left:0; width:100%; height:100%;
display:flex; align-items:center; justify-content:center; padding:20px; text-align:center; font-family: Arial, sans-serif;">
Click "Load PDFs" to start viewing documents.
</div>
</div>
'''
)
# Hidden component to store the Base64 PDF data
pdf_data_hidden = gr.Textbox(visible=False, elem_id="pdf_base64_data")
# Right column: Progress and controls
with gr.Column(scale=1):
# Progress information
file_progress_output = gr.Textbox(label="File Progress", interactive=False)
annotation_status_output = gr.Textbox(label="Annotation Status", interactive=False)
# Navigation
with gr.Row():
prev_button = gr.Button("⬅️ Previous", elem_id="prev_button")
next_button = gr.Button("Next ➡️", elem_id="next_button")
# Best extractor selection
gr.Markdown("### Select Best Extractor")
extractor_buttons = []
for extractor in EXTRACTORS:
button = gr.Button(extractor, variant="secondary")
extractor_buttons.append(button)
button.click(
comparer.set_best_extractor,
inputs=[gr.Textbox(value=extractor, visible=False)],
outputs=[file_progress_output, annotation_status_output]
)
# Extractors section below the PDF
gr.Markdown("### Extractor Comparison")
# Extractor dropdowns
with gr.Row():
extractor1_dropdown = gr.Dropdown(
choices=EXTRACTORS,
label="Extractor 1",
value=EXTRACTORS[0] if EXTRACTORS else None
)
extractor2_dropdown = gr.Dropdown(
choices=EXTRACTORS,
label="Extractor 2",
value=EXTRACTORS[1] if len(EXTRACTORS) > 1 else EXTRACTORS[0] if EXTRACTORS else None
)
# Extractor text outputs with applied class for styling
with gr.Row():
extractor1_text = gr.Textbox(
label="Extractor 1 Output",
lines=15,
elem_classes=["extraction-text"]
)
extractor2_text = gr.Textbox(
label="Extractor 2 Output",
lines=15,
elem_classes=["extraction-text"]
)
# Event handlers
load_button.click(
comparer.load_files,
inputs=[directory_input],
outputs=[file_progress_output, annotation_status_output]
).then(
comparer.load_current_file,
outputs=[pdf_data_hidden, file_progress_output, annotation_status_output]
).then(
comparer.get_extractor_text,
inputs=[extractor1_dropdown],
outputs=[extractor1_text]
).then(
comparer.get_extractor_text,
inputs=[extractor2_dropdown],
outputs=[extractor2_text]
)
prev_button.click(
comparer.prev_pdf,
outputs=[pdf_data_hidden, file_progress_output, annotation_status_output]
).then(
comparer.get_extractor_text,
inputs=[extractor1_dropdown],
outputs=[extractor1_text]
).then(
comparer.get_extractor_text,
inputs=[extractor2_dropdown],
outputs=[extractor2_text]
)
next_button.click(
comparer.next_pdf,
outputs=[pdf_data_hidden, file_progress_output, annotation_status_output]
).then(
comparer.get_extractor_text,
inputs=[extractor1_dropdown],
outputs=[extractor1_text]
).then(
comparer.get_extractor_text,
inputs=[extractor2_dropdown],
outputs=[extractor2_text]
)
extractor1_dropdown.change(
comparer.get_extractor_text,
inputs=[extractor1_dropdown],
outputs=[extractor1_text]
)
extractor2_dropdown.change(
comparer.get_extractor_text,
inputs=[extractor2_dropdown],
outputs=[extractor2_text]
)
# JavaScript for PDF handling
demo.load(
fn=None,
js="""
function() {
console.log("Setting up PDF.js viewer");
// Configure PDF.js worker
if (window.pdfjsLib) {
window.pdfjsLib.GlobalWorkerOptions.workerSrc = "https://unpkg.com/[email protected]/build/pdf.worker.min.js";
console.log("PDF.js configured with worker");
} else {
console.warn("PDF.js not found in head, attempting to load dynamically");
// Fallback to load PDF.js dynamically if not in the head
const pdfJsScript = document.createElement('script');
pdfJsScript.src = "https://unpkg.com/[email protected]/build/pdf.min.js";
document.head.appendChild(pdfJsScript);
pdfJsScript.onload = function() {
window.pdfjsLib.GlobalWorkerOptions.workerSrc = "https://unpkg.com/[email protected]/build/pdf.worker.min.js";
console.log("PDF.js loaded dynamically");
};
}
// To track when we should force a refresh
let currentPdfHash = "";
// Function to render a PDF page
async function renderPage(pdf, pageNumber, container) {
try {
const page = await pdf.getPage(pageNumber);
// Create page container
const pageContainer = document.createElement('div');
pageContainer.className = 'pdf-page';
pageContainer.style.position = 'relative';
pageContainer.style.margin = '10px auto';
pageContainer.style.boxShadow = '0 2px 5px rgba(0,0,0,0.2)';
// Create canvas for this page
const canvas = document.createElement('canvas');
const context = canvas.getContext('2d');
pageContainer.appendChild(canvas);
// Set up viewport with scale based on container width
const containerWidth = container.clientWidth - 30; // Account for margins
const originalViewport = page.getViewport({ scale: 1 });
const scale = containerWidth / originalViewport.width;
const viewport = page.getViewport({ scale });
// Set canvas dimensions
canvas.width = viewport.width;
canvas.height = viewport.height;
// Render the PDF page into canvas context
await page.render({
canvasContext: context,
viewport: viewport
}).promise;
// Add to the container
container.appendChild(pageContainer);
return true;
} catch (error) {
console.error(`Error rendering page ${pageNumber}:`, error);
return false;
}
}
// Simple hash function for PDF data to detect changes
function hashData(str) {
let hash = 0;
if (str.length === 0) return hash;
for (let i = 0; i < Math.min(str.length, 10000); i++) {
const char = str.charCodeAt(i);
hash = ((hash << 5) - hash) + char;
hash = hash & hash;
}
// Also include the length as PDFs with same start can be different
return `${hash}_${str.length}`;
}
// Function to display PDF from base64 data
async function displayPdfFromBase64(base64Data) {
try {
if (!base64Data || base64Data.length < 100) {
console.log("No valid PDF data received");
document.getElementById('pdf-fallback').style.display = 'flex';
document.getElementById('pdf-container').innerHTML = '';
return;
}
// Check if this is the same PDF we already have displayed
const dataHash = hashData(base64Data);
if (dataHash === currentPdfHash) {
console.log("Same PDF already displayed, skipping render");
return;
}
// Update the current PDF hash
currentPdfHash = dataHash;
console.log("PDF changed, rendering new document");
// Check if PDF.js is loaded
if (!window.pdfjsLib) {
console.warn("PDF.js not loaded yet, waiting...");
document.getElementById('pdf-fallback').innerHTML =
'<div style="font-family: Arial, sans-serif;">Loading PDF viewer...</div>';
setTimeout(() => displayPdfFromBase64(base64Data), 500);
return;
}
// Convert base64 to array buffer
const binaryString = atob(base64Data);
const bytes = new Uint8Array(binaryString.length);
for (let i = 0; i < binaryString.length; i++) {
bytes[i] = binaryString.charCodeAt(i);
}
// Clear existing content
const container = document.getElementById('pdf-container');
container.innerHTML = '';
document.getElementById('pdf-fallback').style.display = 'none';
// Load and render the PDF
try {
// Show loading indicator
const loadingIndicator = document.createElement('div');
loadingIndicator.style.padding = '20px';
loadingIndicator.style.textAlign = 'center';
loadingIndicator.innerText = 'Loading PDF...';
container.appendChild(loadingIndicator);
// Load document
const loadingTask = window.pdfjsLib.getDocument({ data: bytes });
const pdf = await loadingTask.promise;
// Clear the loading indicator
container.innerHTML = '';
console.log(`PDF loaded with ${pdf.numPages} pages`);
// Render all pages
const pagePromises = [];
for (let i = 1; i <= pdf.numPages; i++) {
pagePromises.push(renderPage(pdf, i, container));
}
// Wait for all pages to render
await Promise.all(pagePromises);
console.log("All pages rendered");
// Scroll to top
container.scrollTop = 0;
} catch (error) {
console.error("Error loading PDF:", error);
document.getElementById('pdf-fallback').innerHTML =
`<div style="color: red; font-family: Arial, sans-serif;">
Error loading PDF: ${error.message || 'Unknown error'}
</div>`;
document.getElementById('pdf-fallback').style.display = 'flex';
currentPdfHash = ""; // Reset hash to allow retry
}
} catch (error) {
console.error("Error processing PDF data:", error);
document.getElementById('pdf-fallback').innerHTML =
`<div style="color: red; font-family: Arial, sans-serif;">
Error processing PDF: ${error.message || 'Unknown error'}
</div>`;
document.getElementById('pdf-fallback').style.display = 'flex';
currentPdfHash = ""; // Reset hash to allow retry
}
}
// Check for PDF data
function setupPdfListener() {
const dataElement = document.getElementById('pdf_base64_data');
if (!dataElement) {
console.log("PDF data element not found, will retry");
setTimeout(setupPdfListener, 1000);
return;
}
const textarea = dataElement.querySelector('textarea');
if (!textarea) {
console.log("Textarea not found, will retry");
setTimeout(setupPdfListener, 1000);
return;
}
console.log("Found PDF data element, setting up listeners");
// Display initial data if available
if (textarea.value && textarea.value.length > 100) {
displayPdfFromBase64(textarea.value);
}
// Use both an observer and polling for robustness
// 1. Create MutationObserver to watch for value changes
const observer = new MutationObserver((mutations) => {
for (const mutation of mutations) {
if (textarea.value && textarea.value.length > 100) {
displayPdfFromBase64(textarea.value);
break;
}
}
});
// Observe the textarea for changes
observer.observe(textarea, {
attributes: true,
characterData: true,
subtree: true,
childList: true
});
// 2. Also use polling as a fallback
setInterval(() => {
if (textarea.value && textarea.value.length > 100) {
displayPdfFromBase64(textarea.value);
}
}, 1000);
// Monitor the next/prev buttons to force PDF refresh
const prevButton = document.getElementById('prev_button');
const nextButton = document.getElementById('next_button');
if (prevButton) {
prevButton.addEventListener('click', () => {
console.log("Prev button clicked, forcing PDF refresh");
currentPdfHash = ""; // Reset hash to force refresh
});
}
if (nextButton) {
nextButton.addEventListener('click', () => {
console.log("Next button clicked, forcing PDF refresh");
currentPdfHash = ""; // Reset hash to force refresh
});
}
}
// Start checking for PDF data
setTimeout(setupPdfListener, 1000);
// Add keyboard shortcuts
document.addEventListener('keydown', function(event) {
if (event.target.tagName === 'INPUT' || event.target.tagName === 'TEXTAREA') {
return;
}
var buttonId = null;
if (event.key === 'ArrowLeft') buttonId = 'prev_button';
else if (event.key === 'ArrowRight') buttonId = 'next_button';
if (buttonId) {
var button = document.getElementById(buttonId);
if (button) {
event.preventDefault();
button.click();
}
}
});
}
"""
)
return demo
if __name__ == "__main__":
demo = create_interface()
demo.launch()