fuck sandbox on hf space
Browse files- extractor_compare.py +203 -55
extractor_compare.py
CHANGED
@@ -198,7 +198,11 @@ def create_interface():
|
|
198 |
}
|
199 |
"""
|
200 |
|
201 |
-
with gr.Blocks(title="PDF Extractor Comparer", theme="soft", css=custom_css
|
|
|
|
|
|
|
|
|
202 |
gr.Markdown("## PDF Extractor Comparer")
|
203 |
|
204 |
with gr.Row():
|
@@ -217,18 +221,7 @@ def create_interface():
|
|
217 |
label="PDF Document",
|
218 |
value='''
|
219 |
<div style="width:100%; height:700px; position:relative; border:1px solid #ddd;">
|
220 |
-
<style>
|
221 |
-
@font-face {
|
222 |
-
font-family: 'Local Arial';
|
223 |
-
src: local('Arial');
|
224 |
-
}
|
225 |
-
body {
|
226 |
-
font-family: 'Local Arial', sans-serif;
|
227 |
-
}
|
228 |
-
</style>
|
229 |
-
<object id="pdf-object" type="application/pdf" width="100%" height="100%" style="display:none;">
|
230 |
-
<p>PDF cannot be displayed</p>
|
231 |
-
</object>
|
232 |
<div id="pdf-fallback" style="position:absolute; top:0; left:0; width:100%; height:100%;
|
233 |
display:flex; align-items:center; justify-content:center; padding:20px; text-align:center; font-family: Arial, sans-serif;">
|
234 |
Click "Load PDFs" to start viewing documents.
|
@@ -352,92 +345,247 @@ def create_interface():
|
|
352 |
fn=None,
|
353 |
js="""
|
354 |
function() {
|
355 |
-
console.log("Setting up PDF viewer");
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
356 |
|
357 |
-
//
|
358 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
359 |
|
360 |
// Function to display PDF from base64 data
|
361 |
-
function displayPdfFromBase64(base64Data) {
|
362 |
try {
|
363 |
if (!base64Data || base64Data.length < 100) {
|
364 |
console.log("No valid PDF data received");
|
365 |
document.getElementById('pdf-fallback').style.display = 'flex';
|
366 |
-
document.getElementById('pdf-
|
367 |
return;
|
368 |
}
|
369 |
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
}
|
376 |
|
377 |
-
//
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
382 |
}
|
383 |
|
384 |
-
//
|
385 |
-
const
|
386 |
-
|
|
|
|
|
|
|
387 |
|
388 |
-
//
|
389 |
-
const
|
390 |
-
|
|
|
391 |
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
399 |
}
|
400 |
} catch (error) {
|
401 |
-
console.error("Error
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
|
|
407 |
}
|
408 |
}
|
409 |
|
410 |
-
// Check for PDF data
|
411 |
-
function
|
412 |
const dataElement = document.getElementById('pdf_base64_data');
|
413 |
if (!dataElement) {
|
414 |
console.log("PDF data element not found, will retry");
|
415 |
-
setTimeout(
|
416 |
return;
|
417 |
}
|
418 |
|
419 |
const textarea = dataElement.querySelector('textarea');
|
420 |
if (!textarea) {
|
421 |
console.log("Textarea not found, will retry");
|
422 |
-
setTimeout(
|
423 |
return;
|
424 |
}
|
425 |
|
|
|
|
|
426 |
// Display initial data if available
|
427 |
if (textarea.value && textarea.value.length > 100) {
|
428 |
displayPdfFromBase64(textarea.value);
|
429 |
}
|
430 |
|
431 |
-
//
|
432 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
433 |
if (textarea.value && textarea.value.length > 100) {
|
434 |
displayPdfFromBase64(textarea.value);
|
435 |
}
|
436 |
-
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
437 |
}
|
438 |
|
439 |
// Start checking for PDF data
|
440 |
-
setTimeout(
|
441 |
|
442 |
// Add keyboard shortcuts
|
443 |
document.addEventListener('keydown', function(event) {
|
|
|
198 |
}
|
199 |
"""
|
200 |
|
201 |
+
with gr.Blocks(title="PDF Extractor Comparer", theme="soft", css=custom_css, head=
|
202 |
+
"""
|
203 |
+
<script src="https://unpkg.com/[email protected]/build/pdf.min.js"></script>
|
204 |
+
"""
|
205 |
+
) as demo:
|
206 |
gr.Markdown("## PDF Extractor Comparer")
|
207 |
|
208 |
with gr.Row():
|
|
|
221 |
label="PDF Document",
|
222 |
value='''
|
223 |
<div style="width:100%; height:700px; position:relative; border:1px solid #ddd;">
|
224 |
+
<div id="pdf-container" style="width:100%; height:100%; overflow:auto;"></div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
225 |
<div id="pdf-fallback" style="position:absolute; top:0; left:0; width:100%; height:100%;
|
226 |
display:flex; align-items:center; justify-content:center; padding:20px; text-align:center; font-family: Arial, sans-serif;">
|
227 |
Click "Load PDFs" to start viewing documents.
|
|
|
345 |
fn=None,
|
346 |
js="""
|
347 |
function() {
|
348 |
+
console.log("Setting up PDF.js viewer");
|
349 |
+
|
350 |
+
// Configure PDF.js worker
|
351 |
+
if (window.pdfjsLib) {
|
352 |
+
window.pdfjsLib.GlobalWorkerOptions.workerSrc = "https://unpkg.com/[email protected]/build/pdf.worker.min.js";
|
353 |
+
console.log("PDF.js configured with worker");
|
354 |
+
} else {
|
355 |
+
console.warn("PDF.js not found in head, attempting to load dynamically");
|
356 |
+
// Fallback to load PDF.js dynamically if not in the head
|
357 |
+
const pdfJsScript = document.createElement('script');
|
358 |
+
pdfJsScript.src = "https://unpkg.com/[email protected]/build/pdf.min.js";
|
359 |
+
document.head.appendChild(pdfJsScript);
|
360 |
+
|
361 |
+
pdfJsScript.onload = function() {
|
362 |
+
window.pdfjsLib.GlobalWorkerOptions.workerSrc = "https://unpkg.com/[email protected]/build/pdf.worker.min.js";
|
363 |
+
console.log("PDF.js loaded dynamically");
|
364 |
+
};
|
365 |
+
}
|
366 |
|
367 |
+
// To track when we should force a refresh
|
368 |
+
let currentPdfHash = "";
|
369 |
+
|
370 |
+
// Function to render a PDF page
|
371 |
+
async function renderPage(pdf, pageNumber, container) {
|
372 |
+
try {
|
373 |
+
const page = await pdf.getPage(pageNumber);
|
374 |
+
|
375 |
+
// Create page container
|
376 |
+
const pageContainer = document.createElement('div');
|
377 |
+
pageContainer.className = 'pdf-page';
|
378 |
+
pageContainer.style.position = 'relative';
|
379 |
+
pageContainer.style.margin = '10px auto';
|
380 |
+
pageContainer.style.boxShadow = '0 2px 5px rgba(0,0,0,0.2)';
|
381 |
+
|
382 |
+
// Create canvas for this page
|
383 |
+
const canvas = document.createElement('canvas');
|
384 |
+
const context = canvas.getContext('2d');
|
385 |
+
pageContainer.appendChild(canvas);
|
386 |
+
|
387 |
+
// Set up viewport with scale based on container width
|
388 |
+
const containerWidth = container.clientWidth - 30; // Account for margins
|
389 |
+
const originalViewport = page.getViewport({ scale: 1 });
|
390 |
+
const scale = containerWidth / originalViewport.width;
|
391 |
+
const viewport = page.getViewport({ scale });
|
392 |
+
|
393 |
+
// Set canvas dimensions
|
394 |
+
canvas.width = viewport.width;
|
395 |
+
canvas.height = viewport.height;
|
396 |
+
|
397 |
+
// Render the PDF page into canvas context
|
398 |
+
await page.render({
|
399 |
+
canvasContext: context,
|
400 |
+
viewport: viewport
|
401 |
+
}).promise;
|
402 |
+
|
403 |
+
// Add to the container
|
404 |
+
container.appendChild(pageContainer);
|
405 |
+
|
406 |
+
return true;
|
407 |
+
} catch (error) {
|
408 |
+
console.error(`Error rendering page ${pageNumber}:`, error);
|
409 |
+
return false;
|
410 |
+
}
|
411 |
+
}
|
412 |
+
|
413 |
+
// Simple hash function for PDF data to detect changes
|
414 |
+
function hashData(str) {
|
415 |
+
let hash = 0;
|
416 |
+
if (str.length === 0) return hash;
|
417 |
+
for (let i = 0; i < Math.min(str.length, 10000); i++) {
|
418 |
+
const char = str.charCodeAt(i);
|
419 |
+
hash = ((hash << 5) - hash) + char;
|
420 |
+
hash = hash & hash;
|
421 |
+
}
|
422 |
+
// Also include the length as PDFs with same start can be different
|
423 |
+
return `${hash}_${str.length}`;
|
424 |
+
}
|
425 |
|
426 |
// Function to display PDF from base64 data
|
427 |
+
async function displayPdfFromBase64(base64Data) {
|
428 |
try {
|
429 |
if (!base64Data || base64Data.length < 100) {
|
430 |
console.log("No valid PDF data received");
|
431 |
document.getElementById('pdf-fallback').style.display = 'flex';
|
432 |
+
document.getElementById('pdf-container').innerHTML = '';
|
433 |
return;
|
434 |
}
|
435 |
|
436 |
+
// Check if this is the same PDF we already have displayed
|
437 |
+
const dataHash = hashData(base64Data);
|
438 |
+
if (dataHash === currentPdfHash) {
|
439 |
+
console.log("Same PDF already displayed, skipping render");
|
440 |
+
return;
|
441 |
}
|
442 |
|
443 |
+
// Update the current PDF hash
|
444 |
+
currentPdfHash = dataHash;
|
445 |
+
console.log("PDF changed, rendering new document");
|
446 |
+
|
447 |
+
// Check if PDF.js is loaded
|
448 |
+
if (!window.pdfjsLib) {
|
449 |
+
console.warn("PDF.js not loaded yet, waiting...");
|
450 |
+
document.getElementById('pdf-fallback').innerHTML =
|
451 |
+
'<div style="font-family: Arial, sans-serif;">Loading PDF viewer...</div>';
|
452 |
+
setTimeout(() => displayPdfFromBase64(base64Data), 500);
|
453 |
+
return;
|
454 |
}
|
455 |
|
456 |
+
// Convert base64 to array buffer
|
457 |
+
const binaryString = atob(base64Data);
|
458 |
+
const bytes = new Uint8Array(binaryString.length);
|
459 |
+
for (let i = 0; i < binaryString.length; i++) {
|
460 |
+
bytes[i] = binaryString.charCodeAt(i);
|
461 |
+
}
|
462 |
|
463 |
+
// Clear existing content
|
464 |
+
const container = document.getElementById('pdf-container');
|
465 |
+
container.innerHTML = '';
|
466 |
+
document.getElementById('pdf-fallback').style.display = 'none';
|
467 |
|
468 |
+
// Load and render the PDF
|
469 |
+
try {
|
470 |
+
// Show loading indicator
|
471 |
+
const loadingIndicator = document.createElement('div');
|
472 |
+
loadingIndicator.style.padding = '20px';
|
473 |
+
loadingIndicator.style.textAlign = 'center';
|
474 |
+
loadingIndicator.innerText = 'Loading PDF...';
|
475 |
+
container.appendChild(loadingIndicator);
|
476 |
+
|
477 |
+
// Load document
|
478 |
+
const loadingTask = window.pdfjsLib.getDocument({ data: bytes });
|
479 |
+
const pdf = await loadingTask.promise;
|
480 |
+
|
481 |
+
// Clear the loading indicator
|
482 |
+
container.innerHTML = '';
|
483 |
+
|
484 |
+
console.log(`PDF loaded with ${pdf.numPages} pages`);
|
485 |
+
|
486 |
+
// Render all pages
|
487 |
+
const pagePromises = [];
|
488 |
+
for (let i = 1; i <= pdf.numPages; i++) {
|
489 |
+
pagePromises.push(renderPage(pdf, i, container));
|
490 |
+
}
|
491 |
+
|
492 |
+
// Wait for all pages to render
|
493 |
+
await Promise.all(pagePromises);
|
494 |
+
console.log("All pages rendered");
|
495 |
+
|
496 |
+
// Scroll to top
|
497 |
+
container.scrollTop = 0;
|
498 |
+
|
499 |
+
} catch (error) {
|
500 |
+
console.error("Error loading PDF:", error);
|
501 |
+
document.getElementById('pdf-fallback').innerHTML =
|
502 |
+
`<div style="color: red; font-family: Arial, sans-serif;">
|
503 |
+
Error loading PDF: ${error.message || 'Unknown error'}
|
504 |
+
</div>`;
|
505 |
+
document.getElementById('pdf-fallback').style.display = 'flex';
|
506 |
+
currentPdfHash = ""; // Reset hash to allow retry
|
507 |
}
|
508 |
} catch (error) {
|
509 |
+
console.error("Error processing PDF data:", error);
|
510 |
+
document.getElementById('pdf-fallback').innerHTML =
|
511 |
+
`<div style="color: red; font-family: Arial, sans-serif;">
|
512 |
+
Error processing PDF: ${error.message || 'Unknown error'}
|
513 |
+
</div>`;
|
514 |
+
document.getElementById('pdf-fallback').style.display = 'flex';
|
515 |
+
currentPdfHash = ""; // Reset hash to allow retry
|
516 |
}
|
517 |
}
|
518 |
|
519 |
+
// Check for PDF data
|
520 |
+
function setupPdfListener() {
|
521 |
const dataElement = document.getElementById('pdf_base64_data');
|
522 |
if (!dataElement) {
|
523 |
console.log("PDF data element not found, will retry");
|
524 |
+
setTimeout(setupPdfListener, 1000);
|
525 |
return;
|
526 |
}
|
527 |
|
528 |
const textarea = dataElement.querySelector('textarea');
|
529 |
if (!textarea) {
|
530 |
console.log("Textarea not found, will retry");
|
531 |
+
setTimeout(setupPdfListener, 1000);
|
532 |
return;
|
533 |
}
|
534 |
|
535 |
+
console.log("Found PDF data element, setting up listeners");
|
536 |
+
|
537 |
// Display initial data if available
|
538 |
if (textarea.value && textarea.value.length > 100) {
|
539 |
displayPdfFromBase64(textarea.value);
|
540 |
}
|
541 |
|
542 |
+
// Use both an observer and polling for robustness
|
543 |
+
// 1. Create MutationObserver to watch for value changes
|
544 |
+
const observer = new MutationObserver((mutations) => {
|
545 |
+
for (const mutation of mutations) {
|
546 |
+
if (textarea.value && textarea.value.length > 100) {
|
547 |
+
displayPdfFromBase64(textarea.value);
|
548 |
+
break;
|
549 |
+
}
|
550 |
+
}
|
551 |
+
});
|
552 |
+
|
553 |
+
// Observe the textarea for changes
|
554 |
+
observer.observe(textarea, {
|
555 |
+
attributes: true,
|
556 |
+
characterData: true,
|
557 |
+
subtree: true,
|
558 |
+
childList: true
|
559 |
+
});
|
560 |
+
|
561 |
+
// 2. Also use polling as a fallback
|
562 |
+
setInterval(() => {
|
563 |
if (textarea.value && textarea.value.length > 100) {
|
564 |
displayPdfFromBase64(textarea.value);
|
565 |
}
|
566 |
+
}, 1000);
|
567 |
+
|
568 |
+
// Monitor the next/prev buttons to force PDF refresh
|
569 |
+
const prevButton = document.getElementById('prev_button');
|
570 |
+
const nextButton = document.getElementById('next_button');
|
571 |
+
|
572 |
+
if (prevButton) {
|
573 |
+
prevButton.addEventListener('click', () => {
|
574 |
+
console.log("Prev button clicked, forcing PDF refresh");
|
575 |
+
currentPdfHash = ""; // Reset hash to force refresh
|
576 |
+
});
|
577 |
+
}
|
578 |
+
|
579 |
+
if (nextButton) {
|
580 |
+
nextButton.addEventListener('click', () => {
|
581 |
+
console.log("Next button clicked, forcing PDF refresh");
|
582 |
+
currentPdfHash = ""; // Reset hash to force refresh
|
583 |
+
});
|
584 |
+
}
|
585 |
}
|
586 |
|
587 |
// Start checking for PDF data
|
588 |
+
setTimeout(setupPdfListener, 1000);
|
589 |
|
590 |
// Add keyboard shortcuts
|
591 |
document.addEventListener('keydown', function(event) {
|