try again
Browse files- extractor_compare.py +101 -114
extractor_compare.py
CHANGED
@@ -217,7 +217,17 @@ def create_interface():
|
|
217 |
label="PDF Document",
|
218 |
value='''
|
219 |
<div style="width:100%; height:700px; position:relative; border:1px solid #ddd;">
|
220 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
<div id="pdf-fallback" style="position:absolute; top:0; left:0; width:100%; height:100%;
|
222 |
display:flex; align-items:center; justify-content:center; padding:20px; text-align:center;">
|
223 |
Click "Load PDFs" to start viewing documents.
|
@@ -336,80 +346,41 @@ def create_interface():
|
|
336 |
outputs=[extractor2_text]
|
337 |
)
|
338 |
|
339 |
-
#
|
340 |
demo.load(
|
341 |
-
None,
|
342 |
js="""
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
return;
|
355 |
-
}
|
356 |
-
|
357 |
-
// Revoke previous Blob URL to free memory
|
358 |
-
if (window.currentPdfBlobUrl) {
|
359 |
-
console.log('Revoking previous Blob URL:', window.currentPdfBlobUrl);
|
360 |
-
URL.revokeObjectURL(window.currentPdfBlobUrl);
|
361 |
-
window.currentPdfBlobUrl = null;
|
362 |
-
}
|
363 |
-
|
364 |
-
if (base64Data && base64Data.length > 100) { // Ensure there's actual content
|
365 |
-
try {
|
366 |
-
// Hide fallback message
|
367 |
-
if (fallbackDiv) fallbackDiv.style.display = 'none';
|
368 |
-
|
369 |
-
// Decode Base64
|
370 |
-
const byteCharacters = atob(base64Data);
|
371 |
-
console.log('Base64 decoded successfully, length:', byteCharacters.length);
|
372 |
-
const byteNumbers = new Array(byteCharacters.length);
|
373 |
-
for (let i = 0; i < byteCharacters.length; i++) {
|
374 |
-
byteNumbers[i] = byteCharacters.charCodeAt(i);
|
375 |
-
}
|
376 |
-
const byteArray = new Uint8Array(byteNumbers);
|
377 |
-
|
378 |
-
// Create Blob and URL
|
379 |
-
const blob = new Blob([byteArray], {type: 'application/pdf'});
|
380 |
-
window.currentPdfBlobUrl = URL.createObjectURL(blob);
|
381 |
-
console.log('Created new Blob URL:', window.currentPdfBlobUrl);
|
382 |
-
|
383 |
-
// Update iframe source
|
384 |
-
iframe.src = window.currentPdfBlobUrl;
|
385 |
-
console.log('Iframe src updated to Blob URL');
|
386 |
-
} catch (e) {
|
387 |
-
console.error('Error processing Base64 data or creating Blob URL:', e);
|
388 |
-
if (fallbackDiv) {
|
389 |
-
fallbackDiv.innerHTML = '<div style="color:red;">Error loading PDF: ' + e.message + '</div>';
|
390 |
-
fallbackDiv.style.display = 'flex';
|
391 |
-
}
|
392 |
-
iframe.src = 'about:blank'; // Clear iframe on error
|
393 |
}
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
|
|
399 |
}
|
400 |
-
iframe.src = 'about:blank'; // Clear iframe if no data
|
401 |
-
}
|
402 |
-
}
|
403 |
-
|
404 |
-
// MutationObserver to watch the hidden Textbox
|
405 |
-
const targetNode = document.getElementById('pdf_base64_data');
|
406 |
-
if (targetNode) {
|
407 |
-
// Find the actual textarea inside the Gradio component structure
|
408 |
-
const hiddenTextArea = targetNode.querySelector('textarea');
|
409 |
-
if(hiddenTextArea){
|
410 |
-
console.log('Found hidden textarea to observe.');
|
411 |
-
const observerConfig = { characterData: true, childList: true, subtree: true, attributes: true }; // Watch for all changes
|
412 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
413 |
const observer = new MutationObserver(function(mutationsList) {
|
414 |
console.log('Mutation detected, checking textarea value');
|
415 |
if (hiddenTextArea.value && hiddenTextArea.value.length > 100) {
|
@@ -418,55 +389,71 @@ def create_interface():
|
|
418 |
}
|
419 |
});
|
420 |
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
} else {
|
433 |
-
console.error('Could not find the textarea within #pdf_base64_data!');
|
434 |
}
|
435 |
-
}
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
|
|
|
|
|
|
|
|
|
|
443 |
}
|
444 |
-
let targetButtonId = null;
|
445 |
-
const key = event.key;
|
446 |
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
targetButton.click();
|
455 |
-
}
|
456 |
-
}
|
457 |
-
});
|
458 |
-
console.log('Keydown listener added.');
|
459 |
-
|
460 |
-
// Additional style for basic font
|
461 |
-
const additionalStyle = document.createElement('style');
|
462 |
-
additionalStyle.textContent = `
|
463 |
-
.extraction-text textarea {
|
464 |
-
font-family: Arial, Helvetica, sans-serif !important;
|
465 |
-
font-size: 14px !important;
|
466 |
}
|
467 |
-
|
468 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
469 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
470 |
"""
|
471 |
)
|
472 |
|
|
|
217 |
label="PDF Document",
|
218 |
value='''
|
219 |
<div style="width:100%; height:700px; position:relative; border:1px solid #ddd;">
|
220 |
+
<style>
|
221 |
+
@font-face {
|
222 |
+
font-family: 'Local Arial';
|
223 |
+
src: local('Arial');
|
224 |
+
}
|
225 |
+
body {
|
226 |
+
font-family: 'Local Arial', sans-serif;
|
227 |
+
}
|
228 |
+
</style>
|
229 |
+
<meta http-equiv="Content-Security-Policy" content="default-src * blob:; script-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline';">
|
230 |
+
<iframe id="pdf-iframe" width="100%" height="100%" style="border:none;" src="about:blank" sandbox="allow-same-origin allow-scripts allow-forms"></iframe>
|
231 |
<div id="pdf-fallback" style="position:absolute; top:0; left:0; width:100%; height:100%;
|
232 |
display:flex; align-items:center; justify-content:center; padding:20px; text-align:center;">
|
233 |
Click "Load PDFs" to start viewing documents.
|
|
|
346 |
outputs=[extractor2_text]
|
347 |
)
|
348 |
|
349 |
+
# JavaScript for PDF handling
|
350 |
demo.load(
|
351 |
+
fn=None,
|
352 |
js="""
|
353 |
+
// Function to safely setup the MutationObserver for the PDF data
|
354 |
+
function setupPdfDataObserver() {
|
355 |
+
console.log('Setting up PDF data observer...');
|
356 |
+
|
357 |
+
// Wait for Gradio components to fully render
|
358 |
+
setTimeout(() => {
|
359 |
+
try {
|
360 |
+
const targetNode = document.getElementById('pdf_base64_data');
|
361 |
+
if (!targetNode) {
|
362 |
+
console.error('PDF data container not found!');
|
363 |
+
return;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
364 |
}
|
365 |
+
|
366 |
+
// Find the textarea within the Gradio component
|
367 |
+
const hiddenTextArea = targetNode.querySelector('textarea');
|
368 |
+
if (!hiddenTextArea) {
|
369 |
+
console.error('Hidden textarea not found within the container!');
|
370 |
+
return;
|
371 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
372 |
|
373 |
+
console.log('Found hidden textarea to observe');
|
374 |
+
|
375 |
+
// Setup observer configuration
|
376 |
+
const observerConfig = {
|
377 |
+
characterData: true,
|
378 |
+
childList: true,
|
379 |
+
subtree: true,
|
380 |
+
attributes: true
|
381 |
+
};
|
382 |
+
|
383 |
+
// Create and attach the observer
|
384 |
const observer = new MutationObserver(function(mutationsList) {
|
385 |
console.log('Mutation detected, checking textarea value');
|
386 |
if (hiddenTextArea.value && hiddenTextArea.value.length > 100) {
|
|
|
389 |
}
|
390 |
});
|
391 |
|
392 |
+
// Observe the textarea itself, not its parent
|
393 |
+
observer.observe(hiddenTextArea, observerConfig);
|
394 |
+
console.log('MutationObserver attached to textarea');
|
395 |
+
|
396 |
+
// Also check initial value
|
397 |
+
if (hiddenTextArea.value && hiddenTextArea.value.length > 100) {
|
398 |
+
console.log('Initial valid value found, displaying PDF');
|
399 |
+
displayPdfBlob(hiddenTextArea.value);
|
400 |
+
}
|
401 |
+
} catch (error) {
|
402 |
+
console.error('Error setting up observer:', error);
|
|
|
|
|
403 |
}
|
404 |
+
}, 1000); // Wait 1 second for components to render
|
405 |
+
}
|
406 |
+
|
407 |
+
// Function to display PDF from base64 data
|
408 |
+
function displayPdfBlob(base64Data) {
|
409 |
+
try {
|
410 |
+
// Get iframe and fallback elements
|
411 |
+
const iframe = document.getElementById('pdf-iframe');
|
412 |
+
const fallback = document.getElementById('pdf-fallback');
|
413 |
+
|
414 |
+
if (!iframe || !fallback) {
|
415 |
+
console.error('PDF viewer elements not found');
|
416 |
+
return;
|
417 |
}
|
|
|
|
|
418 |
|
419 |
+
// Convert base64 to binary
|
420 |
+
const binaryString = atob(base64Data);
|
421 |
+
const len = binaryString.length;
|
422 |
+
const bytes = new Uint8Array(len);
|
423 |
+
|
424 |
+
for (let i = 0; i < len; i++) {
|
425 |
+
bytes[i] = binaryString.charCodeAt(i);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
426 |
}
|
427 |
+
|
428 |
+
// Create blob and URL
|
429 |
+
const blob = new Blob([bytes], { type: 'application/pdf' });
|
430 |
+
const objectUrl = URL.createObjectURL(blob);
|
431 |
+
|
432 |
+
// Update iframe
|
433 |
+
iframe.src = objectUrl;
|
434 |
+
|
435 |
+
// Hide fallback message
|
436 |
+
fallback.style.display = 'none';
|
437 |
+
|
438 |
+
// Log success
|
439 |
+
console.log('PDF displayed successfully');
|
440 |
+
} catch (error) {
|
441 |
+
console.error('Error displaying PDF:', error);
|
442 |
+
}
|
443 |
}
|
444 |
+
|
445 |
+
// Initialize the observer after everything is loaded
|
446 |
+
window.addEventListener('load', function() {
|
447 |
+
console.log('Window loaded, initializing PDF observer...');
|
448 |
+
setupPdfDataObserver();
|
449 |
+
});
|
450 |
+
|
451 |
+
// Also setup when Gradio mounts the component
|
452 |
+
document.addEventListener('DOMContentLoaded', function() {
|
453 |
+
console.log('DOM loaded, waiting for Gradio components...');
|
454 |
+
// Wait a bit longer for Gradio components to mount
|
455 |
+
setTimeout(setupPdfDataObserver, 2000);
|
456 |
+
});
|
457 |
"""
|
458 |
)
|
459 |
|