hynky HF Staff commited on
Commit
9686506
·
1 Parent(s): 218233f
Files changed (1) hide show
  1. extractor_compare.py +101 -114
extractor_compare.py CHANGED
@@ -217,7 +217,17 @@ def create_interface():
217
  label="PDF Document",
218
  value='''
219
  <div style="width:100%; height:700px; position:relative; border:1px solid #ddd;">
220
- <iframe id="pdf-iframe" width="100%" height="100%" style="border:none;" src="about:blank"></iframe>
 
 
 
 
 
 
 
 
 
 
221
  <div id="pdf-fallback" style="position:absolute; top:0; left:0; width:100%; height:100%;
222
  display:flex; align-items:center; justify-content:center; padding:20px; text-align:center;">
223
  Click "Load PDFs" to start viewing documents.
@@ -336,80 +346,41 @@ def create_interface():
336
  outputs=[extractor2_text]
337
  )
338
 
339
- # Add JavaScript to handle PDF display
340
  demo.load(
341
- None, None, None,
342
  js="""
343
- () => {
344
- console.log('Setting up PDF Blob handler...');
345
- window.currentPdfBlobUrl = null; // Store previous blob url globally
346
-
347
- function displayPdfBlob(base64Data) {
348
- console.log('displayPdfBlob called - data length:', base64Data ? base64Data.length : 0);
349
- const iframe = document.getElementById('pdf-iframe');
350
- const fallbackDiv = document.getElementById('pdf-fallback');
351
-
352
- if (!iframe) {
353
- console.error('PDF iframe not found!');
354
- return;
355
- }
356
-
357
- // Revoke previous Blob URL to free memory
358
- if (window.currentPdfBlobUrl) {
359
- console.log('Revoking previous Blob URL:', window.currentPdfBlobUrl);
360
- URL.revokeObjectURL(window.currentPdfBlobUrl);
361
- window.currentPdfBlobUrl = null;
362
- }
363
-
364
- if (base64Data && base64Data.length > 100) { // Ensure there's actual content
365
- try {
366
- // Hide fallback message
367
- if (fallbackDiv) fallbackDiv.style.display = 'none';
368
-
369
- // Decode Base64
370
- const byteCharacters = atob(base64Data);
371
- console.log('Base64 decoded successfully, length:', byteCharacters.length);
372
- const byteNumbers = new Array(byteCharacters.length);
373
- for (let i = 0; i < byteCharacters.length; i++) {
374
- byteNumbers[i] = byteCharacters.charCodeAt(i);
375
- }
376
- const byteArray = new Uint8Array(byteNumbers);
377
-
378
- // Create Blob and URL
379
- const blob = new Blob([byteArray], {type: 'application/pdf'});
380
- window.currentPdfBlobUrl = URL.createObjectURL(blob);
381
- console.log('Created new Blob URL:', window.currentPdfBlobUrl);
382
-
383
- // Update iframe source
384
- iframe.src = window.currentPdfBlobUrl;
385
- console.log('Iframe src updated to Blob URL');
386
- } catch (e) {
387
- console.error('Error processing Base64 data or creating Blob URL:', e);
388
- if (fallbackDiv) {
389
- fallbackDiv.innerHTML = '<div style="color:red;">Error loading PDF: ' + e.message + '</div>';
390
- fallbackDiv.style.display = 'flex';
391
- }
392
- iframe.src = 'about:blank'; // Clear iframe on error
393
  }
394
- } else {
395
- console.log('No valid Base64 data provided.');
396
- if (fallbackDiv) {
397
- fallbackDiv.innerHTML = '<div>No PDF loaded yet. Use the "Load PDFs" button.</div>';
398
- fallbackDiv.style.display = 'flex';
 
399
  }
400
- iframe.src = 'about:blank'; // Clear iframe if no data
401
- }
402
- }
403
-
404
- // MutationObserver to watch the hidden Textbox
405
- const targetNode = document.getElementById('pdf_base64_data');
406
- if (targetNode) {
407
- // Find the actual textarea inside the Gradio component structure
408
- const hiddenTextArea = targetNode.querySelector('textarea');
409
- if(hiddenTextArea){
410
- console.log('Found hidden textarea to observe.');
411
- const observerConfig = { characterData: true, childList: true, subtree: true, attributes: true }; // Watch for all changes
412
 
 
 
 
 
 
 
 
 
 
 
 
413
  const observer = new MutationObserver(function(mutationsList) {
414
  console.log('Mutation detected, checking textarea value');
415
  if (hiddenTextArea.value && hiddenTextArea.value.length > 100) {
@@ -418,55 +389,71 @@ def create_interface():
418
  }
419
  });
420
 
421
- observer.observe(targetNode, observerConfig);
422
- console.log('MutationObserver attached');
423
-
424
- // Try to display any initial value
425
- setTimeout(() => {
426
- if(hiddenTextArea.value && hiddenTextArea.value.length > 100) {
427
- console.log('Initial value found in textarea, displaying PDF');
428
- displayPdfBlob(hiddenTextArea.value);
429
- }
430
- }, 1000);
431
-
432
- } else {
433
- console.error('Could not find the textarea within #pdf_base64_data!');
434
  }
435
- } else {
436
- console.error('Hidden data element #pdf_base64_data not found!');
437
- }
438
-
439
- // Add keyboard shortcuts like in app.py
440
- document.addEventListener('keydown', function(event) {
441
- if (event.target.tagName === 'INPUT' || event.target.tagName === 'TEXTAREA') {
442
- return; // Ignore inputs
 
 
 
 
 
443
  }
444
- let targetButtonId = null;
445
- const key = event.key;
446
 
447
- if (key === 'ArrowLeft') targetButtonId = 'prev_button';
448
- else if (key === 'ArrowRight') targetButtonId = 'next_button';
449
-
450
- if (targetButtonId) {
451
- const targetButton = document.getElementById(targetButtonId);
452
- if (targetButton) {
453
- event.preventDefault();
454
- targetButton.click();
455
- }
456
- }
457
- });
458
- console.log('Keydown listener added.');
459
-
460
- // Additional style for basic font
461
- const additionalStyle = document.createElement('style');
462
- additionalStyle.textContent = `
463
- .extraction-text textarea {
464
- font-family: Arial, Helvetica, sans-serif !important;
465
- font-size: 14px !important;
466
  }
467
- `;
468
- document.head.appendChild(additionalStyle);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
469
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
470
  """
471
  )
472
 
 
217
  label="PDF Document",
218
  value='''
219
  <div style="width:100%; height:700px; position:relative; border:1px solid #ddd;">
220
+ <style>
221
+ @font-face {
222
+ font-family: 'Local Arial';
223
+ src: local('Arial');
224
+ }
225
+ body {
226
+ font-family: 'Local Arial', sans-serif;
227
+ }
228
+ </style>
229
+ <meta http-equiv="Content-Security-Policy" content="default-src * blob:; script-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline';">
230
+ <iframe id="pdf-iframe" width="100%" height="100%" style="border:none;" src="about:blank" sandbox="allow-same-origin allow-scripts allow-forms"></iframe>
231
  <div id="pdf-fallback" style="position:absolute; top:0; left:0; width:100%; height:100%;
232
  display:flex; align-items:center; justify-content:center; padding:20px; text-align:center;">
233
  Click "Load PDFs" to start viewing documents.
 
346
  outputs=[extractor2_text]
347
  )
348
 
349
+ # JavaScript for PDF handling
350
  demo.load(
351
+ fn=None,
352
  js="""
353
+ // Function to safely setup the MutationObserver for the PDF data
354
+ function setupPdfDataObserver() {
355
+ console.log('Setting up PDF data observer...');
356
+
357
+ // Wait for Gradio components to fully render
358
+ setTimeout(() => {
359
+ try {
360
+ const targetNode = document.getElementById('pdf_base64_data');
361
+ if (!targetNode) {
362
+ console.error('PDF data container not found!');
363
+ return;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
  }
365
+
366
+ // Find the textarea within the Gradio component
367
+ const hiddenTextArea = targetNode.querySelector('textarea');
368
+ if (!hiddenTextArea) {
369
+ console.error('Hidden textarea not found within the container!');
370
+ return;
371
  }
 
 
 
 
 
 
 
 
 
 
 
 
372
 
373
+ console.log('Found hidden textarea to observe');
374
+
375
+ // Setup observer configuration
376
+ const observerConfig = {
377
+ characterData: true,
378
+ childList: true,
379
+ subtree: true,
380
+ attributes: true
381
+ };
382
+
383
+ // Create and attach the observer
384
  const observer = new MutationObserver(function(mutationsList) {
385
  console.log('Mutation detected, checking textarea value');
386
  if (hiddenTextArea.value && hiddenTextArea.value.length > 100) {
 
389
  }
390
  });
391
 
392
+ // Observe the textarea itself, not its parent
393
+ observer.observe(hiddenTextArea, observerConfig);
394
+ console.log('MutationObserver attached to textarea');
395
+
396
+ // Also check initial value
397
+ if (hiddenTextArea.value && hiddenTextArea.value.length > 100) {
398
+ console.log('Initial valid value found, displaying PDF');
399
+ displayPdfBlob(hiddenTextArea.value);
400
+ }
401
+ } catch (error) {
402
+ console.error('Error setting up observer:', error);
 
 
403
  }
404
+ }, 1000); // Wait 1 second for components to render
405
+ }
406
+
407
+ // Function to display PDF from base64 data
408
+ function displayPdfBlob(base64Data) {
409
+ try {
410
+ // Get iframe and fallback elements
411
+ const iframe = document.getElementById('pdf-iframe');
412
+ const fallback = document.getElementById('pdf-fallback');
413
+
414
+ if (!iframe || !fallback) {
415
+ console.error('PDF viewer elements not found');
416
+ return;
417
  }
 
 
418
 
419
+ // Convert base64 to binary
420
+ const binaryString = atob(base64Data);
421
+ const len = binaryString.length;
422
+ const bytes = new Uint8Array(len);
423
+
424
+ for (let i = 0; i < len; i++) {
425
+ bytes[i] = binaryString.charCodeAt(i);
 
 
 
 
 
 
 
 
 
 
 
 
426
  }
427
+
428
+ // Create blob and URL
429
+ const blob = new Blob([bytes], { type: 'application/pdf' });
430
+ const objectUrl = URL.createObjectURL(blob);
431
+
432
+ // Update iframe
433
+ iframe.src = objectUrl;
434
+
435
+ // Hide fallback message
436
+ fallback.style.display = 'none';
437
+
438
+ // Log success
439
+ console.log('PDF displayed successfully');
440
+ } catch (error) {
441
+ console.error('Error displaying PDF:', error);
442
+ }
443
  }
444
+
445
+ // Initialize the observer after everything is loaded
446
+ window.addEventListener('load', function() {
447
+ console.log('Window loaded, initializing PDF observer...');
448
+ setupPdfDataObserver();
449
+ });
450
+
451
+ // Also setup when Gradio mounts the component
452
+ document.addEventListener('DOMContentLoaded', function() {
453
+ console.log('DOM loaded, waiting for Gradio components...');
454
+ // Wait a bit longer for Gradio components to mount
455
+ setTimeout(setupPdfDataObserver, 2000);
456
+ });
457
  """
458
  )
459