hynky HF Staff commited on
Commit
2bea527
·
1 Parent(s): c25fdf1

fuck sandbox on hf space

Browse files
Files changed (1) hide show
  1. extractor_compare.py +203 -55
extractor_compare.py CHANGED
@@ -198,7 +198,11 @@ def create_interface():
198
  }
199
  """
200
 
201
- with gr.Blocks(title="PDF Extractor Comparer", theme="soft", css=custom_css) as demo:
 
 
 
 
202
  gr.Markdown("## PDF Extractor Comparer")
203
 
204
  with gr.Row():
@@ -217,18 +221,7 @@ def create_interface():
217
  label="PDF Document",
218
  value='''
219
  <div style="width:100%; height:700px; position:relative; border:1px solid #ddd;">
220
- <style>
221
- @font-face {
222
- font-family: 'Local Arial';
223
- src: local('Arial');
224
- }
225
- body {
226
- font-family: 'Local Arial', sans-serif;
227
- }
228
- </style>
229
- <object id="pdf-object" type="application/pdf" width="100%" height="100%" style="display:none;">
230
- <p>PDF cannot be displayed</p>
231
- </object>
232
  <div id="pdf-fallback" style="position:absolute; top:0; left:0; width:100%; height:100%;
233
  display:flex; align-items:center; justify-content:center; padding:20px; text-align:center; font-family: Arial, sans-serif;">
234
  Click "Load PDFs" to start viewing documents.
@@ -352,92 +345,247 @@ def create_interface():
352
  fn=None,
353
  js="""
354
  function() {
355
- console.log("Setting up PDF viewer");
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
356
 
357
- // Store the current blob URL
358
- var pdfBlobUrl = null;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359
 
360
  // Function to display PDF from base64 data
361
- function displayPdfFromBase64(base64Data) {
362
  try {
363
  if (!base64Data || base64Data.length < 100) {
364
  console.log("No valid PDF data received");
365
  document.getElementById('pdf-fallback').style.display = 'flex';
366
- document.getElementById('pdf-object').style.display = 'none';
367
  return;
368
  }
369
 
370
- console.log("Displaying PDF from base64 data");
371
-
372
- // Clean up previous blob URL
373
- if (pdfBlobUrl) {
374
- URL.revokeObjectURL(pdfBlobUrl);
375
  }
376
 
377
- // Convert base64 to binary
378
- const binary = atob(base64Data);
379
- const bytes = new Uint8Array(binary.length);
380
- for (let i = 0; i < binary.length; i++) {
381
- bytes[i] = binary.charCodeAt(i);
 
 
 
 
 
 
382
  }
383
 
384
- // Create blob and URL
385
- const blob = new Blob([bytes], {type: 'application/pdf'});
386
- pdfBlobUrl = URL.createObjectURL(blob);
 
 
 
387
 
388
- // Display PDF in the object element
389
- const pdfObject = document.getElementById('pdf-object');
390
- const fallback = document.getElementById('pdf-fallback');
 
391
 
392
- if (pdfObject && fallback) {
393
- pdfObject.data = pdfBlobUrl;
394
- pdfObject.style.display = 'block';
395
- fallback.style.display = 'none';
396
- console.log("PDF displayed successfully");
397
- } else {
398
- console.error("PDF viewer elements not found");
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
399
  }
400
  } catch (error) {
401
- console.error("Error displaying PDF:", error);
402
- const fallback = document.getElementById('pdf-fallback');
403
- if (fallback) {
404
- fallback.innerHTML = '<div style="color:red; font-family: Arial, sans-serif;">Error displaying PDF</div>';
405
- fallback.style.display = 'flex';
406
- }
 
407
  }
408
  }
409
 
410
- // Check for PDF data repeatedly
411
- function checkForPdfData() {
412
  const dataElement = document.getElementById('pdf_base64_data');
413
  if (!dataElement) {
414
  console.log("PDF data element not found, will retry");
415
- setTimeout(checkForPdfData, 1000);
416
  return;
417
  }
418
 
419
  const textarea = dataElement.querySelector('textarea');
420
  if (!textarea) {
421
  console.log("Textarea not found, will retry");
422
- setTimeout(checkForPdfData, 1000);
423
  return;
424
  }
425
 
 
 
426
  // Display initial data if available
427
  if (textarea.value && textarea.value.length > 100) {
428
  displayPdfFromBase64(textarea.value);
429
  }
430
 
431
- // Set up polling to check for changes
432
- setInterval(function() {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
433
  if (textarea.value && textarea.value.length > 100) {
434
  displayPdfFromBase64(textarea.value);
435
  }
436
- }, 2000);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
437
  }
438
 
439
  // Start checking for PDF data
440
- setTimeout(checkForPdfData, 1000);
441
 
442
  // Add keyboard shortcuts
443
  document.addEventListener('keydown', function(event) {
 
198
  }
199
  """
200
 
201
+ with gr.Blocks(title="PDF Extractor Comparer", theme="soft", css=custom_css, head=
202
+ """
203
+ <script src="https://unpkg.com/[email protected]/build/pdf.min.js"></script>
204
+ """
205
+ ) as demo:
206
  gr.Markdown("## PDF Extractor Comparer")
207
 
208
  with gr.Row():
 
221
  label="PDF Document",
222
  value='''
223
  <div style="width:100%; height:700px; position:relative; border:1px solid #ddd;">
224
+ <div id="pdf-container" style="width:100%; height:100%; overflow:auto;"></div>
 
 
 
 
 
 
 
 
 
 
 
225
  <div id="pdf-fallback" style="position:absolute; top:0; left:0; width:100%; height:100%;
226
  display:flex; align-items:center; justify-content:center; padding:20px; text-align:center; font-family: Arial, sans-serif;">
227
  Click "Load PDFs" to start viewing documents.
 
345
  fn=None,
346
  js="""
347
  function() {
348
+ console.log("Setting up PDF.js viewer");
349
+
350
+ // Configure PDF.js worker
351
+ if (window.pdfjsLib) {
352
+ window.pdfjsLib.GlobalWorkerOptions.workerSrc = "https://unpkg.com/[email protected]/build/pdf.worker.min.js";
353
+ console.log("PDF.js configured with worker");
354
+ } else {
355
+ console.warn("PDF.js not found in head, attempting to load dynamically");
356
+ // Fallback to load PDF.js dynamically if not in the head
357
+ const pdfJsScript = document.createElement('script');
358
+ pdfJsScript.src = "https://unpkg.com/[email protected]/build/pdf.min.js";
359
+ document.head.appendChild(pdfJsScript);
360
+
361
+ pdfJsScript.onload = function() {
362
+ window.pdfjsLib.GlobalWorkerOptions.workerSrc = "https://unpkg.com/[email protected]/build/pdf.worker.min.js";
363
+ console.log("PDF.js loaded dynamically");
364
+ };
365
+ }
366
 
367
+ // To track when we should force a refresh
368
+ let currentPdfHash = "";
369
+
370
+ // Function to render a PDF page
371
+ async function renderPage(pdf, pageNumber, container) {
372
+ try {
373
+ const page = await pdf.getPage(pageNumber);
374
+
375
+ // Create page container
376
+ const pageContainer = document.createElement('div');
377
+ pageContainer.className = 'pdf-page';
378
+ pageContainer.style.position = 'relative';
379
+ pageContainer.style.margin = '10px auto';
380
+ pageContainer.style.boxShadow = '0 2px 5px rgba(0,0,0,0.2)';
381
+
382
+ // Create canvas for this page
383
+ const canvas = document.createElement('canvas');
384
+ const context = canvas.getContext('2d');
385
+ pageContainer.appendChild(canvas);
386
+
387
+ // Set up viewport with scale based on container width
388
+ const containerWidth = container.clientWidth - 30; // Account for margins
389
+ const originalViewport = page.getViewport({ scale: 1 });
390
+ const scale = containerWidth / originalViewport.width;
391
+ const viewport = page.getViewport({ scale });
392
+
393
+ // Set canvas dimensions
394
+ canvas.width = viewport.width;
395
+ canvas.height = viewport.height;
396
+
397
+ // Render the PDF page into canvas context
398
+ await page.render({
399
+ canvasContext: context,
400
+ viewport: viewport
401
+ }).promise;
402
+
403
+ // Add to the container
404
+ container.appendChild(pageContainer);
405
+
406
+ return true;
407
+ } catch (error) {
408
+ console.error(`Error rendering page ${pageNumber}:`, error);
409
+ return false;
410
+ }
411
+ }
412
+
413
+ // Simple hash function for PDF data to detect changes
414
+ function hashData(str) {
415
+ let hash = 0;
416
+ if (str.length === 0) return hash;
417
+ for (let i = 0; i < Math.min(str.length, 10000); i++) {
418
+ const char = str.charCodeAt(i);
419
+ hash = ((hash << 5) - hash) + char;
420
+ hash = hash & hash;
421
+ }
422
+ // Also include the length as PDFs with same start can be different
423
+ return `${hash}_${str.length}`;
424
+ }
425
 
426
  // Function to display PDF from base64 data
427
+ async function displayPdfFromBase64(base64Data) {
428
  try {
429
  if (!base64Data || base64Data.length < 100) {
430
  console.log("No valid PDF data received");
431
  document.getElementById('pdf-fallback').style.display = 'flex';
432
+ document.getElementById('pdf-container').innerHTML = '';
433
  return;
434
  }
435
 
436
+ // Check if this is the same PDF we already have displayed
437
+ const dataHash = hashData(base64Data);
438
+ if (dataHash === currentPdfHash) {
439
+ console.log("Same PDF already displayed, skipping render");
440
+ return;
441
  }
442
 
443
+ // Update the current PDF hash
444
+ currentPdfHash = dataHash;
445
+ console.log("PDF changed, rendering new document");
446
+
447
+ // Check if PDF.js is loaded
448
+ if (!window.pdfjsLib) {
449
+ console.warn("PDF.js not loaded yet, waiting...");
450
+ document.getElementById('pdf-fallback').innerHTML =
451
+ '<div style="font-family: Arial, sans-serif;">Loading PDF viewer...</div>';
452
+ setTimeout(() => displayPdfFromBase64(base64Data), 500);
453
+ return;
454
  }
455
 
456
+ // Convert base64 to array buffer
457
+ const binaryString = atob(base64Data);
458
+ const bytes = new Uint8Array(binaryString.length);
459
+ for (let i = 0; i < binaryString.length; i++) {
460
+ bytes[i] = binaryString.charCodeAt(i);
461
+ }
462
 
463
+ // Clear existing content
464
+ const container = document.getElementById('pdf-container');
465
+ container.innerHTML = '';
466
+ document.getElementById('pdf-fallback').style.display = 'none';
467
 
468
+ // Load and render the PDF
469
+ try {
470
+ // Show loading indicator
471
+ const loadingIndicator = document.createElement('div');
472
+ loadingIndicator.style.padding = '20px';
473
+ loadingIndicator.style.textAlign = 'center';
474
+ loadingIndicator.innerText = 'Loading PDF...';
475
+ container.appendChild(loadingIndicator);
476
+
477
+ // Load document
478
+ const loadingTask = window.pdfjsLib.getDocument({ data: bytes });
479
+ const pdf = await loadingTask.promise;
480
+
481
+ // Clear the loading indicator
482
+ container.innerHTML = '';
483
+
484
+ console.log(`PDF loaded with ${pdf.numPages} pages`);
485
+
486
+ // Render all pages
487
+ const pagePromises = [];
488
+ for (let i = 1; i <= pdf.numPages; i++) {
489
+ pagePromises.push(renderPage(pdf, i, container));
490
+ }
491
+
492
+ // Wait for all pages to render
493
+ await Promise.all(pagePromises);
494
+ console.log("All pages rendered");
495
+
496
+ // Scroll to top
497
+ container.scrollTop = 0;
498
+
499
+ } catch (error) {
500
+ console.error("Error loading PDF:", error);
501
+ document.getElementById('pdf-fallback').innerHTML =
502
+ `<div style="color: red; font-family: Arial, sans-serif;">
503
+ Error loading PDF: ${error.message || 'Unknown error'}
504
+ </div>`;
505
+ document.getElementById('pdf-fallback').style.display = 'flex';
506
+ currentPdfHash = ""; // Reset hash to allow retry
507
  }
508
  } catch (error) {
509
+ console.error("Error processing PDF data:", error);
510
+ document.getElementById('pdf-fallback').innerHTML =
511
+ `<div style="color: red; font-family: Arial, sans-serif;">
512
+ Error processing PDF: ${error.message || 'Unknown error'}
513
+ </div>`;
514
+ document.getElementById('pdf-fallback').style.display = 'flex';
515
+ currentPdfHash = ""; // Reset hash to allow retry
516
  }
517
  }
518
 
519
+ // Check for PDF data
520
+ function setupPdfListener() {
521
  const dataElement = document.getElementById('pdf_base64_data');
522
  if (!dataElement) {
523
  console.log("PDF data element not found, will retry");
524
+ setTimeout(setupPdfListener, 1000);
525
  return;
526
  }
527
 
528
  const textarea = dataElement.querySelector('textarea');
529
  if (!textarea) {
530
  console.log("Textarea not found, will retry");
531
+ setTimeout(setupPdfListener, 1000);
532
  return;
533
  }
534
 
535
+ console.log("Found PDF data element, setting up listeners");
536
+
537
  // Display initial data if available
538
  if (textarea.value && textarea.value.length > 100) {
539
  displayPdfFromBase64(textarea.value);
540
  }
541
 
542
+ // Use both an observer and polling for robustness
543
+ // 1. Create MutationObserver to watch for value changes
544
+ const observer = new MutationObserver((mutations) => {
545
+ for (const mutation of mutations) {
546
+ if (textarea.value && textarea.value.length > 100) {
547
+ displayPdfFromBase64(textarea.value);
548
+ break;
549
+ }
550
+ }
551
+ });
552
+
553
+ // Observe the textarea for changes
554
+ observer.observe(textarea, {
555
+ attributes: true,
556
+ characterData: true,
557
+ subtree: true,
558
+ childList: true
559
+ });
560
+
561
+ // 2. Also use polling as a fallback
562
+ setInterval(() => {
563
  if (textarea.value && textarea.value.length > 100) {
564
  displayPdfFromBase64(textarea.value);
565
  }
566
+ }, 1000);
567
+
568
+ // Monitor the next/prev buttons to force PDF refresh
569
+ const prevButton = document.getElementById('prev_button');
570
+ const nextButton = document.getElementById('next_button');
571
+
572
+ if (prevButton) {
573
+ prevButton.addEventListener('click', () => {
574
+ console.log("Prev button clicked, forcing PDF refresh");
575
+ currentPdfHash = ""; // Reset hash to force refresh
576
+ });
577
+ }
578
+
579
+ if (nextButton) {
580
+ nextButton.addEventListener('click', () => {
581
+ console.log("Next button clicked, forcing PDF refresh");
582
+ currentPdfHash = ""; // Reset hash to force refresh
583
+ });
584
+ }
585
  }
586
 
587
  // Start checking for PDF data
588
+ setTimeout(setupPdfListener, 1000);
589
 
590
  // Add keyboard shortcuts
591
  document.addEventListener('keydown', function(event) {