# pdf_processor.py """ Handles the core logic of converting a PDF document into a single image. """ import fitz # PyMuPDF from PIL import Image import io import streamlit as st # Imported for progress bar updates from typing import Tuple, List, Union # Constants DEFAULT_PDF_DPI = 72 # Standard PDF DPI used for scaling calculations JPEG_QUALITY = 95 # Quality setting for JPEG output def calculate_image_dimensions(pdf_document: fitz.Document, dpi: int) -> Tuple[int, int, List[float]]: """ Calculates the total dimensions required for the final image canvas. Iterates through PDF pages to determine the maximum width and total height needed when rendered at the specified DPI. Parameters ---------- pdf_document : fitz.Document The opened PyMuPDF document object. dpi : int The target resolution in dots per inch. Returns ------- Tuple[int, int, List[float]] A tuple containing: - max_width (int): The maximum width required among all pages. - total_height (int): The sum of heights of all pages. - zooms (List[float]): A list of zoom factors for each page. """ total_height = 0 max_width = 0 zooms = [] num_pages = len(pdf_document) # First pass: Calculate dimensions and zoom factors for page_num in range(num_pages): page = pdf_document[page_num] # Calculate the zoom factor needed to achieve the target DPI zoom = dpi / DEFAULT_PDF_DPI zooms.append(zoom) # Get page dimensions in pixels at the calculated zoom rect = page.rect page_width = int(rect.width * zoom) page_height = int(rect.height * zoom) # Update maximum width and total height max_width = max(max_width, page_width) total_height += page_height return max_width, total_height, zooms def render_pages_to_image( pdf_document: fitz.Document, zooms: List[float], canvas_width: int, canvas_height: int ) -> Image.Image: """ Renders each page of the PDF onto a single PIL Image canvas. Parameters ---------- pdf_document : fitz.Document The opened PyMuPDF document object. zooms : List[float] A list of zoom factors, one for each page. canvas_width : int The width of the final image canvas. canvas_height : int The height of the final image canvas. Returns ------- Image.Image A PIL Image object containing all rendered PDF pages. """ num_pages = len(pdf_document) # Create a new blank image canvas (RGB white background) result_image = Image.new("RGB", (canvas_width, canvas_height), (255, 255, 255)) current_height = 0 # Initialize Streamlit progress reporting progress_bar = st.progress(0) status_text = st.empty() # Second pass: Render each page and paste it onto the canvas for page_num in range(num_pages): status_text.text(f"Processing page {page_num + 1}/{num_pages}...") page = pdf_document[page_num] zoom = zooms[page_num] # Generate a pixmap (raster image) of the page # Use fitz.Matrix for transformation with the calculated zoom pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom)) # Convert the pixmap to a PIL Image # Ensure the mode ("RGB" or "RGBA") matches pix.samples structure if issues arise try: page_image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) except ValueError as e: st.error(f"Error converting page {page_num+1} to Image: {e}") st.warning(f"Pixmap details: width={pix.width}, height={pix.height}, alpha={pix.alpha}, samples length={len(pix.samples)}") # Attempt RGBA conversion as a fallback if alpha channel is present if pix.alpha: page_image = Image.frombytes("RGBA", [pix.width, pix.height], pix.samples).convert("RGB") st.info("Retrying page conversion with RGBA mode.") else: raise # Re-raise the original error if not an alpha channel issue # Paste the page image onto the main canvas # The paste position is (0, current_height) result_image.paste(page_image, (0, current_height)) current_height += pix.height # Move down for the next page # Update Streamlit progress bar progress_bar.progress((page_num + 1) / num_pages) status_text.text("Rendering complete!") return result_image def pdf_to_single_image(pdf_path: str, output_format: str = "PNG", dpi: int = 300) -> io.BytesIO: """ Converts all pages of a PDF file into a single vertical image. Opens the PDF, calculates the required dimensions, renders each page at the specified DPI, stitches them together vertically, and returns the result as an image in a BytesIO buffer. Parameters ---------- pdf_path : str The file path to the input PDF document. output_format : str, optional The desired output image format ("PNG" or "JPG"), by default "PNG". dpi : int, optional The resolution (dots per inch) for rendering the PDF pages, by default 300. Higher DPI results in better quality but larger file size. Returns ------- io.BytesIO A BytesIO buffer containing the generated image data in the specified format. Raises ------ fitz.FitzError If there is an error opening or processing the PDF file. Exception For other potential errors during image processing or saving. """ pdf_document = None # Initialize to ensure it's defined in finally block try: # Open the PDF document pdf_document = fitz.open(pdf_path) # Calculate the necessary dimensions for the final image canvas_width, canvas_height, zooms = calculate_image_dimensions(pdf_document, dpi) if canvas_width == 0 or canvas_height == 0: st.warning("Could not determine valid dimensions for the PDF. It might be empty or corrupted.") return io.BytesIO() # Return empty buffer # Render pages onto the canvas result_image = render_pages_to_image(pdf_document, zooms, canvas_width, canvas_height) # Create an in-memory buffer to save the image img_buffer = io.BytesIO() # Save the final image to the buffer in the specified format if output_format.upper() == "PNG": result_image.save(img_buffer, format="PNG") elif output_format.upper() == "JPG" or output_format.upper() == "JPEG": # Save as JPEG with specified quality, converting RGBA to RGB if necessary if result_image.mode == 'RGBA': result_image = result_image.convert('RGB') result_image.save(img_buffer, format="JPEG", quality=JPEG_QUALITY) else: # Default to PNG if format is unknown st.warning(f"Unsupported format '{output_format}'. Defaulting to PNG.") result_image.save(img_buffer, format="PNG") # Reset buffer position to the beginning for reading img_buffer.seek(0) return img_buffer except fitz.FitzError as e: st.error(f"Error processing PDF: {e}") raise # Re-raise the specific exception except Exception as e: st.error(f"An unexpected error occurred during conversion: {e}") raise # Re-raise general exceptions finally: # Ensure the PDF document is closed even if errors occur if pdf_document: pdf_document.close() # st.write("PDF document closed.") # Optional debug message