Spaces:

tsphan
/

pdf_to_single_image

Sleeping

File size: 7,697 Bytes

576a588

# pdf_processor.py
"""
Handles the core logic of converting a PDF document into a single image.
"""

import fitz  # PyMuPDF
from PIL import Image
import io
import streamlit as st  # Imported for progress bar updates
from typing import Tuple, List, Union

# Constants
DEFAULT_PDF_DPI = 72  # Standard PDF DPI used for scaling calculations
JPEG_QUALITY = 95     # Quality setting for JPEG output

def calculate_image_dimensions(pdf_document: fitz.Document, dpi: int) -> Tuple[int, int, List[float]]:
    """
    Calculates the total dimensions required for the final image canvas.

    Iterates through PDF pages to determine the maximum width and total height
    needed when rendered at the specified DPI.

    Parameters
    ----------
    pdf_document : fitz.Document
        The opened PyMuPDF document object.
    dpi : int
        The target resolution in dots per inch.

    Returns
    -------
    Tuple[int, int, List[float]]
        A tuple containing:
        - max_width (int): The maximum width required among all pages.
        - total_height (int): The sum of heights of all pages.
        - zooms (List[float]): A list of zoom factors for each page.
    """
    total_height = 0
    max_width = 0
    zooms = []
    num_pages = len(pdf_document)

    # First pass: Calculate dimensions and zoom factors
    for page_num in range(num_pages):
        page = pdf_document[page_num]
        # Calculate the zoom factor needed to achieve the target DPI
        zoom = dpi / DEFAULT_PDF_DPI
        zooms.append(zoom)
        # Get page dimensions in pixels at the calculated zoom
        rect = page.rect
        page_width = int(rect.width * zoom)
        page_height = int(rect.height * zoom)
        # Update maximum width and total height
        max_width = max(max_width, page_width)
        total_height += page_height

    return max_width, total_height, zooms

def render_pages_to_image(
    pdf_document: fitz.Document,
    zooms: List[float],
    canvas_width: int,
    canvas_height: int
) -> Image.Image:
    """
    Renders each page of the PDF onto a single PIL Image canvas.

    Parameters
    ----------
    pdf_document : fitz.Document
        The opened PyMuPDF document object.
    zooms : List[float]
        A list of zoom factors, one for each page.
    canvas_width : int
        The width of the final image canvas.
    canvas_height : int
        The height of the final image canvas.

    Returns
    -------
    Image.Image
        A PIL Image object containing all rendered PDF pages.
    """
    num_pages = len(pdf_document)
    # Create a new blank image canvas (RGB white background)
    result_image = Image.new("RGB", (canvas_width, canvas_height), (255, 255, 255))
    current_height = 0

    # Initialize Streamlit progress reporting
    progress_bar = st.progress(0)
    status_text = st.empty()

    # Second pass: Render each page and paste it onto the canvas
    for page_num in range(num_pages):
        status_text.text(f"Processing page {page_num + 1}/{num_pages}...")
        page = pdf_document[page_num]
        zoom = zooms[page_num]

        # Generate a pixmap (raster image) of the page
        # Use fitz.Matrix for transformation with the calculated zoom
        pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom))

        # Convert the pixmap to a PIL Image
        # Ensure the mode ("RGB" or "RGBA") matches pix.samples structure if issues arise
        try:
            page_image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        except ValueError as e:
            st.error(f"Error converting page {page_num+1} to Image: {e}")
            st.warning(f"Pixmap details: width={pix.width}, height={pix.height}, alpha={pix.alpha}, samples length={len(pix.samples)}")
            # Attempt RGBA conversion as a fallback if alpha channel is present
            if pix.alpha:
                 page_image = Image.frombytes("RGBA", [pix.width, pix.height], pix.samples).convert("RGB")
                 st.info("Retrying page conversion with RGBA mode.")
            else:
                 raise # Re-raise the original error if not an alpha channel issue

        # Paste the page image onto the main canvas
        # The paste position is (0, current_height)
        result_image.paste(page_image, (0, current_height))
        current_height += pix.height # Move down for the next page

        # Update Streamlit progress bar
        progress_bar.progress((page_num + 1) / num_pages)

    status_text.text("Rendering complete!")
    return result_image

def pdf_to_single_image(pdf_path: str, output_format: str = "PNG", dpi: int = 300) -> io.BytesIO:
    """
    Converts all pages of a PDF file into a single vertical image.

    Opens the PDF, calculates the required dimensions, renders each page
    at the specified DPI, stitches them together vertically, and returns
    the result as an image in a BytesIO buffer.

    Parameters
    ----------
    pdf_path : str
        The file path to the input PDF document.
    output_format : str, optional
        The desired output image format ("PNG" or "JPG"), by default "PNG".
    dpi : int, optional
        The resolution (dots per inch) for rendering the PDF pages, by default 300.
        Higher DPI results in better quality but larger file size.

    Returns
    -------
    io.BytesIO
        A BytesIO buffer containing the generated image data in the specified format.

    Raises
    ------
    fitz.FitzError
        If there is an error opening or processing the PDF file.
    Exception
        For other potential errors during image processing or saving.
    """
    pdf_document = None  # Initialize to ensure it's defined in finally block
    try:
        # Open the PDF document
        pdf_document = fitz.open(pdf_path)

        # Calculate the necessary dimensions for the final image
        canvas_width, canvas_height, zooms = calculate_image_dimensions(pdf_document, dpi)

        if canvas_width == 0 or canvas_height == 0:
            st.warning("Could not determine valid dimensions for the PDF. It might be empty or corrupted.")
            return io.BytesIO() # Return empty buffer

        # Render pages onto the canvas
        result_image = render_pages_to_image(pdf_document, zooms, canvas_width, canvas_height)

        # Create an in-memory buffer to save the image
        img_buffer = io.BytesIO()

        # Save the final image to the buffer in the specified format
        if output_format.upper() == "PNG":
            result_image.save(img_buffer, format="PNG")
        elif output_format.upper() == "JPG" or output_format.upper() == "JPEG":
            # Save as JPEG with specified quality, converting RGBA to RGB if necessary
            if result_image.mode == 'RGBA':
                 result_image = result_image.convert('RGB')
            result_image.save(img_buffer, format="JPEG", quality=JPEG_QUALITY)
        else:
            # Default to PNG if format is unknown
            st.warning(f"Unsupported format '{output_format}'. Defaulting to PNG.")
            result_image.save(img_buffer, format="PNG")

        # Reset buffer position to the beginning for reading
        img_buffer.seek(0)

        return img_buffer

    except fitz.FitzError as e:
        st.error(f"Error processing PDF: {e}")
        raise  # Re-raise the specific exception
    except Exception as e:
        st.error(f"An unexpected error occurred during conversion: {e}")
        raise # Re-raise general exceptions
    finally:
        # Ensure the PDF document is closed even if errors occur
        if pdf_document:
            pdf_document.close()
            # st.write("PDF document closed.") # Optional debug message