Spaces:

Didier
/

Optical_character_recognition

Running

File size: 2,989 Bytes
"""
File: ocr2.py

Description: Optical Character Recognition (OCR) using software 2.0 models

Author: Didier Guillevic
Date: 2025-04-07
"""

import os
import base64
from mistralai import Mistral

#
# MistralAI client
#
api_key = os.environ["MISTRAL_API_KEY"]
client = Mistral(api_key=api_key)


#
# Process PDF file
#
def process_pdf(pdf_path: str):
    """Process given file with Mistral_OCR

    Args:
        pdf_path: Path to a local PDF file.
    
    Returns:
        str: The OCR result as a string.

    Note:
        We follow the Mistral API documentation to upload the file and
        process it with OCR. The file is uploaded to the Mistral API
        and the OCR is performed on the uploaded file. The result is
        returned as a string.
            https://docs.mistral.ai/capabilities/document/
    """

    uploaded_pdf = client.files.upload(
        file={"file_name": pdf_path, "content": open(pdf_path, "rb"),},
        purpose="ocr"
    )
    signed_url = client.files.get_signed_url(file_id=uploaded_pdf.id)

    ocr_response = client.ocr.process(
        model="mistral-ocr-latest",
        document={"type": "document_url", "document_url": signed_url.url,}
    )

    return ocr_response.pages[0].markdown


#
# Process image file
#
def process_image(image_path: str):
    """Process given image file: extract information present in image.

    Args:
        image_path: Path to a local image file.
    
    Returns:
        str: The OCR result as a string.
    
    Note:
        Although it should "work", when I proces an image file with Mistral_OCR,
        I get an empty result. Everything appears fine, but no text is extracted.
        Hence, I will send the image to a model such as as Mistral_Small (or
        Mistral_Large) to extract the text present in the image.
    """
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": (
                        "Could you extract the information present in the image. "
                        "No need to repeat the task description. Simply respond."
                    )
                },
                {
                    "type": "image_url",
                    "image_url": f"data:image/jpeg;base64,{encode_image(image_path)}"
                }
            ]
        }
    ]

    response = client.chat.complete(
        model='mistral-small-latest',
        messages=messages
    )
    return response.choices[0].message.content


#
# Encode images as base64
#
def encode_image(image_path):
    """Encode the image to base64."""
    try:
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode('utf-8')
    except FileNotFoundError:
        print(f"Error: The file {image_path} was not found.")
        return None
    except Exception as e:  # Added general exception handling
        print(f"Error: {e}")
        return None