File size: 3,678 Bytes
3a6ab9e
d109c3e
3a6ab9e
 
 
 
 
 
 
 
d109c3e
 
3a6ab9e
d109c3e
 
 
 
 
 
 
3a6ab9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3cf7dad
 
 
d109c3e
3a6ab9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import gradio as gr
from transformers import AutoModelForConditionalGeneration, AutoProcessor
import torch
import pandas as pd
import pytesseract
import cv2

# Set Tesseract command (only works if Tesseract is already installed on the hosting server)
pytesseract.pytesseract_cmd = r'/usr/bin/tesseract'

# Initialize the model and processor from Hugging Face Hub
model_name = "Qwen/Qwen2-VL-2B-Instruct-AWQ"

model = AutoModelForConditionalGeneration.from_pretrained(
    model_name,
    torch_dtype="auto"
)
model.to("cpu")

processor = AutoProcessor.from_pretrained(model_name)

# Preprocessing image for OCR
def preprocess_image(image_path):
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
    return binary

# OCR-based text extraction
def ocr_extract_text(image_path):
    preprocessed_image = preprocess_image(image_path)
    return pytesseract.image_to_string(preprocessed_image)

# Model-based image processing
def process_image(image_path):
    try:
        messages = [{
            "role": "user",
            "content": [
                {"type": "image", "image": image_path},
                {"type": "text", "text": (
                    "Extract the following details from the invoice:\n"
                    "- 'invoice_number'\n"
                    "- 'date'\n"
                    "- 'place'\n"
                    "- 'amount' (monetary value in the relevant currency)\n"
                    "- 'category' (based on the invoice type)"
                )}
            ]
        }]

        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        
        # Removed process_vision_info and used the processor directly
        inputs = processor(text=[text], padding=True, return_tensors="pt")
        inputs = inputs.to(model.device)

        generated_ids = model.generate(**inputs, max_new_tokens=128)
        output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)

        return parse_details(output_text[0])

    except Exception as e:
        print(f"Model failed, falling back to OCR: {e}")
        ocr_text = ocr_extract_text(image_path)
        return parse_details(ocr_text)

# Parsing details from text
def parse_details(details):
    parsed_data = {
        "Invoice Number": None,
        "Date": None,
        "Place": None,
        "Amount": None,
        "Category": None
    }

    lines = details.split("\n")
    for line in lines:
        lower_line = line.lower()
        if "invoice" in lower_line:
            parsed_data["Invoice Number"] = line.split(":")[-1].strip()
        elif "date" in lower_line:
            parsed_data["Date"] = line.split(":")[-1].strip()
        elif "place" in lower_line:
            parsed_data["Place"] = line.split(":")[-1].strip()
        elif any(keyword in lower_line for keyword in ["total", "amount", "cost"]):
            parsed_data["Amount"] = line.split(":")[-1].strip()
        else:
            parsed_data["Category"] = "General"

    return parsed_data

# Gradio Interface
def gradio_interface(image_files):
    results = []
    for image_file in image_files:
        details = process_image(image_file.name)
        results.append(details)

    df = pd.DataFrame(results)
    return df

# Launch Gradio App
grpc_interface = gr.Interface(
    fn=gradio_interface,
    inputs=gr.File(label="Upload Invoice Images", file_types=["image"]),
    outputs=gr.Dataframe(interactive=True),
    title="Invoice Extraction System"
)

if __name__ == "__main__":
    grpc_interface.launch(share=True)