|
import os |
|
|
|
os.environ["TRANSFORMERS_CACHE"] = "./.cache" |
|
os.environ["CUDA_VISIBLE_DEVICES"] = "-1" |
|
|
|
import gradio as gr |
|
import torch |
|
import cv2 |
|
import numpy as np |
|
from PIL import Image, ImageEnhance |
|
from ultralytics import YOLO |
|
from torchvision.transforms.functional import InterpolationMode |
|
import torchvision.transforms as T |
|
from transformers import AutoModel, AutoTokenizer |
|
import gc |
|
|
|
|
|
from prompts import front as front_prompt, back as back_prompt |
|
|
|
|
|
|
|
|
|
path = "OpenGVLab/InternVL2_5-2B" |
|
cache_folder = "./.cache" |
|
|
|
|
|
model = AutoModel.from_pretrained( |
|
path, |
|
cache_dir=cache_folder, |
|
torch_dtype=torch.float32, |
|
trust_remote_code=True |
|
).eval().to("cpu") |
|
|
|
tokenizer = AutoTokenizer.from_pretrained( |
|
path, |
|
cache_dir=cache_folder, |
|
trust_remote_code=True, |
|
use_fast=False |
|
) |
|
|
|
|
|
|
|
|
|
|
|
model_path = "best.pt" |
|
modelY = YOLO(model_path) |
|
modelY.to('cpu') |
|
|
|
def preprocessing(image): |
|
"""Apply enhancement filters and resize.""" |
|
image = Image.fromarray(np.array(image)) |
|
image = ImageEnhance.Sharpness(image).enhance(2.0) |
|
image = ImageEnhance.Contrast(image).enhance(1.5) |
|
image = ImageEnhance.Brightness(image).enhance(0.8) |
|
|
|
width = 448 |
|
aspect_ratio = image.height / image.width |
|
height = int(width * aspect_ratio) |
|
image = image.resize((width, height)) |
|
return image |
|
|
|
def imageRotation(image): |
|
"""Rotate image if height exceeds width.""" |
|
if image.height > image.width: |
|
return image.rotate(90, expand=True) |
|
return image |
|
|
|
def detect_document(image): |
|
"""Detect front/back of the document using YOLO.""" |
|
image_np = np.array(image) |
|
results = modelY(image_np, conf=0.85, device='cpu') |
|
|
|
detected_classes = set() |
|
labels = [] |
|
bounding_boxes = [] |
|
|
|
for result in results: |
|
for box in result.boxes: |
|
x1, y1, x2, y2 = map(int, box.xyxy[0]) |
|
conf = box.conf[0] |
|
cls = int(box.cls[0]) |
|
class_name = modelY.names[cls] |
|
|
|
detected_classes.add(class_name) |
|
label = f"{class_name} {conf:.2f}" |
|
labels.append(label) |
|
bounding_boxes.append((x1, y1, x2, y2, class_name, conf)) |
|
|
|
cv2.rectangle(image_np, (x1, y1), (x2, y2), (0, 255, 0), 2) |
|
cv2.putText(image_np, label, (x1, y1 - 10), |
|
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) |
|
|
|
possible_classes = {"front", "back"} |
|
missing_classes = possible_classes - detected_classes |
|
if missing_classes: |
|
labels.append(f"Missing: {', '.join(missing_classes)}") |
|
|
|
return Image.fromarray(image_np), labels, bounding_boxes |
|
|
|
def crop_image(image, bounding_boxes): |
|
"""Crop detected bounding boxes from the image.""" |
|
cropped_images = {} |
|
image_np = np.array(image) |
|
for (x1, y1, x2, y2, class_name, conf) in bounding_boxes: |
|
cropped = image_np[y1:y2, x1:x2] |
|
cropped_images[class_name] = Image.fromarray(cropped) |
|
return cropped_images |
|
|
|
|
|
|
|
|
|
IMAGENET_MEAN = (0.485, 0.456, 0.406) |
|
IMAGENET_STD = (0.229, 0.224, 0.225) |
|
|
|
def build_transform(input_size): |
|
transform = T.Compose([ |
|
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), |
|
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), |
|
T.ToTensor(), |
|
T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD) |
|
]) |
|
return transform |
|
|
|
def load_image(image_file): |
|
transform = build_transform(input_size=448) |
|
pixel_values = transform(image_file).unsqueeze(0) |
|
return pixel_values |
|
|
|
|
|
def vision_ai_api(image, doc_type): |
|
"""Run the model using a dynamic prompt based on detected doc type.""" |
|
pixel_values = load_image(image).to(torch.float32).to("cpu") |
|
generation_config = dict(max_new_tokens=512, do_sample=True) |
|
|
|
question = front_prompt if doc_type == "front" else back_prompt if doc_type == "back" else "Please provide document details." |
|
|
|
print("Before requesting model...") |
|
response = model.chat(tokenizer, pixel_values, question, generation_config) |
|
print("After requesting model...", response) |
|
|
|
|
|
del pixel_values |
|
gc.collect() |
|
torch.cuda.empty_cache() |
|
|
|
return f'Assistant: {response}' |
|
|
|
|
|
|
|
|
|
def predict(image): |
|
"""Pipeline: Preprocess β Detect β Crop β Vision AI API call.""" |
|
processed_image = preprocessing(image) |
|
rotated_image = imageRotation(processed_image) |
|
detected_image, labels, bounding_boxes = detect_document(rotated_image) |
|
cropped_images = crop_image(rotated_image, bounding_boxes) |
|
|
|
front_result, back_result = None, None |
|
if "front" in cropped_images: |
|
front_result = vision_ai_api(cropped_images["front"], "front") |
|
if "back" in cropped_images: |
|
back_result = vision_ai_api(cropped_images["back"], "back") |
|
|
|
api_results = {"front": front_result, "back": back_result} |
|
single_image = cropped_images.get("front") or cropped_images.get("back") or detected_image |
|
return single_image, labels, api_results |
|
|
|
|
|
|
|
|
|
iface = gr.Interface( |
|
fn=predict, |
|
inputs="image", |
|
outputs=["image", "text", "json"], |
|
title="License Field Detection (Front & Back Card)" |
|
) |
|
|
|
iface.launch() |