ui-convert2 / app.py
chriswang09's picture
updated
3acc03a
import gradio as gr
import torch
import cv2
import numpy as np
import json
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import pytesseract
import os
# Set cache directories to /tmp which should be writable
os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers_cache"
os.environ["HF_HOME"] = "/tmp/hf_home"
os.environ["HF_DATASETS_CACHE"] = "/tmp/hf_datasets_cache"
# Load Object Detection Pipeline
obj_detect = pipeline("object-detection", model="facebook/detr-resnet-50", device=-1)
# Load Qwen for Code Generation
MODEL_NAME = "Qwen/Qwen2.5-Coder-3B"
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if torch.cuda.is_available() else torch.float32
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME, torch_dtype=dtype, device_map="auto"
)
# Define the process_image function (same as your original logic)
def process_image(img):
opencv_image = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
img_height, img_width, _ = opencv_image.shape
# Run Object Detection
detections = obj_detect(img)
# Run OCR
text_data = pytesseract.image_to_string(opencv_image)
ui_json = {
"id": "generated-ui",
"name": "Generated UI",
"components": [],
"ocr_text": text_data.strip()
}
for det in detections:
ui_json["components"].append({
"id": f"{det['label']}-{len(ui_json['components']) + 1}",
"name": det["label"].capitalize(),
"confidence": round(det["score"], 2),
})
metadata_str = json.dumps(ui_json, indent=2)
# Generate React Code
prompt = f"Generate a React component from this metadata:\n{metadata_str}"
inputs = tokenizer(prompt, return_tensors="pt").to(device)
with torch.no_grad():
output = model.generate(**inputs, max_length=1024)
code_response = tokenizer.decode(output[0], skip_special_tokens=True)
return metadata_str, code_response
# Gradio Interface
interface = gr.Interface(
fn=process_image,
inputs=gr.Image(type="pil"),
outputs=["text", "text"],
title="Screenshot β†’ Metadata & React Code",
description="Upload a UI screenshot and get structured metadata + React code.",
)
# Run in Docker with 0.0.0.0 to allow external access
interface.launch(server_name="0.0.0.0", server_port=7860)