Spaces:
Running
on
L4
Running
on
L4
import gradio as gr | |
from paddleocr import PaddleOCR | |
import numpy as np | |
import os | |
from langdetect import detect | |
from openai import OpenAI | |
from transformers import pipeline | |
# Initialize PaddleOCR | |
ocr_reader = PaddleOCR(use_angle_cls=True, lang='en') | |
# Initialize Whisper Model via Hugging Face Transformers | |
whisper_model = pipeline( | |
task="automatic-speech-recognition", | |
model="openai/whisper-small", | |
device=0 | |
) | |
# Initialize OpenAI Client | |
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) | |
def detect_language(text): | |
try: | |
lang = detect(text) | |
except: | |
lang = "unknown" | |
return lang | |
def gpt_clean_and_translate(text, target_language): | |
if not text.strip(): | |
return "No text detected.", "" | |
prompt = f""" | |
You are an expert document reader and translator. You will receive noisy extracted text from a government ID. Your tasks: | |
1. Identify and extract these fields: Name, Address, Date of Birth, Expiry Date, Class, Sex. | |
2. Output the information in full English sentences. | |
3. Translate the full text into {target_language}. | |
If the target language is English, just output clean English sentences. | |
""" | |
response = client.chat.completions.create( | |
model="gpt-4o", | |
messages=[ | |
{"role": "system", "content": prompt}, | |
{"role": "user", "content": text} | |
], | |
temperature=0.2 | |
) | |
cleaned_translation = response.choices[0].message.content.strip() | |
return cleaned_translation | |
def process_document(image, target_language, language_group): | |
if not isinstance(image, np.ndarray): | |
image = np.array(image) | |
# OCR - Text Extraction using PaddleOCR | |
ocr_result = ocr_reader.ocr(image) | |
extracted_texts = [] | |
for line in ocr_result[0]: | |
text = line[1][0] | |
extracted_texts.append(text) | |
extracted_text = " ".join(extracted_texts) | |
# Language Detection | |
source_language = detect_language(extracted_text) | |
# GPT Cleaning and Translation | |
translation = gpt_clean_and_translate(extracted_text, target_language) | |
return extracted_text, source_language, translation | |
def process_audio(audio, target_language): | |
# Speech Recognition | |
result = whisper_model(audio) | |
extracted_text = result['text'] | |
# Language Detection | |
source_language = detect_language(extracted_text) | |
# GPT Cleaning and Translation | |
translation = gpt_clean_and_translate(extracted_text, target_language) | |
return extracted_text, source_language, translation | |
# Gradio Interface | |
document_interface = gr.Interface( | |
fn=process_document, | |
inputs=[ | |
gr.Image(type="pil", label="Upload a Document Image (e.g., Passport, ID, Government Form)"), | |
gr.Radio(choices=["English", "Arabic"], label="Translate To"), | |
gr.Dropdown(choices=["Arabic", "Russian", "Other (French, English)"], label="Document Language Group") | |
], | |
outputs=[ | |
gr.Textbox(label="Extracted Text"), | |
gr.Textbox(label="Detected Source Language"), | |
gr.Textbox(label="Translated and Structured Text") | |
], | |
title="π¨ Police Vision & Translator - Document Scanner", | |
description="Upload an image document. The system will auto-detect the source language and generate clean translated output." | |
) | |
audio_interface = gr.Interface( | |
fn=process_audio, | |
inputs=[ | |
gr.Audio(type="filepath", label="Record Audio (max 30 sec)"), | |
gr.Radio(choices=["English", "Arabic"], label="Translate To") | |
], | |
outputs=[ | |
gr.Textbox(label="Transcribed Text"), | |
gr.Textbox(label="Detected Source Language"), | |
gr.Textbox(label="Translated and Structured Text") | |
], | |
title="π¨ Police Vision & Translator - Voice Translator", | |
description="Record audio. The system will auto-detect the source language and generate clean translated output." | |
) | |
# Combine the Interfaces | |
app = gr.TabbedInterface( | |
[document_interface, audio_interface], | |
["Document Translator", "Voice Translator"] | |
) | |
if __name__ == "__main__": | |
app.launch(share=True) | |