Nayera-2025's picture
Update app.py
373f022 verified
import gradio as gr
from paddleocr import PaddleOCR
import numpy as np
import os
from langdetect import detect
from openai import OpenAI
from transformers import pipeline
# Initialize PaddleOCR
ocr_reader = PaddleOCR(use_angle_cls=True, lang='en')
# Initialize Whisper Model via Hugging Face Transformers
whisper_model = pipeline(
task="automatic-speech-recognition",
model="openai/whisper-small",
device=0
)
# Initialize OpenAI Client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
def detect_language(text):
try:
lang = detect(text)
except:
lang = "unknown"
return lang
def gpt_clean_and_translate(text, target_language):
if not text.strip():
return "No text detected.", ""
prompt = f"""
You are an expert document reader and translator. You will receive noisy extracted text from a government ID. Your tasks:
1. Identify and extract these fields: Name, Address, Date of Birth, Expiry Date, Class, Sex.
2. Output the information in full English sentences.
3. Translate the full text into {target_language}.
If the target language is English, just output clean English sentences.
"""
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": prompt},
{"role": "user", "content": text}
],
temperature=0.2
)
cleaned_translation = response.choices[0].message.content.strip()
return cleaned_translation
def process_document(image, target_language, language_group):
if not isinstance(image, np.ndarray):
image = np.array(image)
# OCR - Text Extraction using PaddleOCR
ocr_result = ocr_reader.ocr(image)
extracted_texts = []
for line in ocr_result[0]:
text = line[1][0]
extracted_texts.append(text)
extracted_text = " ".join(extracted_texts)
# Language Detection
source_language = detect_language(extracted_text)
# GPT Cleaning and Translation
translation = gpt_clean_and_translate(extracted_text, target_language)
return extracted_text, source_language, translation
def process_audio(audio, target_language):
# Speech Recognition
result = whisper_model(audio)
extracted_text = result['text']
# Language Detection
source_language = detect_language(extracted_text)
# GPT Cleaning and Translation
translation = gpt_clean_and_translate(extracted_text, target_language)
return extracted_text, source_language, translation
# Gradio Interface
document_interface = gr.Interface(
fn=process_document,
inputs=[
gr.Image(type="pil", label="Upload a Document Image (e.g., Passport, ID, Government Form)"),
gr.Radio(choices=["English", "Arabic"], label="Translate To"),
gr.Dropdown(choices=["Arabic", "Russian", "Other (French, English)"], label="Document Language Group")
],
outputs=[
gr.Textbox(label="Extracted Text"),
gr.Textbox(label="Detected Source Language"),
gr.Textbox(label="Translated and Structured Text")
],
title="🚨 Police Vision & Translator - Document Scanner",
description="Upload an image document. The system will auto-detect the source language and generate clean translated output."
)
audio_interface = gr.Interface(
fn=process_audio,
inputs=[
gr.Audio(type="filepath", label="Record Audio (max 30 sec)"),
gr.Radio(choices=["English", "Arabic"], label="Translate To")
],
outputs=[
gr.Textbox(label="Transcribed Text"),
gr.Textbox(label="Detected Source Language"),
gr.Textbox(label="Translated and Structured Text")
],
title="🚨 Police Vision & Translator - Voice Translator",
description="Record audio. The system will auto-detect the source language and generate clean translated output."
)
# Combine the Interfaces
app = gr.TabbedInterface(
[document_interface, audio_interface],
["Document Translator", "Voice Translator"]
)
if __name__ == "__main__":
app.launch(share=True)