import gradio as gr from paddleocr import PaddleOCR import numpy as np import os from langdetect import detect from openai import OpenAI from transformers import pipeline # Initialize PaddleOCR ocr_reader = PaddleOCR(use_angle_cls=True, lang='en') # Initialize Whisper Model via Hugging Face Transformers whisper_model = pipeline( task="automatic-speech-recognition", model="openai/whisper-small", device=0 ) # Initialize OpenAI Client client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) def detect_language(text): try: lang = detect(text) except: lang = "unknown" return lang def gpt_clean_and_translate(text, target_language): if not text.strip(): return "No text detected.", "" prompt = f""" You are an expert document reader and translator. You will receive noisy extracted text from a government ID. Your tasks: 1. Identify and extract these fields: Name, Address, Date of Birth, Expiry Date, Class, Sex. 2. Output the information in full English sentences. 3. Translate the full text into {target_language}. If the target language is English, just output clean English sentences. """ response = client.chat.completions.create( model="gpt-4o", messages=[ {"role": "system", "content": prompt}, {"role": "user", "content": text} ], temperature=0.2 ) cleaned_translation = response.choices[0].message.content.strip() return cleaned_translation def process_document(image, target_language, language_group): if not isinstance(image, np.ndarray): image = np.array(image) # OCR - Text Extraction using PaddleOCR ocr_result = ocr_reader.ocr(image) extracted_texts = [] for line in ocr_result[0]: text = line[1][0] extracted_texts.append(text) extracted_text = " ".join(extracted_texts) # Language Detection source_language = detect_language(extracted_text) # GPT Cleaning and Translation translation = gpt_clean_and_translate(extracted_text, target_language) return extracted_text, source_language, translation def process_audio(audio, target_language): # Speech Recognition result = whisper_model(audio) extracted_text = result['text'] # Language Detection source_language = detect_language(extracted_text) # GPT Cleaning and Translation translation = gpt_clean_and_translate(extracted_text, target_language) return extracted_text, source_language, translation # Gradio Interface document_interface = gr.Interface( fn=process_document, inputs=[ gr.Image(type="pil", label="Upload a Document Image (e.g., Passport, ID, Government Form)"), gr.Radio(choices=["English", "Arabic"], label="Translate To"), gr.Dropdown(choices=["Arabic", "Russian", "Other (French, English)"], label="Document Language Group") ], outputs=[ gr.Textbox(label="Extracted Text"), gr.Textbox(label="Detected Source Language"), gr.Textbox(label="Translated and Structured Text") ], title="🚨 Police Vision & Translator - Document Scanner", description="Upload an image document. The system will auto-detect the source language and generate clean translated output." ) audio_interface = gr.Interface( fn=process_audio, inputs=[ gr.Audio(type="filepath", label="Record Audio (max 30 sec)"), gr.Radio(choices=["English", "Arabic"], label="Translate To") ], outputs=[ gr.Textbox(label="Transcribed Text"), gr.Textbox(label="Detected Source Language"), gr.Textbox(label="Translated and Structured Text") ], title="🚨 Police Vision & Translator - Voice Translator", description="Record audio. The system will auto-detect the source language and generate clean translated output." ) # Combine the Interfaces app = gr.TabbedInterface( [document_interface, audio_interface], ["Document Translator", "Voice Translator"] ) if __name__ == "__main__": app.launch(share=True)