Spaces:
Running
on
L4
Running
on
L4
File size: 4,045 Bytes
925a7bd 373f022 925a7bd 373f022 925a7bd 373f022 925a7bd 373f022 925a7bd 373f022 925a7bd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import gradio as gr
from paddleocr import PaddleOCR
import numpy as np
import os
from langdetect import detect
from openai import OpenAI
from transformers import pipeline
# Initialize PaddleOCR
ocr_reader = PaddleOCR(use_angle_cls=True, lang='en')
# Initialize Whisper Model via Hugging Face Transformers
whisper_model = pipeline(
task="automatic-speech-recognition",
model="openai/whisper-small",
device=0
)
# Initialize OpenAI Client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
def detect_language(text):
try:
lang = detect(text)
except:
lang = "unknown"
return lang
def gpt_clean_and_translate(text, target_language):
if not text.strip():
return "No text detected.", ""
prompt = f"""
You are an expert document reader and translator. You will receive noisy extracted text from a government ID. Your tasks:
1. Identify and extract these fields: Name, Address, Date of Birth, Expiry Date, Class, Sex.
2. Output the information in full English sentences.
3. Translate the full text into {target_language}.
If the target language is English, just output clean English sentences.
"""
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": prompt},
{"role": "user", "content": text}
],
temperature=0.2
)
cleaned_translation = response.choices[0].message.content.strip()
return cleaned_translation
def process_document(image, target_language, language_group):
if not isinstance(image, np.ndarray):
image = np.array(image)
# OCR - Text Extraction using PaddleOCR
ocr_result = ocr_reader.ocr(image)
extracted_texts = []
for line in ocr_result[0]:
text = line[1][0]
extracted_texts.append(text)
extracted_text = " ".join(extracted_texts)
# Language Detection
source_language = detect_language(extracted_text)
# GPT Cleaning and Translation
translation = gpt_clean_and_translate(extracted_text, target_language)
return extracted_text, source_language, translation
def process_audio(audio, target_language):
# Speech Recognition
result = whisper_model(audio)
extracted_text = result['text']
# Language Detection
source_language = detect_language(extracted_text)
# GPT Cleaning and Translation
translation = gpt_clean_and_translate(extracted_text, target_language)
return extracted_text, source_language, translation
# Gradio Interface
document_interface = gr.Interface(
fn=process_document,
inputs=[
gr.Image(type="pil", label="Upload a Document Image (e.g., Passport, ID, Government Form)"),
gr.Radio(choices=["English", "Arabic"], label="Translate To"),
gr.Dropdown(choices=["Arabic", "Russian", "Other (French, English)"], label="Document Language Group")
],
outputs=[
gr.Textbox(label="Extracted Text"),
gr.Textbox(label="Detected Source Language"),
gr.Textbox(label="Translated and Structured Text")
],
title="π¨ Police Vision & Translator - Document Scanner",
description="Upload an image document. The system will auto-detect the source language and generate clean translated output."
)
audio_interface = gr.Interface(
fn=process_audio,
inputs=[
gr.Audio(type="filepath", label="Record Audio (max 30 sec)"),
gr.Radio(choices=["English", "Arabic"], label="Translate To")
],
outputs=[
gr.Textbox(label="Transcribed Text"),
gr.Textbox(label="Detected Source Language"),
gr.Textbox(label="Translated and Structured Text")
],
title="π¨ Police Vision & Translator - Voice Translator",
description="Record audio. The system will auto-detect the source language and generate clean translated output."
)
# Combine the Interfaces
app = gr.TabbedInterface(
[document_interface, audio_interface],
["Document Translator", "Voice Translator"]
)
if __name__ == "__main__":
app.launch(share=True)
|