File size: 4,045 Bytes
925a7bd
 
 
 
 
373f022
 
925a7bd
 
 
 
 
 
 
 
 
 
 
373f022
 
925a7bd
 
 
 
 
 
 
 
 
 
 
 
 
373f022
925a7bd
 
 
 
 
 
 
373f022
925a7bd
 
 
 
 
 
 
 
373f022
925a7bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import gradio as gr
from paddleocr import PaddleOCR
import numpy as np
import os
from langdetect import detect
from openai import OpenAI
from transformers import pipeline

# Initialize PaddleOCR
ocr_reader = PaddleOCR(use_angle_cls=True, lang='en')

# Initialize Whisper Model via Hugging Face Transformers
whisper_model = pipeline(
    task="automatic-speech-recognition",
    model="openai/whisper-small",
    device=0
)

# Initialize OpenAI Client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def detect_language(text):
    try:
        lang = detect(text)
    except:
        lang = "unknown"
    return lang

def gpt_clean_and_translate(text, target_language):
    if not text.strip():
        return "No text detected.", ""

    prompt = f"""
You are an expert document reader and translator. You will receive noisy extracted text from a government ID. Your tasks:

1. Identify and extract these fields: Name, Address, Date of Birth, Expiry Date, Class, Sex.
2. Output the information in full English sentences.
3. Translate the full text into {target_language}.
If the target language is English, just output clean English sentences.
"""

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": text}
        ],
        temperature=0.2
    )

    cleaned_translation = response.choices[0].message.content.strip()
    return cleaned_translation

def process_document(image, target_language, language_group):
    if not isinstance(image, np.ndarray):
        image = np.array(image)

    # OCR - Text Extraction using PaddleOCR
    ocr_result = ocr_reader.ocr(image)

    extracted_texts = []
    for line in ocr_result[0]:
        text = line[1][0]
        extracted_texts.append(text)

    extracted_text = " ".join(extracted_texts)

    # Language Detection
    source_language = detect_language(extracted_text)

    # GPT Cleaning and Translation
    translation = gpt_clean_and_translate(extracted_text, target_language)

    return extracted_text, source_language, translation

def process_audio(audio, target_language):
    # Speech Recognition
    result = whisper_model(audio)
    extracted_text = result['text']

    # Language Detection
    source_language = detect_language(extracted_text)

    # GPT Cleaning and Translation
    translation = gpt_clean_and_translate(extracted_text, target_language)

    return extracted_text, source_language, translation

# Gradio Interface
document_interface = gr.Interface(
    fn=process_document,
    inputs=[
        gr.Image(type="pil", label="Upload a Document Image (e.g., Passport, ID, Government Form)"),
        gr.Radio(choices=["English", "Arabic"], label="Translate To"),
        gr.Dropdown(choices=["Arabic", "Russian", "Other (French, English)"], label="Document Language Group")
    ],
    outputs=[
        gr.Textbox(label="Extracted Text"),
        gr.Textbox(label="Detected Source Language"),
        gr.Textbox(label="Translated and Structured Text")
    ],
    title="🚨 Police Vision & Translator - Document Scanner",
    description="Upload an image document. The system will auto-detect the source language and generate clean translated output."
)

audio_interface = gr.Interface(
    fn=process_audio,
    inputs=[
        gr.Audio(type="filepath", label="Record Audio (max 30 sec)"),
        gr.Radio(choices=["English", "Arabic"], label="Translate To")
    ],
    outputs=[
        gr.Textbox(label="Transcribed Text"),
        gr.Textbox(label="Detected Source Language"),
        gr.Textbox(label="Translated and Structured Text")
    ],
    title="🚨 Police Vision & Translator - Voice Translator",
    description="Record audio. The system will auto-detect the source language and generate clean translated output."
)

# Combine the Interfaces
app = gr.TabbedInterface(
    [document_interface, audio_interface],
    ["Document Translator", "Voice Translator"]
)

if __name__ == "__main__":
    app.launch(share=True)