Spaces:

gizemsarsinlar
/

Art_Analysis_with_Phi-4

Running on Zero

File size: 3,955 Bytes

dea2ee7
 
 
2cf5592
dea2ee7
 
 
2cf5592
dea2ee7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2cf5592
dea2ee7
 
2cf5592
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dea2ee7
 
 
 
 
 
 
 
 
 
 
 
2cf5592
dea2ee7
 
2cf5592
dea2ee7
 
2cf5592
 
 
 
 
 
 
 
dea2ee7
 
2cf5592
dea2ee7
 
2cf5592
 
 
 
 
 
 
 
 
 
 
dea2ee7
 
2cf5592
dea2ee7
2cf5592
dea2ee7
 
 
 
2cf5592
 
dea2ee7

import gradio as gr
from PIL import Image
import torch
import soundfile as sf  # Ses işleme için
from transformers import AutoModelForCausalLM, AutoProcessor
import spaces

# Modeli yükle
model_path = "microsoft/Phi-4-multimodal-instruct"
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
    _attn_implementation="eager",
)

user_prompt = "<|user|>"
assistant_prompt = "<|assistant|>"
prompt_suffix = "<|end|>"

@spaces.GPU
def process_input(input_type, file):
    if not file:
        return "Please upload an image of an artwork."

    if input_type == "Image":
        # **Prompt for Artworks**
        prompt = (
            f"{user_prompt} You are an expert art historian and critic. Analyze the given artwork with these aspects:\n\n"
            f"1. **General Description**: Describe the colors, composition, and subject.\n"
            f"2. **Artistic Style**: Identify the artistic movement.\n"
            f"3. **Historical Context**: Discuss the period and influences.\n"
            f"4. **Symbolism & Meaning**: Interpret the messages conveyed.\n"
            f"5. **Technical Analysis**: Examine brushwork, lighting, and composition.\n"
            f"6. **Impact & Significance**: Explain its relevance in art history.\n\n"
            f"Here is the artwork for analysis:\n"
            f"<|image_1|>\n"
            f"{prompt_suffix}{assistant_prompt}"
        )
        image = Image.open(file)
        inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device)

    elif input_type == "Audio":
        prompt = (
            f"{user_prompt} Please transcribe the given audio into text accurately.\n\n"
            f"<|audio_1|>\n"
            f"{prompt_suffix}{assistant_prompt}"
        )
        audio, samplerate = sf.read(file)
        inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors="pt").to(model.device)

    else:
        return "Geçersiz giriş türü seçildi."

    with torch.no_grad():
        generate_ids = model.generate(
            **inputs,
            max_new_tokens=1000,
            num_logits_to_keep=0,
            temperature=0.7,  
            top_k=50,
        )
    generate_ids = generate_ids[:, inputs["input_ids"].shape[1]:]
    response = processor.batch_decode(
        generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]

    return response

with gr.Blocks(title="Art & Audio Analysis with Phi-4") as demo:
    gr.Markdown(
        """
        # 🎨🗣️ Multimodal Art Analysis with Phi-4  
        - **Art Analysis**: Upload a piece of art, AI will perform a detailed analysis.
        - **Audio Transcription**: Upload your audio file, AI will convert it to text.
        
        🚀 Powered by Microsoft's `Phi-4-multimodal-instruct` model.

        With this project, you can both analyze the uploaded image and convert the guide's verbal explanation into text while visiting any exhibition or museum.
        
        """
    )

    with gr.Row():
        with gr.Column(scale=1):
            input_type = gr.Radio(
                choices=["Image", "Audio"],
                label="Select Input Type",
                value="Image",
            )
            file_input = gr.File(
                label="Upload File",
                file_types=["image", "audio"],
            )
            submit_btn = gr.Button("Analyze", variant="primary")

        with gr.Column(scale=2):
            output_text = gr.Textbox(
                label="AI Response",
                placeholder="The AI's response will appear here...",
                lines=12,
                interactive=False,
            )

    submit_btn.click(
        fn=process_input,
        inputs=[input_type, file_input],
        outputs=output_text,
    )

demo.launch()