File size: 3,761 Bytes
85abd3d
32dd4d2
7cab805
d5d3aa6
32dd4d2
d5d3aa6
 
 
 
32dd4d2
d5d3aa6
 
32dd4d2
d5d3aa6
32dd4d2
 
d5d3aa6
 
 
7cab805
d5d3aa6
 
7cab805
d5d3aa6
32dd4d2
7cab805
d5d3aa6
 
7cab805
d5d3aa6
32dd4d2
d5d3aa6
32dd4d2
 
 
 
 
 
d5d3aa6
32dd4d2
 
d5d3aa6
7cab805
32dd4d2
85abd3d
f018781
 
85abd3d
3ba8f3d
f018781
6852c86
dc2fb2f
 
3ba8f3d
6852c86
 
 
 
 
 
 
85abd3d
3ba8f3d
 
 
f018781
3ba8f3d
 
 
 
 
 
 
 
8d67b19
f018781
8d67b19
f018781
 
 
 
 
 
 
 
3ba8f3d
 
dc2fb2f
f018781
3ba8f3d
dc2fb2f
f018781
 
 
 
dc2fb2f
 
 
 
 
 
 
 
 
 
 
 
f018781
3ba8f3d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
"""from fastapi import FastAPI, UploadFile, File
from fastapi.responses import RedirectResponse, JSONResponse
from transformers import AutoProcessor, AutoModelForCausalLM
from PIL import Image
import tempfile
import torch

app = FastAPI()

# Load model
try:
    processor = AutoProcessor.from_pretrained("microsoft/git-large-coco")
    model = AutoModelForCausalLM.from_pretrained("microsoft/git-large-coco")
    USE_GIT = True
except Exception:
    from transformers import pipeline
    captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
    USE_GIT = False

def generate_caption(image_path):
    try:
        if USE_GIT:
            image = Image.open(image_path)
            inputs = processor(images=image, return_tensors="pt")
            outputs = model.generate(**inputs, max_length=50)
            return processor.batch_decode(outputs, skip_special_tokens=True)[0]
        else:
            result = captioner(image_path)
            return result[0]['generated_text']
    except Exception as e:
        return f"Error generating caption: {str(e)}"

@app.post("/imagecaption/")
async def caption_from_frontend(file: UploadFile = File(...)):
    contents = await file.read()
    with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
        tmp.write(contents)
        image_path = tmp.name

    caption = generate_caption(image_path)
    return JSONResponse({"caption": caption})

@app.get("/")
def home():
    return RedirectResponse(url="/")"""
from fastapi import UploadFile
from transformers import AutoProcessor, AutoModelForCausalLM, pipeline
from PIL import Image
import tempfile
import os
import torch
from gtts import gTTS
import uuid
# Load model
try:
    processor = AutoProcessor.from_pretrained("microsoft/git-large-coco")
    model = AutoModelForCausalLM.from_pretrained("microsoft/git-large-coco")
    USE_GIT = True
except Exception:
    captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
    USE_GIT = False

def generate_caption(image_path):
    try:
        if USE_GIT:
            image = Image.open(image_path).convert("RGB")
            inputs = processor(images=image, return_tensors="pt")
            outputs = model.generate(**inputs, max_length=50)
            return processor.batch_decode(outputs, skip_special_tokens=True)[0]
        else:
            result = captioner(image_path)
            return result[0]['generated_text']
    except Exception as e:
        return f"Error generating caption: {str(e)}"

async def caption_image(file: UploadFile):
    try:
        # Get file extension correctly
        _, ext = os.path.splitext(file.filename)
        if ext.lower() not in [".jpg", ".jpeg", ".png", ".bmp", ".gif"]:
            return {"error": "Unsupported file type"}

        # Save the uploaded image with correct extension
        with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
            contents = await file.read()
            tmp.write(contents)
            tmp_path = tmp.name
        
        # Generate caption
        caption = generate_caption(tmp_path)
        os.remove(tmp_path)

        # Handle errors inside generate_caption
        if caption.startswith("Error"):
            return {"error": caption}

        # Now generate TTS audio for the caption
        tts = gTTS(text=caption, lang="en")
        audio_filename = f"{uuid.uuid4()}.mp3"
        audio_path = os.path.join(tempfile.gettempdir(), audio_filename)
        tts.save(audio_path)

        # Return both caption and audio URL
        return {
            "caption": caption,
            "audio": f"/files/{audio_filename}"
        }

    except Exception as e:
        return {"error": f"Failed to generate caption: {str(e)}"}