|
import torch |
|
import numpy as np |
|
import math |
|
from PIL import Image |
|
from torch.nn.functional import softmax |
|
|
|
def translate(img, model, max_seq_length=128, sos_token=1, eos_token=2): |
|
"data: BxCXHxW" |
|
model.eval() |
|
|
|
with torch.no_grad(): |
|
src = model.cnn(img) |
|
memory = model.transformer.forward_encoder(src) |
|
|
|
translated_sentence = [[sos_token]*len(img)] |
|
|
|
max_length = 0 |
|
|
|
while max_length <= max_seq_length and not all(np.any(np.asarray(translated_sentence).T==eos_token, axis=1)): |
|
tgt_inp = torch.LongTensor(translated_sentence) |
|
|
|
output, memory = model.transformer.forward_decoder(tgt_inp, memory) |
|
output = softmax(output, dim=-1) |
|
|
|
_, indices = torch.topk(output, 5) |
|
|
|
indices = indices[:, -1, 0] |
|
indices = indices.tolist() |
|
|
|
translated_sentence.append(indices) |
|
max_length += 1 |
|
|
|
translated_sentence = np.asarray(translated_sentence).T |
|
|
|
return translated_sentence |
|
|
|
def resize(w, h, expected_height, image_min_width, image_max_width): |
|
new_w = int(expected_height * float(w) / float(h)) |
|
round_to = 10 |
|
new_w = math.ceil(new_w/round_to)*round_to |
|
new_w = max(new_w, image_min_width) |
|
new_w = min(new_w, image_max_width) |
|
|
|
return new_w, expected_height |
|
|
|
def process_image(image, image_height, image_min_width, image_max_width): |
|
img = image.convert('RGB') |
|
|
|
w, h = img.size |
|
new_w, image_height = resize(w, h, image_height, image_min_width, image_max_width) |
|
|
|
img = img.resize((new_w, image_height), Image.Resampling.LANCZOS) |
|
|
|
img = np.asarray(img).transpose(2,0, 1) |
|
img = img/255 |
|
return img |
|
|
|
def process_input(image, image_height, image_min_width, image_max_width): |
|
img = process_image(image, image_height, image_min_width, image_max_width) |
|
img = img[np.newaxis, ...] |
|
img = torch.FloatTensor(img) |
|
return img |
|
|