Xenova's picture
Xenova HF Staff
Upload 3 files
27d3085 verified
raw
history blame
5.01 kB
import {
AutoTokenizer,
CLIPTextModelWithProjection,
AutoProcessor,
CLIPVisionModelWithProjection,
RawImage,
dot,
softmax,
} from "https://cdn.jsdelivr.net/npm/@huggingface/[email protected]";
// Reference the elements that we will need
const status = document.getElementById("status");
const container = document.getElementById("container");
const video = document.getElementById("video");
const labelsInput = document.getElementById("labels");
const templateInput = document.getElementById("template");
const overlay = document.getElementById("overlay");
status.textContent = "Loading model (88MB)...";
const model_id = "Xenova/mobileclip_s0";
let tokenizer, text_model, processor, vision_model;
try {
// Load tokenizer and text model
tokenizer = await AutoTokenizer.from_pretrained(model_id);
text_model = await CLIPTextModelWithProjection.from_pretrained(model_id, {
device: "wasm",
dtype: "q8",
});
// Load processor and vision model
processor = await AutoProcessor.from_pretrained(model_id);
vision_model = await CLIPVisionModelWithProjection.from_pretrained(model_id, {
device: "webnn",
dtype: "fp32",
});
} catch (err) {
console.error(err);
status.textContent = err.message;
alert(err.message);
throw err;
}
labelsInput.disabled = false;
templateInput.disabled = false;
status.textContent = "Ready";
// See `model.logit_scale` parameter of original model
const exp_logit_scale = Math.exp(4.6052);
const IMAGE_SIZE = 224;
const canvas = document.createElement("canvas");
canvas.width = canvas.height = IMAGE_SIZE;
const context = canvas.getContext("2d", { willReadFrequently: true });
let isProcessing = false;
let previousTime;
let textEmbeddings;
let prevTextInputs;
let prevTemplate;
let labels;
function onFrameUpdate() {
if (!isProcessing) {
isProcessing = true;
(async function () {
// If text inputs have changed, update the embeddings
if (
prevTextInputs !== labelsInput.value ||
prevTemplate !== templateInput.value
) {
textEmbeddings = null;
prevTextInputs = labelsInput.value;
prevTemplate = templateInput.value;
labels = prevTextInputs.split(/\s*,\s*/).filter((x) => x);
if (labels.length > 0) {
const texts = labels.map((x) =>
templateInput.value.replaceAll("{}", x),
);
const text_inputs = tokenizer(texts, {
padding: "max_length", // NB: the model requires max_length padding
truncation: true,
});
// Compute embeddings
const { text_embeds } = await text_model(text_inputs);
textEmbeddings = text_embeds.normalize().tolist();
} else {
overlay.innerHTML = "";
}
}
if (textEmbeddings) {
// Read the current frame from the video
context.drawImage(video, 0, 0, IMAGE_SIZE, IMAGE_SIZE);
const pixelData = context.getImageData(
0,
0,
IMAGE_SIZE,
IMAGE_SIZE,
).data;
const image = new RawImage(pixelData, IMAGE_SIZE, IMAGE_SIZE, 4);
const image_inputs = await processor(image);
// Compute embeddings
const { image_embeds } = await vision_model(image_inputs);
const imageEmbedding = image_embeds.normalize().tolist()[0];
// Compute similarity
const similarities = textEmbeddings.map(
(x) => dot(x, imageEmbedding) * exp_logit_scale,
);
const sortedIndices = softmax(similarities)
.map((x, i) => [x, i])
.sort((a, b) => b[0] - a[0]);
// Update UI
overlay.innerHTML = "";
for (const [score, index] of sortedIndices) {
overlay.appendChild(
document.createTextNode(`${labels[index]}: ${score.toFixed(2)}`),
);
overlay.appendChild(document.createElement("br"));
}
}
if (previousTime !== undefined) {
const fps = 1000 / (performance.now() - previousTime);
status.textContent = `FPS: ${fps.toFixed(2)}`;
}
previousTime = performance.now();
isProcessing = false;
})();
}
window.requestAnimationFrame(onFrameUpdate);
}
// Start the video stream
navigator.mediaDevices
.getUserMedia(
{ video: true }, // Ask for video
)
.then((stream) => {
// Set up the video and canvas elements.
video.srcObject = stream;
video.play();
const videoTrack = stream.getVideoTracks()[0];
const { width, height } = videoTrack.getSettings();
video.width = width;
video.height = height;
// Set container width and height depending on the image aspect ratio
const ar = width / height;
const [cw, ch] = ar > 720 / 405 ? [720, 720 / ar] : [405 * ar, 405];
container.style.width = `${cw}px`;
container.style.height = `${ch}px`;
// Start the animation loop
window.requestAnimationFrame(onFrameUpdate);
})
.catch((error) => {
alert(error);
});