|
<!DOCTYPE html> |
|
<html> |
|
<head> |
|
<meta charset="UTF-8"> |
|
<title>SmolVLM Benchmark Demo</title> |
|
<style> |
|
body { font-family: Arial, sans-serif; margin: 20px; } |
|
fieldset { margin-bottom: 20px; padding: 10px; } |
|
legend { font-weight: bold; } |
|
label { display: block; margin-top: 5px; } |
|
input, select { margin-bottom: 5px; width: 100%; max-width: 400px; } |
|
table { border-collapse: collapse; margin-top: 20px; width: 100%; max-width: 600px; } |
|
th, td { border: 1px solid #ccc; padding: 8px; text-align: left; } |
|
button { padding: 10px 20px; } |
|
.model-results { margin-bottom: 40px; } |
|
</style> |
|
</head> |
|
<body> |
|
<h1>SmolVLM Benchmark Demo</h1> |
|
|
|
|
|
<fieldset id="model-options"> |
|
<legend>Model Options (Note: Benchmarking all three SmolVLM models by default)</legend> |
|
<label for="model-id">Select Model ID:</label> |
|
<select id="model-id" disabled> |
|
<option value="hf-internal-testing/tiny-random-Idefics3ForConditionalGeneration">hf-internal-testing/tiny-random-Idefics3ForConditionalGeneration</option> |
|
<option value="HuggingFaceTB/SmolVLM-256M-Instruct" selected>HuggingFaceTB/SmolVLM-256M-Instruct</option> |
|
<option value="HuggingFaceTB/SmolVLM-500M-Instruct">HuggingFaceTB/SmolVLM-500M-Instruct</option> |
|
<option value="HuggingFaceTB/SmolVLM-Instruct">HuggingFaceTB/SmolVLM-Instruct</option> |
|
</select> |
|
|
|
<label for="decoder-dtype">Decoder (decoder_model_merged) dtype:</label> |
|
<select id="decoder-dtype"> |
|
<option value="fp32">fp32</option> |
|
<option value="fp16">fp16</option> |
|
<option value="q8">q8</option> |
|
<option value="q4" selected>q4</option> |
|
<option value="q4f16">q4f16</option> |
|
</select> |
|
|
|
<label for="embed-dtype">Embed Tokens dtype:</label> |
|
<select id="embed-dtype"> |
|
<option value="fp32">fp32</option> |
|
<option value="fp16">fp16</option> |
|
<option value="q8">q8</option> |
|
<option value="q4" selected>q4</option> |
|
<option value="q4f16">q4f16</option> |
|
</select> |
|
|
|
<label for="vision-dtype">Vision Encoder dtype:</label> |
|
<select id="vision-dtype"> |
|
<option value="fp32">fp32</option> |
|
<option value="fp16">fp16</option> |
|
<option value="q8">q8</option> |
|
<option value="q4" selected>q4</option> |
|
<option value="q4f16">q4f16</option> |
|
</select> |
|
</fieldset> |
|
|
|
|
|
<fieldset id="hardware-options"> |
|
<legend>Hardware Options</legend> |
|
<label for="device">Select Device:</label> |
|
<select id="device"> |
|
<option value="wasm">wasm</option> |
|
<option value="webgpu" selected>webgpu</option> |
|
</select> |
|
</fieldset> |
|
|
|
|
|
<fieldset id="benchmark-options"> |
|
<legend>Benchmark Options</legend> |
|
<label for="image-url">Image URL:</label> |
|
<input type="text" id="image-url" value="https://huggingface.co./spaces/merve/chameleon-7b/resolve/main/bee.jpg"> |
|
|
|
<label for="do-split">Do Image Splitting (do_image_splitting)</label> |
|
<input type="checkbox" id="do-split" checked> |
|
|
|
<label for="max-tokens">Number of Tokens to Generate:</label> |
|
<input type="number" id="max-tokens" value="32"> |
|
|
|
<label for="num-runs">Number of Runs:</label> |
|
<input type="number" id="num-runs" value="3"> |
|
</fieldset> |
|
|
|
<button id="start-benchmark">Start Benchmark</button> |
|
|
|
<div id="results"></div> |
|
|
|
<script type="module"> |
|
import { |
|
AutoProcessor, |
|
AutoModelForVision2Seq, |
|
load_image, |
|
TextStreamer, |
|
} from "https://cdn.jsdelivr.net/npm/@huggingface/[email protected]"; |
|
|
|
class SmolVLM { |
|
static model = null; |
|
static processor = null; |
|
static model_id = null; |
|
static async getInstance(modelId, dtypeSettings, device, revision) { |
|
if (this.model_id !== modelId) { |
|
await this.model?.dispose(); |
|
this.model = null; |
|
this.processor = null; |
|
this.model_id = modelId; |
|
} |
|
if (!this.processor) { |
|
this.processor = await AutoProcessor.from_pretrained(modelId); |
|
} |
|
if (!this.model) { |
|
this.model = await AutoModelForVision2Seq.from_pretrained(modelId, { |
|
dtype: { |
|
embed_tokens: dtypeSettings.embed, |
|
vision_encoder: dtypeSettings.vision, |
|
decoder_model_merged: dtypeSettings.decoder, |
|
}, |
|
device: device, |
|
revision, |
|
}); |
|
} |
|
return [this.processor, this.model]; |
|
} |
|
} |
|
|
|
async function runBenchmark() { |
|
document.getElementById("model-options").disabled = true; |
|
document.getElementById("hardware-options").disabled = true; |
|
const resultsDiv = document.getElementById("results"); |
|
resultsDiv.innerHTML = ""; |
|
|
|
const modelIds = { |
|
"HuggingFaceTB/SmolVLM-256M-Instruct": "refs/pr/11", |
|
"HuggingFaceTB/SmolVLM-500M-Instruct": "refs/pr/9", |
|
"HuggingFaceTB/SmolVLM-Instruct": "main" |
|
}; |
|
|
|
const decoder_dtype = document.getElementById("decoder-dtype").value || "q4"; |
|
const embed_dtype = document.getElementById("embed-dtype").value || "q4"; |
|
const vision_dtype = document.getElementById("vision-dtype").value || "q4"; |
|
const device = document.getElementById("device").value; |
|
const imageUrl = document.getElementById("image-url").value; |
|
const maxTokens = parseInt(document.getElementById("max-tokens").value) || 32; |
|
const numRuns = parseInt(document.getElementById("num-runs").value) || 3; |
|
const doImageSplitting = document.getElementById("do-split").checked; |
|
|
|
const dtypeSettings = { decoder: decoder_dtype, embed: embed_dtype, vision: vision_dtype }; |
|
const image = await load_image(imageUrl); |
|
|
|
for (const [modelId, revision] of Object.entries(modelIds)) { |
|
const modelShortName = modelId.split("/").pop(); |
|
const modelSection = document.createElement("div"); |
|
modelSection.className = "model-results"; |
|
modelSection.innerHTML = `<h2>Benchmarking ${modelShortName}</h2><p id="status-${modelShortName}">Loading...</p><pre id="bar-${modelShortName}">▯▯▯▯▯</pre>`; |
|
resultsDiv.appendChild(modelSection); |
|
|
|
const status = document.getElementById(`status-${modelShortName}`); |
|
const bar = document.getElementById(`bar-${modelShortName}`); |
|
|
|
try { |
|
status.innerText = "Loading processor and model..."; |
|
const [processor, model] = await SmolVLM.getInstance(modelId, dtypeSettings, device, revision); |
|
|
|
status.innerText = "Warming up..."; |
|
const messages = [{ |
|
role: "user", |
|
content: [ |
|
{ type: "image" }, |
|
{ type: "text", text: "Can you describe this image?" }, |
|
], |
|
}]; |
|
const text = processor.apply_chat_template(messages, { add_generation_prompt: true }); |
|
const inputs = await processor(text, [image], { do_image_splitting: doImageSplitting }); |
|
await model.generate({ ...inputs, max_new_tokens: 1 }); |
|
|
|
let totalTime = 0; |
|
let totalTps = 0; |
|
let runsResults = []; |
|
|
|
for (let i = 0; i < numRuns; ++i) { |
|
status.innerText = `Running benchmark... (${i + 1}/${numRuns})`; |
|
bar.innerText = createProgressBar(i + 1, numRuns); |
|
const start = performance.now(); |
|
|
|
const text = processor.apply_chat_template(messages, { add_generation_prompt: true }); |
|
const inputs = await processor(text, [image], { do_image_splitting: doImageSplitting }); |
|
|
|
let numTokens = 0; |
|
let startTime; |
|
let tps = 0; |
|
const token_callback_function = () => { |
|
startTime = startTime || performance.now(); |
|
tps = (numTokens++ / (performance.now() - startTime)) * 1000; |
|
}; |
|
const streamer = new TextStreamer(processor.tokenizer, { |
|
skip_prompt: true, |
|
skip_special_tokens: true, |
|
token_callback_function, |
|
}); |
|
await model.generate({ |
|
...inputs, |
|
max_new_tokens: maxTokens, |
|
min_new_tokens: maxTokens, |
|
streamer, |
|
}); |
|
const elapsed = performance.now() - start; |
|
|
|
|
|
totalTime += elapsed; |
|
totalTps += tps; |
|
runsResults.push({ |
|
run: i + 1, |
|
time: elapsed.toFixed(2), |
|
tps: tps.toFixed(2) |
|
}); |
|
} |
|
|
|
const avgTime = (totalTime / numRuns).toFixed(2); |
|
const avgTps = (totalTps / numRuns).toFixed(2); |
|
status.innerText = "✅ Done!"; |
|
bar.innerText = createProgressBar(numRuns, numRuns); |
|
|
|
let tableHtml = "<table>"; |
|
tableHtml += "<tr><th>Run</th><th>Execution Time (ms)</th><th>Tokens per Second</th></tr>"; |
|
runsResults.forEach(r => { |
|
tableHtml += `<tr><td>${r.run}</td><td>${r.time}</td><td>${r.tps}</td></tr>`; |
|
}); |
|
tableHtml += `<tr><td><strong>Average</strong></td><td><strong>${avgTime}</strong></td><td><strong>${avgTps}</strong></td></tr>`; |
|
tableHtml += "</table>"; |
|
modelSection.innerHTML += tableHtml; |
|
|
|
} catch (e) { |
|
status.innerText = "❌ Error: " + e.toString(); |
|
} |
|
} |
|
} |
|
|
|
function createProgressBar(current, total) { |
|
const filled = "▮".repeat(current); |
|
const empty = "▯".repeat(total - current); |
|
return filled + empty; |
|
} |
|
|
|
document.getElementById("start-benchmark").addEventListener("click", runBenchmark); |
|
</script> |
|
|
|
</body> |
|
</html> |
|
|
|
|