Add support for PaliGemma (& PaliGemma2) in https://github.com/huggingface/transformers.js/pull/1074
Example: Image captioning with onnx-community/paligemma2-3b-ft-docci-448.
import { AutoProcessor, PaliGemmaForConditionalGeneration, load_image } from '@huggingface/transformers';
// Load processor and model
const model_id = 'onnx-community/paligemma2-3b-ft-docci-448';
const processor = await AutoProcessor.from_pretrained(model_id);
const model = await PaliGemmaForConditionalGeneration.from_pretrained(model_id, {
dtype: {
embed_tokens: 'fp16', // or 'q8'
vision_encoder: 'fp16', // or 'q4', 'q8'
decoder_model_merged: 'q4', // or 'q4f16'
},
});
// Prepare inputs
const url = 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg'
const raw_image = await load_image(url);
const prompt = '<image>caption en'; // Caption the image in English
const inputs = await processor(raw_image, prompt);
// Generate a response
const output = await model.generate({
...inputs,
max_new_tokens: 100,
})
const generated_ids = output.slice(null, [inputs.input_ids.dims[1], null]);
const answer = processor.batch_decode(
generated_ids,
{ skip_special_tokens: true },
);
console.log(answer[0]);
// A side view of a light blue 1970s Volkswagen Beetle parked on a gray cement road. It is facing to the right. It has a reflection on the side of it. Behind it is a yellow building with a brown double door on the right. It has a white frame around it. Part of a gray cement wall is visible on the far left.
List of supported models: https://huggingface.co/models?library=transformers.js&other=paligemma
Add support for I-JEPA in https://github.com/huggingface/transformers.js/pull/1073
Example: Image feature extraction with onnx-community/ijepa_vith14_1k.
import { pipeline, cos_sim } from "@huggingface/transformers";
// Create an image feature extraction pipeline
const extractor = await pipeline(
"image-feature-extraction",
"onnx-community/ijepa_vith14_1k",
{ dtype: "q8" },
);
// Compute image embeddings
const url_1 = "http://images.cocodataset.org/val2017/000000039769.jpg"
const url_2 = "http://images.cocodataset.org/val2017/000000219578.jpg"
const output = await extractor([url_1, url_2]);
const pooled_output = output.mean(1); // Apply mean pooling
// Compute cosine similarity
const similarity = cos_sim(pooled_output[0].data, pooled_output[1].data);
console.log(similarity); // 0.5168613045518973
List of supported models: https://huggingface.co/models?library=transformers.js&other=ijepa
Add support for OLMo2 in https://github.com/huggingface/transformers.js/pull/1076. List of supported models: https://huggingface.co/models?library=transformers.js&other=olmo2
Full Changelog: https://github.com/huggingface/transformers.js/compare/3.1.1...3.1.2
Fetched April 7, 2026