Recently, I had been interested in locally reproducing the typesense huggingface models on my laptop. I want to experiment with the https://typesense.org nodes, but I also want to be able to use the same embedding models on my laptop for local development.

I noticed that the models in the typesense section of hugging face are in the model.onnx format which I had not encountered before. I learned how to get them running locally and I was able to compare that the vectors on a typesense cluster I was running matched vectors I generated locally.

However, I was extending the model from single query embedding to batch embedding yesterday and I stumbled upon the weirdnes bug of one query being embedded differently depending on whether I embedded it alone versus in a batch. Eventually I understood what my bug was and after facepalming, wrote up and tested a fix!

Setting up the onnx model locally

So I learned that the typical huggingface python library was not sufficient here.

install a new library

cd ~/.python_venvs
uv venv --python 3.11 dish
source dish/bin/activate
which python  # /Users/michal/.python_venvs/dish/bin/python
python --version  # Python 3.11.7
uv pip install ipython
uv pip install optimum[onnxruntime]

And I downloaded the config.json model.onnx vocab.txt three files to a new folder, onnx_models/all-MiniLM-L12-v2.

actually it took a few attempts to load the model

Initially I was getting a numpy v1 vs v2 error,

from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import AutoTokenizer

# Load the model
model = ORTModelForSequenceClassification.from_pretrained("all-MiniLM-L12-v2")

# Load tokenizer if available
tokenizer = AutoTokenizer.from_pretrained("all-MiniLM-L12-v2")

# Prepare input
inputs = tokenizer("Hello world!", return_tensors="pt")

# Perform inference
outputs = model(**inputs)
print(outputs.logits)
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.0 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Retrying with python 3.9 and numpy 1.x

uv venv --python 3.9 dish  # Using CPython 3.9.18 interpreter at: /usr/local/opt/python@3.9/bin/python3.9
source dish/bin/activate
uv pip install ipython optimum[onnxruntime] "numpy<2"

# I saw numpy==1.26.4 , nice!

and now loading was fine,

from pathlib import Path
from optimum.onnxruntime import ORTModelForFeatureExtraction
from transformers import AutoTokenizer

model_name = "all-MiniLM-L12-v2"
local_models = "local_models"
path_to_local_all_minilm_l12_v2 = (Path.home() / local_models / model_name).as_posix()

# Load the model
model = ORTModelForFeatureExtraction.from_pretrained(path_to_local_all_minilm_l12_v2)
tokenizer = AutoTokenizer.from_pretrained(path_to_local_all_minilm_l12_v2)

# Prepare input
inputs = tokenizer("Hello world!", return_tensors="pt")

# Perform inference
outputs = model(**inputs)

# Retrieve embeddings from last_hidden_state (for example)
embeddings = outputs.last_hidden_state
print(inputs)
print(embeddings.shape)  # e.g., [batch_size, seq_length, hidden_dim]
print(embeddings)
{'input_ids': tensor([[ 101, 7592, 2088,  999,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}
torch.Size([1, 5, 384])
tensor([[[-0.1849, -0.0977, -0.0351, -0.2134,  ..., -0.0405, -0.2599,  0.1040,  0.1008],
         [-0.8934, -0.0513,  0.4190, -0.3306,  ..., -0.0511,  0.1419,  0.0011,  0.5171],
         [-0.0798,  0.1150, -0.2807, -0.4178,  ..., -0.5528, -0.4580, -0.5165, -0.0583],
         [-0.3010,  0.9995, -0.0930, -0.1598,  ...,  0.0433, -0.3697,  0.5390,  0.1855],
         [-0.1849, -0.0977, -0.0351, -0.2134,  ..., -0.0405, -0.2599,  0.1040,  0.1008]]])

only thing left to do here is to use mean pooling to get one embedding from the 5 token embeddings,

emb = torch.mean(embeddings[0, :, :], 0)

print(emb.shape)
print(emb)
torch.Size([384])
tensor([-0.3288,  0.1736, -0.0050, -0.2670,  ..., -0.1283, -0.2411,  0.0463,  0.1692])

reproduce local embedding model matches what is used on typesense cluster

So I had loaded some food related data on my typesense cluster, ran a simple query to pull a few documents and then re-embedded them locally to check the vectors.

First query my cluster

import torch
from pprint import pprint

torch.set_printoptions(threshold=12, edgeitems=4, linewidth=90)

client = make_client(timeout=600)
location = random_us_coords()

results = query_raw(location, "buffalo wings", 100)
df = df_from_results(results)
df[:5]

pprint([[row["concat"][:37], torch.tensor(row["embedding"]), round(row["vector_distance"], 3)] for row in df[:5].to_dicts()])
[[' fried chicken wings  ',
  tensor([ 0.0037,  0.0396, -0.9224,  0.1760,  ..., -0.0739,  0.3501, -0.3490, -0.1918]),
  0.395],
 ['c bourbon chicken ',
  tensor([-0.1860, -0.1894, -0.3629, -0.0392,  ..., -0.0728,  0.0968,  0.3149, -0.1836]),
  0.478],
 ['thai crispy wings large chicken wings',
  tensor([-0.1742,  0.0064, -0.2367,  0.0564,  ..., -0.0186,  0.3364, -0.2523, -0.2709]),
  0.541],
 ['bourbon honey bourbons  ',
  tensor([-0.5718, -0.0672, -0.0274, -0.0360,  ...,  0.3141,  0.1054,  0.4766, -0.3480]),
  0.542],
 ['boar s head buffalo style chicken  ',
  tensor([-0.0436, -0.0633, -0.3899,  0.0730,  ..., -0.2415,  0.0070,  0.1767, -0.0291]),
  0.549]]

and compare with local,

for row in df[:5].to_dicts():
    text = row["concat"]
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs)
    embeddings_vec = outputs.last_hidden_state
    emb = torch.mean(embeddings_vec[0, :, :], 0)
    print(text[:37])
    print("typesense embedding", torch.tensor(row["embedding"]))
    print("local embedding", emb, "\n")
 fried chicken wings  
typesense embedding tensor([ 0.0037,  0.0396, -0.9224,  0.1760,  ..., -0.0739,  0.3501, -0.3490, -0.1918])
local embedding tensor([ 0.0037,  0.0396, -0.9224,  0.1760,  ..., -0.0739,  0.3501, -0.3490, -0.1918]) 

c bourbon chicken 
typesense embedding tensor([-0.1860, -0.1894, -0.3629, -0.0392,  ..., -0.0728,  0.0968,  0.3149, -0.1836])
local embedding tensor([-0.1860, -0.1894, -0.3629, -0.0392,  ..., -0.0728,  0.0968,  0.3149, -0.1836]) 

thai crispy wings large chicken wings
typesense embedding tensor([-0.1742,  0.0064, -0.2367,  0.0564,  ..., -0.0186,  0.3364, -0.2523, -0.2709])
local embedding tensor([-0.1742,  0.0064, -0.2367,  0.0564,  ..., -0.0186,  0.3364, -0.2523, -0.2709]) 

bourbon honey bourbons  
typesense embedding tensor([-0.5718, -0.0672, -0.0274, -0.0360,  ...,  0.3141,  0.1054,  0.4766, -0.3480])
local embedding tensor([-0.5718, -0.0672, -0.0274, -0.0360,  ...,  0.3141,  0.1054,  0.4766, -0.3480]) 

boar s head buffalo style chicken  
typesense embedding tensor([-0.0436, -0.0633, -0.3899,  0.0730,  ..., -0.2415,  0.0070,  0.1767, -0.0291])
local embedding tensor([-0.0436, -0.0633, -0.3899,  0.0730,  ..., -0.2415,  0.0070,  0.1767, -0.0291]) 

The bug

So I wanted yesterday, to extend a single text embedding to be batched, to vectorize it in other words. But when I did this, my first iteration of the code had a really weird bug. As I was testing it here is what I noticed, below.

At this point I have a minimal convenience class to wrap the retrieval too

import torch
torch.set_printoptions(threshold=10, edgeitems=2, linewidth=80)

from pathlib import Path
import embedder.onnx_utils as eou

model = eou.LocalOnnx(path_to_local_all_minilm_l12_v2)

queries = ["chicken wings", "chicken parmesan"]
embeddings_separate = torch.stack([model.embed_query(x) for x in queries])
embeddings_batch = model.embed_documents(queries)
print(embeddings_separate.shape, embeddings_batch.shape)
print( embeddings_separate)
print( embeddings_batch)
torch.Size([2, 384]) torch.Size([2, 384])
tensor([[-0.1094,  0.0502,  ..., -0.3084, -0.2158],
        [-0.1148, -0.0603,  ...,  0.1199,  0.0667]])
tensor([[-0.0658,  0.0258,  ..., -0.2852, -0.0799],
        [-0.1148, -0.0603,  ...,  0.1199,  0.0667]])

so, clearly one of the vectors was the same batched but not the other.

Looking at it another way I saw this,

So the embeddings from the new model.embed_documents func was different than when running the single model.embed_query, but only for one query. Weird, and flippin the order did not isolate the bug.

queries = ["chicken wings", "chicken parmesan"]
print(model.embed_documents(queries))
print(model.embed_documents([queries[1], queries[0]]))
print(model.embed_documents([queries[0]]))
print(model.embed_documents([queries[0], queries[0]]))
print(model.embed_documents([queries[1], queries[1]]))
print(torch.stack([model.embed_query(x) for x in queries]))

So I also printed the tokens and then I found the smoking gun. (Idea to also look at the intermediate token ideas came from describing my problem to chatgpt!)

queries = ["moroccan peppermint chile", "saudi falafel with mint"]
print(model.embed_documents(queries, print_tokens=True))
print("...\n")
print(model.embed_documents([queries[1], queries[0]], print_tokens=True))
print("...\n")
print(torch.stack([model.embed_query(x, print_tokens=True) for x in queries]))

so I realized, looking at the above that ok duhh, the padding is the issue. So looking at the pre-pooling it is more clear even,

The padding

The issue, per the above, was that the query that was being messed up, was the one requiring fewer tokens, and so got padded with two additional 0s . And so when applying the mean pooling, the 0s got averaged in, therefore messing with the final vector.

Also, the whole reason for the padding is, when embedding tokens, in batch, padding must be used because all the transformations work on matrix operations. And without matrices, we get the below error.

from pathlib import Path
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForFeatureExtraction

model_name = "all-MiniLM-L12-v2"
local_models = "local_models"
path_to_local_all_minilm_l12_v2 = (Path.home() / local_models / model_name).as_posix()

tokenizer = AutoTokenizer.from_pretrained(path_to_local_all_minilm_l12_v2)
queries = ["moroccan peppermint chile", "saudi falafel with mint"]

inputs = tokenizer.batch_encode_plus(queries, return_tensors="pt", padding=False)
model = ORTModelForFeatureExtraction.from_pretrained(path_to_local_all_minilm_l12_v2)

outputs = model(
  **inputs
)
ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

the bug

But when padding, the bug was that then after embedding the individual tokens, the code applied [ mean pooling ] (https://sbert.net/docs/sentence_transformer/usage/custom_models.html), naively ignoring the 0 token values,

def embed_documents(self, queries: List[str], print_tokens=False):
    inputs = self.tokenizer.batch_encode_plus(
        queries, return_tensors="pt", padding=True)

    outputs = self.model(
        **inputs
    )
    embedding_vec = outputs.last_hidden_state
    embedding = torch.mean(embedding_vec, 1)

    print(embedding_vec.shape, embedding.shape)
    # torch.Size([2, 9, 384]) torch.Size([2, 384])

and messing with the final embedding.

The resolution

As a new approach, the attention mask that is provided during the encoding is used to remove the pad vectors before taking the mean.

In [13]: inputs
Out[13]: 
{'input_ids': tensor([[  101, 17494, 11565, 10020,  ...,  7029,   102,     0,     0],
        [  101,  8174,  6904,  2721,  ...,  2140,  2007, 12927,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0,  ..., 0, 0, 0, 0],
        [0, 0, 0, 0,  ..., 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1,  ..., 1, 1, 0, 0],
        [1, 1, 1, 1,  ..., 1, 1, 1, 1]])}

actually, this is only the first stab and this code can be way cleaner of course, but this sovles the problem for now.

def embed_documents(self, queries: List[str], print_tokens=False):
    inputs = self.tokenizer.batch_encode_plus(
        queries, return_tensors="pt", padding=True)

    if print_tokens:
        print(inputs)

    outputs = self.model(
        **inputs
    )
    embedding_vec = outputs.last_hidden_state

    foo_vec = []

    # use the mask, after padding, to remove the pad rows.
    mask_size = inputs["attention_mask"][1].shape[0]

    vector_length = embedding_vec.shape[2]

    for i, _ in enumerate(queries):
        mask = inputs["attention_mask"][i].bool().view(mask_size, 1).repeat(1, vector_length)

        mask_sum = inputs["attention_mask"][i].sum()

        emb = embedding_vec[i]

        foo_vec.append(
            torch.mean(
                torch.masked_select(emb, mask).view(mask_sum, vector_length), 0
            )
        )

    return torch.stack(foo_vec)

new run with fix

In [58]: queries = ["moroccan peppermint chile", "saudi falafel with mint"]
    ...: print(model.embed_documents(queries, ))
    ...: print("...\n")
    ...: print(model.embed_documents([queries[1], queries[0]]))
    ...: print("...\n")
    ...: print(torch.stack([model.embed_query(x) for x in queries]))
tensor([[-0.4740, -0.0985, -0.3240, -0.1140,  ..., -0.0500, -0.2265,  0.0522,  0.1712],
        [-0.4011,  0.0649, -0.7596, -0.1395,  ...,  0.3353, -0.4370,  0.1567,  0.2334]])
...

tensor([[-0.4011,  0.0649, -0.7596, -0.1395,  ...,  0.3353, -0.4370,  0.1567,  0.2334],
        [-0.4740, -0.0985, -0.3240, -0.1140,  ..., -0.0500, -0.2265,  0.0522,  0.1712]])
...

tensor([[-0.4740, -0.0985, -0.3240, -0.1140,  ..., -0.0500, -0.2265,  0.0522,  0.1712],
        [-0.4011,  0.0649, -0.7596, -0.1395,  ...,  0.3353, -0.4370,  0.1567,  0.2334]])

Appendix

some helper functions

import os
import polars as pl
import typesense
import random
from glom import glom


def make_client(timeout=60):
    api_key = os.getenv("TYPESENSE_API_KEY")
    cluster_host = os.getenv("TYPESENSE_CLUSTER")  # https://cloud.typesense.org/clusters/xxxx

    client = typesense.Client({
      "nodes": [{
        "host": cluster_host,
        "port": "443",
        "protocol": "https"
      }],
      "api_key": api_key,
      "connection_timeout_seconds": timeout
    })
    return client


def random_us_coords():
    # Continental US approximate bounds:
    # Latitude: 24.5°N to 49.5°N
    # Longitude: -124.77°W to -66.95°W
    lat = random.uniform(24.5, 49.5)
    lng = random.uniform(-124.77, -66.95)
    return lat, lng


def query_raw(location, query, radius_km):
    client = make_client(timeout=600)
    lat, lng = location
    search_results = client.collections['items'].documents.search({	
      "q": query,
      "query_by": "concat_embedding",
      "filter_by": 
          f"location:({lat}, {lng}, {radius_km} km)",
      "sort_by": "_vector_distance:asc",
      # "exclude_fields": "concat_embedding",
      'page': 1,
      'per_page': 100
    }
    )
    return search_results


def df_from_results(results):
    spec = [
        {
            "published_name": "document.published_name",
            "concat": "document.concat",
            "embedding": "document.concat_embedding",
            "vector_distance": "vector_distance",
            "location": "document.location",
        }
    ]
    df = pl.from_dicts(glom(results["hits"], spec))
    return df