|
|
--- |
|
|
pipeline_tag: sentence-similarity |
|
|
tags: |
|
|
- sentence-transformers |
|
|
- feature-extraction |
|
|
- sentence-similarity |
|
|
- mteb |
|
|
- arctic |
|
|
- snowflake-arctic-embed |
|
|
- transformers.js |
|
|
license: apache-2.0 |
|
|
language: |
|
|
- af |
|
|
- ar |
|
|
- az |
|
|
- be |
|
|
- bg |
|
|
- bn |
|
|
- ca |
|
|
- ceb |
|
|
- cs |
|
|
- cy |
|
|
- da |
|
|
- de |
|
|
- el |
|
|
- en |
|
|
- es |
|
|
- et |
|
|
- eu |
|
|
- fa |
|
|
- fi |
|
|
- fr |
|
|
- gl |
|
|
- gu |
|
|
- he |
|
|
- hi |
|
|
- hr |
|
|
- ht |
|
|
- hu |
|
|
- hy |
|
|
- id |
|
|
- is |
|
|
- it |
|
|
- ja |
|
|
- jv |
|
|
- ka |
|
|
- kk |
|
|
- km |
|
|
- kn |
|
|
- ko |
|
|
- ky |
|
|
- lo |
|
|
- lt |
|
|
- lv |
|
|
- mk |
|
|
- ml |
|
|
- mn |
|
|
- mr |
|
|
- ms |
|
|
- my |
|
|
- ne |
|
|
- nl |
|
|
- pa |
|
|
- pl |
|
|
- pt |
|
|
- qu |
|
|
- ro |
|
|
- ru |
|
|
- si |
|
|
- sk |
|
|
- sl |
|
|
- so |
|
|
- sq |
|
|
- sr |
|
|
- sv |
|
|
- sw |
|
|
- ta |
|
|
- te |
|
|
- th |
|
|
- tl |
|
|
- tr |
|
|
- uk |
|
|
- ur |
|
|
- vi |
|
|
- yo |
|
|
- zh |
|
|
--- |
|
|
|
|
|
## snowflake-arctic-embed-m-v2.0-ONNX-uint8 |
|
|
|
|
|
This is a version of this [model](https://huggingface.co/electroglyph/snowflake-arctic-embed-m-v2.0-ONNX-quant) which outputs a qdrant compatible uint8 tensor. |
|
|
|
|
|
It's very close to the accuracy of the full precision f32 ONNX model with f32 output. |
|
|
|
|
|
This is an upgraded version of https://huggingface.co/electroglyph/snowflake2_m_uint8 with better accuracy. |
|
|
|
|
|
## Quantization method |
|
|
|
|
|
For calibration data I used my own multilingual dataset of around 1.5m tokens: https://github.com/electroglyph/dataset_build |
|
|
|
|
|
I ran all compatible tokens through the model and logged the highest/lowest values seen. I found a range of: -0.15288913249969482 to 0.1472320258617401 |
|
|
|
|
|
I hacked on the sentence_embedding output of the ONNX model and added QuantizeLinear node based on the range of -0.15288913249969482 to 0.15288913249969482 to keep it symmetric. |
|
|
|
|
|
## Benchmarks |
|
|
|
|
|
I included results for one of my other models: https://huggingface.co/electroglyph/embeddinggemma-300m-ONNX-uint8 |
|
|
|
|
|
The results for this model are quite close to the full precision ONNX model with f32 output. |
|
|
|
|
|
 |
|
|
|
|
|
 |
|
|
|
|
|
# Example Benchmark Code |
|
|
|
|
|
```python |
|
|
import mteb |
|
|
from mteb.encoder_interface import PromptType |
|
|
import numpy as np |
|
|
import onnxruntime as rt |
|
|
from transformers import AutoTokenizer |
|
|
|
|
|
class CustomModel: |
|
|
def __init__(self) -> None: |
|
|
self.tokenizer = AutoTokenizer.from_pretrained("C:/LLM/snowflake-arctic-embed-m-v2.0") |
|
|
self.session = rt.InferenceSession("snowflake-arctic-embed-m-v2.0-uint8.onnx", providers=["CPUExecutionProvider"]) |
|
|
self.scale = 0.15288913249969482 / 127.0 |
|
|
|
|
|
def dequantize(self, quantized: list | np.ndarray, scale: float) -> np.ndarray: |
|
|
quantized = np.array(quantized) |
|
|
dequant = (quantized.astype(np.float32) - 128) * scale |
|
|
if dequant.ndim == 3 and dequant.shape[0] == 1: |
|
|
return np.squeeze(dequant, axis=0) |
|
|
return dequant |
|
|
|
|
|
def encode( |
|
|
self, |
|
|
sentences: list[str], |
|
|
task_name: str, |
|
|
prompt_type: PromptType | None = None, |
|
|
**kwargs, |
|
|
) -> np.ndarray: |
|
|
inputs = self.tokenizer(sentences, padding=True, truncation=True, return_tensors="np", max_length=8192) |
|
|
q = self.session.run(["sentence_embedding"], dict(inputs)) |
|
|
return self.dequantize(q, self.scale) |
|
|
|
|
|
|
|
|
model = CustomModel() |
|
|
benchmark = mteb.get_benchmark("NanoBEIR") |
|
|
evaluation = mteb.MTEB(tasks=benchmark) |
|
|
results = evaluation.run(model, corpus_chunk_size=4) |
|
|
for r in results: |
|
|
print(r) |
|
|
|
|
|
``` |
|
|
|
|
|
# Example FastEmbed Usage |
|
|
|
|
|
```python |
|
|
from fastembed import TextEmbedding |
|
|
from fastembed.common.model_description import PoolingType, ModelSource |
|
|
|
|
|
TextEmbedding.add_custom_model( |
|
|
model="snowflake-arctic-embed-m-v2.0-ONNX-uint8", |
|
|
pooling=PoolingType.DISABLED, |
|
|
normalization=False, |
|
|
sources=ModelSource(hf="electroglyph/snowflake-arctic-embed-m-v2.0-ONNX-uint8"), |
|
|
dim=768, |
|
|
model_file="onnx/model.onnx", |
|
|
) |
|
|
|
|
|
model = TextEmbedding(model_name="snowflake-arctic-embed-m-v2.0-ONNX-uint8") |
|
|
embeddings = list(model.embed("test")) |
|
|
print(embeddings) |
|
|
``` |
|
|
|
|
|
## License |
|
|
Arctic is licensed under the [Apache-2](https://www.apache.org/licenses/LICENSE-2.0). The released models can be used for commercial purposes free of charge. |