Update README.md
Browse files
README.md
CHANGED
|
@@ -11,7 +11,7 @@ language:
|
|
| 11 |
|
| 12 |
# INT4 google/gemma-3-12b-it model
|
| 13 |
|
| 14 |
-
- **Developed by:**
|
| 15 |
- **License:** apache-2.0
|
| 16 |
- **Quantized from Model :** google/gemma-3-12b-it
|
| 17 |
- **Quantization Method :** INT4
|
|
@@ -28,14 +28,14 @@ pip install torchao
|
|
| 28 |
Then we can serve with the following command:
|
| 29 |
```Shell
|
| 30 |
# Server
|
| 31 |
-
export MODEL=
|
| 32 |
VLLM_DISABLE_COMPILE_CACHE=1 vllm serve $MODEL --tokenizer $MODEL -O3
|
| 33 |
```
|
| 34 |
|
| 35 |
```Shell
|
| 36 |
# Client
|
| 37 |
curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
|
| 38 |
-
"model": "
|
| 39 |
"messages": [
|
| 40 |
{"role": "user", "content": "Give me a short introduction to large language models."}
|
| 41 |
],
|
|
@@ -64,7 +64,7 @@ Example:
|
|
| 64 |
import torch
|
| 65 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 66 |
|
| 67 |
-
model_name = "
|
| 68 |
|
| 69 |
# load the tokenizer and the model
|
| 70 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
@@ -187,7 +187,7 @@ We rely on [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-h
|
|
| 187 |
|
| 188 |
| Benchmark | | |
|
| 189 |
|----------------------------------|----------------|---------------------------|
|
| 190 |
-
| | google/gemma-3-12b-it |
|
| 191 |
| mmlu | 71.51 | 68.96 |
|
| 192 |
|
| 193 |
|
|
@@ -204,7 +204,7 @@ lm_eval --model hf --model_args pretrained=google/gemma-3-12b-it --tasks mmlu --
|
|
| 204 |
|
| 205 |
## INT4
|
| 206 |
```Shell
|
| 207 |
-
export MODEL=
|
| 208 |
lm_eval --model hf --model_args pretrained=$MODEL --tasks mmlu --device cuda:0 --batch_size 8
|
| 209 |
```
|
| 210 |
</details>
|
|
@@ -218,7 +218,7 @@ lm_eval --model hf --model_args pretrained=$MODEL --tasks mmlu --device cuda:0 -
|
|
| 218 |
|
| 219 |
| Benchmark | | |
|
| 220 |
|------------------|----------------|--------------------------------|
|
| 221 |
-
| | google/gemma-3-12b-it |
|
| 222 |
| Peak Memory (GB) | 24.50 | 8.68 (65% reduction) |
|
| 223 |
|
| 224 |
|
|
@@ -232,8 +232,8 @@ We can use the following code to get a sense of peak memory usage during inferen
|
|
| 232 |
import torch
|
| 233 |
from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
|
| 234 |
|
| 235 |
-
# use "google/gemma-3-12b-it" or "
|
| 236 |
-
model_id = "
|
| 237 |
quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
|
| 238 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 239 |
|
|
@@ -278,7 +278,7 @@ print(f"Peak Memory Usage: {mem:.02f} GB")
|
|
| 278 |
## Results (A100 machine)
|
| 279 |
| Benchmark (Latency) | | |
|
| 280 |
|----------------------------------|----------------|--------------------------|
|
| 281 |
-
| | google/gemma-3-12b-it |
|
| 282 |
| latency (batch_size=1) | 3.73s | 2.16s (1.73x speedup) |
|
| 283 |
|
| 284 |
<details>
|
|
@@ -308,7 +308,7 @@ python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model
|
|
| 308 |
|
| 309 |
### INT4
|
| 310 |
```Shell
|
| 311 |
-
export MODEL=
|
| 312 |
VLLM_DISABLE_COMPILE_CACHE=1 python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model $MODEL --batch-size 1
|
| 313 |
```
|
| 314 |
</details>
|
|
|
|
| 11 |
|
| 12 |
# INT4 google/gemma-3-12b-it model
|
| 13 |
|
| 14 |
+
- **Developed by:** pytorch
|
| 15 |
- **License:** apache-2.0
|
| 16 |
- **Quantized from Model :** google/gemma-3-12b-it
|
| 17 |
- **Quantization Method :** INT4
|
|
|
|
| 28 |
Then we can serve with the following command:
|
| 29 |
```Shell
|
| 30 |
# Server
|
| 31 |
+
export MODEL=pytorch/gemma-3-12b-it-INT4
|
| 32 |
VLLM_DISABLE_COMPILE_CACHE=1 vllm serve $MODEL --tokenizer $MODEL -O3
|
| 33 |
```
|
| 34 |
|
| 35 |
```Shell
|
| 36 |
# Client
|
| 37 |
curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
|
| 38 |
+
"model": "pytorch/gemma-3-12b-it-INT4",
|
| 39 |
"messages": [
|
| 40 |
{"role": "user", "content": "Give me a short introduction to large language models."}
|
| 41 |
],
|
|
|
|
| 64 |
import torch
|
| 65 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 66 |
|
| 67 |
+
model_name = "pytorch/gemma-3-12b-it-INT4"
|
| 68 |
|
| 69 |
# load the tokenizer and the model
|
| 70 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
|
|
| 187 |
|
| 188 |
| Benchmark | | |
|
| 189 |
|----------------------------------|----------------|---------------------------|
|
| 190 |
+
| | google/gemma-3-12b-it | pytorch/gemma-3-12b-it-INT4 |
|
| 191 |
| mmlu | 71.51 | 68.96 |
|
| 192 |
|
| 193 |
|
|
|
|
| 204 |
|
| 205 |
## INT4
|
| 206 |
```Shell
|
| 207 |
+
export MODEL=pytorch/gemma-3-12b-it-INT4
|
| 208 |
lm_eval --model hf --model_args pretrained=$MODEL --tasks mmlu --device cuda:0 --batch_size 8
|
| 209 |
```
|
| 210 |
</details>
|
|
|
|
| 218 |
|
| 219 |
| Benchmark | | |
|
| 220 |
|------------------|----------------|--------------------------------|
|
| 221 |
+
| | google/gemma-3-12b-it | pytorch/gemma-3-12b-it-INT4 |
|
| 222 |
| Peak Memory (GB) | 24.50 | 8.68 (65% reduction) |
|
| 223 |
|
| 224 |
|
|
|
|
| 232 |
import torch
|
| 233 |
from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
|
| 234 |
|
| 235 |
+
# use "google/gemma-3-12b-it" or "pytorch/gemma-3-12b-it-INT4"
|
| 236 |
+
model_id = "pytorch/gemma-3-12b-it-INT4"
|
| 237 |
quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
|
| 238 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 239 |
|
|
|
|
| 278 |
## Results (A100 machine)
|
| 279 |
| Benchmark (Latency) | | |
|
| 280 |
|----------------------------------|----------------|--------------------------|
|
| 281 |
+
| | google/gemma-3-12b-it | pytorch/gemma-3-12b-it-INT4 |
|
| 282 |
| latency (batch_size=1) | 3.73s | 2.16s (1.73x speedup) |
|
| 283 |
|
| 284 |
<details>
|
|
|
|
| 308 |
|
| 309 |
### INT4
|
| 310 |
```Shell
|
| 311 |
+
export MODEL=pytorch/gemma-3-12b-it-INT4
|
| 312 |
VLLM_DISABLE_COMPILE_CACHE=1 python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model $MODEL --batch-size 1
|
| 313 |
```
|
| 314 |
</details>
|