23 lines
862 B
Python
23 lines
862 B
Python
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
|
|
from llmcompressor.modifiers.quantization import GPTQModifier
|
|
from llmcompressor import oneshot
|
|
|
|
# Select quantization algorithm. In this case, we:
|
|
# * apply SmoothQuant to make the activations easier to quantize
|
|
# * quantize the weights to int8 with GPTQ (static per channel)
|
|
# * quantize the activations to int8 (dynamic per token)
|
|
recipe = [
|
|
SmoothQuantModifier(smoothing_strength=0.8),
|
|
GPTQModifier(scheme="W8A8", targets="Linear", ignore=["lm_head"]),
|
|
]
|
|
|
|
# Apply quantization using the built in open_platypus dataset.
|
|
# * See examples for demos showing how to pass a custom calibration set
|
|
oneshot(
|
|
model="facebook/contriever",
|
|
dataset="open_platypus",
|
|
recipe=recipe,
|
|
output_dir="contriever-INT4",
|
|
max_seq_length=2048,
|
|
num_calibration_samples=512,
|
|
) |