File size: 4,264 Bytes
a7050c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from datasets import load_dataset
from transformers import AutoTokenizer
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot, apply
import argparse
from compressed_tensors.quantization import QuantizationScheme, QuantizationArgs, QuantizationType, QuantizationStrategy


parser = argparse.ArgumentParser()
parser.add_argument('--model_path', type=str, default="/network/eldar/llama31_8b_gsm8k/sparse_transfer/sp2of4_2ep_lr7e-6_bs32_GradClip2_warmup20ba_noKD")
parser.add_argument('--quant_path', type=str, default="output_dir/quant_test")

parser.add_argument('--calib_size', type=int, default=256) # 32 64 128 256 512 1024 2048
parser.add_argument('--dampening_frac', type=float, default=0.01) # 0.001 0.003 0.005 0.008 0.01 0.03 0.05 0.08 0.1 0.3
parser.add_argument('--observer', type=str, default="minmax") # mse or minmax

# ToDo: add args for SmoothQuant if needed
args = parser.parse_args()

print(f"[DEBUGGING ARGS] {args}")
# TODO: to ablate on additionally is whether we want to add EOS token to calib data


model = SparseAutoModelForCausalLM.from_pretrained(
    args.model_path,
    device_map="auto",
    torch_dtype="auto",
    use_cache=False,
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(args.model_path)


NUM_CALIBRATION_SAMPLES = args.calib_size
DATASET_ID = "garage-bAInd/Open-Platypus"
DATASET_SPLIT = "train"
ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))

def preprocess(example):
    concat_txt = example["instruction"] + "\n" + example["output"]
    return {"text": concat_txt}

ds = ds.map(preprocess)
print(f"================================================================================")
print(f"[For debugging] Calibration data sample is:\n{repr(ds[0]['text'])}")
print(f"================================================================================")

def tokenize(sample):
    return tokenizer(
        sample["text"],
        padding=False,
        truncation=False,
        add_special_tokens=True,
    )


ds = ds.map(tokenize, remove_columns=ds.column_names)
print(f"================================================================================")
print(f"[For debugging] Tokenized data sample is:\n{tokenizer.decode(ds[0]['input_ids'])}")
print(f"================================================================================")

# TODO: why do we need to fuck around the calibration data when it's given completely prepared
# TODO: are recipe_args used to override params?

#quant_scheme = QuantizationScheme(
#    targets=["Linear"],
#    weights=QuantizationArgs(
#       num_bits=args.num_bits,
#       type=QuantizationType.INT,
#       symmetric=True,
#       group_size=128,
#       strategy=QuantizationStrategy.GROUP,
#       observer=args.observer,
#       actorder=args.actorder
#   ),
#   input_activations=None,
#   output_activations=None,
#)

recipe = [
    GPTQModifier(
        targets=["Linear"],
        ignore=["lm_head"],
        scheme="W8A8",
        dampening_frac=args.dampening_frac,
        observer=args.observer,
        #config_groups={"group_0": quant_scheme},
    )
]
oneshot(
    model=model,
    dataset=ds,
    # recipe="w4a16_recipe.yaml",
    recipe=recipe,
    # recipe_args={
    #     "quant_stage.quant_modifiers.GPTQModifier.dampening_frac": 100,
    #     "quant_stage.quant_modifiers.GPTQModifier.sequential_update": args.sequential_update,
    #     "quant_stage.quant_modifiers.GPTQModifier.config_groups.group_0.weights.observer": args.observer,
    #     "quant_stage.quant_modifiers.GPTQModifier.config_groups.group_0.weights.actorder": args.actorder,
    # },
    num_calibration_samples=args.calib_size,
    max_seq_length=8192,
)


# apply(
#     recipe=self.recipe,
#     # recipe_stage=None,
#     recipe_args=self.recipe_args,
#     model=self.model,
#     calib_data=calibration_data, # dataloader
#     start=-1,
#     copy_data=False,
#     accelerator=self.accelerator, # some accelerator object
#     min_tokens_per_module=None
# )

# Save to disk compressed.
SAVE_DIR = args.quant_path
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)