olive-recipes/Qwen-Qwen2.5-1.5B-Instruct/QNN/config.json at main · microsoft/olive-recipes · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
{
    "input_model": { "type": "HfModel", "model_path": "Qwen/Qwen2.5-1.5B-Instruct" },
    "systems": {
        "qnn_system": {
            "type": "PythonEnvironment",
            "python_environment_path": "/path/to/qnn/env/bin",
            "accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ]
        }
    },
    "data_configs": [
        {
            "name": "wikitext2_train_joined",
            "type": "HuggingfaceContainer",
            "load_dataset_config": { "data_name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "train" },
            "pre_process_data_config": {
                "strategy": "join",
                "add_special_tokens": false,
                "max_seq_len": 4096,
                "max_samples": 128
            }
        },
        {
            "name": "wikitext2_train_act",
            "type": "HuggingfaceContainer",
            "load_dataset_config": { "data_name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "train" },
            "pre_process_data_config": {
                "strategy": "line-by-line",
                "add_special_tokens": true,
                "max_samples": 256,
                "max_seq_len": 4096
            }
        }
    ],
    "passes": {
        "cs": { "type": "CaptureSplitInfo", "num_splits": 1, "unique_embeds_lm_head_splits": true },
        "g": {
            "type": "GptqModel",
            "bits": 4,
            "sym": true,
            "group_size": -1,
            "lm_head": true,
            "rotation": "hadamard",
            "device": "cuda",
            "data_config": "wikitext2_train_joined",
            "dynamic": {
                    "+:.*lm_head*": {"bits": 8, "sym": true, "group_size": 32, "desc_act": false}
            }
        },
        "mb": {
            "type": "ModelBuilder",
            "precision": "int4",
            "int4_block_size": 32,
            "int4_accuracy_level": 4,
            "int4_op_types_to_quantize": [ "Gather" ]
        },
        "mq": {
            "type": "MatMulNBitsToQDQ",
            "use_int4": true,
            "add_zero_point": true,
            "nodes_to_exclude": [ "/lm_head/MatMulNBits" ],
            "save_as_external_data": true
        },
        "gs": {
            "type": "GraphSurgeries",
            "surgeries": [
                { "surgeon": "RemoveRopeMultiCache" },
                { "surgeon": "AttentionMaskToSequenceLengths" },
                { "surgeon": "RemoveGidxFromMatMulNBits" },
                { "surgeon": "SimplifiedLayerNormToL2Norm" }
            ],
            "save_as_external_data": true
        },
        "sq": {
            "type": "OnnxStaticQuantization",
            "data_config": "wikitext2_train_act",
            "activation_type": "uint16",
            "precision": "uint8",
            "calibration_providers": [ "CUDAExecutionProvider" ],
            "quant_preprocess": true,
            "op_types_to_exclude": [ "GatherBlockQuantized", "GroupQueryAttention", "MatMulNBits" ],
            "save_as_external_data": true
        },
        "sp": { "type": "SplitModel" },
        "st": { "type": "StaticLLM", "batch_size": 1, "context_length": 64 },
        "cb": {
            "type": "EPContextBinaryGenerator",
            "provider_options": {
                "htp_performance_mode": "burst",
                "htp_graph_finalization_optimization_mode": "3",
                "soc_model": "60"
            },
            "weight_sharing": true
        },
        "cp": { "type": "ComposeOnnxModels" }
    },
    "target": "qnn_system",
    "log_severity_level": 1,
    "output_dir": "models/qwen_2.5_1.5b_Instruct",
    "cache_dir": "cache",
    "no_artifacts": true
}