chankhavu commited on
Commit
b992180
·
verified ·
1 Parent(s): f01fc46

Fix hf_quant_config.json and config.json for SGLang/vLLM compatibility

Browse files
Files changed (2) hide show
  1. config.json +68 -128
  2. hf_quant_config.json +48 -20
config.json CHANGED
@@ -1,130 +1,70 @@
1
  {
2
- "architectures": [
3
- "NemotronHForCausalLM"
4
- ],
5
- "attention_bias": false,
6
- "attention_dropout": 0.0,
7
- "auto_map": {
8
- "AutoConfig": "configuration_nemotron_h.NemotronHConfig",
9
- "AutoModel": "modeling_nemotron_h.NemotronHForCausalLM",
10
- "AutoModelForCausalLM": "modeling_nemotron_h.NemotronHForCausalLM"
11
- },
12
- "bos_token_id": 1,
13
- "chunk_size": 128,
14
- "conv_kernel": 4,
15
- "dtype": "bfloat16",
16
- "eos_token_id": 11,
17
- "expand": 2,
18
- "head_dim": 128,
19
- "hidden_dropout": 0.0,
20
- "hidden_size": 2688,
21
- "hybrid_override_pattern": "MEMEM*EMEMEM*EMEMEM*EMEMEM*EMEMEM*EMEMEMEM*EMEMEMEME",
22
- "initializer_range": 0.02,
23
- "intermediate_size": 1856,
24
- "layer_norm_epsilon": 1e-05,
25
- "mamba_head_dim": 64,
26
- "mamba_hidden_act": "silu",
27
- "mamba_num_heads": 64,
28
- "mamba_proj_bias": false,
29
- "max_position_embeddings": 262144,
30
- "mlp_bias": false,
31
- "mlp_hidden_act": "relu2",
32
- "model_type": "nemotron_h",
33
- "moe_intermediate_size": 1856,
34
- "moe_shared_expert_intermediate_size": 3712,
35
- "n_group": 1,
36
- "n_groups": 8,
37
- "n_routed_experts": 128,
38
- "n_shared_experts": 1,
39
- "norm_eps": 1e-05,
40
- "norm_topk_prob": true,
41
- "num_attention_heads": 32,
42
- "num_experts_per_tok": 6,
43
- "num_hidden_layers": 52,
44
- "num_key_value_heads": 2,
45
- "num_logits_to_keep": 1,
46
- "pad_token_id": 0,
47
- "partial_rotary_factor": 1.0,
48
- "rescale_prenorm_residual": true,
49
- "residual_in_fp32": false,
50
- "rope_theta": 10000,
51
- "routed_scaling_factor": 2.5,
52
- "sliding_window": null,
53
- "ssm_state_size": 128,
54
- "tie_word_embeddings": false,
55
- "time_step_floor": 0.0001,
56
- "time_step_limit": [
57
- 0.0,
58
- Infinity
59
- ],
60
- "time_step_max": 0.1,
61
- "time_step_min": 0.001,
62
- "topk_group": 1,
63
- "transformers_version": "4.57.6",
64
- "use_bias": false,
65
- "use_cache": true,
66
- "use_conv_bias": true,
67
- "use_mamba_kernels": true,
68
- "vocab_size": 131072,
69
- "quantization_config": {
70
- "config_groups": {
71
- "group_0": {
72
- "input_activations": {
73
- "dynamic": false,
74
- "num_bits": 4,
75
- "type": "float",
76
- "group_size": 16
77
- },
78
- "weights": {
79
- "dynamic": false,
80
- "num_bits": 4,
81
- "type": "float",
82
- "group_size": 16
83
- },
84
- "targets": [
85
- "Linear"
86
- ]
87
- }
88
- },
89
- "ignore": [
90
- "backbone.layers.0.mixer.conv1d",
91
- "backbone.layers.11*",
92
- "backbone.layers.12*",
93
- "backbone.layers.14.mixer.conv1d",
94
- "backbone.layers.16.mixer.conv1d",
95
- "backbone.layers.18*",
96
- "backbone.layers.19*",
97
- "backbone.layers.2.mixer.conv1d",
98
- "backbone.layers.21.mixer.conv1d",
99
- "backbone.layers.23.mixer.conv1d",
100
- "backbone.layers.25*",
101
- "backbone.layers.26*",
102
- "backbone.layers.28.mixer.conv1d",
103
- "backbone.layers.30.mixer.conv1d",
104
- "backbone.layers.32*",
105
- "backbone.layers.33*",
106
- "backbone.layers.35.mixer.conv1d",
107
- "backbone.layers.37.mixer.conv1d",
108
- "backbone.layers.39.mixer.conv1d",
109
- "backbone.layers.4",
110
- "backbone.layers.4.*",
111
- "backbone.layers.41*",
112
- "backbone.layers.42*",
113
- "backbone.layers.44.mixer.conv1d",
114
- "backbone.layers.46.mixer.conv1d",
115
- "backbone.layers.48.mixer.conv1d",
116
- "backbone.layers.5",
117
- "backbone.layers.5.*",
118
- "backbone.layers.50.mixer.conv1d",
119
- "backbone.layers.7.mixer.conv1d",
120
- "backbone.layers.9.mixer.conv1d",
121
- "lm_head"
122
- ],
123
- "quant_algo": "NVFP4",
124
- "producer": {
125
- "name": "modelopt",
126
- "version": "0.42.0"
127
- },
128
- "quant_method": "modelopt"
129
- }
130
  }
 
1
  {
2
+ "architectures": [
3
+ "NemotronHForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_nemotron_h.NemotronHConfig",
9
+ "AutoModel": "modeling_nemotron_h.NemotronHForCausalLM",
10
+ "AutoModelForCausalLM": "modeling_nemotron_h.NemotronHForCausalLM"
11
+ },
12
+ "bos_token_id": 1,
13
+ "chunk_size": 128,
14
+ "conv_kernel": 4,
15
+ "eos_token_id": 11,
16
+ "expand": 2,
17
+ "head_dim": 128,
18
+ "hidden_dropout": 0.0,
19
+ "hidden_size": 2688,
20
+ "hybrid_override_pattern": "MEMEM*EMEMEM*EMEMEM*EMEMEM*EMEMEM*EMEMEMEM*EMEMEMEME",
21
+ "initializer_range": 0.02,
22
+ "intermediate_size": 1856,
23
+ "layer_norm_epsilon": 1e-05,
24
+ "mamba_head_dim": 64,
25
+ "mamba_hidden_act": "silu",
26
+ "mamba_num_heads": 64,
27
+ "mamba_proj_bias": false,
28
+ "max_position_embeddings": 262144,
29
+ "mlp_bias": false,
30
+ "mlp_hidden_act": "relu2",
31
+ "model_type": "nemotron_h",
32
+ "moe_intermediate_size": 1856,
33
+ "moe_shared_expert_intermediate_size": 3712,
34
+ "n_group": 1,
35
+ "n_groups": 8,
36
+ "n_routed_experts": 128,
37
+ "n_shared_experts": 1,
38
+ "norm_eps": 1e-05,
39
+ "norm_topk_prob": true,
40
+ "num_attention_heads": 32,
41
+ "num_experts_per_tok": 6,
42
+ "num_hidden_layers": 52,
43
+ "num_key_value_heads": 2,
44
+ "num_logits_to_keep": 1,
45
+ "pad_token_id": 0,
46
+ "partial_rotary_factor": 1.0,
47
+ "rescale_prenorm_residual": true,
48
+ "residual_in_fp32": false,
49
+ "rope_theta": 10000,
50
+ "routed_scaling_factor": 2.5,
51
+ "sliding_window": null,
52
+ "ssm_state_size": 128,
53
+ "tie_word_embeddings": false,
54
+ "time_step_floor": 0.0001,
55
+ "time_step_limit": [
56
+ 0.0,
57
+ Infinity
58
+ ],
59
+ "time_step_max": 0.1,
60
+ "time_step_min": 0.001,
61
+ "topk_group": 1,
62
+ "transformers_version": "4.57.6",
63
+ "use_bias": false,
64
+ "use_cache": true,
65
+ "use_conv_bias": true,
66
+ "use_mamba_kernels": true,
67
+ "vocab_size": 131072,
68
+ "torch_dtype": "bfloat16",
69
+ "mamba_ssm_cache_dtype": "float32"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  }
hf_quant_config.json CHANGED
@@ -5,41 +5,69 @@
5
  },
6
  "quantization": {
7
  "quant_algo": "NVFP4",
8
- "kv_cache_quant_algo": null,
9
  "group_size": 16,
10
  "exclude_modules": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  "backbone.layers.0.mixer.conv1d",
12
- "backbone.layers.11*",
13
- "backbone.layers.12*",
 
 
 
14
  "backbone.layers.14.mixer.conv1d",
15
  "backbone.layers.16.mixer.conv1d",
16
- "backbone.layers.18*",
17
- "backbone.layers.19*",
18
- "backbone.layers.2.mixer.conv1d",
19
  "backbone.layers.21.mixer.conv1d",
20
  "backbone.layers.23.mixer.conv1d",
21
- "backbone.layers.25*",
22
- "backbone.layers.26*",
23
  "backbone.layers.28.mixer.conv1d",
24
  "backbone.layers.30.mixer.conv1d",
25
- "backbone.layers.32*",
26
- "backbone.layers.33*",
27
  "backbone.layers.35.mixer.conv1d",
28
  "backbone.layers.37.mixer.conv1d",
29
  "backbone.layers.39.mixer.conv1d",
30
- "backbone.layers.4",
31
- "backbone.layers.4.*",
32
- "backbone.layers.41*",
33
- "backbone.layers.42*",
34
  "backbone.layers.44.mixer.conv1d",
35
  "backbone.layers.46.mixer.conv1d",
36
  "backbone.layers.48.mixer.conv1d",
37
- "backbone.layers.5",
38
- "backbone.layers.5.*",
39
- "backbone.layers.50.mixer.conv1d",
40
- "backbone.layers.7.mixer.conv1d",
41
- "backbone.layers.9.mixer.conv1d",
42
- "lm_head"
43
  ]
44
  }
45
  }
 
5
  },
6
  "quantization": {
7
  "quant_algo": "NVFP4",
8
+ "kv_cache_quant_algo": "FP8",
9
  "group_size": 16,
10
  "exclude_modules": [
11
+ "lm_head",
12
+ "backbone.layers.5.mixer.q_proj",
13
+ "backbone.layers.5.mixer.k_proj",
14
+ "backbone.layers.5.mixer.v_proj",
15
+ "backbone.layers.5.mixer.o_proj",
16
+ "backbone.layers.12.mixer.q_proj",
17
+ "backbone.layers.12.mixer.k_proj",
18
+ "backbone.layers.12.mixer.v_proj",
19
+ "backbone.layers.12.mixer.o_proj",
20
+ "backbone.layers.19.mixer.q_proj",
21
+ "backbone.layers.19.mixer.k_proj",
22
+ "backbone.layers.19.mixer.v_proj",
23
+ "backbone.layers.19.mixer.o_proj",
24
+ "backbone.layers.26.mixer.q_proj",
25
+ "backbone.layers.26.mixer.k_proj",
26
+ "backbone.layers.26.mixer.v_proj",
27
+ "backbone.layers.26.mixer.o_proj",
28
+ "backbone.layers.33.mixer.q_proj",
29
+ "backbone.layers.33.mixer.k_proj",
30
+ "backbone.layers.33.mixer.v_proj",
31
+ "backbone.layers.33.mixer.o_proj",
32
+ "backbone.layers.42.mixer.q_proj",
33
+ "backbone.layers.42.mixer.k_proj",
34
+ "backbone.layers.42.mixer.v_proj",
35
+ "backbone.layers.42.mixer.o_proj",
36
+ "backbone.layers.4.mixer.in_proj",
37
+ "backbone.layers.4.mixer.out_proj",
38
+ "backbone.layers.11.mixer.in_proj",
39
+ "backbone.layers.11.mixer.out_proj",
40
+ "backbone.layers.18.mixer.in_proj",
41
+ "backbone.layers.18.mixer.out_proj",
42
+ "backbone.layers.25.mixer.in_proj",
43
+ "backbone.layers.25.mixer.out_proj",
44
+ "backbone.layers.32.mixer.in_proj",
45
+ "backbone.layers.32.mixer.out_proj",
46
+ "backbone.layers.41.mixer.in_proj",
47
+ "backbone.layers.41.mixer.out_proj",
48
  "backbone.layers.0.mixer.conv1d",
49
+ "backbone.layers.2.mixer.conv1d",
50
+ "backbone.layers.4.mixer.conv1d",
51
+ "backbone.layers.7.mixer.conv1d",
52
+ "backbone.layers.9.mixer.conv1d",
53
+ "backbone.layers.11.mixer.conv1d",
54
  "backbone.layers.14.mixer.conv1d",
55
  "backbone.layers.16.mixer.conv1d",
56
+ "backbone.layers.18.mixer.conv1d",
 
 
57
  "backbone.layers.21.mixer.conv1d",
58
  "backbone.layers.23.mixer.conv1d",
59
+ "backbone.layers.25.mixer.conv1d",
 
60
  "backbone.layers.28.mixer.conv1d",
61
  "backbone.layers.30.mixer.conv1d",
62
+ "backbone.layers.32.mixer.conv1d",
 
63
  "backbone.layers.35.mixer.conv1d",
64
  "backbone.layers.37.mixer.conv1d",
65
  "backbone.layers.39.mixer.conv1d",
66
+ "backbone.layers.41.mixer.conv1d",
 
 
 
67
  "backbone.layers.44.mixer.conv1d",
68
  "backbone.layers.46.mixer.conv1d",
69
  "backbone.layers.48.mixer.conv1d",
70
+ "backbone.layers.50.mixer.conv1d"
 
 
 
 
 
71
  ]
72
  }
73
  }