tamarher commited on
Commit
68ba6d3
·
verified ·
1 Parent(s): 874570a

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. mlx-int8/config.json +279 -0
  2. mlx-int8/model.safetensors +3 -0
mlx-int8/config.json ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "moss-audio-tokenizer",
3
+ "version": "4.26.1.a",
4
+ "sampling_rate": 24000,
5
+ "sample_rate": 24000,
6
+ "downsample_rate": 1920,
7
+ "causal_transformer_context_duration": 10.0,
8
+ "encoder_kwargs": [
9
+ {
10
+ "module_type": "PatchedPretransform",
11
+ "patch_size": 240
12
+ },
13
+ {
14
+ "module_type": "Transformer",
15
+ "causal": true,
16
+ "conv_layout": true,
17
+ "d_model": 768,
18
+ "dim_feedforward": 3072,
19
+ "gating": "none",
20
+ "input_dimension": 240,
21
+ "layer_scale": 0.01,
22
+ "max_period": 10000,
23
+ "norm": "layer_norm",
24
+ "num_heads": 12,
25
+ "num_layers": 12,
26
+ "output_dimension": 384,
27
+ "positional_embedding": "rope"
28
+ },
29
+ {
30
+ "module_type": "PatchedPretransform",
31
+ "patch_size": 2
32
+ },
33
+ {
34
+ "module_type": "Transformer",
35
+ "causal": true,
36
+ "conv_layout": true,
37
+ "d_model": 768,
38
+ "dim_feedforward": 3072,
39
+ "gating": "none",
40
+ "input_dimension": 768,
41
+ "layer_scale": 0.01,
42
+ "max_period": 10000,
43
+ "norm": "layer_norm",
44
+ "num_heads": 12,
45
+ "num_layers": 12,
46
+ "output_dimension": 384,
47
+ "positional_embedding": "rope"
48
+ },
49
+ {
50
+ "module_type": "PatchedPretransform",
51
+ "patch_size": 2
52
+ },
53
+ {
54
+ "module_type": "Transformer",
55
+ "causal": true,
56
+ "conv_layout": true,
57
+ "d_model": 768,
58
+ "dim_feedforward": 3072,
59
+ "gating": "none",
60
+ "input_dimension": 768,
61
+ "layer_scale": 0.01,
62
+ "max_period": 10000,
63
+ "norm": "layer_norm",
64
+ "num_heads": 12,
65
+ "num_layers": 12,
66
+ "output_dimension": 640,
67
+ "positional_embedding": "rope"
68
+ },
69
+ {
70
+ "module_type": "PatchedPretransform",
71
+ "patch_size": 2
72
+ },
73
+ {
74
+ "module_type": "Transformer",
75
+ "causal": true,
76
+ "conv_layout": true,
77
+ "d_model": 1280,
78
+ "dim_feedforward": 5120,
79
+ "gating": "none",
80
+ "input_dimension": 1280,
81
+ "layer_scale": 0.01,
82
+ "max_period": 10000,
83
+ "norm": "layer_norm",
84
+ "num_heads": 20,
85
+ "num_layers": 32,
86
+ "output_dimension": 768,
87
+ "positional_embedding": "rope"
88
+ }
89
+ ],
90
+ "decoder_kwargs": [
91
+ {
92
+ "module_type": "Transformer",
93
+ "causal": true,
94
+ "conv_layout": true,
95
+ "d_model": 1280,
96
+ "dim_feedforward": 5120,
97
+ "gating": "none",
98
+ "input_dimension": 768,
99
+ "layer_scale": 0.01,
100
+ "max_period": 10000,
101
+ "norm": "layer_norm",
102
+ "num_heads": 20,
103
+ "num_layers": 32,
104
+ "output_dimension": 1280,
105
+ "positional_embedding": "rope"
106
+ },
107
+ {
108
+ "module_type": "PatchedPretransform",
109
+ "patch_size": 2
110
+ },
111
+ {
112
+ "module_type": "Transformer",
113
+ "causal": true,
114
+ "conv_layout": true,
115
+ "d_model": 768,
116
+ "dim_feedforward": 3072,
117
+ "gating": "none",
118
+ "input_dimension": 640,
119
+ "layer_scale": 0.01,
120
+ "max_period": 10000,
121
+ "norm": "layer_norm",
122
+ "num_heads": 12,
123
+ "num_layers": 12,
124
+ "output_dimension": 768,
125
+ "positional_embedding": "rope"
126
+ },
127
+ {
128
+ "module_type": "PatchedPretransform",
129
+ "patch_size": 2
130
+ },
131
+ {
132
+ "module_type": "Transformer",
133
+ "causal": true,
134
+ "conv_layout": true,
135
+ "d_model": 768,
136
+ "dim_feedforward": 3072,
137
+ "gating": "none",
138
+ "input_dimension": 384,
139
+ "layer_scale": 0.01,
140
+ "max_period": 10000,
141
+ "norm": "layer_norm",
142
+ "num_heads": 12,
143
+ "num_layers": 12,
144
+ "output_dimension": 768,
145
+ "positional_embedding": "rope"
146
+ },
147
+ {
148
+ "module_type": "PatchedPretransform",
149
+ "patch_size": 2
150
+ },
151
+ {
152
+ "module_type": "Transformer",
153
+ "causal": true,
154
+ "conv_layout": true,
155
+ "d_model": 768,
156
+ "dim_feedforward": 3072,
157
+ "gating": "none",
158
+ "input_dimension": 384,
159
+ "layer_scale": 0.01,
160
+ "max_period": 10000,
161
+ "norm": "layer_norm",
162
+ "num_heads": 12,
163
+ "num_layers": 12,
164
+ "output_dimension": 240,
165
+ "positional_embedding": "rope"
166
+ },
167
+ {
168
+ "module_type": "PatchedPretransform",
169
+ "patch_size": 240
170
+ }
171
+ ],
172
+ "quantizer_type": "rlfq",
173
+ "quantizer_kwargs": {
174
+ "input_dim": 768,
175
+ "rvq_dim": 512,
176
+ "output_dim": 768,
177
+ "num_quantizers": 32,
178
+ "codebook_size": 1024,
179
+ "codebook_dim": 8,
180
+ "quantizer_type": "rlfq"
181
+ },
182
+ "architectures": [
183
+ "MossAudioTokenizerModel"
184
+ ],
185
+ "auto_map": {
186
+ "AutoConfig": "configuration_moss_audio_tokenizer.MossAudioTokenizerConfig",
187
+ "AutoModel": "modeling_moss_audio_tokenizer.MossAudioTokenizerModel"
188
+ },
189
+ "code_dim": 768,
190
+ "dtype": "float32",
191
+ "reversed_decoder_kwargs": [
192
+ {
193
+ "module_type": "PatchedPretransform",
194
+ "patch_size": 240
195
+ },
196
+ {
197
+ "causal": true,
198
+ "conv_layout": true,
199
+ "d_model": 768,
200
+ "dim_feedforward": 3072,
201
+ "gating": "none",
202
+ "input_dimension": 240,
203
+ "layer_scale": 0.01,
204
+ "max_period": 10000,
205
+ "module_type": "Transformer",
206
+ "norm": "layer_norm",
207
+ "num_heads": 12,
208
+ "num_layers": 12,
209
+ "output_dimension": 384,
210
+ "positional_embedding": "rope"
211
+ },
212
+ {
213
+ "module_type": "PatchedPretransform",
214
+ "patch_size": 2
215
+ },
216
+ {
217
+ "causal": true,
218
+ "conv_layout": true,
219
+ "d_model": 768,
220
+ "dim_feedforward": 3072,
221
+ "gating": "none",
222
+ "input_dimension": 768,
223
+ "layer_scale": 0.01,
224
+ "max_period": 10000,
225
+ "module_type": "Transformer",
226
+ "norm": "layer_norm",
227
+ "num_heads": 12,
228
+ "num_layers": 12,
229
+ "output_dimension": 384,
230
+ "positional_embedding": "rope"
231
+ },
232
+ {
233
+ "module_type": "PatchedPretransform",
234
+ "patch_size": 2
235
+ },
236
+ {
237
+ "causal": true,
238
+ "conv_layout": true,
239
+ "d_model": 768,
240
+ "dim_feedforward": 3072,
241
+ "gating": "none",
242
+ "input_dimension": 768,
243
+ "layer_scale": 0.01,
244
+ "max_period": 10000,
245
+ "module_type": "Transformer",
246
+ "norm": "layer_norm",
247
+ "num_heads": 12,
248
+ "num_layers": 12,
249
+ "output_dimension": 640,
250
+ "positional_embedding": "rope"
251
+ },
252
+ {
253
+ "module_type": "PatchedPretransform",
254
+ "patch_size": 2
255
+ },
256
+ {
257
+ "causal": true,
258
+ "conv_layout": true,
259
+ "d_model": 1280,
260
+ "dim_feedforward": 5120,
261
+ "gating": "none",
262
+ "input_dimension": 1280,
263
+ "layer_scale": 0.01,
264
+ "max_period": 10000,
265
+ "module_type": "Transformer",
266
+ "norm": "layer_norm",
267
+ "num_heads": 20,
268
+ "num_layers": 32,
269
+ "output_dimension": 768,
270
+ "positional_embedding": "rope"
271
+ }
272
+ ],
273
+ "transformers_version": "4.56.0.dev0",
274
+ "quantization": {
275
+ "bits": 8,
276
+ "group_size": 64,
277
+ "mode": "affine"
278
+ }
279
+ }
mlx-int8/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de45810416fbc67a444e9b1afdbb41569a13669f7942af2f011b26e918767995
3
+ size 2003407408