appautomaton
/

openmoss-audio-tokenizer-mlx

+{
+  "model_type": "moss-audio-tokenizer",
+  "version": "4.26.1.a",
+  "sampling_rate": 24000,
+  "sample_rate": 24000,
+  "downsample_rate": 1920,
+  "causal_transformer_context_duration": 10.0,
+  "encoder_kwargs": [
+    {
+      "module_type": "PatchedPretransform",
+      "patch_size": 240
+    },
+    {
+      "module_type": "Transformer",
+      "causal": true,
+      "conv_layout": true,
+      "d_model": 768,
+      "dim_feedforward": 3072,
+      "gating": "none",
+      "input_dimension": 240,
+      "layer_scale": 0.01,
+      "max_period": 10000,
+      "norm": "layer_norm",
+      "num_heads": 12,
+      "num_layers": 12,
+      "output_dimension": 384,
+      "positional_embedding": "rope"
+    },
+    {
+      "module_type": "PatchedPretransform",
+      "patch_size": 2
+    },
+    {
+      "module_type": "Transformer",
+      "causal": true,
+      "conv_layout": true,
+      "d_model": 768,
+      "dim_feedforward": 3072,
+      "gating": "none",
+      "input_dimension": 768,
+      "layer_scale": 0.01,
+      "max_period": 10000,
+      "norm": "layer_norm",
+      "num_heads": 12,
+      "num_layers": 12,
+      "output_dimension": 384,
+      "positional_embedding": "rope"
+    },
+    {
+      "module_type": "PatchedPretransform",
+      "patch_size": 2
+    },
+    {
+      "module_type": "Transformer",
+      "causal": true,
+      "conv_layout": true,
+      "d_model": 768,
+      "dim_feedforward": 3072,
+      "gating": "none",
+      "input_dimension": 768,
+      "layer_scale": 0.01,
+      "max_period": 10000,
+      "norm": "layer_norm",
+      "num_heads": 12,
+      "num_layers": 12,
+      "output_dimension": 640,
+      "positional_embedding": "rope"
+    },
+    {
+      "module_type": "PatchedPretransform",
+      "patch_size": 2
+    },
+    {
+      "module_type": "Transformer",
+      "causal": true,
+      "conv_layout": true,
+      "d_model": 1280,
+      "dim_feedforward": 5120,
+      "gating": "none",
+      "input_dimension": 1280,
+      "layer_scale": 0.01,
+      "max_period": 10000,
+      "norm": "layer_norm",
+      "num_heads": 20,
+      "num_layers": 32,
+      "output_dimension": 768,
+      "positional_embedding": "rope"
+    }
+  ],
+  "decoder_kwargs": [
+    {
+      "module_type": "Transformer",
+      "causal": true,
+      "conv_layout": true,
+      "d_model": 1280,
+      "dim_feedforward": 5120,
+      "gating": "none",
+      "input_dimension": 768,
+      "layer_scale": 0.01,
+      "max_period": 10000,
+      "norm": "layer_norm",
+      "num_heads": 20,
+      "num_layers": 32,
+      "output_dimension": 1280,
+      "positional_embedding": "rope"
+    },
+    {
+      "module_type": "PatchedPretransform",
+      "patch_size": 2
+    },
+    {
+      "module_type": "Transformer",
+      "causal": true,
+      "conv_layout": true,
+      "d_model": 768,
+      "dim_feedforward": 3072,
+      "gating": "none",
+      "input_dimension": 640,
+      "layer_scale": 0.01,
+      "max_period": 10000,
+      "norm": "layer_norm",
+      "num_heads": 12,
+      "num_layers": 12,
+      "output_dimension": 768,
+      "positional_embedding": "rope"
+    },
+    {
+      "module_type": "PatchedPretransform",
+      "patch_size": 2
+    },
+    {
+      "module_type": "Transformer",
+      "causal": true,
+      "conv_layout": true,
+      "d_model": 768,
+      "dim_feedforward": 3072,
+      "gating": "none",
+      "input_dimension": 384,
+      "layer_scale": 0.01,
+      "max_period": 10000,
+      "norm": "layer_norm",
+      "num_heads": 12,
+      "num_layers": 12,
+      "output_dimension": 768,
+      "positional_embedding": "rope"
+    },
+    {
+      "module_type": "PatchedPretransform",
+      "patch_size": 2
+    },
+    {
+      "module_type": "Transformer",
+      "causal": true,
+      "conv_layout": true,
+      "d_model": 768,
+      "dim_feedforward": 3072,
+      "gating": "none",
+      "input_dimension": 384,
+      "layer_scale": 0.01,
+      "max_period": 10000,
+      "norm": "layer_norm",
+      "num_heads": 12,
+      "num_layers": 12,
+      "output_dimension": 240,
+      "positional_embedding": "rope"
+    },
+    {
+      "module_type": "PatchedPretransform",
+      "patch_size": 240
+    }
+  ],
+  "quantizer_type": "rlfq",
+  "quantizer_kwargs": {
+    "input_dim": 768,
+    "rvq_dim": 512,
+    "output_dim": 768,
+    "num_quantizers": 32,
+    "codebook_size": 1024,
+    "codebook_dim": 8,
+    "quantizer_type": "rlfq"
+  },
+  "architectures": [
+    "MossAudioTokenizerModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_moss_audio_tokenizer.MossAudioTokenizerConfig",
+    "AutoModel": "modeling_moss_audio_tokenizer.MossAudioTokenizerModel"
+  },
+  "code_dim": 768,
+  "dtype": "float32",
+  "reversed_decoder_kwargs": [
+    {
+      "module_type": "PatchedPretransform",
+      "patch_size": 240
+    },
+    {
+      "causal": true,
+      "conv_layout": true,
+      "d_model": 768,
+      "dim_feedforward": 3072,
+      "gating": "none",
+      "input_dimension": 240,
+      "layer_scale": 0.01,
+      "max_period": 10000,
+      "module_type": "Transformer",
+      "norm": "layer_norm",
+      "num_heads": 12,
+      "num_layers": 12,
+      "output_dimension": 384,
+      "positional_embedding": "rope"
+    },
+    {
+      "module_type": "PatchedPretransform",
+      "patch_size": 2
+    },
+    {
+      "causal": true,
+      "conv_layout": true,
+      "d_model": 768,
+      "dim_feedforward": 3072,
+      "gating": "none",
+      "input_dimension": 768,
+      "layer_scale": 0.01,
+      "max_period": 10000,
+      "module_type": "Transformer",
+      "norm": "layer_norm",
+      "num_heads": 12,
+      "num_layers": 12,
+      "output_dimension": 384,
+      "positional_embedding": "rope"
+    },
+    {
+      "module_type": "PatchedPretransform",
+      "patch_size": 2
+    },
+    {
+      "causal": true,
+      "conv_layout": true,
+      "d_model": 768,
+      "dim_feedforward": 3072,
+      "gating": "none",
+      "input_dimension": 768,
+      "layer_scale": 0.01,
+      "max_period": 10000,
+      "module_type": "Transformer",
+      "norm": "layer_norm",
+      "num_heads": 12,
+      "num_layers": 12,
+      "output_dimension": 640,
+      "positional_embedding": "rope"
+    },
+    {
+      "module_type": "PatchedPretransform",
+      "patch_size": 2
+    },
+    {
+      "causal": true,
+      "conv_layout": true,
+      "d_model": 1280,
+      "dim_feedforward": 5120,
+      "gating": "none",
+      "input_dimension": 1280,
+      "layer_scale": 0.01,
+      "max_period": 10000,
+      "module_type": "Transformer",
+      "norm": "layer_norm",
+      "num_heads": 20,
+      "num_layers": 32,
+      "output_dimension": 768,
+      "positional_embedding": "rope"
+    }
+  ],
+  "transformers_version": "4.56.0.dev0",
+  "quantization": {
+    "bits": 8,
+    "group_size": 64,
+    "mode": "affine"
+  }
+}

mlx-int8/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de45810416fbc67a444e9b1afdbb41569a13669f7942af2f011b26e918767995
+size 2003407408