Wenxuan Wang commited on
Commit
a2cb30f
·
0 Parent(s):

Squash history

Browse files
.gitattributes ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/MOSS-Music.png filter=lfs diff=lfs merge=lfs -text
37
+ assets/music_pipeline.png filter=lfs diff=lfs merge=lfs -text
38
+ assets/music_bench.png filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,424 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ language:
4
+ - en
5
+ - zh
6
+ library_name: transformers
7
+ pipeline_tag: audio-text-to-text
8
+ tags:
9
+ - music
10
+ - music-understanding
11
+ - audio
12
+ - audio-language-model
13
+ - moss
14
+ - moss-music
15
+ - lyrics-asr
16
+ - music-captioning
17
+ - chord-recognition
18
+ ---
19
+
20
+ # MOSS-Music
21
+
22
+ <p align="center">
23
+ <img src="./assets/MOSS-Music.png" width="58%" alt="MOSS-Music logo" />
24
+ </p>
25
+
26
+ <div align="center">
27
+
28
+ <a href="https://huggingface.co/collections/OpenMOSS-Team/moss-music"><img src="https://img.shields.io/badge/Huggingface-Models-orange?logo=huggingface&amp"></a>
29
+ <a href="https://modelscope.cn/collections/openmoss/MOSS-Music"><img src="https://img.shields.io/badge/ModelScope-Models-624AFF?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgMCAyNCAyNCI+PHBhdGggZmlsbD0id2hpdGUiIGQ9Ik0xMiAyQzYuNDggMiAyIDYuNDggMiAxMnM0LjQ4IDEwIDEwIDEwIDEwLTQuNDggMTAtMTBTMTcuNTIgMiAxMiAyeiIvPjwvc3ZnPg==&amp"></a>
30
+ <img src="https://img.shields.io/badge/Blog-Coming_Soon-blue?logo=internet-explorer&amp">
31
+ <img src="https://img.shields.io/badge/Arxiv-Coming_Soon-red?logo=Arxiv&amp">
32
+
33
+ <a href="https://x.com/Open_MOSS"><img src="https://img.shields.io/badge/Twitter-Follow-black?logo=x&amp"></a>
34
+ <a href="https://discord.gg/Xf3aXddCjc"><img src="https://img.shields.io/badge/Discord-Join-5865F2?logo=discord&amp"></a>
35
+
36
+ </div>
37
+
38
+ **MOSS-Music** is an open-source **music understanding model** from
39
+ [MOSI.AI](https://mosi.cn/#hero), the [OpenMOSS team](https://www.open-moss.com/),
40
+ and [Shanghai Innovation Institute](https://www.sii.edu.cn/). Built on the same
41
+ audio backbone as [MOSS-Audio](https://github.com/OpenMOSS/MOSS-Audio),
42
+ MOSS-Music is further specialised on music via dedicated continual pre-training
43
+ and supervised fine-tuning — targeting **musical captioning, lyrics ASR,
44
+ structural analysis, chord / key / tempo reasoning, and long-form musical
45
+ question answering**. In this release, we provide **two 8B models**:
46
+ **MOSS-Music-8B-Instruct** and **MOSS-Music-8B-Thinking**. The Instruct variant
47
+ is optimised for direct instruction following on musical prompts, while the
48
+ Thinking variant provides stronger chain-of-thought reasoning for musical
49
+ analysis.
50
+
51
+ ## News
52
+
53
+ * 2026.04.20: 🎉🎉🎉 We have released [MOSS-Music](https://huggingface.co/collections/OpenMOSS-Team/moss-music).
54
+
55
+ ## Contents
56
+
57
+ - [Introduction](#introduction)
58
+ - [Model Architecture](#model-architecture)
59
+ - [DeepStack Cross-Layer Feature Injection](#deepstack-cross-layer-feature-injection)
60
+ - [Time-Aware Representation](#time-aware-representation)
61
+ - [Released Models](#released-models)
62
+ - [Music Data Pipeline](#music-data-pipeline)
63
+ - [Evaluation](#evaluation)
64
+ - [Quickstart](#quickstart)
65
+ - [Environment Setup](#environment-setup)
66
+ - [Basic Usage](#basic-usage)
67
+ - [Gradio App](#gradio-app)
68
+ - [SGLang Serving](#sglang-serving)
69
+ - [More Information](#more-information)
70
+ - [LICENSE](#license)
71
+ - [Citation](#citation)
72
+
73
+ ## Introduction
74
+
75
+ Music is not just audio plus lyrics — understanding it requires perceiving
76
+ harmonic structure, rhythm, timbre, instrumentation, performance nuance, and
77
+ the textual content of the lyrics, and reasoning about them jointly across
78
+ time. **MOSS-Music** is built to unify these capabilities within a single
79
+ model.
80
+
81
+ - **Lyrics ASR & time-aligned transcription**: Accurate singing ASR with
82
+ sentence- and word-level timestamps, robust to backing tracks.
83
+ - **Musical captioning & tagging**: Natural-language descriptions of mood,
84
+ genre, instrumentation, production style, and emotional trajectory.
85
+ - **Key / tempo / chord reasoning**: Identifies musical key, beats, downbeats,
86
+ and chord progressions, including timestamped chord transcription.
87
+ - **Structural analysis**: Segments a song into intro / verse / chorus /
88
+ bridge / outro and reasons about repetition and contrast.
89
+ - **Instrument & voice recognition**: Identifies prominent instruments and
90
+ singing voices (solo / chorus, gender, register).
91
+ - **Musical QA and long-form analysis**: Open-ended question answering
92
+ grounded in a full track, including chain-of-thought reasoning in the
93
+ *Thinking* variant.
94
+
95
+ ## Model Architecture
96
+
97
+ MOSS-Music inherits the MOSS-Audio modular design, comprising three
98
+ components: an audio encoder, a modality adapter, and a large language model.
99
+ Raw audio is first encoded by **MOSS-Audio-Encoder** into continuous temporal
100
+ representations at **12.5 Hz**, which are then projected into the language
101
+ model's embedding space through the adapter and finally consumed by the LLM
102
+ for auto-regressive text generation.
103
+
104
+ Rather than relying on off-the-shelf audio frontends, we train a dedicated
105
+ encoder from scratch to obtain more robust acoustic representations, tighter
106
+ temporal alignment, and better extensibility across musical styles, singing,
107
+ and non-speech acoustic content.
108
+
109
+ ### DeepStack Cross-Layer Feature Injection
110
+
111
+ Using only the encoder's top-layer features tends to lose low-level prosody,
112
+ transient events, and local time-frequency structure. To address this, we
113
+ adopt a **DeepStack**-inspired cross-layer injection module between the
114
+ encoder and the language model: in addition to the encoder's final-layer
115
+ output, features from earlier and intermediate layers are selected,
116
+ independently projected, and injected into the language model's early layers,
117
+ preserving multi-granularity information from low-level acoustic details to
118
+ high-level semantic abstractions.
119
+
120
+ This design is especially well-suited for music understanding, as it helps
121
+ retain rhythm, timbre, transients, and instrumental texture — information
122
+ that a single high-level representation cannot fully capture, yet is critical
123
+ for chord recognition, structural analysis, and nuanced musical description.
124
+
125
+ ### Time-Aware Representation
126
+
127
+ Time is a critical dimension in music understanding. To enhance explicit
128
+ temporal awareness, we adopt a **time-marker insertion** strategy during
129
+ pre-training: explicit time tokens are inserted between audio frame
130
+ representations at fixed time intervals to indicate temporal positions.
131
+ This design enables the model to learn "what happened when" within a unified
132
+ text generation framework, naturally supporting timestamped lyrics ASR,
133
+ beat / downbeat localisation, section boundary detection, and long-song
134
+ retrospective QA.
135
+
136
+ On top of the MOSS-Audio backbone, MOSS-Music is:
137
+
138
+ - **continually pre-trained** on a large, diverse music corpus produced by
139
+ the standalone data pipeline repository
140
+ [`MOSS-Music-Data-Pipeline`](https://github.com/OpenMOSS/MOSS-Music-Data-Pipeline),
141
+ with an emphasis on singing, lyrics, and full-song coverage;
142
+ - **SFT-ed** on music-centric instruction data covering captioning, lyrics
143
+ ASR, chord / key / structural analysis, and long-form musical QA;
144
+ - further **reasoning-tuned** for the *Thinking* variant.
145
+
146
+ ## Released Models
147
+
148
+ | Model | Audio Encoder | LLM Backbone | Total Size | Hugging Face | ModelScope |
149
+ |---|---|---|---:|---|---|
150
+ | **MOSS‑Music‑8B‑Instruct** | MOSS-Audio-Encoder | Qwen3-8B | ~9.1B | [![Hugging Face](https://img.shields.io/badge/Huggingface-Model-orange?logo=huggingface)](https://huggingface.co/OpenMOSS-Team/MOSS-Music-8B-Instruct) | [![ModelScope](https://img.shields.io/badge/ModelScope-Model-624AFF)](https://modelscope.cn/models/openmoss/MOSS-Music-8B-Instruct) |
151
+ | **MOSS‑Music‑8B‑Thinking** | MOSS-Audio-Encoder | Qwen3-8B | ~9.1B | [![Hugging Face](https://img.shields.io/badge/Huggingface-Model-orange?logo=huggingface)](https://huggingface.co/OpenMOSS-Team/MOSS-Music-8B-Thinking) | [![ModelScope](https://img.shields.io/badge/ModelScope-Model-624AFF)](https://modelscope.cn/models/openmoss/MOSS-Music-8B-Thinking) |
152
+
153
+ > Smaller (4B) variants and additional sizes may follow. Stay tuned!
154
+
155
+ ## Music Data Pipeline
156
+
157
+ The training data used by MOSS-Music is produced by an end-to-end pipeline
158
+ that goes from raw audio to chat-formatted training samples. That pipeline is
159
+ maintained in the standalone repository
160
+ [`MOSS-Music-Data-Pipeline`](https://github.com/OpenMOSS/MOSS-Music-Data-Pipeline),
161
+ which hosts duration detection, MIR feature extraction, song-structure
162
+ segmentation, lyrics ASR, metadata cleanup, and ALM-driven caption / query
163
+ generation with models such as Qwen3-Omni, MusicFlamingo, and other
164
+ audio-language models.
165
+
166
+ <p align="center">
167
+ <img src="./assets/music_pipeline.png" width="94%" />
168
+ </p>
169
+
170
+ ## Evaluation
171
+
172
+ We evaluate MOSS-Music on a diverse suite of public music understanding
173
+ benchmarks. Key results:
174
+
175
+ - **Music QA and understanding**: **MOSS-Music-8B-Instruct** achieves **80.38**
176
+ average accuracy across **8 public music QA benchmarks** (excluding the
177
+ three NSynth note-recognition tracks), ranking first among all compared
178
+ models in our current evaluation set.
179
+ - **Music captioning**: In our preliminary **GPT-5.4-as-a-Judge** evaluation,
180
+ the MOSS-Music series leads both caption benchmarks, with
181
+ `MOSS-Music-8B-Thinking` reaching **4.53** on `MusicCaps` and
182
+ `MOSS-Music-8B-Instruct` reaching **4.58** on `SDD`.
183
+ - **Lyrics ASR for singing voice**: **MOSS-Music-8B-Thinking** achieves the
184
+ best average lyrics recognition error across `MUSDB18`, `MIR-1K` and
185
+ `Opencpop` (**15.88%** avg WER/CER), clearly ahead of all compared
186
+ audio-language baselines including `Gemini-3.1-Pro-Preview`,
187
+ `MusicFlamingo` and `Qwen3-Omni`. Detailed timestamped-ASR results will be
188
+ released in a later update.
189
+ - **Chord transcription**: MOSS-Music supports chord transcription, including
190
+ timestamped chord transcription for harmonic analysis, accompaniment
191
+ reference, and related downstream use cases. Detailed benchmark results will
192
+ be released in a later update.
193
+
194
+
195
+ <p align="center">
196
+ <img src="./assets/music_bench.png" width="98%" />
197
+ </p>
198
+
199
+ ### Music QA & Understanding (Accuracy↑)
200
+
201
+ | Model | MMAU-music | MMAU-mini-music | MMAU-Pro-music | MMAR-music | MuChoMusic | Music-AVQA | NSynth (instrument) | NSynth (source) | NSynth (pitch) | GTZAN | Medley-Solos-DB | Avg |
202
+ |-----|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
203
+ | **MOSS‑Music‑8B‑Instruct** | **79.33** | **80.78** | 71.02 | 59.70 | **89.39** | **76.78** | **86.55** | 61.07 | **86.94** | **93.59** | 92.42 | **80.38** |
204
+ | Gemini‑3.1‑Pro | 71.69 | 77.18 | **73.06** | **71.64** | 79.53 | 61.51 | 13.38 | 38.90 | 6.47 | 86.39 | 80.34 | 75.17 |
205
+ | **MOSS‑Music‑8B‑Thinking** | 74.09 | 77.78 | 67.98 | 50.25 | 82.90 | 68.90 | 56.17 | 57.48 | 77.83 | 84.78 | 87.42 | 74.26 |
206
+ | MusicFlamingo | 76.83 | 76.35 | 65.60 | 48.66 | 74.58 | 73.60 | 80.76 | **75.89** | 0.00 | 84.45 | 90.86 | 73.87 |
207
+ | Audio‑Flamingo‑Next | 72.39 | 72.07 | 61.64 | 45.27 | 75.62 | 62.94 | 86.40 | 66.73 | 0.05 | 77.68 | 91.47 | 69.89 |
208
+ | MiMo‑Audio‑7B‑Instruct | 66.36 | 72.97 | 66.50 | 45.77 | 75.40 | 57.05 | 25.01 | 1.49 | 4.86 | 65.67 | **93.81** | 67.94 |
209
+ | Step‑Audio‑R1 | 66.46 | 75.08 | 62.34 | 50.75 | 72.62 | 57.98 | 13.75 | 15.87 | 2.39 | 73.67 | 82.45 | 67.67 |
210
+ | Qwen3‑Omni | 65.76 | 68.77 | 66.27 | 48.54 | 78.77 | 56.05 | 30.92 | 44.30 | 28.08 | 80.15 | 69.65 | 66.75 |
211
+ | Kimi‑Audio‑7B‑Instruct | 47.95 | 52.25 | 59.10 | 45.27 | 70.18 | 68.90 | 6.01 | 0.81 | 3.88 | 39.54 | 71.98 | 56.90 |
212
+
213
+ > `Avg` is computed over 8 public music QA benchmarks:
214
+ > `MMAU-music`, `MMAU-mini-music`, `MMAU-Pro-music`, `MMAR-music`,
215
+ > `MuChoMusic`, `Music-AVQA`, `GTZAN`, and `Medley-Solos-DB`.
216
+ >
217
+ > We exclude the three `NSynth` tracks from the main average because they focus
218
+ > on fine-grained isolated-note recognition, including instrument-family,
219
+ > acoustic/electronic source, and exact pitch discrimination from short
220
+ > single-note clips. Some compared audio-language models are not explicitly
221
+ > designed for this note-level classification setting, so we report NSynth
222
+ > separately for reference rather than mixing it into the headline average.
223
+
224
+ ### Music Captioning
225
+
226
+ We further report a preliminary **GPT-5.4-as-a-Judge** music captioning
227
+ comparison on `MusicCaps` and `Song Describer Dataset (SDD)`. Scores are on a
228
+ 1-5 scale across 9 dimensions: `genre/style`, `mood/affect`, `tempo/rhythm`,
229
+ `instrumentation/timbre`, `vocals`, `melody/harmony`, `structure/form`,
230
+ `production/audio quality`, and `scene/use case`.
231
+
232
+ - **Overall caption quality**: the MOSS-Music series remains strongest across
233
+ both caption benchmarks, with `MOSS-Music-8B-Thinking` reaching **4.53** on
234
+ `MusicCaps` and `MOSS-Music-8B-Instruct` reaching **4.58** on `SDD`.
235
+ - **Stronger structural descriptions**: MOSS-Music shows the clearest gains on
236
+ `structure / form / progression`, especially on `SDD`.
237
+ - **Competitive baselines on instrumentation and scene semantics**:
238
+ `MusicFlamingo` and `Gemini-3.1-Pro` remain competitive on
239
+ `instrumentation/timbre`, while `Gemini-3.1-Pro` is strongest on
240
+ `scene / use case`.
241
+
242
+ #### MusicCaps
243
+
244
+ | Model | Genre | Mood | Tempo | Instr. | Vocals | Melody/Harmony | Structure | Production | Scene | Avg |
245
+ |-----|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
246
+ | **MOSS‑Music‑8B‑Thinking** | 4.78 | **4.69** | **4.62** | 4.40 | **4.46** | **4.40** | **4.86** | 4.35 | 4.18 | **4.53** |
247
+ | Gemini‑3.1‑Pro | 4.70 | 4.60 | 4.48 | **4.68** | 4.18 | 4.18 | 3.86 | **4.40** | **4.72** | 4.42 |
248
+ | **MOSS‑Music‑8B‑Instruct** | 4.60 | 4.52 | 4.46 | 4.02 | 4.30 | 4.38 | 4.78 | 4.20 | 3.96 | 4.36 |
249
+ | MusicFlamingo | **4.80** | 4.36 | 4.50 | 4.64 | 3.94 | 4.08 | 3.58 | 4.30 | 3.72 | 4.21 |
250
+ | Audio‑Flamingo‑Next | 4.34 | 4.56 | 4.08 | 4.30 | 4.18 | 3.78 | 3.66 | 4.04 | 3.92 | 4.10 |
251
+ | MiMo‑Audio‑7B‑Instruct | 4.02 | 4.20 | 4.46 | 4.28 | 4.36 | 3.62 | 3.30 | 4.08 | 3.50 | 3.98 |
252
+ | Step‑Audio‑R1 | 4.22 | 4.02 | 4.20 | 3.96 | 3.84 | 4.02 | 3.24 | 4.10 | 3.54 | 3.90 |
253
+ | Qwen3‑Omni | 4.58 | 4.50 | 4.26 | 3.62 | 3.64 | 3.48 | 2.98 | 4.18 | 4.42 | 3.96 |
254
+ | Kimi‑Audio‑7B‑Instruct | 3.98 | 3.92 | 4.32 | 3.88 | 4.48 | 3.28 | 2.72 | 3.72 | 3.24 | 3.73 |
255
+
256
+ #### Song Describer Dataset (SDD)
257
+
258
+ | Model | Genre | Mood | Tempo | Instr. | Vocals | Melody/Harmony | Structure | Production | Scene | Avg |
259
+ |-----|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
260
+ | **MOSS‑Music‑8B‑Instruct** | **4.84** | **4.76** | **4.68** | 4.24 | **4.52** | **4.56** | **4.92** | 4.42 | 4.24 | **4.58** |
261
+ | Gemini‑3.1‑Pro | 4.72 | 4.64 | 4.52 | **4.72** | 4.22 | 4.24 | 3.94 | **4.46** | **4.82** | 4.48 |
262
+ | **MOSS‑Music‑8B‑Thinking** | 4.66 | 4.58 | 4.50 | 4.36 | 4.36 | 4.44 | 4.84 | 4.26 | 4.02 | 4.45 |
263
+ | MusicFlamingo | 4.82 | 4.40 | 4.52 | 4.70 | 3.98 | 4.14 | 3.66 | 4.36 | 3.80 | 4.26 |
264
+ | Audio‑Flamingo‑Next | 4.40 | 4.62 | 4.14 | 4.36 | 4.22 | 3.84 | 3.74 | 4.10 | 4.00 | 4.16 |
265
+ | MiMo‑Audio‑7B‑Instruct | 4.08 | 4.26 | 4.52 | 4.34 | 4.42 | 3.70 | 3.38 | 4.16 | 3.58 | 4.05 |
266
+ | Step‑Audio‑R1 | 4.30 | 4.10 | 4.26 | 4.02 | 3.92 | 4.10 | 3.32 | 4.18 | 3.62 | 3.98 |
267
+ | Qwen3‑Omni | 4.62 | 4.54 | 4.30 | 3.68 | 3.70 | 3.56 | 3.06 | 4.24 | 4.50 | 4.02 |
268
+ | Kimi‑Audio‑7B‑Instruct | 4.04 | 3.98 | 4.38 | 3.96 | 4.54 | 3.36 | 2.80 | 3.80 | 3.32 | 3.80 |
269
+
270
+ ### Lyrics ASR (WER / CER↓)
271
+
272
+ We further evaluate MOSS-Music on **singing-voice lyrics ASR** across three
273
+ representative benchmarks:
274
+
275
+ - `MUSDB18` — English pop songs **with backing tracks**, scored with **WER**;
276
+ - `MIR-1K` — **Chinese karaoke** clips with background music, scored with **CER**;
277
+ - `Opencpop` — **clean Mandarin studio singing**, scored with **CER**.
278
+
279
+ `Avg` is the unweighted mean of the three dataset-level error rates.
280
+
281
+ | Model | MUSDB18 WER | MIR-1K CER | Opencpop CER | Avg |
282
+ |-----|---:|---:|---:|---:|
283
+ | **MOSS‑Music‑8B‑Thinking** | 29.19% | **15.84%** | 2.60% | **15.88%** |
284
+ | **MOSS‑Music‑8B‑Instruct** | 32.99% | 23.96% | 4.62% | 20.52% |
285
+ | Gemini‑3.1‑Pro‑Preview | 26.25% | 36.37% | 6.00% | 22.87% |
286
+ | MusicFlamingo | **23.41%** | 38.98% | 18.73% | 27.04% |
287
+ | Qwen3‑Omni‑30B‑A3B‑Instruct | 62.67% | 20.48% | **2.26%** | 28.47% |
288
+ | MiMo‑Audio‑7B‑Instruct | 94.16% | 23.34% | 6.77% | 41.42% |
289
+ | Kimi‑Audio‑7B‑Instruct | 97.53% | 25.83% | 4.90% | 42.75% |
290
+ | Step‑Audio‑R1 | 81.67% | 48.03% | 4.15% | 44.62% |
291
+ | Audio‑Flamingo‑Next | 94.93% | 55.63% | 12.47% | 54.34% |
292
+
293
+ > **MOSS-Music-8B-Thinking** achieves the lowest average lyrics-ASR error
294
+ > (**15.88%**) across these three datasets, with particular gains on
295
+ > `MIR-1K` (Chinese karaoke with accompaniment) and `Opencpop` (clean Mandarin
296
+ > singing). MOSS-Music also inherits the strong timestamp-aware ASR ability
297
+ > from MOSS-Audio; detailed singing-timestamp ASR results will be added soon.
298
+
299
+ ### Chord Transcription
300
+
301
+ MOSS-Music supports chord transcription, including timestamped chord
302
+ transcription that tracks chord progression over time. This can be useful for
303
+ harmonic analysis, accompaniment reference, music education, and related use
304
+ cases. Detailed benchmark results will be added soon.
305
+
306
+ ## Quickstart
307
+
308
+ ### Environment Setup
309
+
310
+ We recommend Python 3.12 with a clean Conda environment. The commands below
311
+ are enough for local inference.
312
+
313
+ #### Recommended setup
314
+
315
+ ```bash
316
+ git clone https://github.com/OpenMOSS/MOSS-Music.git
317
+ cd MOSS-Music
318
+
319
+ conda create -n moss-music python=3.12 -y
320
+ conda activate moss-music
321
+
322
+ conda install -c conda-forge "ffmpeg=7" -y
323
+ pip install --extra-index-url https://download.pytorch.org/whl/cu128 -e ".[torch-runtime]"
324
+ ```
325
+
326
+ #### Optional: FlashAttention 2
327
+
328
+ If your GPU supports FlashAttention 2, you can replace the last install
329
+ command with:
330
+
331
+ ```bash
332
+ pip install --extra-index-url https://download.pytorch.org/whl/cu128 -e ".[torch-runtime,flash-attn]"
333
+ ```
334
+
335
+ ### Basic Usage
336
+
337
+ Download the model first:
338
+
339
+ ```bash
340
+ hf download OpenMOSS-Team/MOSS-Music-8B-Instruct --local-dir ./weights/MOSS-Music-8B-Instruct
341
+ hf download OpenMOSS-Team/MOSS-Music-8B-Thinking --local-dir ./weights/MOSS-Music-8B-Thinking
342
+ ```
343
+
344
+ Then edit `MODEL_PATH` / `AUDIO_PATH` in `infer.py` as needed, and run:
345
+
346
+ ```bash
347
+ python infer.py
348
+ ```
349
+
350
+ > [!IMPORTANT]
351
+ > To achieve the best generation quality and fully leverage the model’s capabilities, we
352
+ > **strongly recommend using SGLang Serving for inference**.
353
+
354
+
355
+ The default prompt in `infer.py` is
356
+ `Please give a detailed musical description of this clip.`. You can directly
357
+ edit that line if you want to try lyrics transcription, chord / key / tempo
358
+ analysis, structural segmentation, or open-ended musical QA. Typical prompts:
359
+
360
+ - `Describe this piece of music in terms of style and tempo, tonal quality and harmony, instrumentation and arrangement, structural organization, and overall emotional mood.`
361
+ - `Please give a detailed musical description of this clip.`
362
+ - `Transcribe the lyrics of this song (with timestamps).`
363
+ - `Transcribe the chord progression of this piece of music with timestamps, and output it in JSON format.`
364
+ - `What is the key, tempo and mood of this track?`
365
+ - `Segment the song into verse / chorus / bridge sections.`
366
+
367
+ ### Gradio App
368
+
369
+ Start the Gradio demo with:
370
+
371
+ ```bash
372
+ python app.py
373
+ ```
374
+
375
+ The server address and port can be overridden via the
376
+ `MOSS_MUSIC_SERVER_NAME` and `MOSS_MUSIC_SERVER_PORT` environment variables,
377
+ and the default model ID via `MOSS_MUSIC_MODEL_ID`.
378
+
379
+ ### SGLang Serving
380
+
381
+ If you want to serve MOSS-Music with SGLang, see the full guide in
382
+ `moss_music_usage_guide.md`.
383
+
384
+ The shortest setup is:
385
+
386
+ ```bash
387
+ cd sglang
388
+ pip install -e "python[all]"
389
+ pip install nvidia-cudnn-cu12==9.16.0.29
390
+ cd ..
391
+
392
+ sglang serve \
393
+ --model-path ./weights/MOSS-Music-8B-Instruct \
394
+ --trust-remote-code
395
+ ```
396
+
397
+ You can replace `./weights/MOSS-Music-8B-Instruct` with
398
+ `./weights/MOSS-Music-8B-Thinking` if needed.
399
+
400
+ If you use the default `torch==2.9.1+cu128` runtime, installing
401
+ `nvidia-cudnn-cu12==9.16.0.29` is recommended before starting `sglang serve`.
402
+
403
+ ## More Information
404
+
405
+ - **MOSI.AI**: [https://mosi.cn](https://mosi.cn)
406
+ - **OpenMOSS**: [https://www.open-moss.com](https://www.open-moss.com)
407
+ - **MOSS-Audio (backbone)**: [https://github.com/OpenMOSS/MOSS-Audio](https://github.com/OpenMOSS/MOSS-Audio)
408
+ - **MOSS-Music Data Pipeline**: [https://github.com/OpenMOSS/MOSS-Music-Data-Pipeline](https://github.com/OpenMOSS/MOSS-Music-Data-Pipeline)
409
+
410
+ ## LICENSE
411
+
412
+ Models in MOSS-Music are licensed under the Apache License 2.0.
413
+
414
+ ## Citation
415
+
416
+ ```bibtex
417
+ @misc{mossmusic2026,
418
+ title={MOSS-Music Technical Report},
419
+ author={OpenMOSS Team},
420
+ year={2026},
421
+ howpublished={\url{https://github.com/OpenMOSS/MOSS-Music}},
422
+ note={GitHub repository}
423
+ }
424
+ ```
added_tokens.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|assistant|>": 151671,
9
+ "<|box_end|>": 151649,
10
+ "<|box_start|>": 151648,
11
+ "<|endoftext|>": 151643,
12
+ "<|eot|>": 151672,
13
+ "<|file_sep|>": 151664,
14
+ "<|fim_middle|>": 151660,
15
+ "<|fim_pad|>": 151662,
16
+ "<|fim_prefix|>": 151659,
17
+ "<|fim_suffix|>": 151661,
18
+ "<|im_end|>": 151645,
19
+ "<|im_start|>": 151644,
20
+ "<|image_pad|>": 151655,
21
+ "<|object_ref_end|>": 151647,
22
+ "<|object_ref_start|>": 151646,
23
+ "<|quad_end|>": 151651,
24
+ "<|quad_start|>": 151650,
25
+ "<|repo_name|>": 151663,
26
+ "<|system|>": 151669,
27
+ "<|user|>": 151670,
28
+ "<|video_pad|>": 151656,
29
+ "<|vision_end|>": 151653,
30
+ "<|vision_pad|>": 151654,
31
+ "<|vision_start|>": 151652
32
+ }
assets/MOSS-Music.png ADDED

Git LFS Details

  • SHA256: af7e90d50967384f1a2b129d1af8d8dcd02ba19d00cdf2b53cab63711f9226ce
  • Pointer size: 132 Bytes
  • Size of remote file: 1.58 MB
assets/music_bench.png ADDED

Git LFS Details

  • SHA256: d13681917bd7d3b61360c22147ab76f2639218835cb717083a6b2a7fc6724bf3
  • Pointer size: 132 Bytes
  • Size of remote file: 3.35 MB
assets/music_pipeline.png ADDED

Git LFS Details

  • SHA256: 1e5358e1e0e16b8452e58a4aad6750bd7f6d1357957c7c2611549cb4d5f5d8a2
  • Pointer size: 132 Bytes
  • Size of remote file: 2.09 MB
chat_template.jinja ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18
+ {%- for message in messages[::-1] %}
19
+ {%- set index = (messages|length - 1) - loop.index0 %}
20
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21
+ {%- set ns.multi_step_tool = false %}
22
+ {%- set ns.last_query_index = index %}
23
+ {%- endif %}
24
+ {%- endfor %}
25
+ {%- for message in messages %}
26
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
27
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
28
+ {%- elif message.role == "assistant" %}
29
+ {%- set content = message.content %}
30
+ {%- set reasoning_content = '' %}
31
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
32
+ {%- set reasoning_content = message.reasoning_content %}
33
+ {%- else %}
34
+ {%- if '</think>' in message.content %}
35
+ {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
36
+ {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
37
+ {%- endif %}
38
+ {%- endif %}
39
+ {%- if loop.index0 > ns.last_query_index %}
40
+ {%- if loop.last or (not loop.last and reasoning_content) %}
41
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
42
+ {%- else %}
43
+ {{- '<|im_start|>' + message.role + '\n' + content }}
44
+ {%- endif %}
45
+ {%- else %}
46
+ {{- '<|im_start|>' + message.role + '\n' + content }}
47
+ {%- endif %}
48
+ {%- if message.tool_calls %}
49
+ {%- for tool_call in message.tool_calls %}
50
+ {%- if (loop.first and content) or (not loop.first) %}
51
+ {{- '\n' }}
52
+ {%- endif %}
53
+ {%- if tool_call.function %}
54
+ {%- set tool_call = tool_call.function %}
55
+ {%- endif %}
56
+ {{- '<tool_call>\n{"name": "' }}
57
+ {{- tool_call.name }}
58
+ {{- '", "arguments": ' }}
59
+ {%- if tool_call.arguments is string %}
60
+ {{- tool_call.arguments }}
61
+ {%- else %}
62
+ {{- tool_call.arguments | tojson }}
63
+ {%- endif %}
64
+ {{- '}\n</tool_call>' }}
65
+ {%- endfor %}
66
+ {%- endif %}
67
+ {{- '<|im_end|>\n' }}
68
+ {%- elif message.role == "tool" %}
69
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
70
+ {{- '<|im_start|>user' }}
71
+ {%- endif %}
72
+ {{- '\n<tool_response>\n' }}
73
+ {{- message.content }}
74
+ {{- '\n</tool_response>' }}
75
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
76
+ {{- '<|im_end|>\n' }}
77
+ {%- endif %}
78
+ {%- endif %}
79
+ {%- endfor %}
80
+ {%- if add_generation_prompt %}
81
+ {{- '<|im_start|>assistant\n' }}
82
+ {%- if enable_thinking is defined and enable_thinking is false %}
83
+ {{- '<think>\n\n</think>\n\n' }}
84
+ {%- endif %}
85
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "MossMusicModel"
4
+ ],
5
+ "model_type": "moss_music",
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_moss_music.MossMusicConfig",
8
+ "AutoProcessor": "processing_moss_music.MossMusicProcessor"
9
+ },
10
+ "adapter_hidden_size": 8192,
11
+ "deepstack_num_inject_layers": 3,
12
+ "ignore_index": -100,
13
+ "dtype": "bfloat16",
14
+ "bos_token_id": 151643,
15
+ "eos_token_id": 151645,
16
+ "num_hidden_layers": 36,
17
+ "tie_word_embeddings": false,
18
+ "transformers_version": "4.57.1",
19
+ "vocab_size": 151936,
20
+ "hidden_size": 4096,
21
+ "audio_config": {
22
+ "_attn_implementation": "eager",
23
+ "activation_dropout": 0.0,
24
+ "activation_function": "gelu",
25
+ "attention_dropout": 0.1,
26
+ "d_model": 1280,
27
+ "deepstack_encoder_layer_indexes": [
28
+ 8,
29
+ 16,
30
+ 24
31
+ ],
32
+ "downsample_hidden_size": 480,
33
+ "downsample_rate": 8,
34
+ "dropout": 0.1,
35
+ "encoder_attention_heads": 20,
36
+ "encoder_attention_window_size": 100,
37
+ "encoder_ffn_dim": 5120,
38
+ "encoder_layers": 32,
39
+ "layer_norm_eps": 1e-05,
40
+ "max_source_positions": 1500,
41
+ "num_mel_bins": 128,
42
+ "output_dim": 1280,
43
+ "pretrained_path": ""
44
+ },
45
+ "language_config": {
46
+ "architectures": [
47
+ "Qwen3ForCausalLM"
48
+ ],
49
+ "attention_bias": false,
50
+ "attention_dropout": 0.0,
51
+ "bos_token_id": 151643,
52
+ "dtype": "bfloat16",
53
+ "eos_token_id": 151645,
54
+ "head_dim": 128,
55
+ "hidden_act": "silu",
56
+ "hidden_size": 4096,
57
+ "initializer_range": 0.02,
58
+ "intermediate_size": 12288,
59
+ "layer_types": [
60
+ "full_attention",
61
+ "full_attention",
62
+ "full_attention",
63
+ "full_attention",
64
+ "full_attention",
65
+ "full_attention",
66
+ "full_attention",
67
+ "full_attention",
68
+ "full_attention",
69
+ "full_attention",
70
+ "full_attention",
71
+ "full_attention",
72
+ "full_attention",
73
+ "full_attention",
74
+ "full_attention",
75
+ "full_attention",
76
+ "full_attention",
77
+ "full_attention",
78
+ "full_attention",
79
+ "full_attention",
80
+ "full_attention",
81
+ "full_attention",
82
+ "full_attention",
83
+ "full_attention",
84
+ "full_attention",
85
+ "full_attention",
86
+ "full_attention",
87
+ "full_attention",
88
+ "full_attention",
89
+ "full_attention",
90
+ "full_attention",
91
+ "full_attention",
92
+ "full_attention",
93
+ "full_attention",
94
+ "full_attention",
95
+ "full_attention"
96
+ ],
97
+ "max_position_embeddings": 40960,
98
+ "max_window_layers": 36,
99
+ "model_type": "qwen3",
100
+ "num_attention_heads": 32,
101
+ "num_hidden_layers": 36,
102
+ "num_key_value_heads": 8,
103
+ "rms_norm_eps": 1e-06,
104
+ "rope_scaling": null,
105
+ "rope_theta": 1000000,
106
+ "sliding_window": null,
107
+ "use_cache": true,
108
+ "use_sliding_window": false,
109
+ "vocab_size": 151936
110
+ }
111
+ }
configuration_moss_music.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Minimal configuration shipped inside MOSS-Music HuggingFace weight folders.
2
+
3
+ This file is copied into each released checkpoint by ``convert_hf_checkpoint.py``
4
+ so that ``AutoConfig.from_pretrained(..., trust_remote_code=True)`` works out of
5
+ the box. The full developer-facing configuration lives in
6
+ ``src/configuration_moss_music.py``.
7
+ """
8
+ from transformers import PretrainedConfig, Qwen3Config
9
+
10
+
11
+ class MossMusicConfig(PretrainedConfig):
12
+ model_type = "moss_music"
13
+ is_composition = True
14
+
15
+ def __init__(
16
+ self,
17
+ audio_config=None,
18
+ language_config=None,
19
+ adapter_hidden_size=8192,
20
+ ignore_index=-100,
21
+ deepstack_num_inject_layers=None,
22
+ **kwargs,
23
+ ):
24
+ if isinstance(language_config, dict):
25
+ language_config = Qwen3Config(**language_config)
26
+ elif language_config is None:
27
+ language_config = Qwen3Config()
28
+
29
+ self.audio_config = audio_config
30
+ self.language_config = language_config
31
+ self.adapter_hidden_size = adapter_hidden_size
32
+ self.ignore_index = ignore_index
33
+ self.deepstack_num_inject_layers = deepstack_num_inject_layers
34
+
35
+ for key in ("num_hidden_layers", "eos_token_id", "bos_token_id", "vocab_size"):
36
+ kwargs.setdefault(key, getattr(language_config, key, None))
37
+
38
+ super().__init__(**kwargs)
39
+
40
+
41
+ def _register_moss_music_with_sglang() -> None:
42
+ """Register MossMusic* as aliases of the built-in MossAudio* classes inside
43
+ SGLang. This module is imported by ``AutoConfig.from_pretrained`` via
44
+ ``auto_map`` + ``trust_remote_code``, which fires before SGLang looks up
45
+ ``architectures[0]`` in its ``ModelRegistry`` / ``PROCESSOR_MAPPING``.
46
+ """
47
+ try:
48
+ from sglang.srt.configs import model_config as _sg_model_config
49
+ from sglang.srt.managers.multimodal_processor import PROCESSOR_MAPPING
50
+ from sglang.srt.models.moss_audio import MossAudioModel
51
+ from sglang.srt.models.registry import ModelRegistry
52
+ from sglang.srt.multimodal.processors.moss_audio import (
53
+ MossAudioMultimodalProcessor,
54
+ )
55
+ except Exception:
56
+ return
57
+
58
+ if "MossMusicModel" not in ModelRegistry.models:
59
+ alias_cls = type("MossMusicModel", (MossAudioModel,), {})
60
+ ModelRegistry.models["MossMusicModel"] = alias_cls
61
+ PROCESSOR_MAPPING[alias_cls] = MossAudioMultimodalProcessor
62
+
63
+ # SGLang also dispatches on the *string* architecture name in several
64
+ # helper functions (multimodal detection, always-process-mm-data, ...).
65
+ # The Python-class alias above does not cover those, so we also extend
66
+ # the name-based whitelists that mirror ``MossAudioModel``.
67
+ try:
68
+ if "MossMusicModel" not in _sg_model_config.multimodal_model_archs:
69
+ _sg_model_config.multimodal_model_archs.append("MossMusicModel")
70
+ except Exception:
71
+ pass
72
+
73
+ try:
74
+ if not getattr(
75
+ _sg_model_config.is_always_process_mm_data_model,
76
+ "_moss_music_patched",
77
+ False,
78
+ ):
79
+ _orig_always_mm = _sg_model_config.is_always_process_mm_data_model
80
+
81
+ def _patched_always_mm(model_architectures):
82
+ if "MossMusicModel" in model_architectures:
83
+ return True
84
+ return _orig_always_mm(model_architectures)
85
+
86
+ _patched_always_mm._moss_music_patched = True # type: ignore[attr-defined]
87
+ _sg_model_config.is_always_process_mm_data_model = _patched_always_mm
88
+ except Exception:
89
+ pass
90
+
91
+
92
+ _register_moss_music_with_sglang()
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 151643,
4
+ "eos_token_id": 151645,
5
+ "transformers_version": "4.57.1"
6
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63edf20789c6fa3bac2fb635a097a927a64237d488ffa29c2e1a946c84490f3f
3
+ size 4931219872
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2473dec1d6a648372beaf906d26477f73e271de8f0c8eaa8d5b7147518c666c
3
+ size 4983069688
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b00d237df0fc68b5d17504ce02dcdae1a724424042d47d6156c5cf60706f4e6
3
+ size 4999847576
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e598e7a6d470f4766639abafef01670dff46cda3c636aea64d0801cc26309584
3
+ size 3190899504
model.safetensors.index.json ADDED
@@ -0,0 +1,910 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 9052463488,
4
+ "total_size": 18104928256
5
+ },
6
+ "weight_map": {
7
+ "audio_adapter.down_proj.weight": "model-00004-of-00004.safetensors",
8
+ "audio_adapter.gate_proj.weight": "model-00004-of-00004.safetensors",
9
+ "audio_adapter.up_proj.weight": "model-00004-of-00004.safetensors",
10
+ "audio_encoder.conv1.bias": "model-00001-of-00004.safetensors",
11
+ "audio_encoder.conv1.weight": "model-00001-of-00004.safetensors",
12
+ "audio_encoder.conv2.bias": "model-00001-of-00004.safetensors",
13
+ "audio_encoder.conv2.weight": "model-00001-of-00004.safetensors",
14
+ "audio_encoder.conv3.bias": "model-00001-of-00004.safetensors",
15
+ "audio_encoder.conv3.weight": "model-00001-of-00004.safetensors",
16
+ "audio_encoder.embed_positions.inv_timescales": "model-00001-of-00004.safetensors",
17
+ "audio_encoder.layer_norm.bias": "model-00001-of-00004.safetensors",
18
+ "audio_encoder.layer_norm.weight": "model-00001-of-00004.safetensors",
19
+ "audio_encoder.layers.0.fc1.bias": "model-00001-of-00004.safetensors",
20
+ "audio_encoder.layers.0.fc1.weight": "model-00001-of-00004.safetensors",
21
+ "audio_encoder.layers.0.fc2.bias": "model-00001-of-00004.safetensors",
22
+ "audio_encoder.layers.0.fc2.weight": "model-00001-of-00004.safetensors",
23
+ "audio_encoder.layers.0.final_layer_norm.bias": "model-00001-of-00004.safetensors",
24
+ "audio_encoder.layers.0.final_layer_norm.weight": "model-00001-of-00004.safetensors",
25
+ "audio_encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
26
+ "audio_encoder.layers.0.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
27
+ "audio_encoder.layers.0.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
28
+ "audio_encoder.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
29
+ "audio_encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
30
+ "audio_encoder.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
31
+ "audio_encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
32
+ "audio_encoder.layers.0.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
33
+ "audio_encoder.layers.0.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
34
+ "audio_encoder.layers.1.fc1.bias": "model-00001-of-00004.safetensors",
35
+ "audio_encoder.layers.1.fc1.weight": "model-00001-of-00004.safetensors",
36
+ "audio_encoder.layers.1.fc2.bias": "model-00001-of-00004.safetensors",
37
+ "audio_encoder.layers.1.fc2.weight": "model-00001-of-00004.safetensors",
38
+ "audio_encoder.layers.1.final_layer_norm.bias": "model-00001-of-00004.safetensors",
39
+ "audio_encoder.layers.1.final_layer_norm.weight": "model-00001-of-00004.safetensors",
40
+ "audio_encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
41
+ "audio_encoder.layers.1.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
42
+ "audio_encoder.layers.1.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
43
+ "audio_encoder.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
44
+ "audio_encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
45
+ "audio_encoder.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
46
+ "audio_encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
47
+ "audio_encoder.layers.1.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
48
+ "audio_encoder.layers.1.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
49
+ "audio_encoder.layers.10.fc1.bias": "model-00001-of-00004.safetensors",
50
+ "audio_encoder.layers.10.fc1.weight": "model-00001-of-00004.safetensors",
51
+ "audio_encoder.layers.10.fc2.bias": "model-00001-of-00004.safetensors",
52
+ "audio_encoder.layers.10.fc2.weight": "model-00001-of-00004.safetensors",
53
+ "audio_encoder.layers.10.final_layer_norm.bias": "model-00001-of-00004.safetensors",
54
+ "audio_encoder.layers.10.final_layer_norm.weight": "model-00001-of-00004.safetensors",
55
+ "audio_encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
56
+ "audio_encoder.layers.10.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
57
+ "audio_encoder.layers.10.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
58
+ "audio_encoder.layers.10.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
59
+ "audio_encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
60
+ "audio_encoder.layers.10.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
61
+ "audio_encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
62
+ "audio_encoder.layers.10.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
63
+ "audio_encoder.layers.10.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
64
+ "audio_encoder.layers.11.fc1.bias": "model-00001-of-00004.safetensors",
65
+ "audio_encoder.layers.11.fc1.weight": "model-00001-of-00004.safetensors",
66
+ "audio_encoder.layers.11.fc2.bias": "model-00001-of-00004.safetensors",
67
+ "audio_encoder.layers.11.fc2.weight": "model-00001-of-00004.safetensors",
68
+ "audio_encoder.layers.11.final_layer_norm.bias": "model-00001-of-00004.safetensors",
69
+ "audio_encoder.layers.11.final_layer_norm.weight": "model-00001-of-00004.safetensors",
70
+ "audio_encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
71
+ "audio_encoder.layers.11.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
72
+ "audio_encoder.layers.11.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
73
+ "audio_encoder.layers.11.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
74
+ "audio_encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
75
+ "audio_encoder.layers.11.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
76
+ "audio_encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
77
+ "audio_encoder.layers.11.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
78
+ "audio_encoder.layers.11.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
79
+ "audio_encoder.layers.12.fc1.bias": "model-00001-of-00004.safetensors",
80
+ "audio_encoder.layers.12.fc1.weight": "model-00001-of-00004.safetensors",
81
+ "audio_encoder.layers.12.fc2.bias": "model-00001-of-00004.safetensors",
82
+ "audio_encoder.layers.12.fc2.weight": "model-00001-of-00004.safetensors",
83
+ "audio_encoder.layers.12.final_layer_norm.bias": "model-00001-of-00004.safetensors",
84
+ "audio_encoder.layers.12.final_layer_norm.weight": "model-00001-of-00004.safetensors",
85
+ "audio_encoder.layers.12.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
86
+ "audio_encoder.layers.12.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
87
+ "audio_encoder.layers.12.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
88
+ "audio_encoder.layers.12.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
89
+ "audio_encoder.layers.12.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
90
+ "audio_encoder.layers.12.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
91
+ "audio_encoder.layers.12.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
92
+ "audio_encoder.layers.12.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
93
+ "audio_encoder.layers.12.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
94
+ "audio_encoder.layers.13.fc1.bias": "model-00001-of-00004.safetensors",
95
+ "audio_encoder.layers.13.fc1.weight": "model-00001-of-00004.safetensors",
96
+ "audio_encoder.layers.13.fc2.bias": "model-00001-of-00004.safetensors",
97
+ "audio_encoder.layers.13.fc2.weight": "model-00001-of-00004.safetensors",
98
+ "audio_encoder.layers.13.final_layer_norm.bias": "model-00001-of-00004.safetensors",
99
+ "audio_encoder.layers.13.final_layer_norm.weight": "model-00001-of-00004.safetensors",
100
+ "audio_encoder.layers.13.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
101
+ "audio_encoder.layers.13.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
102
+ "audio_encoder.layers.13.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
103
+ "audio_encoder.layers.13.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
104
+ "audio_encoder.layers.13.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
105
+ "audio_encoder.layers.13.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
106
+ "audio_encoder.layers.13.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
107
+ "audio_encoder.layers.13.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
108
+ "audio_encoder.layers.13.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
109
+ "audio_encoder.layers.14.fc1.bias": "model-00001-of-00004.safetensors",
110
+ "audio_encoder.layers.14.fc1.weight": "model-00001-of-00004.safetensors",
111
+ "audio_encoder.layers.14.fc2.bias": "model-00001-of-00004.safetensors",
112
+ "audio_encoder.layers.14.fc2.weight": "model-00001-of-00004.safetensors",
113
+ "audio_encoder.layers.14.final_layer_norm.bias": "model-00001-of-00004.safetensors",
114
+ "audio_encoder.layers.14.final_layer_norm.weight": "model-00001-of-00004.safetensors",
115
+ "audio_encoder.layers.14.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
116
+ "audio_encoder.layers.14.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
117
+ "audio_encoder.layers.14.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
118
+ "audio_encoder.layers.14.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
119
+ "audio_encoder.layers.14.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
120
+ "audio_encoder.layers.14.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
121
+ "audio_encoder.layers.14.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
122
+ "audio_encoder.layers.14.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
123
+ "audio_encoder.layers.14.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
124
+ "audio_encoder.layers.15.fc1.bias": "model-00001-of-00004.safetensors",
125
+ "audio_encoder.layers.15.fc1.weight": "model-00001-of-00004.safetensors",
126
+ "audio_encoder.layers.15.fc2.bias": "model-00001-of-00004.safetensors",
127
+ "audio_encoder.layers.15.fc2.weight": "model-00001-of-00004.safetensors",
128
+ "audio_encoder.layers.15.final_layer_norm.bias": "model-00001-of-00004.safetensors",
129
+ "audio_encoder.layers.15.final_layer_norm.weight": "model-00001-of-00004.safetensors",
130
+ "audio_encoder.layers.15.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
131
+ "audio_encoder.layers.15.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
132
+ "audio_encoder.layers.15.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
133
+ "audio_encoder.layers.15.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
134
+ "audio_encoder.layers.15.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
135
+ "audio_encoder.layers.15.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
136
+ "audio_encoder.layers.15.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
137
+ "audio_encoder.layers.15.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
138
+ "audio_encoder.layers.15.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
139
+ "audio_encoder.layers.16.fc1.bias": "model-00001-of-00004.safetensors",
140
+ "audio_encoder.layers.16.fc1.weight": "model-00001-of-00004.safetensors",
141
+ "audio_encoder.layers.16.fc2.bias": "model-00001-of-00004.safetensors",
142
+ "audio_encoder.layers.16.fc2.weight": "model-00001-of-00004.safetensors",
143
+ "audio_encoder.layers.16.final_layer_norm.bias": "model-00001-of-00004.safetensors",
144
+ "audio_encoder.layers.16.final_layer_norm.weight": "model-00001-of-00004.safetensors",
145
+ "audio_encoder.layers.16.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
146
+ "audio_encoder.layers.16.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
147
+ "audio_encoder.layers.16.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
148
+ "audio_encoder.layers.16.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
149
+ "audio_encoder.layers.16.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
150
+ "audio_encoder.layers.16.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
151
+ "audio_encoder.layers.16.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
152
+ "audio_encoder.layers.16.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
153
+ "audio_encoder.layers.16.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
154
+ "audio_encoder.layers.17.fc1.bias": "model-00001-of-00004.safetensors",
155
+ "audio_encoder.layers.17.fc1.weight": "model-00001-of-00004.safetensors",
156
+ "audio_encoder.layers.17.fc2.bias": "model-00001-of-00004.safetensors",
157
+ "audio_encoder.layers.17.fc2.weight": "model-00001-of-00004.safetensors",
158
+ "audio_encoder.layers.17.final_layer_norm.bias": "model-00001-of-00004.safetensors",
159
+ "audio_encoder.layers.17.final_layer_norm.weight": "model-00001-of-00004.safetensors",
160
+ "audio_encoder.layers.17.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
161
+ "audio_encoder.layers.17.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
162
+ "audio_encoder.layers.17.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
163
+ "audio_encoder.layers.17.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
164
+ "audio_encoder.layers.17.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
165
+ "audio_encoder.layers.17.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
166
+ "audio_encoder.layers.17.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
167
+ "audio_encoder.layers.17.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
168
+ "audio_encoder.layers.17.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
169
+ "audio_encoder.layers.18.fc1.bias": "model-00001-of-00004.safetensors",
170
+ "audio_encoder.layers.18.fc1.weight": "model-00001-of-00004.safetensors",
171
+ "audio_encoder.layers.18.fc2.bias": "model-00001-of-00004.safetensors",
172
+ "audio_encoder.layers.18.fc2.weight": "model-00001-of-00004.safetensors",
173
+ "audio_encoder.layers.18.final_layer_norm.bias": "model-00001-of-00004.safetensors",
174
+ "audio_encoder.layers.18.final_layer_norm.weight": "model-00001-of-00004.safetensors",
175
+ "audio_encoder.layers.18.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
176
+ "audio_encoder.layers.18.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
177
+ "audio_encoder.layers.18.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
178
+ "audio_encoder.layers.18.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
179
+ "audio_encoder.layers.18.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
180
+ "audio_encoder.layers.18.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
181
+ "audio_encoder.layers.18.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
182
+ "audio_encoder.layers.18.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
183
+ "audio_encoder.layers.18.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
184
+ "audio_encoder.layers.19.fc1.bias": "model-00001-of-00004.safetensors",
185
+ "audio_encoder.layers.19.fc1.weight": "model-00001-of-00004.safetensors",
186
+ "audio_encoder.layers.19.fc2.bias": "model-00001-of-00004.safetensors",
187
+ "audio_encoder.layers.19.fc2.weight": "model-00001-of-00004.safetensors",
188
+ "audio_encoder.layers.19.final_layer_norm.bias": "model-00001-of-00004.safetensors",
189
+ "audio_encoder.layers.19.final_layer_norm.weight": "model-00001-of-00004.safetensors",
190
+ "audio_encoder.layers.19.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
191
+ "audio_encoder.layers.19.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
192
+ "audio_encoder.layers.19.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
193
+ "audio_encoder.layers.19.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
194
+ "audio_encoder.layers.19.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
195
+ "audio_encoder.layers.19.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
196
+ "audio_encoder.layers.19.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
197
+ "audio_encoder.layers.19.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
198
+ "audio_encoder.layers.19.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
199
+ "audio_encoder.layers.2.fc1.bias": "model-00001-of-00004.safetensors",
200
+ "audio_encoder.layers.2.fc1.weight": "model-00001-of-00004.safetensors",
201
+ "audio_encoder.layers.2.fc2.bias": "model-00001-of-00004.safetensors",
202
+ "audio_encoder.layers.2.fc2.weight": "model-00001-of-00004.safetensors",
203
+ "audio_encoder.layers.2.final_layer_norm.bias": "model-00001-of-00004.safetensors",
204
+ "audio_encoder.layers.2.final_layer_norm.weight": "model-00001-of-00004.safetensors",
205
+ "audio_encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
206
+ "audio_encoder.layers.2.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
207
+ "audio_encoder.layers.2.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
208
+ "audio_encoder.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
209
+ "audio_encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
210
+ "audio_encoder.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
211
+ "audio_encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
212
+ "audio_encoder.layers.2.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
213
+ "audio_encoder.layers.2.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
214
+ "audio_encoder.layers.20.fc1.bias": "model-00001-of-00004.safetensors",
215
+ "audio_encoder.layers.20.fc1.weight": "model-00001-of-00004.safetensors",
216
+ "audio_encoder.layers.20.fc2.bias": "model-00001-of-00004.safetensors",
217
+ "audio_encoder.layers.20.fc2.weight": "model-00001-of-00004.safetensors",
218
+ "audio_encoder.layers.20.final_layer_norm.bias": "model-00001-of-00004.safetensors",
219
+ "audio_encoder.layers.20.final_layer_norm.weight": "model-00001-of-00004.safetensors",
220
+ "audio_encoder.layers.20.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
221
+ "audio_encoder.layers.20.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
222
+ "audio_encoder.layers.20.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
223
+ "audio_encoder.layers.20.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
224
+ "audio_encoder.layers.20.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
225
+ "audio_encoder.layers.20.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
226
+ "audio_encoder.layers.20.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
227
+ "audio_encoder.layers.20.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
228
+ "audio_encoder.layers.20.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
229
+ "audio_encoder.layers.21.fc1.bias": "model-00001-of-00004.safetensors",
230
+ "audio_encoder.layers.21.fc1.weight": "model-00001-of-00004.safetensors",
231
+ "audio_encoder.layers.21.fc2.bias": "model-00001-of-00004.safetensors",
232
+ "audio_encoder.layers.21.fc2.weight": "model-00001-of-00004.safetensors",
233
+ "audio_encoder.layers.21.final_layer_norm.bias": "model-00001-of-00004.safetensors",
234
+ "audio_encoder.layers.21.final_layer_norm.weight": "model-00001-of-00004.safetensors",
235
+ "audio_encoder.layers.21.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
236
+ "audio_encoder.layers.21.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
237
+ "audio_encoder.layers.21.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
238
+ "audio_encoder.layers.21.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
239
+ "audio_encoder.layers.21.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
240
+ "audio_encoder.layers.21.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
241
+ "audio_encoder.layers.21.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
242
+ "audio_encoder.layers.21.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
243
+ "audio_encoder.layers.21.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
244
+ "audio_encoder.layers.22.fc1.bias": "model-00001-of-00004.safetensors",
245
+ "audio_encoder.layers.22.fc1.weight": "model-00001-of-00004.safetensors",
246
+ "audio_encoder.layers.22.fc2.bias": "model-00001-of-00004.safetensors",
247
+ "audio_encoder.layers.22.fc2.weight": "model-00001-of-00004.safetensors",
248
+ "audio_encoder.layers.22.final_layer_norm.bias": "model-00001-of-00004.safetensors",
249
+ "audio_encoder.layers.22.final_layer_norm.weight": "model-00001-of-00004.safetensors",
250
+ "audio_encoder.layers.22.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
251
+ "audio_encoder.layers.22.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
252
+ "audio_encoder.layers.22.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
253
+ "audio_encoder.layers.22.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
254
+ "audio_encoder.layers.22.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
255
+ "audio_encoder.layers.22.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
256
+ "audio_encoder.layers.22.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
257
+ "audio_encoder.layers.22.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
258
+ "audio_encoder.layers.22.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
259
+ "audio_encoder.layers.23.fc1.bias": "model-00001-of-00004.safetensors",
260
+ "audio_encoder.layers.23.fc1.weight": "model-00001-of-00004.safetensors",
261
+ "audio_encoder.layers.23.fc2.bias": "model-00001-of-00004.safetensors",
262
+ "audio_encoder.layers.23.fc2.weight": "model-00001-of-00004.safetensors",
263
+ "audio_encoder.layers.23.final_layer_norm.bias": "model-00001-of-00004.safetensors",
264
+ "audio_encoder.layers.23.final_layer_norm.weight": "model-00001-of-00004.safetensors",
265
+ "audio_encoder.layers.23.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
266
+ "audio_encoder.layers.23.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
267
+ "audio_encoder.layers.23.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
268
+ "audio_encoder.layers.23.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
269
+ "audio_encoder.layers.23.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
270
+ "audio_encoder.layers.23.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
271
+ "audio_encoder.layers.23.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
272
+ "audio_encoder.layers.23.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
273
+ "audio_encoder.layers.23.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
274
+ "audio_encoder.layers.24.fc1.bias": "model-00001-of-00004.safetensors",
275
+ "audio_encoder.layers.24.fc1.weight": "model-00001-of-00004.safetensors",
276
+ "audio_encoder.layers.24.fc2.bias": "model-00001-of-00004.safetensors",
277
+ "audio_encoder.layers.24.fc2.weight": "model-00001-of-00004.safetensors",
278
+ "audio_encoder.layers.24.final_layer_norm.bias": "model-00001-of-00004.safetensors",
279
+ "audio_encoder.layers.24.final_layer_norm.weight": "model-00001-of-00004.safetensors",
280
+ "audio_encoder.layers.24.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
281
+ "audio_encoder.layers.24.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
282
+ "audio_encoder.layers.24.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
283
+ "audio_encoder.layers.24.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
284
+ "audio_encoder.layers.24.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
285
+ "audio_encoder.layers.24.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
286
+ "audio_encoder.layers.24.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
287
+ "audio_encoder.layers.24.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
288
+ "audio_encoder.layers.24.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
289
+ "audio_encoder.layers.25.fc1.bias": "model-00001-of-00004.safetensors",
290
+ "audio_encoder.layers.25.fc1.weight": "model-00001-of-00004.safetensors",
291
+ "audio_encoder.layers.25.fc2.bias": "model-00001-of-00004.safetensors",
292
+ "audio_encoder.layers.25.fc2.weight": "model-00001-of-00004.safetensors",
293
+ "audio_encoder.layers.25.final_layer_norm.bias": "model-00001-of-00004.safetensors",
294
+ "audio_encoder.layers.25.final_layer_norm.weight": "model-00001-of-00004.safetensors",
295
+ "audio_encoder.layers.25.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
296
+ "audio_encoder.layers.25.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
297
+ "audio_encoder.layers.25.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
298
+ "audio_encoder.layers.25.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
299
+ "audio_encoder.layers.25.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
300
+ "audio_encoder.layers.25.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
301
+ "audio_encoder.layers.25.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
302
+ "audio_encoder.layers.25.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
303
+ "audio_encoder.layers.25.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
304
+ "audio_encoder.layers.26.fc1.bias": "model-00001-of-00004.safetensors",
305
+ "audio_encoder.layers.26.fc1.weight": "model-00001-of-00004.safetensors",
306
+ "audio_encoder.layers.26.fc2.bias": "model-00001-of-00004.safetensors",
307
+ "audio_encoder.layers.26.fc2.weight": "model-00001-of-00004.safetensors",
308
+ "audio_encoder.layers.26.final_layer_norm.bias": "model-00001-of-00004.safetensors",
309
+ "audio_encoder.layers.26.final_layer_norm.weight": "model-00001-of-00004.safetensors",
310
+ "audio_encoder.layers.26.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
311
+ "audio_encoder.layers.26.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
312
+ "audio_encoder.layers.26.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
313
+ "audio_encoder.layers.26.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
314
+ "audio_encoder.layers.26.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
315
+ "audio_encoder.layers.26.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
316
+ "audio_encoder.layers.26.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
317
+ "audio_encoder.layers.26.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
318
+ "audio_encoder.layers.26.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
319
+ "audio_encoder.layers.27.fc1.bias": "model-00001-of-00004.safetensors",
320
+ "audio_encoder.layers.27.fc1.weight": "model-00001-of-00004.safetensors",
321
+ "audio_encoder.layers.27.fc2.bias": "model-00001-of-00004.safetensors",
322
+ "audio_encoder.layers.27.fc2.weight": "model-00001-of-00004.safetensors",
323
+ "audio_encoder.layers.27.final_layer_norm.bias": "model-00001-of-00004.safetensors",
324
+ "audio_encoder.layers.27.final_layer_norm.weight": "model-00001-of-00004.safetensors",
325
+ "audio_encoder.layers.27.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
326
+ "audio_encoder.layers.27.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
327
+ "audio_encoder.layers.27.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
328
+ "audio_encoder.layers.27.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
329
+ "audio_encoder.layers.27.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
330
+ "audio_encoder.layers.27.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
331
+ "audio_encoder.layers.27.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
332
+ "audio_encoder.layers.27.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
333
+ "audio_encoder.layers.27.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
334
+ "audio_encoder.layers.28.fc1.bias": "model-00001-of-00004.safetensors",
335
+ "audio_encoder.layers.28.fc1.weight": "model-00001-of-00004.safetensors",
336
+ "audio_encoder.layers.28.fc2.bias": "model-00001-of-00004.safetensors",
337
+ "audio_encoder.layers.28.fc2.weight": "model-00001-of-00004.safetensors",
338
+ "audio_encoder.layers.28.final_layer_norm.bias": "model-00001-of-00004.safetensors",
339
+ "audio_encoder.layers.28.final_layer_norm.weight": "model-00001-of-00004.safetensors",
340
+ "audio_encoder.layers.28.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
341
+ "audio_encoder.layers.28.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
342
+ "audio_encoder.layers.28.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
343
+ "audio_encoder.layers.28.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
344
+ "audio_encoder.layers.28.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
345
+ "audio_encoder.layers.28.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
346
+ "audio_encoder.layers.28.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
347
+ "audio_encoder.layers.28.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
348
+ "audio_encoder.layers.28.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
349
+ "audio_encoder.layers.29.fc1.bias": "model-00001-of-00004.safetensors",
350
+ "audio_encoder.layers.29.fc1.weight": "model-00001-of-00004.safetensors",
351
+ "audio_encoder.layers.29.fc2.bias": "model-00001-of-00004.safetensors",
352
+ "audio_encoder.layers.29.fc2.weight": "model-00001-of-00004.safetensors",
353
+ "audio_encoder.layers.29.final_layer_norm.bias": "model-00001-of-00004.safetensors",
354
+ "audio_encoder.layers.29.final_layer_norm.weight": "model-00001-of-00004.safetensors",
355
+ "audio_encoder.layers.29.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
356
+ "audio_encoder.layers.29.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
357
+ "audio_encoder.layers.29.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
358
+ "audio_encoder.layers.29.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
359
+ "audio_encoder.layers.29.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
360
+ "audio_encoder.layers.29.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
361
+ "audio_encoder.layers.29.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
362
+ "audio_encoder.layers.29.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
363
+ "audio_encoder.layers.29.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
364
+ "audio_encoder.layers.3.fc1.bias": "model-00001-of-00004.safetensors",
365
+ "audio_encoder.layers.3.fc1.weight": "model-00001-of-00004.safetensors",
366
+ "audio_encoder.layers.3.fc2.bias": "model-00001-of-00004.safetensors",
367
+ "audio_encoder.layers.3.fc2.weight": "model-00001-of-00004.safetensors",
368
+ "audio_encoder.layers.3.final_layer_norm.bias": "model-00001-of-00004.safetensors",
369
+ "audio_encoder.layers.3.final_layer_norm.weight": "model-00001-of-00004.safetensors",
370
+ "audio_encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
371
+ "audio_encoder.layers.3.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
372
+ "audio_encoder.layers.3.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
373
+ "audio_encoder.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
374
+ "audio_encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
375
+ "audio_encoder.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
376
+ "audio_encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
377
+ "audio_encoder.layers.3.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
378
+ "audio_encoder.layers.3.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
379
+ "audio_encoder.layers.30.fc1.bias": "model-00001-of-00004.safetensors",
380
+ "audio_encoder.layers.30.fc1.weight": "model-00001-of-00004.safetensors",
381
+ "audio_encoder.layers.30.fc2.bias": "model-00001-of-00004.safetensors",
382
+ "audio_encoder.layers.30.fc2.weight": "model-00001-of-00004.safetensors",
383
+ "audio_encoder.layers.30.final_layer_norm.bias": "model-00001-of-00004.safetensors",
384
+ "audio_encoder.layers.30.final_layer_norm.weight": "model-00001-of-00004.safetensors",
385
+ "audio_encoder.layers.30.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
386
+ "audio_encoder.layers.30.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
387
+ "audio_encoder.layers.30.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
388
+ "audio_encoder.layers.30.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
389
+ "audio_encoder.layers.30.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
390
+ "audio_encoder.layers.30.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
391
+ "audio_encoder.layers.30.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
392
+ "audio_encoder.layers.30.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
393
+ "audio_encoder.layers.30.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
394
+ "audio_encoder.layers.31.fc1.bias": "model-00001-of-00004.safetensors",
395
+ "audio_encoder.layers.31.fc1.weight": "model-00001-of-00004.safetensors",
396
+ "audio_encoder.layers.31.fc2.bias": "model-00001-of-00004.safetensors",
397
+ "audio_encoder.layers.31.fc2.weight": "model-00001-of-00004.safetensors",
398
+ "audio_encoder.layers.31.final_layer_norm.bias": "model-00001-of-00004.safetensors",
399
+ "audio_encoder.layers.31.final_layer_norm.weight": "model-00001-of-00004.safetensors",
400
+ "audio_encoder.layers.31.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
401
+ "audio_encoder.layers.31.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
402
+ "audio_encoder.layers.31.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
403
+ "audio_encoder.layers.31.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
404
+ "audio_encoder.layers.31.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
405
+ "audio_encoder.layers.31.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
406
+ "audio_encoder.layers.31.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
407
+ "audio_encoder.layers.31.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
408
+ "audio_encoder.layers.31.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
409
+ "audio_encoder.layers.4.fc1.bias": "model-00001-of-00004.safetensors",
410
+ "audio_encoder.layers.4.fc1.weight": "model-00001-of-00004.safetensors",
411
+ "audio_encoder.layers.4.fc2.bias": "model-00001-of-00004.safetensors",
412
+ "audio_encoder.layers.4.fc2.weight": "model-00001-of-00004.safetensors",
413
+ "audio_encoder.layers.4.final_layer_norm.bias": "model-00001-of-00004.safetensors",
414
+ "audio_encoder.layers.4.final_layer_norm.weight": "model-00001-of-00004.safetensors",
415
+ "audio_encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
416
+ "audio_encoder.layers.4.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
417
+ "audio_encoder.layers.4.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
418
+ "audio_encoder.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
419
+ "audio_encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
420
+ "audio_encoder.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
421
+ "audio_encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
422
+ "audio_encoder.layers.4.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
423
+ "audio_encoder.layers.4.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
424
+ "audio_encoder.layers.5.fc1.bias": "model-00001-of-00004.safetensors",
425
+ "audio_encoder.layers.5.fc1.weight": "model-00001-of-00004.safetensors",
426
+ "audio_encoder.layers.5.fc2.bias": "model-00001-of-00004.safetensors",
427
+ "audio_encoder.layers.5.fc2.weight": "model-00001-of-00004.safetensors",
428
+ "audio_encoder.layers.5.final_layer_norm.bias": "model-00001-of-00004.safetensors",
429
+ "audio_encoder.layers.5.final_layer_norm.weight": "model-00001-of-00004.safetensors",
430
+ "audio_encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
431
+ "audio_encoder.layers.5.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
432
+ "audio_encoder.layers.5.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
433
+ "audio_encoder.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
434
+ "audio_encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
435
+ "audio_encoder.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
436
+ "audio_encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
437
+ "audio_encoder.layers.5.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
438
+ "audio_encoder.layers.5.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
439
+ "audio_encoder.layers.6.fc1.bias": "model-00001-of-00004.safetensors",
440
+ "audio_encoder.layers.6.fc1.weight": "model-00001-of-00004.safetensors",
441
+ "audio_encoder.layers.6.fc2.bias": "model-00001-of-00004.safetensors",
442
+ "audio_encoder.layers.6.fc2.weight": "model-00001-of-00004.safetensors",
443
+ "audio_encoder.layers.6.final_layer_norm.bias": "model-00001-of-00004.safetensors",
444
+ "audio_encoder.layers.6.final_layer_norm.weight": "model-00001-of-00004.safetensors",
445
+ "audio_encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
446
+ "audio_encoder.layers.6.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
447
+ "audio_encoder.layers.6.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
448
+ "audio_encoder.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
449
+ "audio_encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
450
+ "audio_encoder.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
451
+ "audio_encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
452
+ "audio_encoder.layers.6.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
453
+ "audio_encoder.layers.6.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
454
+ "audio_encoder.layers.7.fc1.bias": "model-00001-of-00004.safetensors",
455
+ "audio_encoder.layers.7.fc1.weight": "model-00001-of-00004.safetensors",
456
+ "audio_encoder.layers.7.fc2.bias": "model-00001-of-00004.safetensors",
457
+ "audio_encoder.layers.7.fc2.weight": "model-00001-of-00004.safetensors",
458
+ "audio_encoder.layers.7.final_layer_norm.bias": "model-00001-of-00004.safetensors",
459
+ "audio_encoder.layers.7.final_layer_norm.weight": "model-00001-of-00004.safetensors",
460
+ "audio_encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
461
+ "audio_encoder.layers.7.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
462
+ "audio_encoder.layers.7.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
463
+ "audio_encoder.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
464
+ "audio_encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
465
+ "audio_encoder.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
466
+ "audio_encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
467
+ "audio_encoder.layers.7.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
468
+ "audio_encoder.layers.7.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
469
+ "audio_encoder.layers.8.fc1.bias": "model-00001-of-00004.safetensors",
470
+ "audio_encoder.layers.8.fc1.weight": "model-00001-of-00004.safetensors",
471
+ "audio_encoder.layers.8.fc2.bias": "model-00001-of-00004.safetensors",
472
+ "audio_encoder.layers.8.fc2.weight": "model-00001-of-00004.safetensors",
473
+ "audio_encoder.layers.8.final_layer_norm.bias": "model-00001-of-00004.safetensors",
474
+ "audio_encoder.layers.8.final_layer_norm.weight": "model-00001-of-00004.safetensors",
475
+ "audio_encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
476
+ "audio_encoder.layers.8.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
477
+ "audio_encoder.layers.8.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
478
+ "audio_encoder.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
479
+ "audio_encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
480
+ "audio_encoder.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
481
+ "audio_encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
482
+ "audio_encoder.layers.8.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
483
+ "audio_encoder.layers.8.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
484
+ "audio_encoder.layers.9.fc1.bias": "model-00001-of-00004.safetensors",
485
+ "audio_encoder.layers.9.fc1.weight": "model-00001-of-00004.safetensors",
486
+ "audio_encoder.layers.9.fc2.bias": "model-00001-of-00004.safetensors",
487
+ "audio_encoder.layers.9.fc2.weight": "model-00001-of-00004.safetensors",
488
+ "audio_encoder.layers.9.final_layer_norm.bias": "model-00001-of-00004.safetensors",
489
+ "audio_encoder.layers.9.final_layer_norm.weight": "model-00001-of-00004.safetensors",
490
+ "audio_encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
491
+ "audio_encoder.layers.9.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
492
+ "audio_encoder.layers.9.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
493
+ "audio_encoder.layers.9.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
494
+ "audio_encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
495
+ "audio_encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
496
+ "audio_encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
497
+ "audio_encoder.layers.9.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
498
+ "audio_encoder.layers.9.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
499
+ "audio_encoder.stem_proj.bias": "model-00001-of-00004.safetensors",
500
+ "audio_encoder.stem_proj.weight": "model-00001-of-00004.safetensors",
501
+ "deepstack_audio_merger_list.0.down_proj.weight": "model-00004-of-00004.safetensors",
502
+ "deepstack_audio_merger_list.0.gate_proj.weight": "model-00004-of-00004.safetensors",
503
+ "deepstack_audio_merger_list.0.up_proj.weight": "model-00004-of-00004.safetensors",
504
+ "deepstack_audio_merger_list.1.down_proj.weight": "model-00004-of-00004.safetensors",
505
+ "deepstack_audio_merger_list.1.gate_proj.weight": "model-00004-of-00004.safetensors",
506
+ "deepstack_audio_merger_list.1.up_proj.weight": "model-00004-of-00004.safetensors",
507
+ "deepstack_audio_merger_list.2.down_proj.weight": "model-00004-of-00004.safetensors",
508
+ "deepstack_audio_merger_list.2.gate_proj.weight": "model-00004-of-00004.safetensors",
509
+ "deepstack_audio_merger_list.2.up_proj.weight": "model-00004-of-00004.safetensors",
510
+ "language_model.embed_tokens.weight": "model-00001-of-00004.safetensors",
511
+ "language_model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
512
+ "language_model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
513
+ "language_model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
514
+ "language_model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
515
+ "language_model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
516
+ "language_model.layers.0.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
517
+ "language_model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
518
+ "language_model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
519
+ "language_model.layers.0.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
520
+ "language_model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
521
+ "language_model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
522
+ "language_model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
523
+ "language_model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
524
+ "language_model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
525
+ "language_model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
526
+ "language_model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
527
+ "language_model.layers.1.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
528
+ "language_model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
529
+ "language_model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
530
+ "language_model.layers.1.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
531
+ "language_model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
532
+ "language_model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
533
+ "language_model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
534
+ "language_model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
535
+ "language_model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
536
+ "language_model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
537
+ "language_model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
538
+ "language_model.layers.10.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
539
+ "language_model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
540
+ "language_model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
541
+ "language_model.layers.10.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
542
+ "language_model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
543
+ "language_model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
544
+ "language_model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
545
+ "language_model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
546
+ "language_model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
547
+ "language_model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
548
+ "language_model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
549
+ "language_model.layers.11.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
550
+ "language_model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
551
+ "language_model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
552
+ "language_model.layers.11.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
553
+ "language_model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
554
+ "language_model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
555
+ "language_model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
556
+ "language_model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
557
+ "language_model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
558
+ "language_model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
559
+ "language_model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
560
+ "language_model.layers.12.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
561
+ "language_model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
562
+ "language_model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
563
+ "language_model.layers.12.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
564
+ "language_model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
565
+ "language_model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
566
+ "language_model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
567
+ "language_model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
568
+ "language_model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
569
+ "language_model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
570
+ "language_model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
571
+ "language_model.layers.13.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
572
+ "language_model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
573
+ "language_model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
574
+ "language_model.layers.13.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
575
+ "language_model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
576
+ "language_model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
577
+ "language_model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
578
+ "language_model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
579
+ "language_model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
580
+ "language_model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
581
+ "language_model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
582
+ "language_model.layers.14.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
583
+ "language_model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
584
+ "language_model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
585
+ "language_model.layers.14.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
586
+ "language_model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
587
+ "language_model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
588
+ "language_model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
589
+ "language_model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
590
+ "language_model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
591
+ "language_model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
592
+ "language_model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
593
+ "language_model.layers.15.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
594
+ "language_model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
595
+ "language_model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
596
+ "language_model.layers.15.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
597
+ "language_model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
598
+ "language_model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
599
+ "language_model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
600
+ "language_model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
601
+ "language_model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
602
+ "language_model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
603
+ "language_model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
604
+ "language_model.layers.16.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
605
+ "language_model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
606
+ "language_model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
607
+ "language_model.layers.16.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
608
+ "language_model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
609
+ "language_model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
610
+ "language_model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
611
+ "language_model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
612
+ "language_model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
613
+ "language_model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
614
+ "language_model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
615
+ "language_model.layers.17.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
616
+ "language_model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
617
+ "language_model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
618
+ "language_model.layers.17.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
619
+ "language_model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
620
+ "language_model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
621
+ "language_model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
622
+ "language_model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
623
+ "language_model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
624
+ "language_model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
625
+ "language_model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
626
+ "language_model.layers.18.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
627
+ "language_model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
628
+ "language_model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
629
+ "language_model.layers.18.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
630
+ "language_model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
631
+ "language_model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
632
+ "language_model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
633
+ "language_model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
634
+ "language_model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
635
+ "language_model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
636
+ "language_model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
637
+ "language_model.layers.19.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
638
+ "language_model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
639
+ "language_model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
640
+ "language_model.layers.19.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
641
+ "language_model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
642
+ "language_model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
643
+ "language_model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
644
+ "language_model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
645
+ "language_model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
646
+ "language_model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
647
+ "language_model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
648
+ "language_model.layers.2.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
649
+ "language_model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
650
+ "language_model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
651
+ "language_model.layers.2.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
652
+ "language_model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
653
+ "language_model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
654
+ "language_model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
655
+ "language_model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
656
+ "language_model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
657
+ "language_model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
658
+ "language_model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
659
+ "language_model.layers.20.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
660
+ "language_model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
661
+ "language_model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
662
+ "language_model.layers.20.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
663
+ "language_model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
664
+ "language_model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
665
+ "language_model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
666
+ "language_model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
667
+ "language_model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
668
+ "language_model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
669
+ "language_model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
670
+ "language_model.layers.21.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
671
+ "language_model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
672
+ "language_model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
673
+ "language_model.layers.21.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
674
+ "language_model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
675
+ "language_model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
676
+ "language_model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
677
+ "language_model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
678
+ "language_model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
679
+ "language_model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
680
+ "language_model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
681
+ "language_model.layers.22.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
682
+ "language_model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
683
+ "language_model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
684
+ "language_model.layers.22.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
685
+ "language_model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
686
+ "language_model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
687
+ "language_model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
688
+ "language_model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
689
+ "language_model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
690
+ "language_model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
691
+ "language_model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
692
+ "language_model.layers.23.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
693
+ "language_model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
694
+ "language_model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
695
+ "language_model.layers.23.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
696
+ "language_model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
697
+ "language_model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
698
+ "language_model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
699
+ "language_model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
700
+ "language_model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
701
+ "language_model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
702
+ "language_model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
703
+ "language_model.layers.24.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
704
+ "language_model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
705
+ "language_model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
706
+ "language_model.layers.24.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
707
+ "language_model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
708
+ "language_model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
709
+ "language_model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
710
+ "language_model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
711
+ "language_model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
712
+ "language_model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
713
+ "language_model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
714
+ "language_model.layers.25.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
715
+ "language_model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
716
+ "language_model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
717
+ "language_model.layers.25.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
718
+ "language_model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
719
+ "language_model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
720
+ "language_model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
721
+ "language_model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
722
+ "language_model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
723
+ "language_model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
724
+ "language_model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
725
+ "language_model.layers.26.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
726
+ "language_model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
727
+ "language_model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
728
+ "language_model.layers.26.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
729
+ "language_model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
730
+ "language_model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
731
+ "language_model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
732
+ "language_model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
733
+ "language_model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
734
+ "language_model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
735
+ "language_model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
736
+ "language_model.layers.27.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
737
+ "language_model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
738
+ "language_model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
739
+ "language_model.layers.27.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
740
+ "language_model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
741
+ "language_model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
742
+ "language_model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
743
+ "language_model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
744
+ "language_model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
745
+ "language_model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
746
+ "language_model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
747
+ "language_model.layers.28.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
748
+ "language_model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
749
+ "language_model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
750
+ "language_model.layers.28.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
751
+ "language_model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
752
+ "language_model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
753
+ "language_model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
754
+ "language_model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
755
+ "language_model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
756
+ "language_model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
757
+ "language_model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
758
+ "language_model.layers.29.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
759
+ "language_model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
760
+ "language_model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
761
+ "language_model.layers.29.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
762
+ "language_model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
763
+ "language_model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
764
+ "language_model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
765
+ "language_model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
766
+ "language_model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
767
+ "language_model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
768
+ "language_model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
769
+ "language_model.layers.3.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
770
+ "language_model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
771
+ "language_model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
772
+ "language_model.layers.3.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
773
+ "language_model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
774
+ "language_model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
775
+ "language_model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
776
+ "language_model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
777
+ "language_model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
778
+ "language_model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
779
+ "language_model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
780
+ "language_model.layers.30.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
781
+ "language_model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
782
+ "language_model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
783
+ "language_model.layers.30.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
784
+ "language_model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
785
+ "language_model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
786
+ "language_model.layers.31.input_layernorm.weight": "model-00003-of-00004.safetensors",
787
+ "language_model.layers.31.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
788
+ "language_model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
789
+ "language_model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
790
+ "language_model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
791
+ "language_model.layers.31.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
792
+ "language_model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
793
+ "language_model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
794
+ "language_model.layers.31.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
795
+ "language_model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
796
+ "language_model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
797
+ "language_model.layers.32.input_layernorm.weight": "model-00004-of-00004.safetensors",
798
+ "language_model.layers.32.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
799
+ "language_model.layers.32.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
800
+ "language_model.layers.32.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
801
+ "language_model.layers.32.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
802
+ "language_model.layers.32.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
803
+ "language_model.layers.32.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
804
+ "language_model.layers.32.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
805
+ "language_model.layers.32.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
806
+ "language_model.layers.32.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
807
+ "language_model.layers.32.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
808
+ "language_model.layers.33.input_layernorm.weight": "model-00004-of-00004.safetensors",
809
+ "language_model.layers.33.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
810
+ "language_model.layers.33.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
811
+ "language_model.layers.33.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
812
+ "language_model.layers.33.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
813
+ "language_model.layers.33.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
814
+ "language_model.layers.33.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
815
+ "language_model.layers.33.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
816
+ "language_model.layers.33.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
817
+ "language_model.layers.33.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
818
+ "language_model.layers.33.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
819
+ "language_model.layers.34.input_layernorm.weight": "model-00004-of-00004.safetensors",
820
+ "language_model.layers.34.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
821
+ "language_model.layers.34.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
822
+ "language_model.layers.34.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
823
+ "language_model.layers.34.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
824
+ "language_model.layers.34.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
825
+ "language_model.layers.34.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
826
+ "language_model.layers.34.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
827
+ "language_model.layers.34.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
828
+ "language_model.layers.34.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
829
+ "language_model.layers.34.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
830
+ "language_model.layers.35.input_layernorm.weight": "model-00004-of-00004.safetensors",
831
+ "language_model.layers.35.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
832
+ "language_model.layers.35.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
833
+ "language_model.layers.35.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
834
+ "language_model.layers.35.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
835
+ "language_model.layers.35.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
836
+ "language_model.layers.35.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
837
+ "language_model.layers.35.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
838
+ "language_model.layers.35.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
839
+ "language_model.layers.35.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
840
+ "language_model.layers.35.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
841
+ "language_model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
842
+ "language_model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
843
+ "language_model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
844
+ "language_model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
845
+ "language_model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
846
+ "language_model.layers.4.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
847
+ "language_model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
848
+ "language_model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
849
+ "language_model.layers.4.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
850
+ "language_model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
851
+ "language_model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
852
+ "language_model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
853
+ "language_model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
854
+ "language_model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
855
+ "language_model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
856
+ "language_model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
857
+ "language_model.layers.5.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
858
+ "language_model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
859
+ "language_model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
860
+ "language_model.layers.5.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
861
+ "language_model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
862
+ "language_model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
863
+ "language_model.layers.6.input_layernorm.weight": "model-00002-of-00004.safetensors",
864
+ "language_model.layers.6.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
865
+ "language_model.layers.6.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
866
+ "language_model.layers.6.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
867
+ "language_model.layers.6.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
868
+ "language_model.layers.6.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
869
+ "language_model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
870
+ "language_model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
871
+ "language_model.layers.6.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
872
+ "language_model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
873
+ "language_model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
874
+ "language_model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors",
875
+ "language_model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
876
+ "language_model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
877
+ "language_model.layers.7.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
878
+ "language_model.layers.7.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
879
+ "language_model.layers.7.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
880
+ "language_model.layers.7.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
881
+ "language_model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
882
+ "language_model.layers.7.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
883
+ "language_model.layers.7.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
884
+ "language_model.layers.7.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
885
+ "language_model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
886
+ "language_model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
887
+ "language_model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
888
+ "language_model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
889
+ "language_model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
890
+ "language_model.layers.8.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
891
+ "language_model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
892
+ "language_model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
893
+ "language_model.layers.8.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
894
+ "language_model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
895
+ "language_model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
896
+ "language_model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
897
+ "language_model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
898
+ "language_model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
899
+ "language_model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
900
+ "language_model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
901
+ "language_model.layers.9.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
902
+ "language_model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
903
+ "language_model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
904
+ "language_model.layers.9.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
905
+ "language_model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
906
+ "language_model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
907
+ "language_model.norm.weight": "model-00004-of-00004.safetensors",
908
+ "lm_head.weight": "model-00004-of-00004.safetensors"
909
+ }
910
+ }
modeling_moss_music.py ADDED
@@ -0,0 +1,464 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, List, Optional, Tuple, Union
2
+ import math
3
+
4
+ import torch
5
+ import torch.nn as nn
6
+ from transformers.generation.utils import GenerationMixin
7
+ from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
8
+ from transformers.modeling_utils import PreTrainedModel
9
+ from transformers.models.qwen3.modeling_qwen3 import Qwen3DecoderLayer, Qwen3Model
10
+ from transformers.models.whisper.modeling_whisper import WhisperEncoderLayer
11
+ from transformers.utils.auto_docstring import auto_docstring
12
+
13
+ from .configuration_moss_music import MossMusicConfig
14
+
15
+
16
+ class SinusoidsPositionEmbedding(nn.Module):
17
+ def __init__(self, num_positions: int, embedding_dim: int):
18
+ super().__init__()
19
+ max_timescale = 10000.0
20
+ log_timescale_increment = math.log(max_timescale) / (embedding_dim // 2 - 1)
21
+ inv_timescales = torch.exp(
22
+ -log_timescale_increment * torch.arange(embedding_dim // 2).float()
23
+ )
24
+ self.register_buffer("inv_timescales", inv_timescales, persistent=False)
25
+
26
+ def forward(self, seq_len: int, device: torch.device):
27
+ scaled_time = torch.arange(
28
+ seq_len, device=device, dtype=self.inv_timescales.dtype
29
+ ).unsqueeze(1) * self.inv_timescales.unsqueeze(0)
30
+ sin_emb = torch.sin(scaled_time)
31
+ cos_emb = torch.cos(scaled_time)
32
+ pos_emb = torch.cat([sin_emb, cos_emb], dim=1)
33
+ return pos_emb.unsqueeze(0)
34
+
35
+
36
+ class MossMusicEncoder(nn.Module):
37
+ """Audio encoder with conv-stem downsampling and Whisper transformer layers."""
38
+
39
+ def __init__(self, config: dict):
40
+ super().__init__()
41
+ self.config = config
42
+ self.gelu = nn.GELU()
43
+
44
+ self.conv1 = nn.Conv2d(
45
+ 1,
46
+ config.downsample_hidden_size,
47
+ kernel_size=(3, 3),
48
+ stride=(2, 2),
49
+ padding=(1, 1),
50
+ )
51
+ self.conv2 = nn.Conv2d(
52
+ config.downsample_hidden_size,
53
+ config.downsample_hidden_size,
54
+ kernel_size=(3, 3),
55
+ stride=(2, 2),
56
+ padding=(1, 1),
57
+ )
58
+ self.conv3 = nn.Conv2d(
59
+ config.downsample_hidden_size,
60
+ config.downsample_hidden_size,
61
+ kernel_size=(3, 3),
62
+ stride=(2, 2),
63
+ padding=(1, 1),
64
+ )
65
+
66
+ # 128 mel bins / 8 = 16 after 3 convs with stride=2
67
+ self.stem_proj = nn.Linear(config.downsample_hidden_size * 16, config.d_model)
68
+ self.embed_positions = SinusoidsPositionEmbedding(
69
+ config.max_source_positions, config.d_model
70
+ )
71
+ self.layers = nn.ModuleList(
72
+ [WhisperEncoderLayer(config) for _ in range(config.encoder_layers)]
73
+ )
74
+ self.layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)
75
+ self.out_proj = (
76
+ nn.Linear(config.d_model, config.output_dim, bias=False)
77
+ if config.output_dim != config.d_model
78
+ else nn.Identity()
79
+ )
80
+
81
+ self._deepstack_indexes_set = set(config.deepstack_encoder_layer_indexes or [])
82
+
83
+ def _compute_downsampled_length(self, lengths: torch.Tensor) -> torch.Tensor:
84
+ def conv_out_len(L):
85
+ return (L - 1) // 2 + 1
86
+
87
+ l1 = conv_out_len(lengths)
88
+ l2 = conv_out_len(l1)
89
+ l3 = conv_out_len(l2)
90
+ return l3
91
+
92
+ def forward(
93
+ self,
94
+ input_features: torch.Tensor,
95
+ feature_lens: Optional[torch.Tensor] = None,
96
+ output_deepstack_hidden_states: bool = True,
97
+ ):
98
+ if input_features.dim() == 2:
99
+ input_features = input_features.unsqueeze(0)
100
+
101
+ if feature_lens is None:
102
+ feature_lens = torch.full(
103
+ (input_features.size(0),),
104
+ input_features.size(-1),
105
+ device=input_features.device,
106
+ dtype=torch.long,
107
+ )
108
+
109
+ downsampled_lengths = self._compute_downsampled_length(feature_lens)
110
+
111
+ # [B, n_mels, T] -> [B, 1, n_mels, T]
112
+ x = input_features.unsqueeze(1)
113
+ x = self.gelu(self.conv1(x))
114
+ x = self.gelu(self.conv2(x))
115
+ x = self.gelu(self.conv3(x))
116
+
117
+ # [B, C, F, T] -> [B, T, C*F]
118
+ x = x.permute(0, 3, 1, 2).contiguous().flatten(2)
119
+ x = self.stem_proj(x)
120
+
121
+ max_len = int(downsampled_lengths.max().item())
122
+ if x.size(1) > max_len:
123
+ x = x[:, :max_len, :]
124
+
125
+ positions = self.embed_positions(x.shape[1], x.device)
126
+ x = x + positions.to(x.dtype)
127
+
128
+ padding_mask = (
129
+ torch.arange(x.size(1), device=x.device)[None, :]
130
+ >= downsampled_lengths[:, None]
131
+ )
132
+ attention_mask = (1.0 - (~padding_mask).to(dtype=x.dtype)) * torch.finfo(
133
+ x.dtype
134
+ ).min
135
+ attention_mask = attention_mask.unsqueeze(1).unsqueeze(1)
136
+
137
+ deepstack_states: List[torch.Tensor] = []
138
+ for layer_idx, layer in enumerate(self.layers):
139
+ layer_outputs = layer(
140
+ x,
141
+ attention_mask,
142
+ layer_head_mask=None,
143
+ output_attentions=False,
144
+ )
145
+ x = layer_outputs[0]
146
+ if output_deepstack_hidden_states and layer_idx in self._deepstack_indexes_set:
147
+ deepstack_states.append(x)
148
+
149
+ x = self.layer_norm(x)
150
+ x = self.out_proj(x)
151
+
152
+ return BaseModelOutputWithPast(
153
+ last_hidden_state=x,
154
+ hidden_states=tuple(deepstack_states) if output_deepstack_hidden_states else None,
155
+ )
156
+
157
+
158
+ class GatedMLP(nn.Module):
159
+ def __init__(self, input_size, hidden_size, output_size):
160
+ super().__init__()
161
+ self.gate_proj = nn.Linear(input_size, hidden_size, bias=False)
162
+ self.up_proj = nn.Linear(input_size, hidden_size, bias=False)
163
+ self.down_proj = nn.Linear(hidden_size, output_size, bias=False)
164
+ self.act_fn = nn.SiLU()
165
+
166
+ def forward(self, x):
167
+ return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
168
+
169
+
170
+ @auto_docstring
171
+ class MossMusicPreTrainedModel(PreTrainedModel):
172
+ config_class = MossMusicConfig
173
+ config: MossMusicConfig
174
+ base_model_prefix = ""
175
+ supports_gradient_checkpointing = True
176
+ _no_split_modules = ["Qwen3DecoderLayer"]
177
+ _skip_keys_device_placement = ["past_key_values"]
178
+ _supports_flash_attn = True
179
+ _supports_sdpa = True
180
+ _supports_flex_attn = True
181
+
182
+ _can_compile_fullgraph = False
183
+ _supports_attention_backend = True
184
+ _can_record_outputs = {"hidden_states": Qwen3DecoderLayer}
185
+
186
+
187
+ class MossMusicModel(MossMusicPreTrainedModel, GenerationMixin):
188
+ config_class = MossMusicConfig
189
+ _tied_weights_keys: List[str] = []
190
+
191
+ def __init__(self, config: MossMusicConfig):
192
+ super().__init__(config)
193
+
194
+ self.audio_encoder = MossMusicEncoder(config.audio_config)
195
+ self.language_model = Qwen3Model(config.language_config)
196
+
197
+ self.audio_adapter = GatedMLP(
198
+ input_size=config.audio_config.output_dim,
199
+ hidden_size=config.adapter_hidden_size,
200
+ output_size=config.language_config.hidden_size,
201
+ )
202
+
203
+ deepstack_k = len(
204
+ getattr(config.audio_config, "deepstack_encoder_layer_indexes", []) or []
205
+ )
206
+ if config.deepstack_num_inject_layers is not None:
207
+ deepstack_k = min(deepstack_k, int(config.deepstack_num_inject_layers))
208
+ self.deepstack_audio_merger_list = nn.ModuleList(
209
+ [
210
+ GatedMLP(
211
+ input_size=config.audio_config.output_dim,
212
+ hidden_size=config.adapter_hidden_size,
213
+ output_size=config.language_config.hidden_size,
214
+ )
215
+ for _ in range(deepstack_k)
216
+ ]
217
+ )
218
+
219
+ self.vocab_size = config.language_config.vocab_size
220
+ self.lm_head = nn.Linear(
221
+ config.language_config.hidden_size, self.vocab_size, bias=False
222
+ )
223
+ self.post_init()
224
+
225
+ def get_input_embeddings(self):
226
+ return self.language_model.get_input_embeddings()
227
+
228
+ def set_input_embeddings(self, value):
229
+ self.language_model.set_input_embeddings(value)
230
+
231
+ def get_output_embeddings(self):
232
+ return self.lm_head
233
+
234
+ def set_output_embeddings(self, new_embeddings):
235
+ self.lm_head = new_embeddings
236
+
237
+ def get_audio_features(self, input_features, feature_lens):
238
+ audio_outputs = self.audio_encoder(
239
+ input_features=input_features,
240
+ feature_lens=feature_lens,
241
+ output_deepstack_hidden_states=True,
242
+ )
243
+ deepstack = (
244
+ list(audio_outputs.hidden_states)
245
+ if audio_outputs.hidden_states is not None
246
+ else None
247
+ )
248
+ return audio_outputs.last_hidden_state, deepstack
249
+
250
+ def _apply_deepstack_to_hidden_states(
251
+ self,
252
+ hidden_states: torch.Tensor,
253
+ audio_input_mask: torch.Tensor,
254
+ deepstack_embeds: torch.Tensor,
255
+ ) -> torch.Tensor:
256
+ audio_input_mask = audio_input_mask.to(hidden_states.device)
257
+ deepstack_embeds = deepstack_embeds.to(hidden_states.device, hidden_states.dtype)
258
+ flat = deepstack_embeds.reshape(-1, deepstack_embeds.shape[-1])
259
+ hs = hidden_states.clone()
260
+ hs[audio_input_mask] = hs[audio_input_mask] + flat
261
+ return hs
262
+
263
+ def _register_llm_deepstack_hooks(
264
+ self,
265
+ audio_input_mask: torch.Tensor,
266
+ deepstack_audio_embeds: List[torch.Tensor],
267
+ ):
268
+ if deepstack_audio_embeds is None or len(deepstack_audio_embeds) == 0:
269
+ return []
270
+
271
+ layers = getattr(self.language_model, "layers", None)
272
+ if layers is None:
273
+ raise RuntimeError(
274
+ "Qwen3Model does not expose `.layers`; cannot register DeepStack hooks."
275
+ )
276
+
277
+ num_inject = len(deepstack_audio_embeds)
278
+ handles = []
279
+
280
+ for layer_idx, layer in enumerate(layers):
281
+ if layer_idx >= num_inject:
282
+ break
283
+
284
+ def _make_llm_hook(k: int):
285
+ def _hook(_module, _inputs, _output):
286
+ if isinstance(_output, (tuple, list)):
287
+ hs = _output[0]
288
+ new_hs = self._apply_deepstack_to_hidden_states(
289
+ hs, audio_input_mask, deepstack_audio_embeds[k]
290
+ )
291
+ return (new_hs,) + tuple(_output[1:])
292
+ else:
293
+ return self._apply_deepstack_to_hidden_states(
294
+ _output, audio_input_mask, deepstack_audio_embeds[k]
295
+ )
296
+
297
+ return _hook
298
+
299
+ handles.append(layer.register_forward_hook(_make_llm_hook(layer_idx)))
300
+
301
+ return handles
302
+
303
+ def forward(
304
+ self,
305
+ input_ids: torch.LongTensor = None,
306
+ attention_mask: Optional[torch.Tensor] = None,
307
+ position_ids: Optional[torch.LongTensor] = None,
308
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
309
+ inputs_embeds: Optional[torch.FloatTensor] = None,
310
+ labels: Optional[torch.LongTensor] = None,
311
+ use_cache: Optional[bool] = None,
312
+ output_attentions: Optional[bool] = None,
313
+ output_hidden_states: Optional[bool] = None,
314
+ return_dict: Optional[bool] = None,
315
+ audio_data: Optional[torch.FloatTensor] = None,
316
+ audio_data_seqlens: Optional[torch.Tensor] = None,
317
+ audio_input_mask: Optional[torch.Tensor] = None,
318
+ cache_position: Optional[torch.LongTensor] = None,
319
+ **kwargs: Any,
320
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
321
+ output_attentions = (
322
+ output_attentions if output_attentions is not None else self.config.output_attentions
323
+ )
324
+ output_hidden_states = (
325
+ output_hidden_states
326
+ if output_hidden_states is not None
327
+ else self.config.output_hidden_states
328
+ )
329
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
330
+
331
+ if inputs_embeds is None:
332
+ inputs_embeds = self.get_input_embeddings()(input_ids)
333
+
334
+ hook_handles = []
335
+ if audio_data is not None:
336
+ if audio_input_mask is None:
337
+ raise ValueError(
338
+ "audio_input_mask is required when audio_data is provided."
339
+ )
340
+
341
+ audio_embeds, deepstack = self.get_audio_features(audio_data, audio_data_seqlens)
342
+ audio_embeds = self.audio_adapter(audio_embeds)
343
+
344
+ audio_token_count = int(audio_input_mask.to(torch.int32).sum().item())
345
+ if audio_token_count != int(audio_embeds.shape[1]):
346
+ raise ValueError(
347
+ f"Audio token count mismatch: audio_input_mask has {audio_token_count} audio tokens, "
348
+ f"but audio_embeds has length {int(audio_embeds.shape[1])}."
349
+ )
350
+
351
+ mask_expanded = audio_input_mask.unsqueeze(-1).expand_as(inputs_embeds)
352
+ inputs_embeds = inputs_embeds.clone()
353
+ inputs_embeds.masked_scatter_(mask_expanded, audio_embeds)
354
+
355
+ if deepstack is not None and len(self.deepstack_audio_merger_list) > 0:
356
+ deepstack_audio_embeds = []
357
+ for i, x in enumerate(deepstack[: len(self.deepstack_audio_merger_list)]):
358
+ ds = self.deepstack_audio_merger_list[i](x)
359
+ if int(ds.shape[1]) != audio_token_count:
360
+ raise ValueError(
361
+ f"DeepStack audio seq_len mismatch at index {i}: "
362
+ f"expected {audio_token_count}, got {int(ds.shape[1])}."
363
+ )
364
+ deepstack_audio_embeds.append(ds)
365
+
366
+ try:
367
+ hook_handles = self._register_llm_deepstack_hooks(
368
+ audio_input_mask, deepstack_audio_embeds
369
+ )
370
+ except Exception:
371
+ for h in hook_handles:
372
+ h.remove()
373
+ raise
374
+
375
+ try:
376
+ outputs = self.language_model(
377
+ input_ids=None,
378
+ attention_mask=attention_mask,
379
+ position_ids=position_ids,
380
+ past_key_values=past_key_values,
381
+ inputs_embeds=inputs_embeds,
382
+ use_cache=use_cache,
383
+ output_attentions=output_attentions,
384
+ output_hidden_states=output_hidden_states,
385
+ return_dict=return_dict,
386
+ cache_position=cache_position,
387
+ **kwargs,
388
+ )
389
+ finally:
390
+ for h in hook_handles:
391
+ h.remove()
392
+
393
+ hidden_states = outputs[0]
394
+ logits = self.lm_head(hidden_states)
395
+
396
+ loss = None
397
+ if labels is not None:
398
+ shift_logits = logits[..., :-1, :].contiguous()
399
+ shift_labels = labels[..., 1:].contiguous()
400
+ loss_fct = nn.CrossEntropyLoss(ignore_index=self.config.ignore_index)
401
+ shift_logits = shift_logits.view(-1, self.config.language_config.vocab_size)
402
+ shift_labels = shift_labels.view(-1)
403
+ shift_labels = shift_labels.to(shift_logits.device)
404
+ loss = loss_fct(shift_logits, shift_labels)
405
+
406
+ if not return_dict:
407
+ output = (logits,) + outputs[1:]
408
+ return ((loss,) + output) if loss is not None else output
409
+
410
+ return CausalLMOutputWithPast(
411
+ loss=loss,
412
+ logits=logits,
413
+ past_key_values=outputs.past_key_values,
414
+ hidden_states=outputs.hidden_states,
415
+ attentions=outputs.attentions,
416
+ )
417
+
418
+ def prepare_inputs_for_generation(
419
+ self,
420
+ input_ids,
421
+ past_key_values=None,
422
+ attention_mask=None,
423
+ inputs_embeds=None,
424
+ cache_position=None,
425
+ **kwargs,
426
+ ):
427
+ position_ids = kwargs.get("position_ids", None)
428
+ if cache_position is not None and cache_position[0] > 0:
429
+ input_ids = input_ids[:, -1:]
430
+ if position_ids is not None:
431
+ position_ids = position_ids[:, -1:]
432
+ audio_data = None
433
+ audio_input_mask = None
434
+ audio_data_seqlens = None
435
+ else:
436
+ audio_data = kwargs.get("audio_data", None)
437
+ audio_input_mask = kwargs.get("audio_input_mask", None)
438
+ audio_data_seqlens = kwargs.get("audio_data_seqlens", None)
439
+
440
+ if inputs_embeds is not None and past_key_values is None:
441
+ model_inputs = {"inputs_embeds": inputs_embeds}
442
+ else:
443
+ model_inputs = {"input_ids": input_ids}
444
+
445
+ model_inputs.update(
446
+ {
447
+ "past_key_values": past_key_values,
448
+ "use_cache": kwargs.get("use_cache"),
449
+ "attention_mask": attention_mask,
450
+ "position_ids": position_ids,
451
+ "audio_data": audio_data,
452
+ "audio_input_mask": audio_input_mask,
453
+ "audio_data_seqlens": audio_data_seqlens,
454
+ }
455
+ )
456
+
457
+ return model_inputs
458
+
459
+
460
+ __all__ = [
461
+ "dict",
462
+ "MossMusicConfig",
463
+ "MossMusicModel",
464
+ ]
preprocessor_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "processor_class": "MossMusicProcessor",
3
+ "auto_map": {
4
+ "AutoProcessor": "processing_moss_music.MossMusicProcessor"
5
+ }
6
+ }
processing_moss_music.py ADDED
@@ -0,0 +1,407 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib.util
2
+ import os
3
+ import re
4
+ import sys
5
+ import types
6
+ from dataclasses import dataclass
7
+ from typing import List, Optional, Sequence, Union
8
+
9
+ import numpy as np
10
+ import torch
11
+ import torchaudio # noqa: F401
12
+ from transformers import AutoTokenizer, BatchEncoding
13
+
14
+
15
+ @dataclass
16
+ class MelConfig:
17
+ mel_sr: int = 16000
18
+ mel_dim: int = 128
19
+ mel_n_fft: int = 400
20
+ mel_hop_length: int = 160
21
+ mel_dtype: torch.dtype = torch.bfloat16
22
+ use_whisper_feature_extractor: bool = True
23
+
24
+
25
+ def load_chat_template(template_path: str, mossflux_path: str = None) -> List:
26
+ if mossflux_path is None:
27
+ template_dir = os.path.dirname(os.path.abspath(template_path))
28
+ current = template_dir
29
+ while current and os.path.basename(current) != "mossLite":
30
+ parent = os.path.dirname(current)
31
+ if parent == current:
32
+ break
33
+ current = parent
34
+ if os.path.basename(current) == "mossLite":
35
+ mossflux_path = os.path.join(current, "mossflux")
36
+
37
+ if mossflux_path and mossflux_path not in sys.path:
38
+ sys.path.insert(0, mossflux_path)
39
+
40
+ spec = importlib.util.spec_from_file_location("chat_template_module", template_path)
41
+ module = importlib.util.module_from_spec(spec)
42
+ sys.modules["chat_template_module"] = module
43
+ spec.loader.exec_module(module)
44
+ return module.chat_template
45
+
46
+
47
+ class MossMusicProcessor:
48
+ _AUDIO_SPAN_RE = re.compile(r"<\|audio_bos\|>(?:<\|AUDIO\|>)+<\|audio_eos\|>")
49
+ _auto_class = None
50
+
51
+ @classmethod
52
+ def register_for_auto_class(cls, auto_class="AutoProcessor"):
53
+ if not isinstance(auto_class, str):
54
+ auto_class = auto_class.__name__
55
+ cls._auto_class = auto_class
56
+
57
+ def __init__(
58
+ self,
59
+ tokenizer,
60
+ *,
61
+ mel_config: Optional[MelConfig] = None,
62
+ template_path: Optional[str] = None,
63
+ enable_time_marker: bool = True,
64
+ audio_token_id: int = 151654,
65
+ audio_start_id: int = 151669,
66
+ audio_end_id: int = 151670,
67
+ ):
68
+ self._base_tokenizer = tokenizer
69
+ self.tokenizer = tokenizer
70
+ self.audio_token_id = int(audio_token_id)
71
+ self.audio_start_id = int(audio_start_id)
72
+ self.audio_end_id = int(audio_end_id)
73
+ self.chat_template = (
74
+ None if template_path is None else load_chat_template(template_path)
75
+ )
76
+ self.custom_texts = {}
77
+ self.enable_time_marker = bool(enable_time_marker)
78
+ self.config = mel_config or MelConfig()
79
+ self._whisper_feature_extractor = None
80
+
81
+ alias_map = {
82
+ "<|AUDIO|>": self.audio_token_id,
83
+ "<|audio_bos|>": self.audio_start_id,
84
+ "<|audio_eos|>": self.audio_end_id,
85
+ }
86
+ orig_convert_tokens_to_ids = self.tokenizer.convert_tokens_to_ids
87
+
88
+ def _patched_convert_tokens_to_ids(tokenizer_self, tokens):
89
+ if isinstance(tokens, (list, tuple)):
90
+ converted = [
91
+ _patched_convert_tokens_to_ids(tokenizer_self, token)
92
+ for token in tokens
93
+ ]
94
+ return converted if isinstance(tokens, list) else tuple(converted)
95
+ if isinstance(tokens, str) and tokens in alias_map:
96
+ return alias_map[tokens]
97
+ return orig_convert_tokens_to_ids(tokens)
98
+
99
+ self.tokenizer.convert_tokens_to_ids = types.MethodType(
100
+ _patched_convert_tokens_to_ids, self.tokenizer
101
+ )
102
+
103
+ self._digit_token_ids = {
104
+ "0": 15,
105
+ "1": 16,
106
+ "2": 17,
107
+ "3": 18,
108
+ "4": 19,
109
+ "5": 20,
110
+ "6": 21,
111
+ "7": 22,
112
+ "8": 23,
113
+ "9": 24,
114
+ }
115
+ self.audio_tokens_per_second = 12.5
116
+ self.time_marker_every_seconds = 2
117
+ self.time_marker_every_audio_tokens = int(
118
+ self.audio_tokens_per_second * self.time_marker_every_seconds
119
+ )
120
+ self.model_input_names = [
121
+ "input_ids",
122
+ "attention_mask",
123
+ "audio_data",
124
+ "audio_data_seqlens",
125
+ ]
126
+
127
+ @classmethod
128
+ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
129
+ tokenizer_kwargs = {}
130
+ for key in ["cache_dir", "revision", "token", "local_files_only"]:
131
+ if key in kwargs:
132
+ tokenizer_kwargs[key] = kwargs[key]
133
+
134
+ tokenizer = AutoTokenizer.from_pretrained(
135
+ pretrained_model_name_or_path,
136
+ use_fast=False,
137
+ **tokenizer_kwargs,
138
+ )
139
+
140
+ mel_config = kwargs.pop("mel_config", None)
141
+ template_path = kwargs.pop("template_path", None)
142
+ enable_time_marker = kwargs.pop("enable_time_marker", False)
143
+ audio_token_id = kwargs.pop("audio_token_id", 151654)
144
+ audio_start_id = kwargs.pop("audio_start_id", 151669)
145
+ audio_end_id = kwargs.pop("audio_end_id", 151670)
146
+
147
+ return cls(
148
+ tokenizer,
149
+ mel_config=mel_config,
150
+ template_path=template_path,
151
+ enable_time_marker=enable_time_marker,
152
+ audio_token_id=audio_token_id,
153
+ audio_start_id=audio_start_id,
154
+ audio_end_id=audio_end_id,
155
+ )
156
+
157
+ def load_template(self, template_path: str):
158
+ self.chat_template = load_chat_template(template_path)
159
+ return self
160
+
161
+ def set_custom_text(self, key: str, text: str):
162
+ self.custom_texts[key] = text
163
+ return self
164
+
165
+ def clear_custom_text(self, key: Optional[str] = None):
166
+ if key is None:
167
+ self.custom_texts.clear()
168
+ else:
169
+ self.custom_texts.pop(key, None)
170
+ return self
171
+
172
+ def _template_requires_audio(self) -> bool:
173
+ if self.chat_template is None:
174
+ return False
175
+ for segment in self.chat_template:
176
+ if segment.type in {"audio_contiguous", "audio_token"}:
177
+ return True
178
+ return False
179
+
180
+ @staticmethod
181
+ def _conv3_downsample_len(raw_mel_len: int) -> int:
182
+ def conv_out_len(length: int) -> int:
183
+ return (length - 1) // 2 + 1
184
+
185
+ length1 = conv_out_len(int(raw_mel_len))
186
+ length2 = conv_out_len(length1)
187
+ length3 = conv_out_len(length2)
188
+ return int(length3)
189
+
190
+ def _get_whisper_feature_extractor(self):
191
+ if self._whisper_feature_extractor is not None:
192
+ return self._whisper_feature_extractor
193
+
194
+ from transformers.models.whisper.feature_extraction_whisper import (
195
+ WhisperFeatureExtractor,
196
+ )
197
+
198
+ self._whisper_feature_extractor = WhisperFeatureExtractor(
199
+ feature_size=int(self.config.mel_dim),
200
+ sampling_rate=int(self.config.mel_sr),
201
+ hop_length=int(self.config.mel_hop_length),
202
+ n_fft=int(self.config.mel_n_fft),
203
+ )
204
+ return self._whisper_feature_extractor
205
+
206
+ def _extract_mel(self, audio: Union[np.ndarray, torch.Tensor]) -> torch.Tensor:
207
+ if isinstance(audio, np.ndarray):
208
+ wav = torch.from_numpy(audio)
209
+ else:
210
+ wav = audio
211
+ wav = wav.to(dtype=torch.float32)
212
+ if wav.dim() == 1:
213
+ wav = wav.unsqueeze(0)
214
+
215
+ if bool(getattr(self.config, "use_whisper_feature_extractor", False)):
216
+ fe = self._get_whisper_feature_extractor()
217
+ wav_np = wav.detach().to("cpu", torch.float32).contiguous().numpy()
218
+ if wav_np.ndim == 2:
219
+ wav_np = wav_np[0]
220
+ feats = fe._np_extract_fbank_features(wav_np[None, ...], device="cpu")
221
+ mel = torch.from_numpy(feats[0])
222
+
223
+ return mel.to(dtype=self.config.mel_dtype)
224
+
225
+ def _get_time_marker_token_ids(self, second: int) -> List[int]:
226
+ return [self._digit_token_ids[digit] for digit in str(second)]
227
+
228
+ def _build_audio_tokens_with_time_markers(self, audio_seq_len: int) -> List[int]:
229
+ total_duration_seconds = audio_seq_len / self.audio_tokens_per_second
230
+ num_full_seconds = int(total_duration_seconds)
231
+
232
+ token_ids: List[int] = []
233
+ audio_tokens_consumed = 0
234
+ for second in range(
235
+ self.time_marker_every_seconds,
236
+ num_full_seconds + 1,
237
+ self.time_marker_every_seconds,
238
+ ):
239
+ marker_pos = (
240
+ second // self.time_marker_every_seconds
241
+ ) * self.time_marker_every_audio_tokens
242
+ audio_segment_len = marker_pos - audio_tokens_consumed
243
+ if audio_segment_len > 0:
244
+ token_ids.extend([self.audio_token_id] * audio_segment_len)
245
+ audio_tokens_consumed += audio_segment_len
246
+ token_ids.extend(self._get_time_marker_token_ids(second))
247
+
248
+ remaining = audio_seq_len - audio_tokens_consumed
249
+ if remaining > 0:
250
+ token_ids.extend([self.audio_token_id] * remaining)
251
+ return token_ids
252
+
253
+ def _build_audio_placeholder_ids(self, num_audio_tokens: int) -> List[int]:
254
+ if self.enable_time_marker:
255
+ return self._build_audio_tokens_with_time_markers(num_audio_tokens)
256
+ return [self.audio_token_id] * num_audio_tokens
257
+
258
+ def _build_input_from_template(
259
+ self, num_audio_tokens: int, include_answer: bool = False
260
+ ) -> List[int]:
261
+ if self.chat_template is None:
262
+ raise ValueError("Chat template not loaded.")
263
+
264
+ input_ids: List[int] = []
265
+ for segment in self.chat_template:
266
+ seg_type = segment.type
267
+ if seg_type == "constant_text_token":
268
+ input_ids.extend(segment.text_ids.tolist())
269
+ elif seg_type in {"audio_contiguous", "audio_token"}:
270
+ input_ids.extend(self._build_audio_placeholder_ids(num_audio_tokens))
271
+ elif seg_type == "text_token":
272
+ text_token_key = segment.text_token_key
273
+ if "answer" in text_token_key.lower() and not include_answer:
274
+ break
275
+ if text_token_key not in self.custom_texts:
276
+ break
277
+ text_ids = self._base_tokenizer.encode(
278
+ self.custom_texts[text_token_key], add_special_tokens=False
279
+ )
280
+ input_ids.extend(text_ids)
281
+
282
+ return input_ids
283
+
284
+ def _build_default_prompt(self, text: str, has_audio: bool) -> str:
285
+ if has_audio:
286
+ return (
287
+ "<|im_start|>system\n"
288
+ "You are a helpful assistant.<|im_end|>\n"
289
+ "<|im_start|>user\n"
290
+ "<|audio_bos|><|AUDIO|><|audio_eos|>\n"
291
+ f"{text}<|im_end|>\n"
292
+ "<|im_start|>assistant\n"
293
+ )
294
+ return (
295
+ "<|im_start|>system\n"
296
+ "You are a helpful assistant.<|im_end|>\n"
297
+ "<|im_start|>user\n"
298
+ f"{text}<|im_end|>\n"
299
+ "<|im_start|>assistant\n"
300
+ )
301
+
302
+ def _build_input_from_prompt(self, prompt: str, token_lens: List[int]) -> List[int]:
303
+ spans = list(self._AUDIO_SPAN_RE.finditer(prompt))
304
+ if len(spans) != len(token_lens):
305
+ raise ValueError(
306
+ f"Audio placeholder count mismatch: found {len(spans)} spans in text, "
307
+ f"but got {len(token_lens)} audio inputs."
308
+ )
309
+
310
+ input_ids: List[int] = []
311
+ cursor = 0
312
+ for index, match in enumerate(spans):
313
+ prefix = prompt[cursor : match.start()]
314
+ if prefix:
315
+ input_ids.extend(
316
+ self._base_tokenizer.encode(prefix, add_special_tokens=False)
317
+ )
318
+
319
+ input_ids.append(self.audio_start_id)
320
+ input_ids.extend(self._build_audio_placeholder_ids(int(token_lens[index])))
321
+ input_ids.append(self.audio_end_id)
322
+ cursor = match.end()
323
+
324
+ suffix = prompt[cursor:]
325
+ if suffix:
326
+ input_ids.extend(
327
+ self._base_tokenizer.encode(suffix, add_special_tokens=False)
328
+ )
329
+ return input_ids
330
+
331
+ def __call__(
332
+ self,
333
+ *,
334
+ text: Union[str, Sequence[str], None] = None,
335
+ audios: Optional[Sequence[Union[np.ndarray, torch.Tensor]]] = None,
336
+ audio: Optional[Sequence[Union[np.ndarray, torch.Tensor]]] = None,
337
+ return_tensors: str = "pt",
338
+ **kwargs,
339
+ ):
340
+ if isinstance(text, (list, tuple)):
341
+ if len(text) != 1:
342
+ raise ValueError(f"Expected text batch size 1, got {len(text)}")
343
+ prompt_text = text[0]
344
+ else:
345
+ prompt_text = text
346
+
347
+ audio_list = audios if audios is not None else audio
348
+ audio_list = [] if audio_list is None else list(audio_list)
349
+
350
+ mels: List[torch.Tensor] = []
351
+ raw_lengths: List[int] = []
352
+ token_lens: List[int] = []
353
+ for one_audio in audio_list:
354
+ mel = self._extract_mel(one_audio)
355
+ raw_len = int(mel.shape[-1])
356
+ mels.append(mel)
357
+ raw_lengths.append(raw_len)
358
+ token_lens.append(self._conv3_downsample_len(raw_len))
359
+
360
+ if mels:
361
+ max_length = max(raw_lengths)
362
+ audio_batch = torch.zeros(
363
+ (len(mels), self.config.mel_dim, max_length),
364
+ dtype=self.config.mel_dtype,
365
+ )
366
+ for index, mel in enumerate(mels):
367
+ audio_batch[index, :, : mel.shape[-1]] = mel
368
+ seqlens_tensor = torch.tensor(raw_lengths, dtype=torch.long)
369
+ else:
370
+ audio_batch = None
371
+ seqlens_tensor = None
372
+
373
+ if prompt_text is not None:
374
+ if self._AUDIO_SPAN_RE.search(prompt_text) is None and audio_list:
375
+ prompt_text = self._build_default_prompt(prompt_text, has_audio=True)
376
+ elif self._AUDIO_SPAN_RE.search(prompt_text) is None and not audio_list:
377
+ prompt_text = self._build_default_prompt(prompt_text, has_audio=False)
378
+ input_ids_list = self._build_input_from_prompt(prompt_text, token_lens)
379
+ elif self.chat_template is not None:
380
+ input_ids_list = self._build_input_from_template(
381
+ token_lens[0] if token_lens else 0
382
+ )
383
+ else:
384
+ raise ValueError(
385
+ "Either provide text or load a chat_template before calling the processor."
386
+ )
387
+
388
+ input_ids_tensor = torch.tensor([input_ids_list], dtype=torch.long)
389
+ attention_mask_tensor = torch.ones_like(input_ids_tensor)
390
+
391
+ data = {
392
+ "input_ids": input_ids_tensor,
393
+ "attention_mask": attention_mask_tensor,
394
+ }
395
+ if audio_batch is not None:
396
+ data["audio_data"] = audio_batch
397
+ data["audio_data_seqlens"] = seqlens_tensor
398
+ return BatchEncoding(data=data, tensor_type=return_tensors)
399
+
400
+ def batch_decode(self, *args, **kwargs):
401
+ return self._base_tokenizer.batch_decode(*args, **kwargs)
402
+
403
+ def decode(self, *args, **kwargs):
404
+ return self._base_tokenizer.decode(*args, **kwargs)
405
+
406
+
407
+ __all__ = ["MelConfig", "MossMusicProcessor"]
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ },
213
+ "151669": {
214
+ "content": "<|system|>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": false
220
+ },
221
+ "151670": {
222
+ "content": "<|user|>",
223
+ "lstrip": false,
224
+ "normalized": false,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": false
228
+ },
229
+ "151671": {
230
+ "content": "<|assistant|>",
231
+ "lstrip": false,
232
+ "normalized": false,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": false
236
+ },
237
+ "151672": {
238
+ "content": "<|eot|>",
239
+ "lstrip": false,
240
+ "normalized": false,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": false
244
+ }
245
+ },
246
+ "additional_special_tokens": [
247
+ "<|im_start|>",
248
+ "<|im_end|>",
249
+ "<|object_ref_start|>",
250
+ "<|object_ref_end|>",
251
+ "<|box_start|>",
252
+ "<|box_end|>",
253
+ "<|quad_start|>",
254
+ "<|quad_end|>",
255
+ "<|vision_start|>",
256
+ "<|vision_end|>",
257
+ "<|vision_pad|>",
258
+ "<|image_pad|>",
259
+ "<|video_pad|>"
260
+ ],
261
+ "bos_token": null,
262
+ "clean_up_tokenization_spaces": false,
263
+ "eos_token": "<|im_end|>",
264
+ "errors": "replace",
265
+ "extra_special_tokens": {},
266
+ "model_max_length": 131072,
267
+ "pad_token": "<|endoftext|>",
268
+ "split_special_tokens": false,
269
+ "tokenizer_class": "Qwen2Tokenizer",
270
+ "unk_token": null
271
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff