rsvalerio commited on
Commit
e4e7a21
·
verified ·
1 Parent(s): eba6f16

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. added_tokens.json +24 -0
  2. config.yaml +9 -0
  3. convert.py +240 -0
  4. merges.txt +0 -0
  5. vocab.json +0 -0
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
config.yaml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ source_model: "codefuse-ai/C2LLM-0.5B"
2
+ target_format: "coreml"
3
+ hf_repo: "rsvalerio/c2llm-0.5b-coreml"
4
+ hf_revision: "main"
5
+ artifacts:
6
+ - "model.mlpackage/**"
7
+ - "tokenizer.json"
8
+ - "tokenizer_config.json"
9
+ - "special_tokens_map.json"
convert.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Convert codefuse-ai/C2LLM-0.5B to CoreML .mlpackage with ANE support.
2
+
3
+ C2LLM-0.5B is a code embedding model built on Qwen-2.5-Coder with
4
+ a Pooling-by-Multihead-Attention (PMA) head. The model outputs pooled
5
+ embeddings directly — no external mean pooling needed.
6
+
7
+ Uses ``torch.export`` with dynamic shapes so coremltools receives a graph
8
+ that already encodes symbolic dimensions. Both batch and sequence length
9
+ are dynamic, enabling true batched inference on CoreML.
10
+
11
+ Produces:
12
+ - model.mlpackage/ (FP16, variable-length shapes for ANE)
13
+ - tokenizer.json (HF fast tokenizer)
14
+ - tokenizer_config.json (tokenizer settings)
15
+
16
+ Usage:
17
+ uv run python convert.py
18
+ """
19
+
20
+ import logging
21
+ import math
22
+ import shutil
23
+ from pathlib import Path
24
+
25
+ import coremltools as ct
26
+ import numpy as np
27
+ import torch
28
+ from torch import Tensor, nn
29
+ from transformers import AutoModel, AutoTokenizer
30
+ from transformers.models.qwen2 import modeling_qwen2
31
+
32
+ log = logging.getLogger(__name__)
33
+
34
+ MODEL_ID = "codefuse-ai/C2LLM-0.5B"
35
+ OUTPUT_DIR = Path(".")
36
+ MAX_SEQ_LEN = 8192
37
+
38
+ EXPECTED_OUTPUTS = ["model.mlpackage", "tokenizer.json", "tokenizer_config.json"]
39
+
40
+
41
+ def _rotate_half_chunked(x: Tensor) -> Tensor:
42
+ """``torch.chunk`` avoids the dynamic ``x.shape[-1] // 2`` int op that
43
+ coremltools cannot convert."""
44
+ x1, x2 = torch.chunk(x, 2, dim=-1)
45
+ return torch.cat((-x2, x1), dim=-1)
46
+
47
+
48
+ def _patch_mab_forward(mab_cls) -> None:
49
+ """Monkey-patch MAB_POST / MAB_POST_v2 forward to use reshape+transpose
50
+ instead of split+cat for multi-head attention.
51
+
52
+ The original code does:
53
+ Q_ = torch.cat(Q.split(dim_split, 2), 0) # [B, S, C] → [B*H, S, C//H]
54
+ O = torch.cat(result.split(B, 0), 2) # [B*H, S, C//H] → [B, S, C]
55
+
56
+ The merge step ``split(B, 0)`` uses batch_size as the chunk count, which
57
+ torch.export cannot handle as a symbolic dimension. We replace both with
58
+ reshape+transpose which keeps everything symbolic-friendly.
59
+ """
60
+ original_init = mab_cls.__init__
61
+
62
+ def patched_init(self, *args, **kwargs):
63
+ original_init(self, *args, **kwargs)
64
+ # Store num_heads so we can use it in forward
65
+ self._num_heads = self.num_heads
66
+
67
+ def patched_forward(self, Q, K, pad_mask=None):
68
+ Q_proj = self.fc_q(Q)
69
+ K_, V_ = self.fc_k(K), self.fc_v(K)
70
+
71
+ B = Q.size(0)
72
+ H = self._num_heads
73
+ dim_split = self.dim_V // H
74
+
75
+ # Split heads via reshape: [B, S, C] → [B, S, H, C//H] → [B*H, S, C//H]
76
+ def split_heads(x):
77
+ s = x.size(1)
78
+ return x.reshape(B, s, H, dim_split).transpose(1, 2).reshape(B * H, s, dim_split)
79
+
80
+ Q_ = split_heads(Q_proj)
81
+ K_ = split_heads(K_)
82
+ V_ = split_heads(V_)
83
+
84
+ if pad_mask is not None:
85
+ # Expand mask for multi-head: [B, S] → [B*H, 1, S]
86
+ pad_mask = pad_mask.unsqueeze(1).unsqueeze(1) # [B, 1, 1, S]
87
+ pad_mask = pad_mask.expand(-1, H, -1, -1).reshape(B * H, 1, -1)
88
+
89
+ A = Q_.bmm(K_.transpose(1, 2)) / math.sqrt(self.dim_V)
90
+
91
+ if pad_mask is not None:
92
+ A = A.masked_fill(pad_mask == 0, float("-inf"))
93
+
94
+ A = torch.softmax(A, dim=2)
95
+
96
+ result = A.bmm(V_) # [B*H, seeds, C//H]
97
+
98
+ # Merge heads via reshape: [B*H, seeds, C//H] → [B, H, seeds, C//H] → [B, seeds, C]
99
+ seeds = result.size(1)
100
+ O = result.reshape(B, H, seeds, dim_split).transpose(1, 2).reshape(B, seeds, H * dim_split)
101
+
102
+ # Residual + layer norm (v2 uses Q_proj, v1 uses Q)
103
+ if hasattr(self, "ln1"):
104
+ # MAB_POST_v2 style: residual from projected Q
105
+ O = Q_proj + O
106
+ O = self.ln1(O)
107
+ else:
108
+ O = Q + O
109
+ if hasattr(self, "ln0"):
110
+ O = self.ln0(O)
111
+
112
+ return O
113
+
114
+ mab_cls.__init__ = patched_init
115
+ mab_cls.forward = patched_forward
116
+
117
+
118
+ class PooledEmbeddingWrapper(nn.Module):
119
+ """Wraps the C2LLM model to return the pooled embedding tensor.
120
+
121
+ C2LLM uses PMA (Pooling by Multihead Attention) internally,
122
+ so the output is already [batch, dim] — no mean pooling needed.
123
+ We call the model's encode() or forward() to get the final embedding,
124
+ then L2-normalize.
125
+ """
126
+
127
+ def __init__(self, model: nn.Module) -> None:
128
+ super().__init__()
129
+ self.model = model
130
+
131
+ def forward(self, input_ids: Tensor, attention_mask: Tensor) -> Tensor:
132
+ # C2LLM.forward() returns {"sentence_embedding": tensor} with return_dict=True,
133
+ # or (tensor,) with return_dict=False. Use return_dict=False for cleaner export.
134
+ out = self.model(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
135
+ emb = out[0]
136
+
137
+ # L2 normalize
138
+ emb = torch.nn.functional.normalize(emb, p=2, dim=-1)
139
+ return emb
140
+
141
+
142
+ def convert() -> None:
143
+ log.info("Loading %s...", MODEL_ID)
144
+
145
+ # C2LLM's modeling code references is_torch_npu_available without importing it.
146
+ # Inject it into builtins so it's available when the module loads.
147
+ import builtins
148
+ builtins.is_torch_npu_available = lambda: False
149
+
150
+ model = AutoModel.from_pretrained(
151
+ MODEL_ID,
152
+ trust_remote_code=True,
153
+ attn_implementation="eager",
154
+ torch_dtype=torch.float32,
155
+ )
156
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
157
+ model.eval()
158
+
159
+ # Patch rotate_half for coremltools compatibility (Qwen2 architecture)
160
+ modeling_qwen2.rotate_half = _rotate_half_chunked
161
+
162
+ # Patch PMA's multi-head attention to use reshape+transpose instead of
163
+ # split(batch_size, 0)+cat which breaks torch.export with dynamic batch.
164
+ # We find the MAB classes from the loaded model's module hierarchy.
165
+ mab_classes_patched = set()
166
+ for module in model.modules():
167
+ cls = type(module)
168
+ cls_name = cls.__name__
169
+ if cls_name.startswith("MAB_POST"):
170
+ # Set _num_heads on already-constructed instances
171
+ module._num_heads = module.num_heads
172
+ if cls not in mab_classes_patched:
173
+ log.info("Patching %s.forward for dynamic batch export", cls_name)
174
+ _patch_mab_forward(cls)
175
+ mab_classes_patched.add(cls)
176
+
177
+ wrapper = PooledEmbeddingWrapper(model)
178
+ wrapper.eval()
179
+
180
+ dummy = tokenizer(["hello world", "foo bar"], return_tensors="pt", padding=True)
181
+
182
+ # Verify output shape before export
183
+ with torch.no_grad():
184
+ test_out = wrapper(dummy["input_ids"], dummy["attention_mask"])
185
+ log.info("Test output shape: %s", test_out.shape)
186
+ assert test_out.dim() == 2, f"Expected 2D output [batch, dim], got shape {test_out.shape}"
187
+ log.info("Embedding dimension: %d", test_out.shape[-1])
188
+
189
+ batch_dim = torch.export.Dim("batch", min=1, max=512)
190
+ seq_dim = torch.export.Dim("seq", min=1, max=MAX_SEQ_LEN)
191
+
192
+ log.info("Exporting model (dynamic batch + dynamic seq)...")
193
+ exported = torch.export.export(
194
+ wrapper,
195
+ (dummy["input_ids"], dummy["attention_mask"]),
196
+ dynamic_shapes={
197
+ "input_ids": {0: batch_dim, 1: seq_dim},
198
+ "attention_mask": {0: batch_dim, 1: seq_dim},
199
+ },
200
+ strict=False,
201
+ ).run_decompositions()
202
+
203
+ # Strip _assert_tensor_metadata nodes added by PyTorch >= 2.7 that
204
+ # coremltools doesn't understand yet.
205
+ graph = exported.graph_module.graph
206
+ for node in list(graph.nodes):
207
+ if "_assert" in str(node.target):
208
+ graph.erase_node(node)
209
+ graph.lint()
210
+ exported.graph_module.recompile()
211
+
212
+ log.info("Converting to CoreML...")
213
+ mlmodel = ct.convert(
214
+ exported,
215
+ compute_units=ct.ComputeUnit.ALL,
216
+ compute_precision=ct.precision.FLOAT16,
217
+ )
218
+
219
+ output_path = OUTPUT_DIR / "model.mlpackage"
220
+ if output_path.exists():
221
+ shutil.rmtree(output_path)
222
+ log.info("Saving %s...", output_path)
223
+ mlmodel.save(str(output_path))
224
+
225
+ log.info("Saving tokenizer...")
226
+ tokenizer.save_pretrained(str(OUTPUT_DIR))
227
+
228
+ missing = [name for name in EXPECTED_OUTPUTS if not (OUTPUT_DIR / name).exists()]
229
+ if missing:
230
+ raise FileNotFoundError(f"Expected outputs not found: {', '.join(missing)}")
231
+
232
+ for name in EXPECTED_OUTPUTS:
233
+ log.info(" ok %s", name)
234
+
235
+ log.info("Done.")
236
+
237
+
238
+ if __name__ == "__main__":
239
+ logging.basicConfig(level=logging.INFO, format="%(message)s")
240
+ convert()
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
vocab.json ADDED
The diff for this file is too large to render. See raw diff