Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

added_tokens.json +24 -0
config.yaml +9 -0
convert.py +240 -0
merges.txt +0 -0
vocab.json +0 -0

added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

config.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+source_model: "codefuse-ai/C2LLM-0.5B"
+target_format: "coreml"
+hf_repo: "rsvalerio/c2llm-0.5b-coreml"
+hf_revision: "main"
+artifacts:
+  - "model.mlpackage/**"
+  - "tokenizer.json"
+  - "tokenizer_config.json"
+  - "special_tokens_map.json"

convert.py ADDED Viewed

	@@ -0,0 +1,240 @@

+"""Convert codefuse-ai/C2LLM-0.5B to CoreML .mlpackage with ANE support.
+C2LLM-0.5B is a code embedding model built on Qwen-2.5-Coder with
+a Pooling-by-Multihead-Attention (PMA) head. The model outputs pooled
+embeddings directly — no external mean pooling needed.
+Uses ``torch.export`` with dynamic shapes so coremltools receives a graph
+that already encodes symbolic dimensions. Both batch and sequence length
+are dynamic, enabling true batched inference on CoreML.
+Produces:
+  - model.mlpackage/        (FP16, variable-length shapes for ANE)
+  - tokenizer.json          (HF fast tokenizer)
+  - tokenizer_config.json   (tokenizer settings)
+Usage:
+  uv run python convert.py
+"""
+import logging
+import math
+import shutil
+from pathlib import Path
+import coremltools as ct
+import numpy as np
+import torch
+from torch import Tensor, nn
+from transformers import AutoModel, AutoTokenizer
+from transformers.models.qwen2 import modeling_qwen2
+log = logging.getLogger(__name__)
+MODEL_ID = "codefuse-ai/C2LLM-0.5B"
+OUTPUT_DIR = Path(".")
+MAX_SEQ_LEN = 8192
+EXPECTED_OUTPUTS = ["model.mlpackage", "tokenizer.json", "tokenizer_config.json"]
+def _rotate_half_chunked(x: Tensor) -> Tensor:
+    """``torch.chunk`` avoids the dynamic ``x.shape[-1] // 2`` int op that
+    coremltools cannot convert."""
+    x1, x2 = torch.chunk(x, 2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+def _patch_mab_forward(mab_cls) -> None:
+    """Monkey-patch MAB_POST / MAB_POST_v2 forward to use reshape+transpose
+    instead of split+cat for multi-head attention.
+    The original code does:
+        Q_ = torch.cat(Q.split(dim_split, 2), 0)     # [B, S, C] → [B*H, S, C//H]
+        O  = torch.cat(result.split(B, 0), 2)         # [B*H, S, C//H] → [B, S, C]
+    The merge step ``split(B, 0)`` uses batch_size as the chunk count, which
+    torch.export cannot handle as a symbolic dimension. We replace both with
+    reshape+transpose which keeps everything symbolic-friendly.
+    """
+    original_init = mab_cls.__init__
+    def patched_init(self, *args, **kwargs):
+        original_init(self, *args, **kwargs)
+        # Store num_heads so we can use it in forward
+        self._num_heads = self.num_heads
+    def patched_forward(self, Q, K, pad_mask=None):
+        Q_proj = self.fc_q(Q)
+        K_, V_ = self.fc_k(K), self.fc_v(K)
+        B = Q.size(0)
+        H = self._num_heads
+        dim_split = self.dim_V // H
+        # Split heads via reshape: [B, S, C] → [B, S, H, C//H] → [B*H, S, C//H]
+        def split_heads(x):
+            s = x.size(1)
+            return x.reshape(B, s, H, dim_split).transpose(1, 2).reshape(B * H, s, dim_split)
+        Q_ = split_heads(Q_proj)
+        K_ = split_heads(K_)
+        V_ = split_heads(V_)
+        if pad_mask is not None:
+            # Expand mask for multi-head: [B, S] → [B*H, 1, S]
+            pad_mask = pad_mask.unsqueeze(1).unsqueeze(1)  # [B, 1, 1, S]
+            pad_mask = pad_mask.expand(-1, H, -1, -1).reshape(B * H, 1, -1)
+        A = Q_.bmm(K_.transpose(1, 2)) / math.sqrt(self.dim_V)
+        if pad_mask is not None:
+            A = A.masked_fill(pad_mask == 0, float("-inf"))
+        A = torch.softmax(A, dim=2)
+        result = A.bmm(V_)  # [B*H, seeds, C//H]
+        # Merge heads via reshape: [B*H, seeds, C//H] → [B, H, seeds, C//H] → [B, seeds, C]
+        seeds = result.size(1)
+        O = result.reshape(B, H, seeds, dim_split).transpose(1, 2).reshape(B, seeds, H * dim_split)
+        # Residual + layer norm (v2 uses Q_proj, v1 uses Q)
+        if hasattr(self, "ln1"):
+            # MAB_POST_v2 style: residual from projected Q
+            O = Q_proj + O
+            O = self.ln1(O)
+        else:
+            O = Q + O
+            if hasattr(self, "ln0"):
+                O = self.ln0(O)
+        return O
+    mab_cls.__init__ = patched_init
+    mab_cls.forward = patched_forward
+class PooledEmbeddingWrapper(nn.Module):
+    """Wraps the C2LLM model to return the pooled embedding tensor.
+    C2LLM uses PMA (Pooling by Multihead Attention) internally,
+    so the output is already [batch, dim] — no mean pooling needed.
+    We call the model's encode() or forward() to get the final embedding,
+    then L2-normalize.
+    """
+    def __init__(self, model: nn.Module) -> None:
+        super().__init__()
+        self.model = model
+    def forward(self, input_ids: Tensor, attention_mask: Tensor) -> Tensor:
+        # C2LLM.forward() returns {"sentence_embedding": tensor} with return_dict=True,
+        # or (tensor,) with return_dict=False. Use return_dict=False for cleaner export.
+        out = self.model(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
+        emb = out[0]
+        # L2 normalize
+        emb = torch.nn.functional.normalize(emb, p=2, dim=-1)
+        return emb
+def convert() -> None:
+    log.info("Loading %s...", MODEL_ID)
+    # C2LLM's modeling code references is_torch_npu_available without importing it.
+    # Inject it into builtins so it's available when the module loads.
+    import builtins
+    builtins.is_torch_npu_available = lambda: False
+    model = AutoModel.from_pretrained(
+        MODEL_ID,
+        trust_remote_code=True,
+        attn_implementation="eager",
+        torch_dtype=torch.float32,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+    model.eval()
+    # Patch rotate_half for coremltools compatibility (Qwen2 architecture)
+    modeling_qwen2.rotate_half = _rotate_half_chunked
+    # Patch PMA's multi-head attention to use reshape+transpose instead of
+    # split(batch_size, 0)+cat which breaks torch.export with dynamic batch.
+    # We find the MAB classes from the loaded model's module hierarchy.
+    mab_classes_patched = set()
+    for module in model.modules():
+        cls = type(module)
+        cls_name = cls.__name__
+        if cls_name.startswith("MAB_POST"):
+            # Set _num_heads on already-constructed instances
+            module._num_heads = module.num_heads
+            if cls not in mab_classes_patched:
+                log.info("Patching %s.forward for dynamic batch export", cls_name)
+                _patch_mab_forward(cls)
+                mab_classes_patched.add(cls)
+    wrapper = PooledEmbeddingWrapper(model)
+    wrapper.eval()
+    dummy = tokenizer(["hello world", "foo bar"], return_tensors="pt", padding=True)
+    # Verify output shape before export
+    with torch.no_grad():
+        test_out = wrapper(dummy["input_ids"], dummy["attention_mask"])
+        log.info("Test output shape: %s", test_out.shape)
+        assert test_out.dim() == 2, f"Expected 2D output [batch, dim], got shape {test_out.shape}"
+        log.info("Embedding dimension: %d", test_out.shape[-1])
+    batch_dim = torch.export.Dim("batch", min=1, max=512)
+    seq_dim = torch.export.Dim("seq", min=1, max=MAX_SEQ_LEN)
+    log.info("Exporting model (dynamic batch + dynamic seq)...")
+    exported = torch.export.export(
+        wrapper,
+        (dummy["input_ids"], dummy["attention_mask"]),
+        dynamic_shapes={
+            "input_ids": {0: batch_dim, 1: seq_dim},
+            "attention_mask": {0: batch_dim, 1: seq_dim},
+        },
+        strict=False,
+    ).run_decompositions()
+    # Strip _assert_tensor_metadata nodes added by PyTorch >= 2.7 that
+    # coremltools doesn't understand yet.
+    graph = exported.graph_module.graph
+    for node in list(graph.nodes):
+        if "_assert" in str(node.target):
+            graph.erase_node(node)
+    graph.lint()
+    exported.graph_module.recompile()
+    log.info("Converting to CoreML...")
+    mlmodel = ct.convert(
+        exported,
+        compute_units=ct.ComputeUnit.ALL,
+        compute_precision=ct.precision.FLOAT16,
+    )
+    output_path = OUTPUT_DIR / "model.mlpackage"
+    if output_path.exists():
+        shutil.rmtree(output_path)
+    log.info("Saving %s...", output_path)
+    mlmodel.save(str(output_path))
+    log.info("Saving tokenizer...")
+    tokenizer.save_pretrained(str(OUTPUT_DIR))
+    missing = [name for name in EXPECTED_OUTPUTS if not (OUTPUT_DIR / name).exists()]
+    if missing:
+        raise FileNotFoundError(f"Expected outputs not found: {', '.join(missing)}")
+    for name in EXPECTED_OUTPUTS:
+        log.info("  ok %s", name)
+    log.info("Done.")
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="%(message)s")
+    convert()

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff