Instructions to use BucketOfFish/simplified_phi2 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use BucketOfFish/simplified_phi2 with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="BucketOfFish/simplified_phi2", trust_remote_code=True)

# Load model directly
from transformers import AutoModel
model = AutoModel.from_pretrained("BucketOfFish/simplified_phi2", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use BucketOfFish/simplified_phi2 with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "BucketOfFish/simplified_phi2"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "BucketOfFish/simplified_phi2",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker

docker model run hf.co/BucketOfFish/simplified_phi2

SGLang

How to use BucketOfFish/simplified_phi2 with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "BucketOfFish/simplified_phi2" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "BucketOfFish/simplified_phi2",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "BucketOfFish/simplified_phi2" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "BucketOfFish/simplified_phi2",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Docker Model Runner
How to use BucketOfFish/simplified_phi2 with Docker Model Runner:
```
docker model run hf.co/BucketOfFish/simplified_phi2
```

simplified_phi2 / phi2_model.py

BucketOfFish

Passing KV cache through iterations

c07c430 over 2 years ago

raw

history blame contribute delete

6.3 kB

	import torch
	import torch.nn as nn
	from transformers import PreTrainedModel
	from transformers.modeling_outputs import CausalLMOutputWithPast
	from typing import Any, cast

	from .attention import ParallelAttentionBlock, KVCache
	from .phi2_configuration import Phi2Config


	class Phi2PreTrainedModel(PreTrainedModel):
	config_class = Phi2Config # not necessary unless you want to register model with auto classes
	supports_gradient_checkpointing = False
	# _no_split_modules = ["ParallelAttentionBlock"]

	def __init__(self, config: Phi2Config):
	super().__init__(config)
	self.config = config

	def _init_weights(self, module: nn.Module) -> None:
	# initialize weights - will get overwritten by saved weights in from_pretrained() if they exist
	if isinstance(module, (nn.Linear,)):
	module.weight.data.normal_(mean=0.0, std=self.config.weight_initialization_range)
	if module.bias is not None:
	module.bias.data.zero_()
	elif isinstance(module, nn.Embedding):
	module.weight.data.normal_(mean=0.0, std=self.config.weight_initialization_range)
	if module.padding_idx is not None:
	module.weight.data[module.padding_idx].zero_()
	elif isinstance(module, nn.LayerNorm):
	if module.bias is not None:
	module.bias.data.zero_()
	module.weight.data.fill_(1.0)

	def prepare_inputs_for_generation(
	self,
	input_ids: torch.LongTensor, # dim: (batch_size, seq_len)
	past_key_values: KVCache \| None = None, # has to be named this
	key_padding_mask: torch.LongTensor \| torch.BoolTensor \| None = None,
	**kwargs, # has to be here
	) -> dict[str, Any]:
	kv_cache = past_key_values
	if not kv_cache:
	kv_cache = KVCache(
	max_seqlen=self.config.initial_cos_sin_cache_len,
	max_batch_size=input_ids.shape[0],
	seqlen_offset=0,
	batch_size_offset=0,
	kv_block_map={},
	lengths_per_sample=None,
	)
	else:
	# assume that `kv_cache` has cached all tokens up to the last token in `input_ids`
	kv_cache.seqlen_offset = input_ids.shape[1] - 1
	input_ids = cast(torch.LongTensor, input_ids[:, -1].unsqueeze(-1))

	return { # to be passed to forward()
	"input_ids": input_ids,
	"kv_cache": kv_cache,
	"key_padding_mask": key_padding_mask,
	}


	class Embedding(nn.Module):
	"""Token embedding with dropout."""

	def __init__(
	self,
	vocab_size: int,
	d_embedding: int,
	embd_pdrop: float,
	) -> None:
	super().__init__()
	self.embeddings = nn.Embedding(vocab_size, d_embedding)
	self.dropout = nn.Dropout(embd_pdrop)

	def forward(
	self,
	input_ids: torch.LongTensor, # dim: (batch_size, seq_len)
	) -> torch.FloatTensor:
	x = self.embeddings( # dim: (batch_size, seq_len, d_embedding)
	input_ids.view(-1, input_ids.size()[-1])
	)
	x = self.dropout(x)
	return x


	class Phi2Model(Phi2PreTrainedModel):
	def __init__(self, config: Phi2Config) -> None:
	super().__init__(config)
	self.embedding = Embedding(
	vocab_size=config.vocab_size,
	d_embedding=config.d_embedding,
	embd_pdrop=config.embd_pdrop,
	)
	self.parallel_blocks = nn.ModuleList([
	ParallelAttentionBlock(
	resid_pdrop=config.resid_pdrop,
	layer_norm_epsilon=config.layer_norm_epsilon,
	d_embedding=config.d_embedding,
	n_attn_heads=config.n_attn_heads,
	block_n=i,
	initial_cos_sin_cache_len=config.initial_cos_sin_cache_len,
	attn_pdrop=config.attn_pdrop,
	use_flash_rotary=config.use_flash_rotary,
	use_flash_attn=config.use_flash_attn,
	use_fused_dense=config.use_fused_dense,
	checkpointing=config.checkpointing,
	)
	for i in range(config.n_attn_blocks)
	])
	self.gradient_checkpointing_disable() # https://github.com/cybertronai/gradient-checkpointing - I think this is turned off due to flash attention?
	self.post_init() # calls self._init_weights() for all modules

	"""
	def get_input_embeddings(self) -> nn.Embedding:
	return self.embedding.embeddings

	def set_input_embeddings(self, new_embeddings: nn.Embedding) -> None:
	self.embedding.embeddings = new_embeddings
	"""

	def forward(
	self,
	input_ids: torch.LongTensor,
	kv_cache: KVCache \| None = None,
	key_padding_mask: torch.BoolTensor \| None = None,
	) -> torch.FloatTensor:
	x = self.embedding(input_ids)
	for block in self.parallel_blocks:
	x = block(
	x,
	kv_cache=kv_cache,
	key_padding_mask=key_padding_mask,
	)
	return x


	class Phi2ModelForCausalLM(Phi2PreTrainedModel):
	def __init__(self, config: Phi2Config) -> None:
	super().__init__(config)
	self.model = Phi2Model(config)
	self.lm_head_layer_norm = nn.LayerNorm(config.d_embedding, eps=config.layer_norm_epsilon)
	self.lm_head_linear = nn.Linear(config.d_embedding, config.vocab_size)
	self.loss_fn = nn.CrossEntropyLoss()
	self.post_init() # calls self._init_weights() for all modules

	def forward(
	self,
	input_ids: torch.LongTensor,
	kv_cache: KVCache \| None = None,
	key_padding_mask: torch.BoolTensor \| None = None,
	labels: torch.LongTensor \| None = None,
	**kwargs, # has to be here
	) -> CausalLMOutputWithPast:
	x = self.model(input_ids, kv_cache=kv_cache, key_padding_mask=key_padding_mask)
	x = self.lm_head_layer_norm(x)
	logits = self.lm_head_linear(x).to(torch.float32)
	loss = (
	self.loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1))
	if labels is not None
	else None
	)
	return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=kv_cache)