ai-edge-torch/ai_edge_torch/generative/examples/README.md at main · google-ai-edge/ai-edge-torch · GitHub

 class TinyLLamma(nn.Module):

   def __init__(self, config: cfg.ModelConfig):
     super().__init__()

     self.config = config
     # Construct model layers.
     self.lm_head = nn.Linear(
         config.embedding_dim, config.vocab_size, bias=config.lm_head_use_bias
     )
     self.tok_embedding = nn.Embedding(
         config.vocab_size, config.embedding_dim, padding_idx=0
     )
     self.transformer_blocks = nn.ModuleList(
         TransformerBlock(config) for _ in range(config.num_layers)
     )
     self.final_norm = builder.build_norm(
         config.embedding_dim,
         config.final_norm_config,
     )
     self.rope_cache = attn_utils.build_rope_cache(
         size=config.kv_cache_max,
         dim=int(config.attn_config.rotary_percentage * config.head_dim),
         base=10_000,
         condense_ratio=1,
         dtype=torch.float32,
         device=torch.device("cpu"),
     )
     self.mask_cache = attn_utils.build_causal_mask_cache(
         size=config.kv_cache_max, dtype=torch.float32, device=torch.device("cpu")
     )
     self.config = config
 # The model's forward function takes in additional k/v cache tensors
 # and returns the updated k/v cache tensors to the caller.
 # This can be eliminated if we handle k/v cache updates inside the model itself.
 @torch.inference_mode
 def forward(self, idx: torch.Tensor, input_pos: torch.Tensor) -> torch.Tensor:
   B, T = idx.size()
   assert (
       self.config.max_seq_len >= T
   ), f"Cannot forward sequence of length {T}, max seq length is only {self.config.max_seq_len}"

   cos, sin = self.rope_cache
   cos = cos.index_select(0, input_pos)
   sin = sin.index_select(0, input_pos)
   mask = self.mask_cache.index_select(2, input_pos)
   mask = mask[:, :, :, : self.config.kv_cache_max]

   # forward the model itself
   x = self.tok_embedding(idx)  # token embeddings of shape (b, t, n_embd)

   for i, block in enumerate(self.transformer_blocks):
     x = block(x, (cos, sin), mask, input_pos)

   x = self.final_norm(x)

   res = self.lm_head(x)  # (b, t, vocab_size)
   return res
 @dataclass
 class TensorNames:
   attn_query_proj: str
   attn_key_proj: str
   attn_value_proj: str
   attn_output_proj: str

   ff_up_proj: str
   ff_down_proj: str
   ff_gate_proj: str = None

   pre_attn_norm: str = None
   pre_ff_norm: str = None
   embedding: str = None
   final_norm: str = None
   lm_head: str = None
 TENSOR_NAMES = loading_utils.ModelLoader.TensorNames(
     ff_up_proj="model.layers.{}.mlp.up_proj",
     ff_down_proj="model.layers.{}.mlp.down_proj",
     ff_gate_proj="model.layers.{}.mlp.gate_proj",
     attn_query_proj="model.layers.{}.self_attn.q_proj",
     attn_key_proj="model.layers.{}.self_attn.k_proj",
     attn_value_proj="model.layers.{}.self_attn.v_proj",
     attn_output_proj="model.layers.{}.self_attn.o_proj",
     pre_attn_norm="model.layers.{}.input_layernorm",
     pre_ff_norm="model.layers.{}.post_attention_layernorm",
     embedding="model.embed_tokens",
     final_norm="model.norm",
     lm_head="lm_head",
 )
 def convert_tiny_llama_to_tflite(
     checkpoint_path: str,
     prefill_seq_len: int = 512,
     kv_cache_max_len: int = 1024,
     quantize: bool = True,
 ):
   """An example method for converting TinyLlama model to multi-signature
   tflite model.

   Args:
       checkpoint_path (str): The filepath to the model checkpoint, or directory holding the checkpoint.
       prefill_seq_len (int, optional): The maximum size of prefill input tensor.
         Defaults to 512.
       kv_cache_max_len (int, optional): The maximum size of KV cache buffer,
         including both prefill and decode. Defaults to 1024.
       quantize (bool, optional): Whether the model should be quanized.
         Defaults to True.
   """
   pytorch_model = tiny_llama.build_model(
       checkpoint_path, kv_cache_max_len=kv_cache_max_len
   )
   # Tensors used to trace the model graph during conversion.
   prefill_tokens = torch.full((1, prefill_seq_len), 0, dtype=torch.long)
   prefill_input_pos = torch.arange(0, prefill_seq_len)
   decode_token = torch.tensor([[0]], dtype=torch.long)
   decode_input_pos = torch.tensor([0], dtype=torch.int64)

   quant_config = quant_recipes.full_linear_int8_dynamic_recipe() if quantize else None
   edge_model = (
       ai_edge_torch.signature(
           'prefill', pytorch_model, (prefill_tokens, prefill_input_pos)
       )
       .signature('decode', pytorch_model, (decode_token, decode_input_pos))
       .convert(quant_config=quant_config)
   )
   edge_model.export(f'/tmp/tiny_llama_seq{prefill_seq_len}_kv{kv_cache_max_len}.tflite')
	class TinyLLamma(nn.Module):

	def __init__(self, config: cfg.ModelConfig):
	super().__init__()

	self.config = config
	# Construct model layers.
	self.lm_head = nn.Linear(
	config.embedding_dim, config.vocab_size, bias=config.lm_head_use_bias
	)
	self.tok_embedding = nn.Embedding(
	config.vocab_size, config.embedding_dim, padding_idx=0
	)
	self.transformer_blocks = nn.ModuleList(
	TransformerBlock(config) for _ in range(config.num_layers)
	)
	self.final_norm = builder.build_norm(
	config.embedding_dim,
	config.final_norm_config,
	)
	self.rope_cache = attn_utils.build_rope_cache(
	size=config.kv_cache_max,
	dim=int(config.attn_config.rotary_percentage * config.head_dim),
	base=10_000,
	condense_ratio=1,
	dtype=torch.float32,
	device=torch.device("cpu"),
	)
	self.mask_cache = attn_utils.build_causal_mask_cache(
	size=config.kv_cache_max, dtype=torch.float32, device=torch.device("cpu")
	)
	self.config = config
	# The model's forward function takes in additional k/v cache tensors
	# and returns the updated k/v cache tensors to the caller.
	# This can be eliminated if we handle k/v cache updates inside the model itself.
	@torch.inference_mode
	def forward(self, idx: torch.Tensor, input_pos: torch.Tensor) -> torch.Tensor:
	B, T = idx.size()
	assert (
	self.config.max_seq_len >= T
	), f"Cannot forward sequence of length {T}, max seq length is only {self.config.max_seq_len}"

	cos, sin = self.rope_cache
	cos = cos.index_select(0, input_pos)
	sin = sin.index_select(0, input_pos)
	mask = self.mask_cache.index_select(2, input_pos)
	mask = mask[:, :, :, : self.config.kv_cache_max]

	# forward the model itself
	x = self.tok_embedding(idx) # token embeddings of shape (b, t, n_embd)

	for i, block in enumerate(self.transformer_blocks):
	x = block(x, (cos, sin), mask, input_pos)

	x = self.final_norm(x)

	res = self.lm_head(x) # (b, t, vocab_size)
	return res
	@dataclass
	class TensorNames:
	attn_query_proj: str
	attn_key_proj: str
	attn_value_proj: str
	attn_output_proj: str

	ff_up_proj: str
	ff_down_proj: str
	ff_gate_proj: str = None

	pre_attn_norm: str = None
	pre_ff_norm: str = None
	embedding: str = None
	final_norm: str = None
	lm_head: str = None
	TENSOR_NAMES = loading_utils.ModelLoader.TensorNames(
	ff_up_proj="model.layers.{}.mlp.up_proj",
	ff_down_proj="model.layers.{}.mlp.down_proj",
	ff_gate_proj="model.layers.{}.mlp.gate_proj",
	attn_query_proj="model.layers.{}.self_attn.q_proj",
	attn_key_proj="model.layers.{}.self_attn.k_proj",
	attn_value_proj="model.layers.{}.self_attn.v_proj",
	attn_output_proj="model.layers.{}.self_attn.o_proj",
	pre_attn_norm="model.layers.{}.input_layernorm",
	pre_ff_norm="model.layers.{}.post_attention_layernorm",
	embedding="model.embed_tokens",
	final_norm="model.norm",
	lm_head="lm_head",
	)
	def convert_tiny_llama_to_tflite(
	checkpoint_path: str,
	prefill_seq_len: int = 512,
	kv_cache_max_len: int = 1024,
	quantize: bool = True,
	):
	"""An example method for converting TinyLlama model to multi-signature
	tflite model.

	Args:
	checkpoint_path (str): The filepath to the model checkpoint, or directory holding the checkpoint.
	prefill_seq_len (int, optional): The maximum size of prefill input tensor.
	Defaults to 512.
	kv_cache_max_len (int, optional): The maximum size of KV cache buffer,
	including both prefill and decode. Defaults to 1024.
	quantize (bool, optional): Whether the model should be quanized.
	Defaults to True.
	"""
	pytorch_model = tiny_llama.build_model(
	checkpoint_path, kv_cache_max_len=kv_cache_max_len
	)
	# Tensors used to trace the model graph during conversion.
	prefill_tokens = torch.full((1, prefill_seq_len), 0, dtype=torch.long)
	prefill_input_pos = torch.arange(0, prefill_seq_len)
	decode_token = torch.tensor([[0]], dtype=torch.long)
	decode_input_pos = torch.tensor([0], dtype=torch.int64)

	quant_config = quant_recipes.full_linear_int8_dynamic_recipe() if quantize else None
	edge_model = (
	ai_edge_torch.signature(
	'prefill', pytorch_model, (prefill_tokens, prefill_input_pos)
	)
	.signature('decode', pytorch_model, (decode_token, decode_input_pos))
	.convert(quant_config=quant_config)
	)
	edge_model.export(f'/tmp/tiny_llama_seq{prefill_seq_len}_kv{kv_cache_max_len}.tflite')