há 1 ano atrás · e31c6f0b45
--- a/.pylintrc
+++ b/.pylintrc
@@ -61,6 +61,7 @@ disable=abstract-method,
 
				         c-extension-no-member,
			
 
				         consider-using-enumerate,
			
 
				         cmp-builtin,
			
 
				+        inconsistent-quotes,
			
 
				         cmp-method,
			
 
				         coerce-builtin,
			
 
				         coerce-method,
			
--- a/aphrodite/modeling/layers/activation.py
+++ b/aphrodite/modeling/layers/activation.py
@@ -37,6 +37,27 @@ class SiluAndMul(nn.Module):
 
				         return out
			
 
				 
			
 
				 
			
 
				+class GeluAndMul(nn.Module):
			
 
				+    """An activation function for GeGLU.
			
 
				+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
			
 
				+    Shapes:
			
 
				+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
			
 
				+        return: (batch_size, seq_len, d) or (num_tokens, d)
			
 
				+    """
			
 
				+
			
 
				+    def _forward(self, x: torch.Tensor) -> torch.Tensor:
			
 
				+        """PyTorch-native implementation equivalent to forward()."""
			
 
				+        d = x.shape[-1] // 2
			
 
				+        return F.gelu(x[..., :d]) * x[..., d:]
			
 
				+
			
 
				+    def forward(self, x: torch.Tensor) -> torch.Tensor:
			
 
				+        d = x.shape[-1] // 2
			
 
				+        output_shape = (x.shape[:-1] + (d, ))
			
 
				+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
			
 
				+        ops.gelu_and_mul(out, x)
			
 
				+        return out
			
 
				+
			
 
				+
			
 
				 class NewGELU(nn.Module):
			
 
				 
			
 
				     def _forward(self, x: torch.Tensor) -> torch.Tensor:
			
--- a/aphrodite/modeling/layers/layernorm.py
+++ b/aphrodite/modeling/layers/layernorm.py
@@ -7,6 +7,31 @@ import torch.nn as nn
 
				 from aphrodite._C import ops
			
 
				 
			
 
				 
			
 
				+class LayerNorm(nn.LayerNorm):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        hidden_size: int,
			
 
				+        eps: float = 1e-6,
			
 
				+    ) -> None:
			
 
				+        super().__init__(hidden_size, eps=eps)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        x: torch.Tensor,
			
 
				+        residual: Optional[torch.Tensor] = None,
			
 
				+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
			
 
				+        """normalization."""
			
 
				+        if residual is not None:
			
 
				+            x = x + residual
			
 
				+            residual = x
			
 
				+        x = super().forward(x)
			
 
				+        if residual is None:
			
 
				+            return x
			
 
				+        else:
			
 
				+            return x, residual
			
 
				+
			
 
				+
			
 
				 class RMSNorm(nn.Module):
			
 
				     """Root mean square normalization.
			
 
				 
			
--- a/aphrodite/modeling/models/__init__.py
+++ b/aphrodite/modeling/models/__init__.py
@@ -8,30 +8,53 @@ from aphrodite.common.utils import is_hip
 
				 
			
 
				 logger = init_logger(__name__)
			
 
				 
			
 
				-# Architecture -> (module, class)
			
 
				+# Architecture -> (module, class).
			
 
				 _MODELS = {
			
 
				+    "AquilaModel": ("llama", "LlamaForCausalLM"),
			
 
				+    "AquilaForCausalLM": ("llama", "LlamaForCausalLM"),  # AquilaChat2
			
 
				+    "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"),  # baichuan-7b
			
 
				+    "BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"),  # baichuan-13b
			
 
				+    "BloomForCausalLM": ("bloom", "BloomForCausalLM"),
			
 
				+    "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"),
			
 
				+    "ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"),
			
 
				     "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
			
 
				     "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
			
 
				+    "FalconForCausalLM": ("falcon", "FalconForCausalLM"),
			
 
				+    "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),
			
 
				+    "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"),
			
 
				+    "GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"),
			
 
				     "GPTJForCausalLM": ("gpt_j", "GPTJForCausalLM"),
			
 
				     "GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
			
 
				+    "InternLMForCausalLM": ("llama", "LlamaForCausalLM"),
			
 
				+    "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),
			
 
				     "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
			
 
				+    # For decapoda-research/llama-*
			
 
				     "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
			
 
				-    "MistralForCausalLM": ("mistral", "MistralForCausalLM"),
			
 
				+    "MistralForCausalLM": ("llama", "LlamaForCausalLM"),
			
 
				     "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"),
			
 
				     "QuantMixtralForCausalLM": ("mixtral_quant", "MixtralForCausalLM"),
			
 
				-    "PhiForCausalLM": ("phi", "PhiForCausalLM"),
			
 
				-    "YiForCausalLM": ("yi", "YiForCausalLM"),
			
 
				+    # transformers's mpt class has lower case
			
 
				+    "MptForCausalLM": ("mpt", "MPTForCausalLM"),
			
 
				+    "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
			
 
				+    "OLMoForCausalLM": ("olmo", "OLMoForCausalLM"),
			
 
				     "OPTForCausalLM": ("opt", "OPTForCausalLM"),
			
 
				+    "PhiForCausalLM": ("phi", "PhiForCausalLM"),
			
 
				+    "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
			
 
				+    "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
			
 
				+    "RWForCausalLM": ("falcon", "FalconForCausalLM"),
			
 
				+    "StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"),
			
 
				 }
			
 
				 
			
 
				-# Models not supported by ROCm
			
 
				+# Models not supported by ROCm.
			
 
				 _ROCM_UNSUPPORTED_MODELS = []
			
 
				 
			
 
				 # Models partially supported by ROCm.
			
 
				-# Architecture -> Reason
			
 
				+# Architecture -> Reason.
			
 
				 _ROCM_PARTIALLY_SUPPORTED_MODELS = {
			
 
				+    "Qwen2ForCausalLM":
			
 
				+    "Sliding window attention is not yet supported in ROCm's flash attention",
			
 
				     "MistralForCausalLM":
			
 
				-    "Sliding window attention is not yet supported in ROCM's flash attention.",
			
 
				+    "Sliding window attention is not yet supported in ROCm's flash attention",
			
 
				     "MixtralForCausalLM":
			
 
				     "Sliding window attention is not yet supported in ROCm's flash attention",
			
 
				 }
			
@@ -45,8 +68,9 @@ class ModelRegistry:
 
				             return None
			
 
				         if is_hip():
			
 
				             if model_arch in _ROCM_UNSUPPORTED_MODELS:
			
 
				-                raise ValueError(f"Model architecture {model_arch} is not "
			
 
				-                                 "supported in ROCm for now.")
			
 
				+                raise ValueError(
			
 
				+                    f"Model architecture {model_arch} is not supported by "
			
 
				+                    "ROCm for now.")
			
 
				             if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS:
			
 
				                 logger.warning(
			
 
				                     f"Model architecture {model_arch} is partially supported "
			
@@ -62,4 +86,6 @@ class ModelRegistry:
 
				         return list(_MODELS.keys())
			
 
				 
			
 
				 
			
 
				-__all__ = ["ModelRegistry"]
			
 
				+__all__ = [
			
 
				+    "ModelRegistry",
			
 
				+]
			
--- a/aphrodite/modeling/models/baichuan.py
+++ b/aphrodite/modeling/models/baichuan.py
@@ -0,0 +1,412 @@
 
				+# coding=utf-8
			
 
				+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
			
 
				+#
			
 
				+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
			
 
				+# and OPT implementations in this library. It has been modified from its
			
 
				+# original forms to accommodate minor architectural differences compared
			
 
				+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+"""Inference-only BaiChuan model compatible with HuggingFace weights."""
			
 
				+import math
			
 
				+from typing import List, Optional, Tuple
			
 
				+
			
 
				+import torch
			
 
				+from torch import nn
			
 
				+
			
 
				+from aphrodite.modeling.metadata import InputMetadata
			
 
				+from aphrodite.modeling.layers.activation import SiluAndMul
			
 
				+from aphrodite.modeling.layers.attention import PagedAttention
			
 
				+from aphrodite.modeling.layers.layernorm import RMSNorm
			
 
				+from aphrodite.modeling.layers.linear import (LinearMethodBase,
			
 
				+                                              MergedColumnParallelLinear,
			
 
				+                                              QKVParallelLinear,
			
 
				+                                              RowParallelLinear,
			
 
				+                                              ColumnParallelLinear)
			
 
				+from aphrodite.modeling.layers.rotary_embedding import get_rope
			
 
				+from aphrodite.modeling.layers.sampler import Sampler
			
 
				+from aphrodite.modeling.layers.vocab_parallel_embedding import (
			
 
				+    VocabParallelEmbedding, ParallelLMHead)
			
 
				+from aphrodite.modeling.megatron.parallel_state import (
			
 
				+    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
			
 
				+from aphrodite.modeling.sampling_metadata import SamplingMetadata
			
 
				+from aphrodite.modeling.hf_downloader import (default_weight_loader,
			
 
				+                                              hf_model_weights_iterator)
			
 
				+from aphrodite.common.sequence import SamplerOutput
			
 
				+from aphrodite.transformers_utils.configs.baichuan import BaiChuanConfig
			
 
				+
			
 
				+KVCache = Tuple[torch.Tensor, torch.Tensor]
			
 
				+
			
 
				+
			
 
				+def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
			
 
				+    closest_power_of_2 = 2**math.floor(math.log2(total_num_heads))
			
 
				+    base = torch.tensor(
			
 
				+        2**(-(2**-(math.log2(closest_power_of_2) - 3))),
			
 
				+        dtype=torch.float32,
			
 
				+    )
			
 
				+    powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32)
			
 
				+    slopes = torch.pow(base, powers)
			
 
				+
			
 
				+    if closest_power_of_2 != total_num_heads:
			
 
				+        extra_base = torch.tensor(
			
 
				+            2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))),
			
 
				+            dtype=torch.float32,
			
 
				+        )
			
 
				+        num_remaining_heads = min(closest_power_of_2,
			
 
				+                                  total_num_heads - closest_power_of_2)
			
 
				+        extra_powers = torch.arange(start=1,
			
 
				+                                    end=1 + 2 * num_remaining_heads,
			
 
				+                                    step=2,
			
 
				+                                    dtype=torch.int32)
			
 
				+        slopes = torch.cat(
			
 
				+            [slopes, torch.pow(extra_base, extra_powers)], dim=0)
			
 
				+    return slopes
			
 
				+
			
 
				+
			
 
				+class BaiChuanMLP(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        hidden_size: int,
			
 
				+        intermediate_size: int,
			
 
				+        hidden_act: str,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        if linear_method is not None and not linear_method.quant_config.merge_weight(
			
 
				+        ):
			
 
				+            self.merge_weight = False
			
 
				+            self.gate_proj = ColumnParallelLinear(hidden_size,
			
 
				+                                                  intermediate_size,
			
 
				+                                                  bias=False,
			
 
				+                                                  linear_method=linear_method)
			
 
				+            self.up_proj = ColumnParallelLinear(hidden_size,
			
 
				+                                                intermediate_size,
			
 
				+                                                bias=False,
			
 
				+                                                linear_method=linear_method)
			
 
				+        else:
			
 
				+            self.merge_weight = True
			
 
				+            self.gate_up_proj = MergedColumnParallelLinear(
			
 
				+                hidden_size, [intermediate_size] * 2,
			
 
				+                bias=False,
			
 
				+                linear_method=linear_method)
			
 
				+        self.down_proj = RowParallelLinear(intermediate_size,
			
 
				+                                           hidden_size,
			
 
				+                                           bias=False,
			
 
				+                                           linear_method=linear_method)
			
 
				+        if hidden_act != "silu":
			
 
				+            raise ValueError(f"Unsupported activation: {hidden_act}. "
			
 
				+                             "Only silu is supported for now.")
			
 
				+        self.act_fn = SiluAndMul()
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        if self.merge_weight:
			
 
				+            gate_up, _ = self.gate_up_proj(x)
			
 
				+        else:
			
 
				+            up, _ = self.up_proj(x)
			
 
				+            gate, _ = self.gate_proj(x)
			
 
				+            gate_up = torch.cat([gate, up], dim=-1)
			
 
				+        x = self.act_fn(gate_up)
			
 
				+        x, _ = self.down_proj(x)
			
 
				+        return x
			
 
				+
			
 
				+
			
 
				+class BaiChuanAttention(nn.Module):
			
 
				+    """Multi-headed attention from 'Attention Is All You Need' paper"""
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        hidden_size: int,
			
 
				+        num_heads: int,
			
 
				+        position_embedding: str,
			
 
				+        rope_theta: float = 10000,
			
 
				+        max_position_embeddings: int = 8192,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.hidden_size = hidden_size
			
 
				+        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size(
			
 
				+        )
			
 
				+        self.total_num_heads = num_heads
			
 
				+        assert self.total_num_heads % tensor_model_parallel_world_size == 0
			
 
				+        self.num_heads = (self.total_num_heads //
			
 
				+                          tensor_model_parallel_world_size)
			
 
				+        self.head_dim = hidden_size // self.total_num_heads
			
 
				+        self.postion_embedding = position_embedding
			
 
				+        self.rope_theta = rope_theta
			
 
				+        self.max_position_embeddings = max_position_embeddings
			
 
				+
			
 
				+        # pylint: disable=invalid-name
			
 
				+        self.W_pack = QKVParallelLinear(
			
 
				+            hidden_size,
			
 
				+            self.head_dim,
			
 
				+            self.total_num_heads,
			
 
				+            self.total_num_heads,
			
 
				+            bias=False,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+        self.o_proj = RowParallelLinear(
			
 
				+            self.total_num_heads * self.head_dim,
			
 
				+            hidden_size,
			
 
				+            bias=False,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+        # Create the alibi slopes and slice them.
			
 
				+        if self.postion_embedding == "ALIBI":
			
 
				+            tp_rank = get_tensor_model_parallel_rank()
			
 
				+            head_start = tp_rank * self.num_heads
			
 
				+            head_end = (tp_rank + 1) * self.num_heads
			
 
				+            alibi_slopes = _get_alibi_slopes(self.total_num_heads)
			
 
				+            alibi_slopes = alibi_slopes[head_start:head_end].tolist()
			
 
				+
			
 
				+            scaling = self.head_dim**-0.5
			
 
				+            self.attn = PagedAttention(self.num_heads,
			
 
				+                                       self.head_dim,
			
 
				+                                       scaling,
			
 
				+                                       alibi_slopes=alibi_slopes)
			
 
				+        else:
			
 
				+            is_neox_style = True if linear_method is None or linear_method.quant_config.rope_style(
			
 
				+            ) is None else linear_method.quant_config.rope_style()
			
 
				+            self.rotary_emb = get_rope(
			
 
				+                self.head_dim,
			
 
				+                rotary_dim=self.head_dim,
			
 
				+                max_position=self.max_position_embeddings,
			
 
				+                base=self.rope_theta,
			
 
				+                is_neox_style=is_neox_style,
			
 
				+            )
			
 
				+            self.scaling = self.head_dim**-0.5
			
 
				+            self.attn = PagedAttention(self.num_heads, self.head_dim,
			
 
				+                                       self.scaling)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        positions: torch.Tensor,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        kv_cache: KVCache,
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        qkv, _ = self.W_pack(hidden_states)
			
 
				+        q, k, v = qkv.chunk(chunks=3, dim=-1)
			
 
				+        if self.postion_embedding != "ALIBI":
			
 
				+            q, k = self.rotary_emb(positions, q, k)
			
 
				+        k_cache, v_cache = kv_cache
			
 
				+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
			
 
				+        output, _ = self.o_proj(attn_output)
			
 
				+        return output
			
 
				+
			
 
				+
			
 
				+class BaiChuanDecoderLayer(nn.Module):
			
 
				+
			
 
				+    def __init__(self,
			
 
				+                 config: BaiChuanConfig,
			
 
				+                 position_embedding: str,
			
 
				+                 linear_method: Optional[LinearMethodBase] = None):
			
 
				+        super().__init__()
			
 
				+        self.hidden_size = config.hidden_size
			
 
				+        rope_theta = getattr(config, "rope_theta", 10000)
			
 
				+        max_position_embeddings = getattr(config, "max_position_embeddings",
			
 
				+                                          8192)
			
 
				+        self.self_attn = BaiChuanAttention(
			
 
				+            hidden_size=self.hidden_size,
			
 
				+            num_heads=config.num_attention_heads,
			
 
				+            position_embedding=position_embedding,
			
 
				+            rope_theta=rope_theta,
			
 
				+            max_position_embeddings=max_position_embeddings,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+        self.mlp = BaiChuanMLP(
			
 
				+            hidden_size=self.hidden_size,
			
 
				+            intermediate_size=config.intermediate_size,
			
 
				+            hidden_act=config.hidden_act,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+        self.input_layernorm = RMSNorm(config.hidden_size,
			
 
				+                                       eps=config.rms_norm_eps)
			
 
				+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
			
 
				+                                                eps=config.rms_norm_eps)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        positions: torch.Tensor,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        kv_cache: KVCache,
			
 
				+        input_metadata: InputMetadata,
			
 
				+        residual: Optional[torch.Tensor],
			
 
				+    ) -> Tuple[torch.Tensor, torch.Tensor]:
			
 
				+        # Self Attention
			
 
				+        if residual is None:
			
 
				+            residual = hidden_states
			
 
				+            hidden_states = self.input_layernorm(hidden_states)
			
 
				+        else:
			
 
				+            hidden_states, residual = self.input_layernorm(
			
 
				+                hidden_states, residual)
			
 
				+        hidden_states = self.self_attn(
			
 
				+            positions=positions,
			
 
				+            hidden_states=hidden_states,
			
 
				+            kv_cache=kv_cache,
			
 
				+            input_metadata=input_metadata,
			
 
				+        )
			
 
				+
			
 
				+        # Fully Connected
			
 
				+        hidden_states, residual = self.post_attention_layernorm(
			
 
				+            hidden_states, residual)
			
 
				+        hidden_states = self.mlp(hidden_states)
			
 
				+        return hidden_states, residual
			
 
				+
			
 
				+
			
 
				+class BaiChuanModel(nn.Module):
			
 
				+
			
 
				+    def __init__(self,
			
 
				+                 config: BaiChuanConfig,
			
 
				+                 position_embedding: str,
			
 
				+                 linear_method: Optional[LinearMethodBase] = None):
			
 
				+        super().__init__()
			
 
				+        self.config = config
			
 
				+        self.padding_idx = config.pad_token_id
			
 
				+        self.vocab_size = config.vocab_size
			
 
				+
			
 
				+        self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
			
 
				+                                                   config.hidden_size,
			
 
				+                                                   linear_method=linear_method)
			
 
				+        self.layers = nn.ModuleList([
			
 
				+            BaiChuanDecoderLayer(config, position_embedding, linear_method)
			
 
				+            for _ in range(config.num_hidden_layers)
			
 
				+        ])
			
 
				+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        input_ids: torch.Tensor,
			
 
				+        positions: torch.Tensor,
			
 
				+        kv_caches: List[KVCache],
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        hidden_states = self.embed_tokens(input_ids)
			
 
				+        residual = None
			
 
				+        for i in range(len(self.layers)):
			
 
				+            layer = self.layers[i]
			
 
				+            hidden_states, residual = layer(
			
 
				+                positions,
			
 
				+                hidden_states,
			
 
				+                kv_caches[i],
			
 
				+                input_metadata,
			
 
				+                residual,
			
 
				+            )
			
 
				+        hidden_states, _ = self.norm(hidden_states, residual)
			
 
				+        return hidden_states
			
 
				+
			
 
				+
			
 
				+class BaiChuanBaseForCausalLM(nn.Module):
			
 
				+
			
 
				+    def __init__(self,
			
 
				+                 config,
			
 
				+                 position_embedding: str,
			
 
				+                 linear_method: Optional[LinearMethodBase] = None):
			
 
				+        super().__init__()
			
 
				+        self.config = config
			
 
				+        self.linear_method = linear_method
			
 
				+        self.model = BaiChuanModel(config, position_embedding, linear_method)
			
 
				+        self.lm_head = ParallelLMHead(config.vocab_size,
			
 
				+                                      config.hidden_size,
			
 
				+                                      linear_method=linear_method)
			
 
				+        self.sampler = Sampler(config.vocab_size)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        input_ids: torch.Tensor,
			
 
				+        positions: torch.Tensor,
			
 
				+        kv_caches: List[KVCache],
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        hidden_states = self.model(input_ids, positions, kv_caches,
			
 
				+                                   input_metadata)
			
 
				+        return hidden_states
			
 
				+
			
 
				+    def sample(
			
 
				+        self,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        sampling_metadata: SamplingMetadata,
			
 
				+    ) -> Optional[SamplerOutput]:
			
 
				+        next_tokens = self.sampler(self.lm_head(hidden_states),
			
 
				+                                   sampling_metadata)
			
 
				+        return next_tokens
			
 
				+
			
 
				+    def load_weights(self,
			
 
				+                     model_name_or_path: str,
			
 
				+                     cache_dir: Optional[str] = None,
			
 
				+                     load_format: str = "auto",
			
 
				+                     revision: Optional[str] = None):
			
 
				+        stacked_params_mapping = [
			
 
				+            # (param_name, shard_name, shard_id)
			
 
				+            ("gate_up_proj", "gate_proj", 0),
			
 
				+            ("gate_up_proj", "up_proj", 1),
			
 
				+        ]
			
 
				+        if self.linear_method is not None and not self.linear_method.quant_config.merge_weight(
			
 
				+        ):
			
 
				+            stacked_params_mapping = []
			
 
				+        params_dict = dict(self.named_parameters())
			
 
				+        for name, loaded_weight in hf_model_weights_iterator(
			
 
				+                model_name_or_path, cache_dir, load_format, revision,
			
 
				+                self.config):
			
 
				+            if "rotary_emb.inv_freq" in name:
			
 
				+                continue
			
 
				+            if name == "lm_head.weight":
			
 
				+                # Unlike Baichuan, Baichuan2 normalizes the head weights. Refer to:
			
 
				+                # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/84603cde5ebffb6084e476cfaeceaf0b8b91fe54/modeling_baichuan.py#L508
			
 
				+                # Distinguish between Baichuan and Baichuan2 by checking the
			
 
				+                # vocab size.
			
 
				+                is_baichuan2 = self.config.vocab_size == 125696
			
 
				+                if is_baichuan2:
			
 
				+                    loaded_weight = torch.nn.functional.normalize(
			
 
				+                        loaded_weight)
			
 
				+
			
 
				+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
			
 
				+                if weight_name not in name:
			
 
				+                    continue
			
 
				+                name = name.replace(weight_name, param_name)
			
 
				+                # Skip loading extra bias for GPTQ models.
			
 
				+                if name.endswith(".bias") and name not in params_dict:
			
 
				+                    continue
			
 
				+                param = params_dict[name]
			
 
				+                weight_loader = param.weight_loader
			
 
				+                weight_loader(param, loaded_weight, shard_id)
			
 
				+                break
			
 
				+            else:
			
 
				+                # Skip loading extra bias for GPTQ models.
			
 
				+                if name.endswith(".bias") and name not in params_dict:
			
 
				+                    continue
			
 
				+                param = params_dict[name]
			
 
				+                weight_loader = getattr(param, "weight_loader",
			
 
				+                                        default_weight_loader)
			
 
				+                weight_loader(param, loaded_weight)
			
 
				+
			
 
				+
			
 
				+class BaichuanForCausalLM(BaiChuanBaseForCausalLM):
			
 
				+    """Baichuan 13B and Baichuan2 7B/13B."""
			
 
				+
			
 
				+    def __init__(self,
			
 
				+                 config,
			
 
				+                 linear_method: Optional[LinearMethodBase] = None):
			
 
				+        if config.hidden_size == 4096:  # baichuan2 7b
			
 
				+            super().__init__(config, "ROPE", linear_method)
			
 
				+        else:  # baichuan 13b, baichuan2 13b
			
 
				+            super().__init__(config, "ALIBI", linear_method)
			
 
				+
			
 
				+
			
 
				+class BaiChuanForCausalLM(BaiChuanBaseForCausalLM):
			
 
				+    """Baichuan 7B."""
			
 
				+
			
 
				+    def __init__(self,
			
 
				+                 config,
			
 
				+                 linear_method: Optional[LinearMethodBase] = None):
			
 
				+        super().__init__(config, "ROPE", linear_method)
			
--- a/aphrodite/modeling/models/bloom.py
+++ b/aphrodite/modeling/models/bloom.py
@@ -0,0 +1,340 @@
 
				+# coding=utf-8
			
 
				+# Adapted from
			
 
				+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/bloom/modeling_bloom.py
			
 
				+# Copyright 2023 The PygmalionAI team.
			
 
				+# Copyright 2023 The CacheFlow team.
			
 
				+# Copyright 2022 HuggingFace Inc. team and BigScience workshop.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+"""Inference-only BLOOM model compatible with HuggingFace weights."""
			
 
				+import math
			
 
				+from typing import List, Optional, Tuple
			
 
				+
			
 
				+import torch
			
 
				+from torch import nn
			
 
				+from transformers import BloomConfig
			
 
				+
			
 
				+from aphrodite.modeling.metadata import InputMetadata
			
 
				+from aphrodite.modeling.layers.activation import get_act_fn
			
 
				+from aphrodite.modeling.layers.attention import PagedAttention
			
 
				+from aphrodite.modeling.layers.linear import (ColumnParallelLinear,
			
 
				+                                              LinearMethodBase,
			
 
				+                                              QKVParallelLinear,
			
 
				+                                              RowParallelLinear)
			
 
				+from aphrodite.modeling.layers.sampler import Sampler
			
 
				+from aphrodite.modeling.layers.vocab_parallel_embedding import (
			
 
				+    VocabParallelEmbedding, ParallelLMHead)
			
 
				+from aphrodite.modeling.megatron.parallel_state import (
			
 
				+    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
			
 
				+from aphrodite.modeling.sampling_metadata import SamplingMetadata
			
 
				+from aphrodite.modeling.hf_downloader import (default_weight_loader,
			
 
				+                                              hf_model_weights_iterator)
			
 
				+from aphrodite.common.sequence import SamplerOutput
			
 
				+
			
 
				+KVCache = Tuple[torch.Tensor, torch.Tensor]
			
 
				+
			
 
				+
			
 
				+def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
			
 
				+    closest_power_of_2 = 2**math.floor(math.log2(total_num_heads))
			
 
				+    base = torch.tensor(
			
 
				+        2**(-(2**-(math.log2(closest_power_of_2) - 3))),
			
 
				+        dtype=torch.float32,
			
 
				+    )
			
 
				+    powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32)
			
 
				+    slopes = torch.pow(base, powers)
			
 
				+
			
 
				+    if closest_power_of_2 != total_num_heads:
			
 
				+        extra_base = torch.tensor(
			
 
				+            2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))),
			
 
				+            dtype=torch.float32,
			
 
				+        )
			
 
				+        num_remaining_heads = min(closest_power_of_2,
			
 
				+                                  total_num_heads - closest_power_of_2)
			
 
				+        extra_powers = torch.arange(start=1,
			
 
				+                                    end=1 + 2 * num_remaining_heads,
			
 
				+                                    step=2,
			
 
				+                                    dtype=torch.int32)
			
 
				+        slopes = torch.cat(
			
 
				+            [slopes, torch.pow(extra_base, extra_powers)], dim=0)
			
 
				+    return slopes
			
 
				+
			
 
				+
			
 
				+class BloomAttention(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: BloomConfig,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.hidden_size = config.hidden_size
			
 
				+        self.total_num_heads = config.n_head
			
 
				+        self.head_dim = self.hidden_size // self.total_num_heads
			
 
				+        assert self.head_dim * self.total_num_heads == self.hidden_size
			
 
				+
			
 
				+        tp_world_size = get_tensor_model_parallel_world_size()
			
 
				+        assert self.total_num_heads % tp_world_size == 0
			
 
				+        self.num_heads = self.total_num_heads // tp_world_size
			
 
				+
			
 
				+        self.query_key_value = QKVParallelLinear(
			
 
				+            self.hidden_size,
			
 
				+            self.head_dim,
			
 
				+            self.total_num_heads,
			
 
				+            bias=True,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+        self.dense = RowParallelLinear(
			
 
				+            self.hidden_size,
			
 
				+            self.hidden_size,
			
 
				+            bias=True,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+
			
 
				+        # Create the alibi slopes and slice them.
			
 
				+        tp_rank = get_tensor_model_parallel_rank()
			
 
				+        head_start = tp_rank * self.num_heads
			
 
				+        head_end = (tp_rank + 1) * self.num_heads
			
 
				+        alibi_slopes = _get_alibi_slopes(self.total_num_heads)
			
 
				+        alibi_slopes = alibi_slopes[head_start:head_end].tolist()
			
 
				+
			
 
				+        scaling = self.head_dim**-0.5
			
 
				+        self.attn = PagedAttention(self.num_heads,
			
 
				+                                   self.head_dim,
			
 
				+                                   scaling,
			
 
				+                                   alibi_slopes=alibi_slopes)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        position_ids: torch.Tensor,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        kv_cache: KVCache,
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        del position_ids  # Unused.
			
 
				+        qkv, _ = self.query_key_value(hidden_states)
			
 
				+        q, k, v = qkv.chunk(chunks=3, dim=-1)
			
 
				+        k_cache, v_cache = kv_cache
			
 
				+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
			
 
				+        output, _ = self.dense(attn_output)
			
 
				+        return output
			
 
				+
			
 
				+
			
 
				+class BloomMLP(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: BloomConfig,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        hidden_size = config.hidden_size
			
 
				+        self.dense_h_to_4h = ColumnParallelLinear(
			
 
				+            hidden_size,
			
 
				+            4 * hidden_size,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+        quant_config = getattr(linear_method, "quant_config", None)
			
 
				+        self.gelu_impl = get_act_fn("gelu", quant_config, 4 * hidden_size)
			
 
				+        self.dense_4h_to_h = RowParallelLinear(
			
 
				+            4 * hidden_size,
			
 
				+            hidden_size,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+
			
 
				+    def forward(self, x: torch.Tensor) -> torch.Tensor:
			
 
				+        x, _ = self.dense_h_to_4h(x)
			
 
				+        x = self.gelu_impl(x)
			
 
				+        x, _ = self.dense_4h_to_h(x)
			
 
				+        return x
			
 
				+
			
 
				+
			
 
				+class BloomBlock(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: BloomConfig,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        hidden_size = config.hidden_size
			
 
				+
			
 
				+        self.input_layernorm = nn.LayerNorm(hidden_size,
			
 
				+                                            eps=config.layer_norm_epsilon)
			
 
				+        self.self_attention = BloomAttention(config, linear_method)
			
 
				+        self.post_attention_layernorm = nn.LayerNorm(
			
 
				+            hidden_size, eps=config.layer_norm_epsilon)
			
 
				+        self.mlp = BloomMLP(config, linear_method)
			
 
				+        self.apply_residual_connection_post_layernorm = (
			
 
				+            config.apply_residual_connection_post_layernorm)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        position_ids: torch.Tensor,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        kv_cache: KVCache,
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        # Layer norm at the beginning of the transformer layer.
			
 
				+        layernorm_output = self.input_layernorm(hidden_states)
			
 
				+
			
 
				+        # Layer norm post the self attention.
			
 
				+        if self.apply_residual_connection_post_layernorm:
			
 
				+            residual = layernorm_output
			
 
				+        else:
			
 
				+            residual = hidden_states
			
 
				+
			
 
				+        # Self attention.
			
 
				+        attention_output = self.self_attention(
			
 
				+            position_ids=position_ids,
			
 
				+            hidden_states=layernorm_output,
			
 
				+            kv_cache=kv_cache,
			
 
				+            input_metadata=input_metadata,
			
 
				+        )
			
 
				+        attention_output = attention_output + residual
			
 
				+        layernorm_output = self.post_attention_layernorm(attention_output)
			
 
				+
			
 
				+        # Get residual
			
 
				+        if self.apply_residual_connection_post_layernorm:
			
 
				+            residual = layernorm_output
			
 
				+        else:
			
 
				+            residual = attention_output
			
 
				+
			
 
				+        # MLP.
			
 
				+        output = self.mlp(layernorm_output) + residual
			
 
				+        return output
			
 
				+
			
 
				+
			
 
				+class BloomModel(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: BloomConfig,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.embed_dim = config.hidden_size
			
 
				+
			
 
				+        # Embedding + LN Embedding
			
 
				+        self.word_embeddings = VocabParallelEmbedding(
			
 
				+            config.vocab_size, self.embed_dim, linear_method=linear_method)
			
 
				+        self.word_embeddings_layernorm = nn.LayerNorm(
			
 
				+            self.embed_dim, eps=config.layer_norm_epsilon)
			
 
				+
			
 
				+        # Transformer blocks
			
 
				+        self.h = nn.ModuleList([
			
 
				+            BloomBlock(config, linear_method)
			
 
				+            for _ in range(config.num_hidden_layers)
			
 
				+        ])
			
 
				+
			
 
				+        # Final Layer Norm
			
 
				+        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        input_ids: torch.Tensor,
			
 
				+        position_ids: torch.Tensor,
			
 
				+        kv_caches: List[KVCache],
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        hidden_states = self.word_embeddings(input_ids)
			
 
				+        hidden_states = self.word_embeddings_layernorm(hidden_states)
			
 
				+        for i in range(len(self.h)):
			
 
				+            layer = self.h[i]
			
 
				+            hidden_states = layer(
			
 
				+                position_ids,
			
 
				+                hidden_states,
			
 
				+                kv_caches[i],
			
 
				+                input_metadata,
			
 
				+            )
			
 
				+        hidden_states = self.ln_f(hidden_states)
			
 
				+        return hidden_states
			
 
				+
			
 
				+
			
 
				+class BloomForCausalLM(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: BloomConfig,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.config = config
			
 
				+        self.linear_method = linear_method
			
 
				+        self.transformer = BloomModel(config, linear_method)
			
 
				+        # self.lm_head_weight = self.transformer.word_embeddings.weight
			
 
				+        self.lm_head = ParallelLMHead(config.vocab_size,
			
 
				+                                      config.hidden_size,
			
 
				+                                      linear_method=linear_method)
			
 
				+        self.sampler = Sampler(config.vocab_size)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        input_ids: torch.Tensor,
			
 
				+        positions: torch.Tensor,
			
 
				+        kv_caches: List[KVCache],
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        hidden_states = self.transformer(input_ids, positions, kv_caches,
			
 
				+                                         input_metadata)
			
 
				+        return hidden_states
			
 
				+
			
 
				+    def sample(
			
 
				+        self,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        sampling_metadata: SamplingMetadata,
			
 
				+    ) -> Optional[SamplerOutput]:
			
 
				+        next_tokens = self.sampler(self.lm_head(hidden_states),
			
 
				+                                   sampling_metadata)
			
 
				+        return next_tokens
			
 
				+
			
 
				+    def load_weights(self,
			
 
				+                     model_name_or_path: str,
			
 
				+                     cache_dir: Optional[str] = None,
			
 
				+                     load_format: str = "auto",
			
 
				+                     revision: Optional[str] = None):
			
 
				+        params_dict = dict(self.named_parameters(remove_duplicate=False))
			
 
				+        for name, loaded_weight in hf_model_weights_iterator(
			
 
				+                model_name_or_path, cache_dir, load_format, revision,
			
 
				+                self.config):
			
 
				+            if name == "lm_head.weight":
			
 
				+                continue
			
 
				+            if not name.startswith("transformer."):
			
 
				+                name = "transformer." + name
			
 
				+            param = params_dict[name]
			
 
				+
			
 
				+            if "word_embeddings.weight" in name:
			
 
				+                # Copy word embedding to lm_head
			
 
				+                lm_head_param = params_dict["lm_head.weight"]
			
 
				+                weight_loader = getattr(lm_head_param, "weight_loader",
			
 
				+                                        default_weight_loader)
			
 
				+                weight_loader(lm_head_param, loaded_weight)
			
 
				+
			
 
				+            if "query_key_value" in name:
			
 
				+                # NOTE: BLOOM's fused QKV's output_dim has the shape of
			
 
				+                # (num_heads * 3 * head_size), while the
			
 
				+                # required shape is (3 * num_heads * head_size).
			
 
				+                # Thus, we need weight conversion.
			
 
				+                output_dim = getattr(param, "output_dim", None)
			
 
				+                num_heads = self.config.num_attention_heads
			
 
				+                if output_dim is not None:
			
 
				+                    loaded_weight_shape = loaded_weight.shape
			
 
				+                    loaded_weight = loaded_weight.view(
			
 
				+                        loaded_weight_shape[:output_dim] + (num_heads, 3, -1) +
			
 
				+                        loaded_weight_shape[output_dim + 1:])
			
 
				+                    loaded_weight = loaded_weight.transpose(
			
 
				+                        output_dim, output_dim + 1)
			
 
				+                    loaded_weight = loaded_weight.reshape(loaded_weight_shape)
			
 
				+
			
 
				+            weight_loader = getattr(param, "weight_loader",
			
 
				+                                    default_weight_loader)
			
 
				+            weight_loader(param, loaded_weight)
			
--- a/aphrodite/modeling/models/chatglm.py
+++ b/aphrodite/modeling/models/chatglm.py
@@ -0,0 +1,378 @@
 
				+# coding=utf-8
			
 
				+# Adapted from
			
 
				+# https://github.com/THUDM/ChatGLM2-6B
			
 
				+"""Inference-only ChatGLM model compatible with THUDM weights."""
			
 
				+from typing import List, Optional, Tuple
			
 
				+
			
 
				+import torch
			
 
				+from torch import nn
			
 
				+from torch.nn import LayerNorm
			
 
				+
			
 
				+from aphrodite.modeling.metadata import InputMetadata
			
 
				+from aphrodite.modeling.layers.activation import SiluAndMul
			
 
				+from aphrodite.modeling.layers.attention import PagedAttention
			
 
				+from aphrodite.modeling.layers.layernorm import RMSNorm
			
 
				+from aphrodite.modeling.layers.linear import (LinearMethodBase,
			
 
				+                                              MergedColumnParallelLinear,
			
 
				+                                              QKVParallelLinear,
			
 
				+                                              RowParallelLinear)
			
 
				+from aphrodite.modeling.layers.rotary_embedding import get_rope
			
 
				+from aphrodite.modeling.layers.sampler import Sampler
			
 
				+from aphrodite.modeling.layers.vocab_parallel_embedding import (
			
 
				+    VocabParallelEmbedding, ParallelLMHead)
			
 
				+from aphrodite.modeling.megatron.parallel_state import (
			
 
				+    get_tensor_model_parallel_world_size)
			
 
				+from aphrodite.modeling.sampling_metadata import SamplingMetadata
			
 
				+from aphrodite.modeling.hf_downloader import (default_weight_loader,
			
 
				+                                              hf_model_weights_iterator)
			
 
				+from aphrodite.common.sequence import SamplerOutput
			
 
				+from aphrodite.transformers_utils.configs import ChatGLMConfig
			
 
				+
			
 
				+KVCache = Tuple[torch.Tensor, torch.Tensor]
			
 
				+
			
 
				+
			
 
				+class GLMAttention(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.hidden_size = config.hidden_size
			
 
				+        tp_size = get_tensor_model_parallel_world_size()
			
 
				+        self.total_num_heads = config.num_attention_heads
			
 
				+        assert self.total_num_heads % tp_size == 0
			
 
				+        self.num_heads = self.total_num_heads // tp_size
			
 
				+        self.multi_query_attention = config.multi_query_attention
			
 
				+        self.total_num_kv_heads = (config.multi_query_group_num
			
 
				+                                   if config.multi_query_attention else
			
 
				+                                   config.num_attention_heads)
			
 
				+        if self.total_num_kv_heads >= tp_size:
			
 
				+            # Number of KV heads is greater than TP size, so we partition
			
 
				+            # the KV heads across multiple tensor parallel GPUs.
			
 
				+            assert self.total_num_kv_heads % tp_size == 0
			
 
				+        else:
			
 
				+            # Number of KV heads is less than TP size, so we replicate
			
 
				+            # the KV heads across multiple tensor parallel GPUs.
			
 
				+            assert tp_size % self.total_num_kv_heads == 0
			
 
				+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
			
 
				+        self.head_dim = config.hidden_size // self.total_num_heads
			
 
				+        self.q_size = self.num_heads * self.head_dim
			
 
				+        self.kv_size = self.num_kv_heads * self.head_dim
			
 
				+        self.scaling = self.head_dim**-0.5
			
 
				+
			
 
				+        self.query_key_value = QKVParallelLinear(
			
 
				+            self.hidden_size,
			
 
				+            self.head_dim,
			
 
				+            self.total_num_heads,
			
 
				+            self.total_num_kv_heads,
			
 
				+            bias=config.add_bias_linear or config.add_qkv_bias,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+        self.dense = RowParallelLinear(
			
 
				+            self.total_num_heads * self.head_dim,
			
 
				+            config.hidden_size,
			
 
				+            bias=config.add_bias_linear,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+
			
 
				+        # https://huggingface.co/THUDM/chatglm3-6b-32k/blob/e210410255278dd9d74463cf396ba559c0ef801c/modeling_chatglm.py#L141
			
 
				+        rope_ratio = getattr(config, "rope_ratio", 1.0)
			
 
				+        max_positions = getattr(config, "seq_length", 8192)
			
 
				+        self.rotary_emb = get_rope(
			
 
				+            self.head_dim,
			
 
				+            rotary_dim=self.head_dim // 2,
			
 
				+            max_position=max_positions,
			
 
				+            base=10000 * rope_ratio,
			
 
				+            is_neox_style=False,
			
 
				+        )
			
 
				+        self.attn = PagedAttention(
			
 
				+            self.num_heads,
			
 
				+            self.head_dim,
			
 
				+            self.scaling,
			
 
				+            num_kv_heads=self.num_kv_heads,
			
 
				+        )
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        position_ids: torch.Tensor,
			
 
				+        kv_cache: KVCache,
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        qkv, _ = self.query_key_value(hidden_states)
			
 
				+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
			
 
				+        q, k = self.rotary_emb(position_ids, q, k)
			
 
				+        key_cache, value_cache = kv_cache
			
 
				+        context_layer = self.attn(
			
 
				+            q,
			
 
				+            k,
			
 
				+            v,
			
 
				+            key_cache,
			
 
				+            value_cache,
			
 
				+            input_metadata,
			
 
				+        )
			
 
				+        attn_output, _ = self.dense(context_layer)
			
 
				+        return attn_output
			
 
				+
			
 
				+
			
 
				+class GLMMLP(nn.Module):
			
 
				+    """MLP.
			
 
				+
			
 
				+    MLP will take the input with h hidden state, project it to 4*h
			
 
				+    hidden dimension, perform nonlinear transformation, and project the
			
 
				+    state back into h hidden dimension.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+
			
 
				+        self.add_bias = config.add_bias_linear
			
 
				+
			
 
				+        # Project to 4h.
			
 
				+        self.dense_h_to_4h = MergedColumnParallelLinear(
			
 
				+            config.hidden_size,
			
 
				+            [config.ffn_hidden_size] * 2,
			
 
				+            bias=config.add_bias_linear,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+
			
 
				+        self.activation_func = SiluAndMul()
			
 
				+
			
 
				+        # Project back to h.
			
 
				+        self.dense_4h_to_h = RowParallelLinear(
			
 
				+            config.ffn_hidden_size,
			
 
				+            config.hidden_size,
			
 
				+            bias=config.add_bias_linear,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+
			
 
				+    def forward(self, hidden_states):
			
 
				+        # [s, b, 4hp]
			
 
				+        intermediate_parallel, _ = self.dense_h_to_4h(hidden_states)
			
 
				+        intermediate_parallel = self.activation_func(intermediate_parallel)
			
 
				+        # [s, b, h]
			
 
				+        output, _ = self.dense_4h_to_h(intermediate_parallel)
			
 
				+        return output
			
 
				+
			
 
				+
			
 
				+class GLMBlock(nn.Module):
			
 
				+    """A single transformer layer.
			
 
				+
			
 
				+    Transformer layer takes input with size [s, b, h] and returns an
			
 
				+    output of the same size.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.apply_residual_connection_post_layernorm = (
			
 
				+            config.apply_residual_connection_post_layernorm)
			
 
				+
			
 
				+        self.fp32_residual_connection = config.fp32_residual_connection
			
 
				+
			
 
				+        layer_norm_func = RMSNorm if config.rmsnorm else LayerNorm
			
 
				+        # Layernorm on the input data.
			
 
				+        self.input_layernorm = layer_norm_func(config.hidden_size,
			
 
				+                                               eps=config.layernorm_epsilon)
			
 
				+
			
 
				+        # Self attention.
			
 
				+        self.self_attention = GLMAttention(config, linear_method)
			
 
				+        self.hidden_dropout = config.hidden_dropout
			
 
				+
			
 
				+        # Layernorm on the attention output
			
 
				+        self.post_attention_layernorm = layer_norm_func(
			
 
				+            config.hidden_size, eps=config.layernorm_epsilon)
			
 
				+
			
 
				+        # MLP
			
 
				+        self.mlp = GLMMLP(config, linear_method)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        position_ids: torch.Tensor,
			
 
				+        kv_cache: KVCache,
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        # hidden_states: [num_tokens, h]
			
 
				+        # Layer norm at the beginning of the transformer layer.
			
 
				+        layernorm_output = self.input_layernorm(hidden_states)
			
 
				+        # Self attention.
			
 
				+        attention_output = self.self_attention(
			
 
				+            hidden_states=layernorm_output,
			
 
				+            position_ids=position_ids,
			
 
				+            kv_cache=kv_cache,
			
 
				+            input_metadata=input_metadata,
			
 
				+        )
			
 
				+
			
 
				+        # Residual connection.
			
 
				+        if self.apply_residual_connection_post_layernorm:
			
 
				+            residual = layernorm_output
			
 
				+        else:
			
 
				+            residual = hidden_states
			
 
				+
			
 
				+        layernorm_input = residual + attention_output
			
 
				+
			
 
				+        # Layer norm post the self attention.
			
 
				+        layernorm_output = self.post_attention_layernorm(layernorm_input)
			
 
				+
			
 
				+        # Second residual connection.
			
 
				+        if self.apply_residual_connection_post_layernorm:
			
 
				+            residual = layernorm_output
			
 
				+        else:
			
 
				+            residual = layernorm_input
			
 
				+
			
 
				+        output = self.mlp(layernorm_output) + residual
			
 
				+
			
 
				+        return output
			
 
				+
			
 
				+
			
 
				+class GLMTransformer(nn.Module):
			
 
				+    """Transformer class."""
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.post_layer_norm = config.post_layer_norm
			
 
				+
			
 
				+        # Number of layers.
			
 
				+        self.num_layers = config.num_layers
			
 
				+
			
 
				+        # Transformer layers.
			
 
				+        self.layers = nn.ModuleList(
			
 
				+            [GLMBlock(config, linear_method) for i in range(self.num_layers)])
			
 
				+
			
 
				+        if self.post_layer_norm:
			
 
				+            layer_norm_func = RMSNorm if config.rmsnorm else LayerNorm
			
 
				+            # Final layer norm before output.
			
 
				+            self.final_layernorm = layer_norm_func(
			
 
				+                config.hidden_size, eps=config.layernorm_epsilon)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        position_ids: torch.Tensor,
			
 
				+        kv_caches: List[KVCache],
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        for i in range(self.num_layers):
			
 
				+            layer = self.layers[i]
			
 
				+            hidden_states = layer(
			
 
				+                hidden_states=hidden_states,
			
 
				+                position_ids=position_ids,
			
 
				+                kv_cache=kv_caches[i],
			
 
				+                input_metadata=input_metadata,
			
 
				+            )
			
 
				+        # Final layer norm.
			
 
				+        if self.post_layer_norm:
			
 
				+            hidden_states = self.final_layernorm(hidden_states)
			
 
				+
			
 
				+        return hidden_states
			
 
				+
			
 
				+
			
 
				+class ChatGLMModel(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+
			
 
				+        self.embedding = VocabParallelEmbedding(config.padded_vocab_size,
			
 
				+                                                config.hidden_size,
			
 
				+                                                linear_method=linear_method)
			
 
				+
			
 
				+        self.num_layers = config.num_layers
			
 
				+        self.multi_query_group_num = config.multi_query_group_num
			
 
				+        self.kv_channels = config.kv_channels
			
 
				+        self.encoder = GLMTransformer(config, linear_method)
			
 
				+
			
 
				+        self.output_layer = ParallelLMHead(config.padded_vocab_size,
			
 
				+                                           config.hidden_size,
			
 
				+                                           linear_method=linear_method)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        input_ids: torch.Tensor,
			
 
				+        position_ids: torch.Tensor,
			
 
				+        kv_caches: List[KVCache],
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        inputs_embeds = self.embedding(input_ids)
			
 
				+
			
 
				+        # Run encoder.
			
 
				+        hidden_states = self.encoder(
			
 
				+            hidden_states=inputs_embeds,
			
 
				+            position_ids=position_ids,
			
 
				+            kv_caches=kv_caches,
			
 
				+            input_metadata=input_metadata,
			
 
				+        )
			
 
				+        return hidden_states
			
 
				+
			
 
				+
			
 
				+class ChatGLMForCausalLM(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: ChatGLMConfig,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.config: ChatGLMConfig = config
			
 
				+        self.linear_method = linear_method
			
 
				+        self.transformer = ChatGLMModel(config, linear_method)
			
 
				+        # self.lm_head_weight = self.transformer.output_layer.weight
			
 
				+        self.sampler = Sampler(config.padded_vocab_size)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        input_ids: torch.Tensor,
			
 
				+        positions: torch.Tensor,
			
 
				+        kv_caches: List[KVCache],
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        hidden_states = self.transformer(input_ids, positions, kv_caches,
			
 
				+                                         input_metadata)
			
 
				+        return hidden_states
			
 
				+
			
 
				+    def sample(
			
 
				+        self,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        sampling_metadata: SamplingMetadata,
			
 
				+    ) -> Optional[SamplerOutput]:
			
 
				+        next_tokens = self.sampler(
			
 
				+            self.transformer.output_layer(hidden_states), sampling_metadata)
			
 
				+        return next_tokens
			
 
				+
			
 
				+    def load_weights(self,
			
 
				+                     model_name_or_path: str,
			
 
				+                     cache_dir: Optional[str] = None,
			
 
				+                     load_format: str = "auto",
			
 
				+                     revision: Optional[str] = None):
			
 
				+        params_dict = dict(self.named_parameters(remove_duplicate=False))
			
 
				+        for name, loaded_weight in hf_model_weights_iterator(
			
 
				+                model_name_or_path, cache_dir, load_format, revision,
			
 
				+                self.config):
			
 
				+            if "rotary_pos_emb.inv_freq" in name:
			
 
				+                continue
			
 
				+            if "word_embeddings" in name:
			
 
				+                name = name.replace(".word_embeddings", "")
			
 
				+            # Skip loading extra bias for GPTQ models.
			
 
				+            if name.endswith(".bias") and name not in params_dict:
			
 
				+                continue
			
 
				+            param = params_dict[name]
			
 
				+            weight_loader = getattr(param, "weight_loader",
			
 
				+                                    default_weight_loader)
			
 
				+            weight_loader(param, loaded_weight)
			
--- a/aphrodite/modeling/models/decilm.py
+++ b/aphrodite/modeling/models/decilm.py
@@ -29,6 +29,7 @@ from typing import Optional
 
				 import torch
			
 
				 from transformers import PretrainedConfig
			
 
				 
			
 
				+from aphrodite.common.config import LoRAConfig
			
 
				 from aphrodite.modeling.layers.linear import LinearMethodBase
			
 
				 from aphrodite.modeling.models.llama import LlamaForCausalLM
			
 
				 from aphrodite.modeling.hf_downloader import (default_weight_loader,
			
@@ -41,7 +42,7 @@ class DeciLMForCausalLM(LlamaForCausalLM):
 
				     Based on the llama executor.
			
 
				 
			
 
				     The main difference is that DeciLM uses Variable Grouped Query Attention.
			
 
				-    The constant number of GQA heads in the decoder is overriden with a value
			
 
				+    The constant number of GQA heads in the decoder is overridden with a value
			
 
				     per layer.
			
 
				 
			
 
				     Usually, in the HuggingFace implementation, instead of
			
@@ -57,10 +58,13 @@ class DeciLMForCausalLM(LlamaForCausalLM):
 
				         self,
			
 
				         config: Optional[PretrainedConfig] = None,
			
 
				         linear_method: Optional[LinearMethodBase] = None,
			
 
				+        lora_config: Optional[LoRAConfig] = None,
			
 
				     ) -> None:
			
 
				         config.num_key_value_heads = max(config.num_key_value_heads_per_layer)
			
 
				         delattr(config, "num_key_value_heads_per_layer")
			
 
				-        super().__init__(config=config, linear_method=linear_method)
			
 
				+        super().__init__(config=config,
			
 
				+                         linear_method=linear_method,
			
 
				+                         lora_config=lora_config)
			
 
				 
			
 
				     def load_weights(self,
			
 
				                      model_name_or_path: str,
			
--- a/aphrodite/modeling/models/deepseek.py
+++ b/aphrodite/modeling/models/deepseek.py
@@ -28,14 +28,16 @@ import torch
 
				 from torch import nn
			
 
				 from transformers import PretrainedConfig
			
 
				 
			
 
				-from aphrodite.modeling.metadata import InputMetadata
			
 
				+from aphrodite.modeling.megatron import InputMetadata
			
 
				 from aphrodite.modeling.layers.activation import SiluAndMul
			
 
				 from aphrodite.modeling.layers.attention import PagedAttention
			
 
				 from aphrodite.modeling.layers.triton_kernel.fused_moe import fused_moe
			
 
				 from aphrodite.modeling.layers.layernorm import RMSNorm
			
 
				-from aphrodite.modeling.layers.linear import (
			
 
				-    LinearMethodBase, MergedColumnParallelLinear, ReplicatedLinear,
			
 
				-    QKVParallelLinear, RowParallelLinear, ColumnParallelLinear)
			
 
				+from aphrodite.modeling.layers.linear import (LinearMethodBase,
			
 
				+                                              MergedColumnParallelLinear,
			
 
				+                                              ReplicatedLinear,
			
 
				+                                              QKVParallelLinear,
			
 
				+                                              RowParallelLinear)
			
 
				 from aphrodite.modeling.layers.rotary_embedding import get_rope
			
 
				 from aphrodite.modeling.layers.sampler import Sampler
			
 
				 from aphrodite.modeling.layers.vocab_parallel_embedding import (
			
@@ -63,23 +65,10 @@ class DeepseekMLP(nn.Module):
 
				         reduce_results: bool = True,
			
 
				     ) -> None:
			
 
				         super().__init__()
			
 
				-        if linear_method is not None and not linear_method.quant_config.merge_weight(
			
 
				-        ):
			
 
				-            self.merge_weight = False
			
 
				-            self.gate_proj = ColumnParallelLinear(hidden_size,
			
 
				-                                                  intermediate_size,
			
 
				-                                                  bias=False,
			
 
				-                                                  linear_method=linear_method)
			
 
				-            self.up_proj = ColumnParallelLinear(hidden_size,
			
 
				-                                                intermediate_size,
			
 
				-                                                bias=False,
			
 
				-                                                linear_method=linear_method)
			
 
				-        else:
			
 
				-            self.merge_weight = True
			
 
				-            self.gate_up_proj = MergedColumnParallelLinear(
			
 
				-                hidden_size, [intermediate_size] * 2,
			
 
				-                bias=False,
			
 
				-                linear_method=linear_method)
			
 
				+        self.gate_up_proj = MergedColumnParallelLinear(
			
 
				+            hidden_size, [intermediate_size] * 2,
			
 
				+            bias=False,
			
 
				+            linear_method=linear_method)
			
 
				         self.down_proj = RowParallelLinear(intermediate_size,
			
 
				                                            hidden_size,
			
 
				                                            bias=False,
			
@@ -91,12 +80,7 @@ class DeepseekMLP(nn.Module):
 
				         self.act_fn = SiluAndMul()
			
 
				 
			
 
				     def forward(self, x):
			
 
				-        if self.merge_weight:
			
 
				-            gate_up, _ = self.gate_up_proj(x)
			
 
				-        else:
			
 
				-            up, _ = self.up_proj(x)
			
 
				-            gate, _ = self.gate_proj(x)
			
 
				-            gate_up = torch.cat([gate, up], dim=-1)
			
 
				+        gate_up, _ = self.gate_up_proj(x)
			
 
				         x = self.act_fn(gate_up)
			
 
				         x, _ = self.down_proj(x)
			
 
				         return x
			
@@ -171,7 +155,6 @@ class DeepseekMoE(nn.Module):
 
				             shared_output = self.shared_experts(hidden_states)
			
 
				         # router_logits: (batch * sequence_length, n_experts)
			
 
				         router_logits, _ = self.gate(hidden_states)
			
 
				-
			
 
				         final_hidden_states = fused_moe(hidden_states,
			
 
				                                         self.w1,
			
 
				                                         self.w2,
			
@@ -224,31 +207,14 @@ class DeepseekAttention(nn.Module):
 
				         self.rope_theta = rope_theta
			
 
				         self.max_position_embeddings = max_position_embeddings
			
 
				 
			
 
				-        if linear_method is not None and not linear_method.quant_config.merge_weight(
			
 
				-        ):
			
 
				-            self.merge_weight = False
			
 
				-            self.q_proj = ColumnParallelLinear(hidden_size,
			
 
				-                                               self.q_size,
			
 
				-                                               bias=False,
			
 
				-                                               linear_method=linear_method)
			
 
				-            self.k_proj = ColumnParallelLinear(hidden_size,
			
 
				-                                               self.kv_size,
			
 
				-                                               bias=False,
			
 
				-                                               linear_method=linear_method)
			
 
				-            self.v_proj = ColumnParallelLinear(hidden_size,
			
 
				-                                               self.kv_size,
			
 
				-                                               bias=False,
			
 
				-                                               linear_method=linear_method)
			
 
				-        else:
			
 
				-            self.merge_weight = True
			
 
				-            self.qkv_proj = QKVParallelLinear(
			
 
				-                hidden_size,
			
 
				-                self.head_dim,
			
 
				-                self.total_num_heads,
			
 
				-                self.total_num_kv_heads,
			
 
				-                bias=False,
			
 
				-                linear_method=linear_method,
			
 
				-            )
			
 
				+        self.qkv_proj = QKVParallelLinear(
			
 
				+            hidden_size,
			
 
				+            self.head_dim,
			
 
				+            self.total_num_heads,
			
 
				+            self.total_num_kv_heads,
			
 
				+            bias=False,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				 
			
 
				         self.o_proj = RowParallelLinear(
			
 
				             self.total_num_heads * self.head_dim,
			
@@ -257,15 +223,12 @@ class DeepseekAttention(nn.Module):
 
				             linear_method=linear_method,
			
 
				         )
			
 
				 
			
 
				-        is_neox_style = True if linear_method is None or linear_method.quant_config.rope_style(
			
 
				-        ) is None else linear_method.quant_config.rope_style()
			
 
				         self.rotary_emb = get_rope(
			
 
				             self.head_dim,
			
 
				             rotary_dim=self.head_dim,
			
 
				             max_position=max_position_embeddings,
			
 
				             base=rope_theta,
			
 
				             rope_scaling=rope_scaling,
			
 
				-            is_neox_style=is_neox_style,
			
 
				         )
			
 
				         self.attn = PagedAttention(self.num_heads,
			
 
				                                    self.head_dim,
			
@@ -279,14 +242,8 @@ class DeepseekAttention(nn.Module):
 
				         kv_cache: KVCache,
			
 
				         input_metadata: InputMetadata,
			
 
				     ) -> torch.Tensor:
			
 
				-        if self.merge_weight:
			
 
				-            qkv, _ = self.qkv_proj(hidden_states)
			
 
				-            q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size],
			
 
				-                                dim=-1)
			
 
				-        else:
			
 
				-            q, _ = self.q_proj(hidden_states)
			
 
				-            k, _ = self.k_proj(hidden_states)
			
 
				-            v, _ = self.v_proj(hidden_states)
			
 
				+        qkv, _ = self.qkv_proj(hidden_states)
			
 
				+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
			
 
				         q, k = self.rotary_emb(positions, q, k)
			
 
				         k_cache, v_cache = kv_cache
			
 
				         attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
			
@@ -375,7 +332,6 @@ class DeepseekModel(nn.Module):
 
				         self.embed_tokens = VocabParallelEmbedding(
			
 
				             config.vocab_size,
			
 
				             config.hidden_size,
			
 
				-            linear_method=linear_method,
			
 
				         )
			
 
				         self.layers = nn.ModuleList([
			
 
				             DeepseekDecoderLayer(config,
			
@@ -414,9 +370,7 @@ class DeepseekForCausalLM(nn.Module):
 
				         self.config = config
			
 
				         self.linear_method = linear_method
			
 
				         self.model = DeepseekModel(config, linear_method)
			
 
				-        self.lm_head = ParallelLMHead(config.vocab_size,
			
 
				-                                      config.hidden_size,
			
 
				-                                      linear_method=linear_method)
			
 
				+        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
			
 
				         self.sampler = Sampler(config.vocab_size)
			
 
				 
			
 
				     def forward(
			
@@ -452,16 +406,13 @@ class DeepseekForCausalLM(nn.Module):
 
				             ("gate_up_proj", "gate_proj", 0),
			
 
				             ("gate_up_proj", "up_proj", 1),
			
 
				         ]
			
 
				-        if self.linear_method is not None and not self.linear_method.quant_config.merge_weight(
			
 
				-        ):
			
 
				-            stacked_params_mapping = []
			
 
				+
			
 
				         params_dict = dict(self.named_parameters())
			
 
				         for name, loaded_weight in hf_model_weights_iterator(
			
 
				                 model_name_or_path,
			
 
				                 cache_dir,
			
 
				                 load_format,
			
 
				                 revision,
			
 
				-                self.config,
			
 
				                 fall_back_to_pt=False):
			
 
				             if "rotary_emb.inv_freq" in name:
			
 
				                 continue
			
--- a/aphrodite/modeling/models/falcon.py
+++ b/aphrodite/modeling/models/falcon.py
@@ -0,0 +1,446 @@
 
				+# coding=utf-8
			
 
				+# Adapted from
			
 
				+# https://github.com/huggingface/transformers/blob/a5cc30d72ae2dc19af534e4b35c986cc28db1275/src/transformers/models/falcon/modeling_falcon.py
			
 
				+# Copyright 2023 The PygmalionAI team.
			
 
				+# Copyright 2023 The vLLM team.
			
 
				+# Copyright 2023 the Falcon authors and HuggingFace Inc. team.  All rights
			
 
				+# reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+"""PyTorch Falcon model."""
			
 
				+
			
 
				+import math
			
 
				+from typing import List, Optional, Tuple, Union
			
 
				+
			
 
				+import torch
			
 
				+from torch import nn
			
 
				+from torch.nn import LayerNorm
			
 
				+from transformers import FalconConfig as HF_FalconConfig
			
 
				+
			
 
				+from aphrodite.modeling.metadata import InputMetadata
			
 
				+from aphrodite.modeling.layers.activation import get_act_fn
			
 
				+from aphrodite.modeling.layers.attention import PagedAttention
			
 
				+from aphrodite.modeling.layers.linear import (ColumnParallelLinear,
			
 
				+                                              LinearMethodBase,
			
 
				+                                              QKVParallelLinear,
			
 
				+                                              RowParallelLinear)
			
 
				+from aphrodite.modeling.layers.rotary_embedding import get_rope
			
 
				+from aphrodite.modeling.layers.sampler import Sampler
			
 
				+from aphrodite.modeling.layers.vocab_parallel_embedding import (
			
 
				+    VocabParallelEmbedding, ParallelLMHead)
			
 
				+from aphrodite.modeling.megatron.communication_op import (
			
 
				+    tensor_model_parallel_all_reduce)
			
 
				+from aphrodite.modeling.megatron.parallel_state import (
			
 
				+    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
			
 
				+from aphrodite.modeling.sampling_metadata import SamplingMetadata
			
 
				+from aphrodite.modeling.hf_downloader import (default_weight_loader,
			
 
				+                                              hf_model_weights_iterator)
			
 
				+from aphrodite.common.sequence import SamplerOutput
			
 
				+from aphrodite.transformers_utils.configs import RWConfig
			
 
				+
			
 
				+KVCache = Tuple[torch.Tensor, torch.Tensor]
			
 
				+FalconConfig = Union[HF_FalconConfig, RWConfig]
			
 
				+
			
 
				+
			
 
				+def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
			
 
				+    closest_power_of_2 = 2**math.floor(math.log2(total_num_heads))
			
 
				+    base = torch.tensor(2**(-(2**-(math.log2(closest_power_of_2) - 3))),
			
 
				+                        dtype=torch.float32)
			
 
				+    powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32)
			
 
				+    slopes = torch.pow(base, powers)
			
 
				+
			
 
				+    if closest_power_of_2 != total_num_heads:
			
 
				+        extra_base = torch.tensor(
			
 
				+            2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))),
			
 
				+            dtype=torch.float32)
			
 
				+        num_remaining_heads = min(closest_power_of_2,
			
 
				+                                  total_num_heads - closest_power_of_2)
			
 
				+        extra_powers = torch.arange(1,
			
 
				+                                    1 + 2 * num_remaining_heads,
			
 
				+                                    2,
			
 
				+                                    dtype=torch.int32)
			
 
				+        slopes = torch.cat(
			
 
				+            [slopes, torch.pow(extra_base, extra_powers)], dim=0)
			
 
				+
			
 
				+    return slopes
			
 
				+
			
 
				+
			
 
				+class FalconAttention(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: FalconConfig,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+
			
 
				+        self.hidden_size = config.hidden_size
			
 
				+        tp_size = get_tensor_model_parallel_world_size()
			
 
				+
			
 
				+        self.total_num_heads = config.num_attention_heads
			
 
				+        assert self.total_num_heads % tp_size == 0
			
 
				+        self.num_heads = self.total_num_heads // tp_size
			
 
				+        self.head_dim = self.hidden_size // self.total_num_heads
			
 
				+        assert self.head_dim * self.total_num_heads == self.hidden_size
			
 
				+
			
 
				+        self.new_decoder_architecture = config.new_decoder_architecture
			
 
				+        self.multi_query = config.multi_query
			
 
				+
			
 
				+        if self.new_decoder_architecture:
			
 
				+            self.total_num_kv_heads = config.num_kv_heads
			
 
				+        elif self.multi_query:
			
 
				+            self.total_num_kv_heads = 1
			
 
				+        else:
			
 
				+            self.total_num_kv_heads = self.total_num_heads
			
 
				+        if self.total_num_kv_heads >= tp_size:
			
 
				+            # Number of KV heads is greater than TP size, so we partition
			
 
				+            # the KV heads across multiple tensor parallel GPUs.
			
 
				+            assert self.total_num_kv_heads % tp_size == 0
			
 
				+        else:
			
 
				+            # Number of KV heads is less than TP size, so we replicate
			
 
				+            # the KV heads across multiple tensor parallel GPUs.
			
 
				+            assert tp_size % self.total_num_kv_heads == 0
			
 
				+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
			
 
				+
			
 
				+        self.query_key_value = QKVParallelLinear(
			
 
				+            self.hidden_size,
			
 
				+            self.head_dim,
			
 
				+            self.total_num_heads,
			
 
				+            self.total_num_kv_heads,
			
 
				+            bias=config.bias,
			
 
				+            skip_bias_add=True,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+        self.q_size = self.num_heads * self.head_dim
			
 
				+        self.kv_size = self.num_kv_heads * self.head_dim
			
 
				+
			
 
				+        # Layer-wise attention scaling
			
 
				+        self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim)
			
 
				+        self.reduce_row_parallel_results = not (config.new_decoder_architecture
			
 
				+                                                or config.parallel_attn)
			
 
				+        self.dense = RowParallelLinear(
			
 
				+            self.hidden_size,
			
 
				+            self.hidden_size,
			
 
				+            bias=config.bias,
			
 
				+            skip_bias_add=True,
			
 
				+            linear_method=linear_method,
			
 
				+            reduce_results=self.reduce_row_parallel_results)
			
 
				+
			
 
				+        self.use_rotary = config.rotary
			
 
				+        self.use_alibi = config.alibi
			
 
				+        assert not (self.use_rotary and self.use_alibi), (
			
 
				+            "Rotary and alibi are mutually exclusive.")
			
 
				+
			
 
				+        if self.use_rotary:
			
 
				+            rope_theta = getattr(config, "rope_theta", 10000)
			
 
				+            max_position_embeddings = getattr(config,
			
 
				+                                              "max_position_embeddings", 8192)
			
 
				+            self.rotary_emb = get_rope(
			
 
				+                self.head_dim,
			
 
				+                rotary_dim=self.head_dim,
			
 
				+                max_position=max_position_embeddings,
			
 
				+                base=rope_theta,
			
 
				+            )
			
 
				+            self.attn = PagedAttention(self.num_heads,
			
 
				+                                       self.head_dim,
			
 
				+                                       self.inv_norm_factor,
			
 
				+                                       num_kv_heads=self.num_kv_heads)
			
 
				+        elif self.use_alibi:
			
 
				+            tp_rank = get_tensor_model_parallel_rank()
			
 
				+            head_start = tp_rank * self.num_heads
			
 
				+            head_end = (tp_rank + 1) * self.num_heads
			
 
				+            alibi_slopes = (_get_alibi_slopes(self.total_num_heads) *
			
 
				+                            self.inv_norm_factor)
			
 
				+            alibi_slopes = alibi_slopes[head_start:head_end].tolist()
			
 
				+            self.attn = PagedAttention(self.num_heads,
			
 
				+                                       self.head_dim,
			
 
				+                                       self.inv_norm_factor,
			
 
				+                                       num_kv_heads=self.num_kv_heads,
			
 
				+                                       alibi_slopes=alibi_slopes)
			
 
				+        else:
			
 
				+            self.attn = PagedAttention(self.num_heads,
			
 
				+                                       self.head_dim,
			
 
				+                                       scale=self.inv_norm_factor,
			
 
				+                                       num_kv_heads=self.num_kv_heads)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        positions: torch.Tensor,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        kv_cache: KVCache,
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        qkv, bias = self.query_key_value(hidden_states)
			
 
				+        if bias is not None:
			
 
				+            qkv += bias
			
 
				+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
			
 
				+        if self.use_rotary:
			
 
				+            q, k = self.rotary_emb(positions, q, k)
			
 
				+        k_cache, v_cache = kv_cache
			
 
				+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
			
 
				+        attn_output, bias = self.dense(attn_output)
			
 
				+        return attn_output, bias
			
 
				+
			
 
				+
			
 
				+class FalconMLP(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: FalconConfig,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        hidden_size = config.hidden_size
			
 
				+
			
 
				+        self.dense_h_to_4h = ColumnParallelLinear(hidden_size,
			
 
				+                                                  4 * hidden_size,
			
 
				+                                                  bias=config.bias,
			
 
				+                                                  skip_bias_add=True,
			
 
				+                                                  linear_method=linear_method)
			
 
				+        quant_config = getattr(linear_method, "quant_config", None)
			
 
				+        self.act = get_act_fn("gelu", quant_config, 4 * hidden_size)
			
 
				+        self.reduce_row_parallel_results = not (config.new_decoder_architecture
			
 
				+                                                or config.parallel_attn)
			
 
				+        self.dense_4h_to_h = RowParallelLinear(
			
 
				+            4 * hidden_size,
			
 
				+            hidden_size,
			
 
				+            bias=config.bias,
			
 
				+            skip_bias_add=True,
			
 
				+            reduce_results=self.reduce_row_parallel_results,
			
 
				+            linear_method=linear_method)
			
 
				+
			
 
				+    def forward(self, x: torch.Tensor) -> torch.Tensor:
			
 
				+        # NOTE(zhuohan): Following huggingface, we do not fuse bias add here.
			
 
				+        x, bias = self.dense_h_to_4h(x)
			
 
				+        if bias is not None:
			
 
				+            x += bias
			
 
				+        x = self.act(x)
			
 
				+        x, bias = self.dense_4h_to_h(x)
			
 
				+        return x, bias
			
 
				+
			
 
				+
			
 
				+class FalconDecoderLayer(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: FalconConfig,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        hidden_size = config.hidden_size
			
 
				+        self.num_heads = config.num_attention_heads
			
 
				+        self.self_attention = FalconAttention(config, linear_method)
			
 
				+        self.mlp = FalconMLP(config, linear_method)
			
 
				+        self.config = config
			
 
				+
			
 
				+        if config.new_decoder_architecture:
			
 
				+            # The layer norm before self-attention
			
 
				+            self.ln_attn = LayerNorm(hidden_size,
			
 
				+                                     eps=config.layer_norm_epsilon)
			
 
				+            # The layer norm before the MLP
			
 
				+            self.ln_mlp = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
			
 
				+        else:
			
 
				+            self.input_layernorm = LayerNorm(hidden_size,
			
 
				+                                             eps=config.layer_norm_epsilon)
			
 
				+            if not config.parallel_attn:
			
 
				+                self.post_attention_layernorm = LayerNorm(
			
 
				+                    hidden_size, eps=config.layer_norm_epsilon)
			
 
				+
			
 
				+        self.reduce_row_parallel_results = not (config.new_decoder_architecture
			
 
				+                                                or config.parallel_attn)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        positions: torch.Tensor,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        kv_cache: KVCache,
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        residual = hidden_states
			
 
				+
			
 
				+        if self.config.new_decoder_architecture:
			
 
				+            attention_layernorm_out = self.ln_attn(hidden_states)
			
 
				+            mlp_layernorm_out = self.ln_mlp(hidden_states)
			
 
				+        else:
			
 
				+            attention_layernorm_out = self.input_layernorm(hidden_states)
			
 
				+
			
 
				+        # Self attention.
			
 
				+        attention_output, attention_bias = self.self_attention(
			
 
				+            positions=positions,
			
 
				+            hidden_states=attention_layernorm_out,
			
 
				+            kv_cache=kv_cache,
			
 
				+            input_metadata=input_metadata,
			
 
				+        )
			
 
				+        if self.reduce_row_parallel_results and attention_bias is not None:
			
 
				+            attention_output += attention_bias
			
 
				+
			
 
				+        if not self.config.new_decoder_architecture:
			
 
				+            if self.config.parallel_attn:
			
 
				+                mlp_layernorm_out = attention_layernorm_out
			
 
				+            else:
			
 
				+                residual += attention_output
			
 
				+                mlp_layernorm_out = self.post_attention_layernorm(residual)
			
 
				+
			
 
				+        # MLP.
			
 
				+        mlp_output, mlp_bias = self.mlp(mlp_layernorm_out)
			
 
				+        if self.reduce_row_parallel_results and mlp_bias is not None:
			
 
				+            mlp_output += mlp_bias
			
 
				+
			
 
				+        if not self.reduce_row_parallel_results:
			
 
				+            # When MLP and Attention layers are parallel, we can use
			
 
				+            # only one all-reduce operator to reduce the results from
			
 
				+            # both MLP and Attention layers.
			
 
				+            mlp_output += attention_output
			
 
				+            mlp_output = tensor_model_parallel_all_reduce(mlp_output)
			
 
				+            if attention_bias is not None:
			
 
				+                mlp_output += attention_bias
			
 
				+            if mlp_bias is not None:
			
 
				+                mlp_output += mlp_bias
			
 
				+
			
 
				+        output = mlp_output + residual
			
 
				+        return output
			
 
				+
			
 
				+
			
 
				+class FalconModel(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: FalconConfig,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.config = config
			
 
				+        self.embed_dim = config.hidden_size
			
 
				+        self.num_heads = config.num_attention_heads
			
 
				+        self.use_alibi = config.alibi
			
 
				+
			
 
				+        # Embedding + LN Embedding
			
 
				+        self.word_embeddings = VocabParallelEmbedding(
			
 
				+            config.vocab_size, self.embed_dim, linear_method=linear_method)
			
 
				+
			
 
				+        # Transformer blocks
			
 
				+        self.h = nn.ModuleList([
			
 
				+            FalconDecoderLayer(config, linear_method)
			
 
				+            for _ in range(config.num_hidden_layers)
			
 
				+        ])
			
 
				+
			
 
				+        # Final Layer Norm
			
 
				+        self.ln_f = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        input_ids: torch.LongTensor,
			
 
				+        positions: torch.Tensor,
			
 
				+        kv_caches: List[KVCache],
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        hidden_states = self.word_embeddings(input_ids)
			
 
				+        for i in range(len(self.h)):
			
 
				+            layer = self.h[i]
			
 
				+            hidden_states = layer(
			
 
				+                positions,
			
 
				+                hidden_states,
			
 
				+                kv_caches[i],
			
 
				+                input_metadata,
			
 
				+            )
			
 
				+        hidden_states = self.ln_f(hidden_states)
			
 
				+        return hidden_states
			
 
				+
			
 
				+
			
 
				+class FalconForCausalLM(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: FalconConfig,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.config = config
			
 
				+        self.linear_method = linear_method
			
 
				+        self.transformer = FalconModel(config, linear_method)
			
 
				+        self.lm_head = ParallelLMHead(config.vocab_size,
			
 
				+                                      config.hidden_size,
			
 
				+                                      linear_method=linear_method)
			
 
				+        self.sampler = Sampler(config.vocab_size)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        input_ids: torch.LongTensor,
			
 
				+        positions: torch.Tensor,
			
 
				+        kv_caches: List[KVCache],
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        hidden_states = self.transformer(
			
 
				+            input_ids,
			
 
				+            positions,
			
 
				+            kv_caches,
			
 
				+            input_metadata,
			
 
				+        )
			
 
				+        return hidden_states
			
 
				+
			
 
				+    def sample(
			
 
				+        self,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        sampling_metadata: SamplingMetadata,
			
 
				+    ) -> Optional[SamplerOutput]:
			
 
				+        next_tokens = self.sampler(self.lm_head(hidden_states),
			
 
				+                                   sampling_metadata)
			
 
				+        return next_tokens
			
 
				+
			
 
				+    def load_weights(self,
			
 
				+                     model_name_or_path: str,
			
 
				+                     cache_dir: Optional[str] = None,
			
 
				+                     load_format: str = "auto",
			
 
				+                     revision: Optional[str] = None):
			
 
				+        total_num_heads = self.config.num_attention_heads
			
 
				+        if self.config.new_decoder_architecture:
			
 
				+            total_num_kv_heads = self.config.num_kv_heads
			
 
				+        elif self.config.multi_query:
			
 
				+            total_num_kv_heads = 1
			
 
				+        else:
			
 
				+            total_num_kv_heads = total_num_heads
			
 
				+        num_query_heads_per_kv_head = total_num_heads // total_num_kv_heads
			
 
				+        params_dict = dict(self.named_parameters())
			
 
				+        for name, loaded_weight in hf_model_weights_iterator(
			
 
				+                model_name_or_path, cache_dir, load_format, revision,
			
 
				+                self.config):
			
 
				+            # Skip loading extra bias for GPTQ models.
			
 
				+            if name.endswith(".bias") and name not in params_dict:
			
 
				+                continue
			
 
				+            param = params_dict[name]
			
 
				+            if "query_key_value" in name:
			
 
				+                output_dim = getattr(param, "output_dim", None)
			
 
				+                loaded_weight_shape = loaded_weight.shape
			
 
				+                if output_dim is not None:
			
 
				+                    loaded_weight = loaded_weight.view(
			
 
				+                        loaded_weight_shape[:output_dim] +
			
 
				+                        (total_num_kv_heads, num_query_heads_per_kv_head + 2,
			
 
				+                         -1) + loaded_weight_shape[output_dim + 1:])
			
 
				+                    wq = loaded_weight.narrow(
			
 
				+                        output_dim + 1, 0,
			
 
				+                        num_query_heads_per_kv_head).reshape(
			
 
				+                            *loaded_weight_shape[:output_dim], -1,
			
 
				+                            *loaded_weight_shape[output_dim + 1:])
			
 
				+                    wk = loaded_weight.narrow(
			
 
				+                        output_dim + 1, num_query_heads_per_kv_head,
			
 
				+                        1).reshape(*loaded_weight_shape[:output_dim], -1,
			
 
				+                                   *loaded_weight_shape[output_dim + 1:])
			
 
				+                    wv = loaded_weight.narrow(
			
 
				+                        output_dim + 1, num_query_heads_per_kv_head + 1,
			
 
				+                        1).reshape(*loaded_weight_shape[:output_dim], -1,
			
 
				+                                   *loaded_weight_shape[output_dim + 1:])
			
 
				+                    loaded_weight = torch.cat([wq, wk, wv], dim=output_dim)
			
 
				+
			
 
				+            weight_loader = getattr(param, "weight_loader",
			
 
				+                                    default_weight_loader)
			
 
				+            weight_loader(param, loaded_weight)
			
--- a/aphrodite/modeling/models/gemma.py
+++ b/aphrodite/modeling/models/gemma.py
@@ -1,14 +1,7 @@
 
				 # coding=utf-8
			
 
				-# Adapted from
			
 
				-# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
			
 
				 # Copyright 2023 The PygmalionAI team.
			
 
				 # Copyright 2023 The vLLM team.
			
 
				-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
			
 
				-#
			
 
				-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
			
 
				-# and OPT implementations in this library. It has been modified from its
			
 
				-# original forms to accommodate minor architectural differences compared
			
 
				-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
			
 
				+# Copyright (c) Google Inc.
			
 
				 #
			
 
				 # Licensed under the Apache License, Version 2.0 (the "License");
			
 
				 # you may not use this file except in compliance with the License.
			
@@ -21,15 +14,15 @@
 
				 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				 # See the License for the specific language governing permissions and
			
 
				 # limitations under the License.
			
 
				-"""Inference-only Yi model compatible with HuggingFace weights."""
			
 
				-from typing import Any, Dict, List, Optional, Tuple
			
 
				+"""Inference-only Gemma model compatible with HuggingFace weights."""
			
 
				+from typing import List, Optional, Tuple
			
 
				 
			
 
				 import torch
			
 
				 from torch import nn
			
 
				-from aphrodite.transformers_utils.configs.yi import YiConfig
			
 
				+from transformers import GemmaConfig
			
 
				 
			
 
				 from aphrodite.modeling.metadata import InputMetadata
			
 
				-from aphrodite.modeling.layers.activation import SiluAndMul
			
 
				+from aphrodite.modeling.layers.activation import GeluAndMul
			
 
				 from aphrodite.modeling.layers.attention import PagedAttention
			
 
				 from aphrodite.modeling.layers.layernorm import RMSNorm
			
 
				 from aphrodite.modeling.layers.linear import (LinearMethodBase,
			
@@ -51,13 +44,12 @@ from aphrodite.common.sequence import SamplerOutput
 
				 KVCache = Tuple[torch.Tensor, torch.Tensor]
			
 
				 
			
 
				 
			
 
				-class YiMLP(nn.Module):
			
 
				+class GemmaMLP(nn.Module):
			
 
				 
			
 
				     def __init__(
			
 
				         self,
			
 
				         hidden_size: int,
			
 
				         intermediate_size: int,
			
 
				-        hidden_act: str,
			
 
				         linear_method: Optional[LinearMethodBase] = None,
			
 
				     ) -> None:
			
 
				         super().__init__()
			
@@ -82,10 +74,7 @@ class YiMLP(nn.Module):
 
				                                            hidden_size,
			
 
				                                            bias=False,
			
 
				                                            linear_method=linear_method)
			
 
				-        if hidden_act != "silu":
			
 
				-            raise ValueError(f"Unsupported activation: {hidden_act}. "
			
 
				-                             "Only silu is supported for now.")
			
 
				-        self.act_fn = SiluAndMul()
			
 
				+        self.act_fn = GeluAndMul()
			
 
				 
			
 
				     def forward(self, x):
			
 
				         if self.merge_weight:
			
@@ -99,18 +88,16 @@ class YiMLP(nn.Module):
 
				         return x
			
 
				 
			
 
				 
			
 
				-class YiAttention(nn.Module):
			
 
				+class GemmaAttention(nn.Module):
			
 
				 
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-        hidden_size: int,
			
 
				-        num_heads: int,
			
 
				-        num_kv_heads: int,
			
 
				-        rope_theta: float = 10000,
			
 
				-        rope_scaling: Optional[Dict[str, Any]] = None,
			
 
				-        max_position_embeddings: int = 8192,
			
 
				-        linear_method: Optional[LinearMethodBase] = None,
			
 
				-    ) -> None:
			
 
				+    def __init__(self,
			
 
				+                 hidden_size: int,
			
 
				+                 num_heads: int,
			
 
				+                 num_kv_heads: int,
			
 
				+                 head_dim: int,
			
 
				+                 max_position_embeddings: int = 8192,
			
 
				+                 rope_theta: float = 10000,
			
 
				+                 linear_method: Optional[LinearMethodBase] = None) -> None:
			
 
				         super().__init__()
			
 
				         self.hidden_size = hidden_size
			
 
				         tp_size = get_tensor_model_parallel_world_size()
			
@@ -127,12 +114,11 @@ class YiAttention(nn.Module):
 
				             # the KV heads across multiple tensor parallel GPUs.
			
 
				             assert tp_size % self.total_num_kv_heads == 0
			
 
				         self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
			
 
				-        self.head_dim = hidden_size // self.total_num_heads
			
 
				+        self.head_dim = head_dim
			
 
				         self.q_size = self.num_heads * self.head_dim
			
 
				         self.kv_size = self.num_kv_heads * self.head_dim
			
 
				         self.scaling = self.head_dim**-0.5
			
 
				         self.rope_theta = rope_theta
			
 
				-        self.max_position_embeddings = max_position_embeddings
			
 
				 
			
 
				         if linear_method is not None and not linear_method.quant_config.merge_weight(
			
 
				         ):
			
@@ -165,15 +151,12 @@ class YiAttention(nn.Module):
 
				             bias=False,
			
 
				             linear_method=linear_method,
			
 
				         )
			
 
				-        is_neox_style = True if linear_method is None or linear_method.quant_config.rope_style(
			
 
				-        ) is None else linear_method.quant_config.rope_style()
			
 
				         self.rotary_emb = get_rope(
			
 
				             self.head_dim,
			
 
				             rotary_dim=self.head_dim,
			
 
				             max_position=max_position_embeddings,
			
 
				             base=self.rope_theta,
			
 
				-            rope_scaling=rope_scaling,
			
 
				-            is_neox_style=is_neox_style,
			
 
				+            is_neox_style=True,
			
 
				         )
			
 
				         self.attn = PagedAttention(self.num_heads,
			
 
				                                    self.head_dim,
			
@@ -202,36 +185,33 @@ class YiAttention(nn.Module):
 
				         return output
			
 
				 
			
 
				 
			
 
				-class YiDecoderLayer(nn.Module):
			
 
				+class GemmaDecoderLayer(nn.Module):
			
 
				 
			
 
				     def __init__(
			
 
				         self,
			
 
				-        config: YiConfig,
			
 
				+        config: GemmaConfig,
			
 
				         linear_method: Optional[LinearMethodBase] = None,
			
 
				     ) -> None:
			
 
				         super().__init__()
			
 
				         self.hidden_size = config.hidden_size
			
 
				-        rope_theta = getattr(config, "rope_theta", 10000)
			
 
				-        rope_scaling = getattr(config, "rope_scaling", None)
			
 
				-        max_position_embeddings = getattr(config, "max_position_embeddings",
			
 
				-                                          8192)
			
 
				-        self.self_attn = YiAttention(
			
 
				+        self.self_attn = GemmaAttention(
			
 
				             hidden_size=self.hidden_size,
			
 
				             num_heads=config.num_attention_heads,
			
 
				             num_kv_heads=config.num_key_value_heads,
			
 
				-            rope_theta=rope_theta,
			
 
				-            rope_scaling=rope_scaling,
			
 
				-            max_position_embeddings=max_position_embeddings,
			
 
				+            head_dim=config.head_dim,
			
 
				+            max_position_embeddings=config.max_position_embeddings,
			
 
				+            rope_theta=config.rope_theta,
			
 
				             linear_method=linear_method,
			
 
				         )
			
 
				-        self.mlp = YiMLP(
			
 
				+        self.mlp = GemmaMLP(
			
 
				             hidden_size=self.hidden_size,
			
 
				             intermediate_size=config.intermediate_size,
			
 
				-            hidden_act=config.hidden_act,
			
 
				             linear_method=linear_method,
			
 
				         )
			
 
				-        self.ln1 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
			
 
				-        self.ln2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
			
 
				+        self.input_layernorm = RMSNorm(config.hidden_size,
			
 
				+                                       eps=config.rms_norm_eps)
			
 
				+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
			
 
				+                                                eps=config.rms_norm_eps)
			
 
				 
			
 
				     def forward(
			
 
				         self,
			
@@ -244,9 +224,10 @@ class YiDecoderLayer(nn.Module):
 
				         # Self Attention
			
 
				         if residual is None:
			
 
				             residual = hidden_states
			
 
				-            hidden_states = self.ln1(hidden_states)
			
 
				+            hidden_states = self.input_layernorm(hidden_states)
			
 
				         else:
			
 
				-            hidden_states, residual = self.ln1(hidden_states, residual)
			
 
				+            hidden_states, residual = self.input_layernorm(
			
 
				+                hidden_states, residual)
			
 
				         hidden_states = self.self_attn(
			
 
				             positions=positions,
			
 
				             hidden_states=hidden_states,
			
@@ -255,27 +236,27 @@ class YiDecoderLayer(nn.Module):
 
				         )
			
 
				 
			
 
				         # Fully Connected
			
 
				-        hidden_states, residual = self.ln2(hidden_states, residual)
			
 
				+        hidden_states, residual = self.post_attention_layernorm(
			
 
				+            hidden_states, residual)
			
 
				         hidden_states = self.mlp(hidden_states)
			
 
				         return hidden_states, residual
			
 
				 
			
 
				 
			
 
				-class YiModel(nn.Module):
			
 
				+class GemmaModel(nn.Module):
			
 
				 
			
 
				     def __init__(
			
 
				         self,
			
 
				-        config: YiConfig,
			
 
				+        config: GemmaConfig,
			
 
				         linear_method: Optional[LinearMethodBase] = None,
			
 
				     ) -> None:
			
 
				         super().__init__()
			
 
				         self.config = config
			
 
				-        self.padding_idx = config.pad_token_id
			
 
				-        self.vocab_size = config.vocab_size
			
 
				+
			
 
				         self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
			
 
				                                                    config.hidden_size,
			
 
				                                                    linear_method=linear_method)
			
 
				         self.layers = nn.ModuleList([
			
 
				-            YiDecoderLayer(config, linear_method)
			
 
				+            GemmaDecoderLayer(config, linear_method)
			
 
				             for _ in range(config.num_hidden_layers)
			
 
				         ])
			
 
				         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
			
@@ -288,6 +269,9 @@ class YiModel(nn.Module):
 
				         input_metadata: InputMetadata,
			
 
				     ) -> torch.Tensor:
			
 
				         hidden_states = self.embed_tokens(input_ids)
			
 
				+        # Normalize the embedding by sqrt(hidden_size)
			
 
				+        hidden_states *= self.config.hidden_size**0.5
			
 
				+
			
 
				         residual = None
			
 
				         for i in range(len(self.layers)):
			
 
				             layer = self.layers[i]
			
@@ -302,22 +286,23 @@ class YiModel(nn.Module):
 
				         return hidden_states
			
 
				 
			
 
				 
			
 
				-class YiForCausalLM(nn.Module):
			
 
				+class GemmaForCausalLM(nn.Module):
			
 
				 
			
 
				     def __init__(
			
 
				         self,
			
 
				-        config: YiConfig,
			
 
				+        config: GemmaConfig,
			
 
				         linear_method: Optional[LinearMethodBase] = None,
			
 
				     ) -> None:
			
 
				         super().__init__()
			
 
				         self.config = config
			
 
				         self.linear_method = linear_method
			
 
				-        self.model = YiModel(config, linear_method)
			
 
				+        self.model = GemmaModel(config, linear_method)
			
 
				         self.lm_head = ParallelLMHead(config.vocab_size,
			
 
				                                       config.hidden_size,
			
 
				                                       linear_method=linear_method)
			
 
				         self.sampler = Sampler(config.vocab_size)
			
 
				 
			
 
				+    @torch.no_grad()
			
 
				     def forward(
			
 
				         self,
			
 
				         input_ids: torch.Tensor,
			
@@ -355,11 +340,19 @@ class YiForCausalLM(nn.Module):
 
				         ):
			
 
				             stacked_params_mapping = []
			
 
				         params_dict = dict(self.named_parameters())
			
 
				+        loaded_params = set()
			
 
				         for name, loaded_weight in hf_model_weights_iterator(
			
 
				                 model_name_or_path, cache_dir, load_format, revision,
			
 
				                 self.config):
			
 
				             if "rotary_emb.inv_freq" in name:
			
 
				                 continue
			
 
				+            if "embed_tokens.weight" in name:
			
 
				+                # Copy word embedding to lm_head
			
 
				+                loaded_params.add("lm_head.weight")
			
 
				+                lm_head_param = params_dict["lm_head.weight"]
			
 
				+                weight_loader = getattr(lm_head_param, "weight_loader",
			
 
				+                                        default_weight_loader)
			
 
				+                weight_loader(lm_head_param, loaded_weight)
			
 
				             for (param_name, weight_name, shard_id) in stacked_params_mapping:
			
 
				                 if weight_name not in name:
			
 
				                     continue
			
@@ -372,10 +365,20 @@ class YiForCausalLM(nn.Module):
 
				                 weight_loader(param, loaded_weight, shard_id)
			
 
				                 break
			
 
				             else:
			
 
				-                # Skip loading extra bias for GPTQ models.
			
 
				-                if name.endswith(".bias") and name not in params_dict:
			
 
				+                # Skip loading extra layer for lora models.
			
 
				+                if "lm_head" in name:
			
 
				                     continue
			
 
				+                # GemmaRMSNorm is different from Llama's in that it multiplies
			
 
				+                # (1 + weight) to the output, instead of just weight.
			
 
				+                if "norm.weight" in name:
			
 
				+                    loaded_weight += 1.0
			
 
				                 param = params_dict[name]
			
 
				                 weight_loader = getattr(param, "weight_loader",
			
 
				                                         default_weight_loader)
			
 
				                 weight_loader(param, loaded_weight)
			
 
				+            loaded_params.add(name)
			
 
				+        unloaded_params = params_dict.keys() - loaded_params
			
 
				+        if unloaded_params:
			
 
				+            raise RuntimeError(
			
 
				+                "Some weights are not initialized from checkpoints: "
			
 
				+                f"{unloaded_params}")
			
--- a/aphrodite/modeling/models/gpt2.py
+++ b/aphrodite/modeling/models/gpt2.py
@@ -0,0 +1,286 @@
 
				+# coding=utf-8
			
 
				+# Adapted from
			
 
				+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
			
 
				+# Copyright 2023 The PygmalionAI team.
			
 
				+# Copyright 2023 The vLLM team.
			
 
				+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
			
 
				+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+"""Inference-only GPT-2 model compatible with HuggingFace weights."""
			
 
				+from typing import List, Optional, Tuple
			
 
				+
			
 
				+import torch
			
 
				+from torch import nn
			
 
				+from transformers import GPT2Config
			
 
				+
			
 
				+from aphrodite.modeling.metadata import InputMetadata
			
 
				+from aphrodite.modeling.layers.activation import get_act_fn
			
 
				+from aphrodite.modeling.layers.attention import PagedAttention
			
 
				+from aphrodite.modeling.layers.linear import (ColumnParallelLinear,
			
 
				+                                              LinearMethodBase,
			
 
				+                                              QKVParallelLinear,
			
 
				+                                              RowParallelLinear)
			
 
				+from aphrodite.modeling.layers.sampler import Sampler
			
 
				+from aphrodite.modeling.layers.vocab_parallel_embedding import (
			
 
				+    VocabParallelEmbedding, ParallelLMHead)
			
 
				+from aphrodite.modeling.megatron.parallel_state import (
			
 
				+    get_tensor_model_parallel_world_size)
			
 
				+from aphrodite.modeling.sampling_metadata import SamplingMetadata
			
 
				+from aphrodite.modeling.hf_downloader import (default_weight_loader,
			
 
				+                                              hf_model_weights_iterator)
			
 
				+from aphrodite.common.sequence import SamplerOutput
			
 
				+
			
 
				+KVCache = Tuple[torch.Tensor, torch.Tensor]
			
 
				+
			
 
				+
			
 
				+class GPT2Attention(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: GPT2Config,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.hidden_size = config.hidden_size
			
 
				+        total_num_heads = config.num_attention_heads
			
 
				+        tensor_model_parallel_world_size = (
			
 
				+            get_tensor_model_parallel_world_size())
			
 
				+        assert total_num_heads % tensor_model_parallel_world_size == 0
			
 
				+        self.num_heads = total_num_heads // tensor_model_parallel_world_size
			
 
				+        self.head_dim = self.hidden_size // total_num_heads
			
 
				+        self.scale = self.head_dim**-0.5
			
 
				+
			
 
				+        self.c_attn = QKVParallelLinear(
			
 
				+            self.hidden_size,
			
 
				+            self.head_dim,
			
 
				+            total_num_heads,
			
 
				+            bias=True,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+        self.c_proj = RowParallelLinear(
			
 
				+            self.hidden_size,
			
 
				+            self.hidden_size,
			
 
				+            bias=True,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+        self.attn = PagedAttention(self.num_heads,
			
 
				+                                   self.head_dim,
			
 
				+                                   scale=self.scale)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        kv_cache: KVCache,
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        qkv, _ = self.c_attn(hidden_states)
			
 
				+        q, k, v = qkv.chunk(chunks=3, dim=-1)
			
 
				+        key_cache, value_cache = kv_cache
			
 
				+        attn_output = self.attn(q, k, v, key_cache, value_cache,
			
 
				+                                input_metadata)
			
 
				+        attn_output, _ = self.c_proj(attn_output)
			
 
				+        return attn_output
			
 
				+
			
 
				+
			
 
				+class GPT2MLP(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        intermediate_size: int,
			
 
				+        config: GPT2Config,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        hidden_size = config.hidden_size
			
 
				+        self.c_fc = ColumnParallelLinear(
			
 
				+            hidden_size,
			
 
				+            intermediate_size,
			
 
				+            bias=True,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+        self.c_proj = RowParallelLinear(
			
 
				+            intermediate_size,
			
 
				+            hidden_size,
			
 
				+            bias=True,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+        quant_config = getattr(linear_method, "quant_config", None)
			
 
				+        self.act = get_act_fn(config.activation_function, quant_config,
			
 
				+                              intermediate_size)
			
 
				+
			
 
				+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
			
 
				+        hidden_states, _ = self.c_fc(hidden_states)
			
 
				+        hidden_states = self.act(hidden_states)
			
 
				+        hidden_states, _ = self.c_proj(hidden_states)
			
 
				+        return hidden_states
			
 
				+
			
 
				+
			
 
				+class GPT2Block(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: GPT2Config,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        hidden_size = config.hidden_size
			
 
				+        inner_dim = (config.n_inner if config.n_inner is not None else 4 *
			
 
				+                     hidden_size)
			
 
				+
			
 
				+        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
			
 
				+        self.attn = GPT2Attention(config, linear_method)
			
 
				+        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
			
 
				+        self.mlp = GPT2MLP(inner_dim, config, linear_method)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        kv_cache: KVCache,
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        residual = hidden_states
			
 
				+        hidden_states = self.ln_1(hidden_states)
			
 
				+        attn_output = self.attn(
			
 
				+            hidden_states=hidden_states,
			
 
				+            kv_cache=kv_cache,
			
 
				+            input_metadata=input_metadata,
			
 
				+        )
			
 
				+        # residual connection
			
 
				+        hidden_states = attn_output + residual
			
 
				+
			
 
				+        residual = hidden_states
			
 
				+        hidden_states = self.ln_2(hidden_states)
			
 
				+        feed_forward_hidden_states = self.mlp(hidden_states)
			
 
				+        # residual connection
			
 
				+        hidden_states = residual + feed_forward_hidden_states
			
 
				+        return hidden_states
			
 
				+
			
 
				+
			
 
				+class GPT2Model(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: GPT2Config,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.config = config
			
 
				+        assert not config.add_cross_attention
			
 
				+        assert not config.scale_attn_by_inverse_layer_idx
			
 
				+        assert not config.reorder_and_upcast_attn
			
 
				+        self.embed_dim = config.hidden_size
			
 
				+        self.wte = VocabParallelEmbedding(config.vocab_size,
			
 
				+                                          self.embed_dim,
			
 
				+                                          linear_method=linear_method)
			
 
				+        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
			
 
				+        self.h = nn.ModuleList([
			
 
				+            GPT2Block(config, linear_method)
			
 
				+            for _ in range(config.num_hidden_layers)
			
 
				+        ])
			
 
				+        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        input_ids: torch.Tensor,
			
 
				+        position_ids: torch.Tensor,
			
 
				+        kv_caches: List[KVCache],
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        inputs_embeds = self.wte(input_ids)
			
 
				+        position_embeds = self.wpe(position_ids)
			
 
				+        hidden_states = inputs_embeds + position_embeds
			
 
				+
			
 
				+        for i in range(len(self.h)):
			
 
				+            layer = self.h[i]
			
 
				+            hidden_states = layer(hidden_states, kv_caches[i], input_metadata)
			
 
				+
			
 
				+        hidden_states = self.ln_f(hidden_states)
			
 
				+        return hidden_states
			
 
				+
			
 
				+
			
 
				+class GPT2LMHeadModel(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: GPT2Config,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.config = config
			
 
				+        self.linear_method = linear_method
			
 
				+        self.transformer = GPT2Model(config, linear_method)
			
 
				+        # self.lm_head_weight = self.transformer.wte.weight
			
 
				+        self.lm_head = ParallelLMHead(config.vocab_size,
			
 
				+                                      config.hidden_size,
			
 
				+                                      linear_method=linear_method)
			
 
				+        self.sampler = Sampler(config.vocab_size)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        input_ids: torch.Tensor,
			
 
				+        positions: torch.Tensor,
			
 
				+        kv_caches: List[KVCache],
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        hidden_states = self.transformer(input_ids, positions, kv_caches,
			
 
				+                                         input_metadata)
			
 
				+        return hidden_states
			
 
				+
			
 
				+    def sample(
			
 
				+        self,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        sampling_metadata: SamplingMetadata,
			
 
				+    ) -> Optional[SamplerOutput]:
			
 
				+        next_tokens = self.sampler(self.lm_head(hidden_states),
			
 
				+                                   sampling_metadata)
			
 
				+        return next_tokens
			
 
				+
			
 
				+    def load_weights(self,
			
 
				+                     model_name_or_path: str,
			
 
				+                     cache_dir: Optional[str] = None,
			
 
				+                     load_format: str = "auto",
			
 
				+                     revision: Optional[str] = None):
			
 
				+        params_dict = dict(self.named_parameters(remove_duplicate=False))
			
 
				+        for name, loaded_weight in hf_model_weights_iterator(
			
 
				+                model_name_or_path, cache_dir, load_format, revision,
			
 
				+                self.config):
			
 
				+            if "lm_head.weight" in name:
			
 
				+                # GPT-2 ties the weights of the embedding layer and the final
			
 
				+                # linear layer.
			
 
				+                continue
			
 
				+            if "wte.weight" in name:
			
 
				+                # Copy word embedding to lm_head
			
 
				+                lm_head_param = params_dict["lm_head.weight"]
			
 
				+                weight_loader = getattr(lm_head_param, "weight_loader",
			
 
				+                                        default_weight_loader)
			
 
				+                weight_loader(lm_head_param, loaded_weight)
			
 
				+            if ".attn.bias" in name or ".attn.masked_bias" in name:
			
 
				+                # Skip attention mask.
			
 
				+                # NOTE: "c_attn.bias" should not be skipped.
			
 
				+                continue
			
 
				+            if not name.startswith("transformer."):
			
 
				+                name = "transformer." + name
			
 
				+            param = params_dict[name]
			
 
				+            # The HF's GPT-2 implementation uses Conv1D instead of Linear.
			
 
				+            # Because of this, we need to transpose the weights.
			
 
				+            # Note(zhuohan): the logic below might break quantized models.
			
 
				+            for conv1d_weight_name in ["c_attn", "c_proj", "c_fc"]:
			
 
				+                if conv1d_weight_name not in name:
			
 
				+                    continue
			
 
				+                if not name.endswith(".weight"):
			
 
				+                    continue
			
 
				+                loaded_weight = loaded_weight.t()
			
 
				+            weight_loader = getattr(param, "weight_loader",
			
 
				+                                    default_weight_loader)
			
 
				+            weight_loader(param, loaded_weight)
			
--- a/aphrodite/modeling/models/gpt_bigcode.py
+++ b/aphrodite/modeling/models/gpt_bigcode.py
@@ -0,0 +1,292 @@
 
				+# coding=utf-8
			
 
				+# Adapted from
			
 
				+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
			
 
				+# Copyright 2023 The PygmalionAI team.
			
 
				+# Copyright 2023 The vLLM team.
			
 
				+# Copyright 2023 CTranslate2, and Michael Feil
			
 
				+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
			
 
				+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+"""Inference-only GPTBigCode model compatible with HuggingFace weights."""
			
 
				+from typing import List, Optional, Tuple
			
 
				+
			
 
				+import torch
			
 
				+from torch import nn
			
 
				+from transformers import GPTBigCodeConfig
			
 
				+
			
 
				+from aphrodite.modeling.metadata import InputMetadata
			
 
				+from aphrodite.modeling.layers.activation import get_act_fn
			
 
				+from aphrodite.modeling.layers.attention import PagedAttention
			
 
				+from aphrodite.modeling.layers.linear import (ColumnParallelLinear,
			
 
				+                                              LinearMethodBase,
			
 
				+                                              QKVParallelLinear,
			
 
				+                                              RowParallelLinear)
			
 
				+from aphrodite.modeling.layers.sampler import Sampler
			
 
				+from aphrodite.modeling.layers.vocab_parallel_embedding import (
			
 
				+    VocabParallelEmbedding, ParallelLMHead)
			
 
				+from aphrodite.modeling.megatron.parallel_state import (
			
 
				+    get_tensor_model_parallel_world_size)
			
 
				+from aphrodite.modeling.sampling_metadata import SamplingMetadata
			
 
				+from aphrodite.modeling.hf_downloader import (default_weight_loader,
			
 
				+                                              hf_model_weights_iterator)
			
 
				+from aphrodite.common.sequence import SamplerOutput
			
 
				+
			
 
				+KVCache = Tuple[torch.Tensor, torch.Tensor]
			
 
				+
			
 
				+
			
 
				+class GPTBigCodeAttention(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: GPTBigCodeConfig,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.hidden_size = config.hidden_size
			
 
				+        total_num_heads = config.num_attention_heads
			
 
				+        self.tensor_model_parallel_world_size = (
			
 
				+            get_tensor_model_parallel_world_size())
			
 
				+        assert total_num_heads % self.tensor_model_parallel_world_size == 0
			
 
				+        self.num_heads = (total_num_heads //
			
 
				+                          self.tensor_model_parallel_world_size)
			
 
				+        self.head_dim = self.hidden_size // total_num_heads
			
 
				+        self.scale = self.head_dim**-0.5
			
 
				+
			
 
				+        self.multi_query = config.multi_query
			
 
				+        if self.multi_query:
			
 
				+            total_num_kv_heads = 1
			
 
				+            self.num_kv_heads = 1
			
 
				+        else:
			
 
				+            total_num_kv_heads = total_num_heads
			
 
				+            self.num_kv_heads = self.num_heads
			
 
				+        self.kv_dim = self.head_dim * self.num_kv_heads
			
 
				+        self.c_attn = QKVParallelLinear(
			
 
				+            self.hidden_size,
			
 
				+            self.head_dim,
			
 
				+            total_num_heads,
			
 
				+            total_num_kv_heads,
			
 
				+            bias=True,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+
			
 
				+        self.c_proj = RowParallelLinear(
			
 
				+            self.hidden_size,
			
 
				+            self.hidden_size,
			
 
				+            bias=True,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+        self.attn = PagedAttention(self.num_heads,
			
 
				+                                   self.head_dim,
			
 
				+                                   scale=self.scale,
			
 
				+                                   num_kv_heads=self.num_kv_heads)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        kv_cache: KVCache,
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        qkv, _ = self.c_attn(hidden_states)
			
 
				+        q, k, v = qkv.split(
			
 
				+            [
			
 
				+                self.hidden_size // self.tensor_model_parallel_world_size,
			
 
				+                self.kv_dim, self.kv_dim
			
 
				+            ],
			
 
				+            dim=-1,
			
 
				+        )
			
 
				+        key_cache, value_cache = kv_cache
			
 
				+        attn_output = self.attn(q, k, v, key_cache, value_cache,
			
 
				+                                input_metadata)
			
 
				+        attn_output, _ = self.c_proj(attn_output)
			
 
				+        return attn_output
			
 
				+
			
 
				+
			
 
				+class GPTBigMLP(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        intermediate_size: int,
			
 
				+        config: GPTBigCodeConfig,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        hidden_size = config.hidden_size
			
 
				+        self.c_fc = ColumnParallelLinear(
			
 
				+            hidden_size,
			
 
				+            intermediate_size,
			
 
				+            bias=True,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+        self.c_proj = RowParallelLinear(
			
 
				+            intermediate_size,
			
 
				+            hidden_size,
			
 
				+            bias=True,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+        quant_config = getattr(linear_method, "quant_config", None)
			
 
				+        self.act = get_act_fn(config.activation_function, quant_config,
			
 
				+                              intermediate_size)
			
 
				+
			
 
				+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
			
 
				+        hidden_states, _ = self.c_fc(hidden_states)
			
 
				+        hidden_states = self.act(hidden_states)
			
 
				+        hidden_states, _ = self.c_proj(hidden_states)
			
 
				+        return hidden_states
			
 
				+
			
 
				+
			
 
				+class GPTBigCodeBlock(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: GPTBigCodeConfig,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        hidden_size = config.hidden_size
			
 
				+        inner_dim = (config.n_inner if config.n_inner is not None else 4 *
			
 
				+                     hidden_size)
			
 
				+
			
 
				+        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
			
 
				+        self.attn = GPTBigCodeAttention(config, linear_method)
			
 
				+        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
			
 
				+        self.mlp = GPTBigMLP(inner_dim, config, linear_method)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        kv_cache: KVCache,
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        residual = hidden_states
			
 
				+        hidden_states = self.ln_1(hidden_states)
			
 
				+        attn_output = self.attn(
			
 
				+            hidden_states=hidden_states,
			
 
				+            kv_cache=kv_cache,
			
 
				+            input_metadata=input_metadata,
			
 
				+        )
			
 
				+        # residual connection
			
 
				+        hidden_states = attn_output + residual
			
 
				+
			
 
				+        residual = hidden_states
			
 
				+        hidden_states = self.ln_2(hidden_states)
			
 
				+        feed_forward_hidden_states = self.mlp(hidden_states)
			
 
				+        # residual connection
			
 
				+        hidden_states = residual + feed_forward_hidden_states
			
 
				+        return hidden_states
			
 
				+
			
 
				+
			
 
				+class GPTBigCodeModel(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: GPTBigCodeConfig,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.config = config
			
 
				+        assert not config.add_cross_attention
			
 
				+
			
 
				+        self.embed_dim = config.hidden_size
			
 
				+
			
 
				+        self.wte = VocabParallelEmbedding(config.vocab_size,
			
 
				+                                          self.embed_dim,
			
 
				+                                          linear_method=linear_method)
			
 
				+        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
			
 
				+        self.h = nn.ModuleList([
			
 
				+            GPTBigCodeBlock(config, linear_method)
			
 
				+            for _ in range(config.num_hidden_layers)
			
 
				+        ])
			
 
				+        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        input_ids: torch.Tensor,
			
 
				+        position_ids: torch.Tensor,
			
 
				+        kv_caches: List[KVCache],
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        inputs_embeds = self.wte(input_ids)
			
 
				+        position_embeds = self.wpe(position_ids)
			
 
				+        hidden_states = inputs_embeds + position_embeds
			
 
				+
			
 
				+        for i in range(len(self.h)):
			
 
				+            layer = self.h[i]
			
 
				+            hidden_states = layer(hidden_states, kv_caches[i], input_metadata)
			
 
				+
			
 
				+        hidden_states = self.ln_f(hidden_states)
			
 
				+        return hidden_states
			
 
				+
			
 
				+
			
 
				+class GPTBigCodeForCausalLM(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: GPTBigCodeConfig,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.config = config
			
 
				+        self.linear_method = linear_method
			
 
				+        self.transformer = GPTBigCodeModel(config, linear_method)
			
 
				+        # self.lm_head_weight = self.transformer.wte.weight
			
 
				+        self.lm_head = ParallelLMHead(config.vocab_size,
			
 
				+                                      config.hidden_size,
			
 
				+                                      linear_method=linear_method)
			
 
				+        self.sampler = Sampler(config.vocab_size)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        input_ids: torch.Tensor,
			
 
				+        positions: torch.Tensor,
			
 
				+        kv_caches: List[KVCache],
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        hidden_states = self.transformer(input_ids, positions, kv_caches,
			
 
				+                                         input_metadata)
			
 
				+        return hidden_states
			
 
				+
			
 
				+    def sample(
			
 
				+        self,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        sampling_metadata: SamplingMetadata,
			
 
				+    ) -> Optional[SamplerOutput]:
			
 
				+        next_tokens = self.sampler(self.lm_head(hidden_states),
			
 
				+                                   sampling_metadata)
			
 
				+        return next_tokens
			
 
				+
			
 
				+    def load_weights(self,
			
 
				+                     model_name_or_path: str,
			
 
				+                     cache_dir: Optional[str] = None,
			
 
				+                     load_format: str = "auto",
			
 
				+                     revision: Optional[str] = None):
			
 
				+        params_dict = dict(self.named_parameters(remove_duplicate=False))
			
 
				+        for name, loaded_weight in hf_model_weights_iterator(
			
 
				+                model_name_or_path, cache_dir, load_format, revision,
			
 
				+                self.config):
			
 
				+            if "lm_head.weight" in name:
			
 
				+                continue
			
 
				+            if "wte.weight" in name:
			
 
				+                # Copy word embedding to lm_head
			
 
				+                lm_head_param = params_dict["lm_head.weight"]
			
 
				+                weight_loader = getattr(lm_head_param, "weight_loader",
			
 
				+                                        default_weight_loader)
			
 
				+                weight_loader(lm_head_param, loaded_weight)
			
 
				+            if ".attn.bias" in name:
			
 
				+                # Skip attention mask.
			
 
				+                # NOTE: "c_attn.bias" should not be skipped.
			
 
				+                continue
			
 
				+            param = params_dict[name]
			
 
				+            weight_loader = getattr(param, "weight_loader",
			
 
				+                                    default_weight_loader)
			
 
				+            weight_loader(param, loaded_weight)
			
--- a/aphrodite/modeling/models/gpt_j.py
+++ b/aphrodite/modeling/models/gpt_j.py
@@ -166,8 +166,7 @@ class GPTJBlock(nn.Module):
 
				         linear_method: Optional[LinearMethodBase] = None,
			
 
				     ):
			
 
				         super().__init__()
			
 
				-        inner_dim = (4 * config.n_embd
			
 
				-                     if config.n_inner is None else config.n_inner)
			
 
				+        inner_dim = 4 * config.n_embd if config.n_inner is None else config.n_inner
			
 
				         self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
			
 
				         self.attn = GPTJAttention(config, linear_method)
			
 
				         self.mlp = GPTJMLP(inner_dim, config, linear_method)
			
@@ -202,11 +201,9 @@ class GPTJModel(nn.Module):
 
				         super().__init__()
			
 
				         self.config = config
			
 
				         self.embed_dim = config.n_embd
			
 
				-        self.wte = VocabParallelEmbedding(
			
 
				-            config.vocab_size,
			
 
				-            self.embed_dim,
			
 
				-            linear_method=linear_method,
			
 
				-        )
			
 
				+        self.wte = VocabParallelEmbedding(config.vocab_size,
			
 
				+                                          self.embed_dim,
			
 
				+                                          linear_method=linear_method)
			
 
				         self.h = nn.ModuleList(
			
 
				             [GPTJBlock(config, linear_method) for _ in range(config.n_layer)])
			
 
				         self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
			
@@ -243,12 +240,10 @@ class GPTJForCausalLM(nn.Module):
 
				         self.linear_method = linear_method
			
 
				         assert not config.tie_word_embeddings
			
 
				         self.transformer = GPTJModel(config, linear_method)
			
 
				-        self.lm_head = ParallelLMHead(
			
 
				-            config.vocab_size,
			
 
				-            config.n_embd,
			
 
				-            bias=True,
			
 
				-            linear_method=linear_method,
			
 
				-        )
			
 
				+        self.lm_head = ParallelLMHead(config.vocab_size,
			
 
				+                                      config.n_embd,
			
 
				+                                      bias=True,
			
 
				+                                      linear_method=linear_method)
			
 
				         self.sampler = Sampler(config.vocab_size)
			
 
				 
			
 
				     def forward(
			
--- a/aphrodite/modeling/models/gpt_neox.py
+++ b/aphrodite/modeling/models/gpt_neox.py
@@ -196,11 +196,9 @@ class GPTNeoXModel(nn.Module):
 
				         super().__init__()
			
 
				         self.config = config
			
 
				 
			
 
				-        self.embed_in = VocabParallelEmbedding(
			
 
				-            config.vocab_size,
			
 
				-            config.hidden_size,
			
 
				-            linear_method=linear_method,
			
 
				-        )
			
 
				+        self.embed_in = VocabParallelEmbedding(config.vocab_size,
			
 
				+                                               config.hidden_size,
			
 
				+                                               linear_method=linear_method)
			
 
				         self.layers = nn.ModuleList([
			
 
				             GPTNeoXLayer(config, linear_method)
			
 
				             for _ in range(config.num_hidden_layers)
			
@@ -239,11 +237,9 @@ class GPTNeoXForCausalLM(nn.Module):
 
				         self.config = config
			
 
				         self.linear_method = linear_method
			
 
				         self.gpt_neox = GPTNeoXModel(config, linear_method)
			
 
				-        self.embed_out = ParallelLMHead(
			
 
				-            config.vocab_size,
			
 
				-            config.hidden_size,
			
 
				-            linear_method=linear_method,
			
 
				-        )
			
 
				+        self.embed_out = ParallelLMHead(config.vocab_size,
			
 
				+                                        config.hidden_size,
			
 
				+                                        linear_method=linear_method)
			
 
				         self.sampler = Sampler(config.vocab_size)
			
 
				 
			
 
				     def forward(
			
--- a/aphrodite/modeling/models/internlm2.py
+++ b/aphrodite/modeling/models/internlm2.py
@@ -0,0 +1,352 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+from typing import Any, Dict, List, Optional, Tuple
			
 
				+
			
 
				+import torch
			
 
				+from torch import nn
			
 
				+from transformers import PretrainedConfig
			
 
				+
			
 
				+from aphrodite.modeling.metadata import InputMetadata
			
 
				+from aphrodite.modeling.layers.activation import SiluAndMul
			
 
				+from aphrodite.modeling.layers.attention import PagedAttention
			
 
				+from aphrodite.modeling.layers.layernorm import RMSNorm
			
 
				+from aphrodite.modeling.layers.linear import (LinearMethodBase,
			
 
				+                                              ColumnParallelLinear,
			
 
				+                                              MergedColumnParallelLinear,
			
 
				+                                              QKVParallelLinear,
			
 
				+                                              RowParallelLinear)
			
 
				+from aphrodite.modeling.layers.rotary_embedding import get_rope
			
 
				+from aphrodite.modeling.layers.sampler import Sampler
			
 
				+from aphrodite.modeling.layers.vocab_parallel_embedding import (
			
 
				+    VocabParallelEmbedding, ParallelLMHead)
			
 
				+from aphrodite.modeling.megatron.parallel_state import (
			
 
				+    get_tensor_model_parallel_world_size)
			
 
				+from aphrodite.modeling.sampling_metadata import SamplingMetadata
			
 
				+from aphrodite.modeling.hf_downloader import (default_weight_loader,
			
 
				+                                              hf_model_weights_iterator)
			
 
				+from aphrodite.common.sequence import SamplerOutput
			
 
				+
			
 
				+KVCache = Tuple[torch.Tensor, torch.Tensor]
			
 
				+
			
 
				+
			
 
				+class InternLM2MLP(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        hidden_size: int,
			
 
				+        intermediate_size: int,
			
 
				+        hidden_act: str,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ) -> None:
			
 
				+        super().__init__()
			
 
				+        if linear_method is not None and not linear_method.quant_config.merge_weight(
			
 
				+        ):
			
 
				+            self.merge_weight = False
			
 
				+            self.w1 = ColumnParallelLinear(hidden_size,
			
 
				+                                           intermediate_size,
			
 
				+                                           bias=False,
			
 
				+                                           linear_method=linear_method)
			
 
				+            self.w3 = ColumnParallelLinear(hidden_size,
			
 
				+                                           intermediate_size,
			
 
				+                                           bias=False,
			
 
				+                                           linear_method=linear_method)
			
 
				+        else:
			
 
				+            self.merge_weight = True
			
 
				+            self.gate_up_proj = MergedColumnParallelLinear(
			
 
				+                hidden_size, [intermediate_size] * 2,
			
 
				+                bias=False,
			
 
				+                linear_method=linear_method)
			
 
				+        self.w2 = RowParallelLinear(intermediate_size,
			
 
				+                                    hidden_size,
			
 
				+                                    bias=False,
			
 
				+                                    linear_method=linear_method)
			
 
				+        if hidden_act != "silu":
			
 
				+            raise ValueError(f"Unsupported activation: {hidden_act}. "
			
 
				+                             "Only silu is supported for now.")
			
 
				+        self.act_fn = SiluAndMul()
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        if self.merge_weight:
			
 
				+            gate_up, _ = self.gate_up_proj(x)
			
 
				+        else:
			
 
				+            up, _ = self.up_proj(x)
			
 
				+            gate, _ = self.gate_proj(x)
			
 
				+            gate_up = torch.cat([gate, up], dim=-1)
			
 
				+        x = self.act_fn(gate_up)
			
 
				+        x, _ = self.w2(x)
			
 
				+        return x
			
 
				+
			
 
				+
			
 
				+class InternLM2Attention(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        hidden_size: int,
			
 
				+        num_heads: int,
			
 
				+        num_kv_heads: int,
			
 
				+        rope_theta: float = 10000,
			
 
				+        rope_scaling: Optional[Dict[str, Any]] = None,
			
 
				+        max_position_embeddings: int = 8192,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ) -> None:
			
 
				+        super().__init__()
			
 
				+        self.hidden_size = hidden_size
			
 
				+        tp_size = get_tensor_model_parallel_world_size()
			
 
				+        self.total_num_heads = num_heads
			
 
				+        assert self.total_num_heads % tp_size == 0
			
 
				+        self.num_heads = self.total_num_heads // tp_size
			
 
				+        self.total_num_kv_heads = num_kv_heads
			
 
				+        if self.total_num_kv_heads >= tp_size:
			
 
				+            # Number of KV heads is greater than TP size, so we partition
			
 
				+            # the KV heads across multiple tensor parallel GPUs.
			
 
				+            assert self.total_num_kv_heads % tp_size == 0
			
 
				+        else:
			
 
				+            # Number of KV heads is less than TP size, so we replicate
			
 
				+            # the KV heads across multiple tensor parallel GPUs.
			
 
				+            assert tp_size % self.total_num_kv_heads == 0
			
 
				+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
			
 
				+        self.head_dim = hidden_size // self.total_num_heads
			
 
				+        self.q_size = self.num_heads * self.head_dim
			
 
				+        self.kv_size = self.num_kv_heads * self.head_dim
			
 
				+        self.scaling = self.head_dim**-0.5
			
 
				+        self.rope_theta = rope_theta
			
 
				+        self.max_position_embeddings = max_position_embeddings
			
 
				+
			
 
				+        self.wqkv = QKVParallelLinear(
			
 
				+            hidden_size,
			
 
				+            self.head_dim,
			
 
				+            self.total_num_heads,
			
 
				+            self.total_num_kv_heads,
			
 
				+            bias=False,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+        self.wo = RowParallelLinear(
			
 
				+            self.total_num_heads * self.head_dim,
			
 
				+            hidden_size,
			
 
				+            bias=False,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+
			
 
				+        self.rotary_emb = get_rope(
			
 
				+            self.head_dim,
			
 
				+            rotary_dim=self.head_dim,
			
 
				+            max_position=max_position_embeddings,
			
 
				+            base=rope_theta,
			
 
				+            rope_scaling=rope_scaling,
			
 
				+        )
			
 
				+        self.attn = PagedAttention(self.num_heads,
			
 
				+                                   self.head_dim,
			
 
				+                                   self.scaling,
			
 
				+                                   num_kv_heads=self.num_kv_heads)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        positions: torch.Tensor,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        kv_cache: KVCache,
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        qkv, _ = self.wqkv(hidden_states)
			
 
				+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
			
 
				+        q, k = self.rotary_emb(positions, q, k)
			
 
				+        k_cache, v_cache = kv_cache
			
 
				+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
			
 
				+        output, _ = self.wo(attn_output)
			
 
				+        return output
			
 
				+
			
 
				+
			
 
				+class InternLMDecoderLayer(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: PretrainedConfig,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ) -> None:
			
 
				+        super().__init__()
			
 
				+        self.hidden_size = config.hidden_size
			
 
				+        rope_theta = getattr(config, "rope_theta", 10000)
			
 
				+        rope_scaling = getattr(config, "rope_scaling", None)
			
 
				+        max_position_embeddings = getattr(config, "max_position_embeddings",
			
 
				+                                          8192)
			
 
				+        self.attention = InternLM2Attention(
			
 
				+            hidden_size=self.hidden_size,
			
 
				+            num_heads=config.num_attention_heads,
			
 
				+            num_kv_heads=config.num_key_value_heads,
			
 
				+            rope_theta=rope_theta,
			
 
				+            rope_scaling=rope_scaling,
			
 
				+            max_position_embeddings=max_position_embeddings,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+        self.feed_forward = InternLM2MLP(
			
 
				+            hidden_size=self.hidden_size,
			
 
				+            intermediate_size=config.intermediate_size,
			
 
				+            hidden_act=config.hidden_act,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+        self.attention_norm = RMSNorm(config.hidden_size,
			
 
				+                                      eps=config.rms_norm_eps)
			
 
				+        self.ffn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        positions: torch.Tensor,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        kv_cache: KVCache,
			
 
				+        input_metadata: InputMetadata,
			
 
				+        residual: Optional[torch.Tensor],
			
 
				+    ) -> Tuple[torch.Tensor, torch.Tensor]:
			
 
				+        # Self Attention
			
 
				+        if residual is None:
			
 
				+            residual = hidden_states
			
 
				+            hidden_states = self.attention_norm(hidden_states)
			
 
				+        else:
			
 
				+            hidden_states, residual = self.attention_norm(
			
 
				+                hidden_states, residual)
			
 
				+        hidden_states = self.attention(
			
 
				+            positions=positions,
			
 
				+            hidden_states=hidden_states,
			
 
				+            kv_cache=kv_cache,
			
 
				+            input_metadata=input_metadata,
			
 
				+        )
			
 
				+
			
 
				+        # Fully Connected
			
 
				+        hidden_states, residual = self.ffn_norm(hidden_states, residual)
			
 
				+        hidden_states = self.feed_forward(hidden_states)
			
 
				+        return hidden_states, residual
			
 
				+
			
 
				+
			
 
				+class InternLM2Model(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: PretrainedConfig,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ) -> None:
			
 
				+        super().__init__()
			
 
				+        self.config = config
			
 
				+        self.padding_idx = config.pad_token_id
			
 
				+        self.vocab_size = config.vocab_size
			
 
				+        self.tok_embeddings = VocabParallelEmbedding(
			
 
				+            config.vocab_size,
			
 
				+            config.hidden_size,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+        self.layers = nn.ModuleList([
			
 
				+            InternLMDecoderLayer(config, linear_method)
			
 
				+            for _ in range(config.num_hidden_layers)
			
 
				+        ])
			
 
				+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        input_ids: torch.Tensor,
			
 
				+        positions: torch.Tensor,
			
 
				+        kv_caches: List[KVCache],
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        hidden_states = self.tok_embeddings(input_ids)
			
 
				+        residual = None
			
 
				+        for i in range(len(self.layers)):
			
 
				+            layer = self.layers[i]
			
 
				+            hidden_states, residual = layer(
			
 
				+                positions,
			
 
				+                hidden_states,
			
 
				+                kv_caches[i],
			
 
				+                input_metadata,
			
 
				+                residual,
			
 
				+            )
			
 
				+        hidden_states, _ = self.norm(hidden_states, residual)
			
 
				+        return hidden_states
			
 
				+
			
 
				+
			
 
				+class InternLM2ForCausalLM(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: PretrainedConfig,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ) -> None:
			
 
				+        super().__init__()
			
 
				+        self.config = config
			
 
				+        self.linear_method = linear_method
			
 
				+        self.model = InternLM2Model(config, linear_method)
			
 
				+        self.output = ParallelLMHead(
			
 
				+            config.vocab_size,
			
 
				+            config.hidden_size,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+        self.sampler = Sampler(config.vocab_size)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        input_ids: torch.Tensor,
			
 
				+        positions: torch.Tensor,
			
 
				+        kv_caches: List[KVCache],
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        hidden_states = self.model(input_ids, positions, kv_caches,
			
 
				+                                   input_metadata)
			
 
				+        return hidden_states
			
 
				+
			
 
				+    def sample(
			
 
				+        self,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        sampling_metadata: SamplingMetadata,
			
 
				+    ) -> Optional[SamplerOutput]:
			
 
				+        next_tokens = self.sampler(self.output(hidden_states),
			
 
				+                                   sampling_metadata)
			
 
				+        return next_tokens
			
 
				+
			
 
				+    def load_weights(self,
			
 
				+                     model_name_or_path: str,
			
 
				+                     cache_dir: Optional[str] = None,
			
 
				+                     load_format: str = "auto",
			
 
				+                     revision: Optional[str] = None):
			
 
				+        stacked_params_mapping = [
			
 
				+            # (param_name, shard_name, shard_id)
			
 
				+            ("gate_up_proj", "w1", 0),
			
 
				+            ("gate_up_proj", "w3", 1),
			
 
				+        ]
			
 
				+        if self.linear_method is not None and not self.linear_method.quant_config.merge_weight(
			
 
				+        ):
			
 
				+            stacked_params_mapping = []
			
 
				+        params_dict = dict(self.named_parameters())
			
 
				+        for name, loaded_weight in hf_model_weights_iterator(
			
 
				+                model_name_or_path, cache_dir, load_format, revision):
			
 
				+            if "rotary_emb.inv_freq" in name:
			
 
				+                continue
			
 
				+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
			
 
				+                if weight_name not in name:
			
 
				+                    continue
			
 
				+                name = name.replace(weight_name, param_name)
			
 
				+                # Skip loading extra bias for GPTQ models.
			
 
				+                if name.endswith(".bias") and name not in params_dict:
			
 
				+                    continue
			
 
				+                param = params_dict[name]
			
 
				+                weight_loader = param.weight_loader
			
 
				+                weight_loader(param, loaded_weight, shard_id)
			
 
				+                break
			
 
				+            else:
			
 
				+                # Skip loading extra bias for GPTQ models.
			
 
				+                if name.endswith(".bias") and name not in params_dict:
			
 
				+                    continue
			
 
				+                param = params_dict[name]
			
 
				+                if "wqkv" in name:
			
 
				+                    config = self.config
			
 
				+                    kv_groups = config.num_attention_heads // config.num_key_value_heads
			
 
				+                    head_dim = config.hidden_size // config.num_attention_heads
			
 
				+                    loaded_weight = loaded_weight.view(-1, 2 + kv_groups,
			
 
				+                                                       head_dim,
			
 
				+                                                       loaded_weight.shape[-1])
			
 
				+                    wq, wk, wv = torch.split(loaded_weight, [kv_groups, 1, 1],
			
 
				+                                             dim=1)
			
 
				+                    wq = wq.reshape(-1, wq.shape[-1])
			
 
				+                    wk = wk.reshape(-1, wk.shape[-1])
			
 
				+                    wv = wv.reshape(-1, wv.shape[-1])
			
 
				+                    weight_loader = param.weight_loader
			
 
				+                    weight_loader(param, wq, 'q')
			
 
				+                    weight_loader(param, wk, 'k')
			
 
				+                    weight_loader(param, wv, 'v')
			
 
				+                else:
			
 
				+                    weight_loader = getattr(param, "weight_loader",
			
 
				+                                            default_weight_loader)
			
 
				+                    weight_loader(param, loaded_weight)
			
--- a/aphrodite/modeling/models/llama.py
+++ b/aphrodite/modeling/models/llama.py
@@ -1,7 +1,6 @@
 
				 # coding=utf-8
			
 
				 # Adapted from
			
 
				 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
			
 
				-# Copyright 2023 The PygmalionAI team.
			
 
				 # Copyright 2023 The vLLM team.
			
 
				 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
			
 
				 #
			
@@ -111,6 +110,8 @@ class LlamaAttention(nn.Module):
 
				         rope_scaling: Optional[Dict[str, Any]] = None,
			
 
				         max_position_embeddings: int = 8192,
			
 
				         linear_method: Optional[LinearMethodBase] = None,
			
 
				+        bias: bool = False,
			
 
				+        sliding_window: Optional[int] = None,
			
 
				     ) -> None:
			
 
				         super().__init__()
			
 
				         self.hidden_size = hidden_size
			
@@ -140,15 +141,15 @@ class LlamaAttention(nn.Module):
 
				             self.merge_weight = False
			
 
				             self.q_proj = ColumnParallelLinear(hidden_size,
			
 
				                                                self.q_size,
			
 
				-                                               bias=False,
			
 
				+                                               bias=bias,
			
 
				                                                linear_method=linear_method)
			
 
				             self.k_proj = ColumnParallelLinear(hidden_size,
			
 
				                                                self.kv_size,
			
 
				-                                               bias=False,
			
 
				+                                               bias=bias,
			
 
				                                                linear_method=linear_method)
			
 
				             self.v_proj = ColumnParallelLinear(hidden_size,
			
 
				                                                self.kv_size,
			
 
				-                                               bias=False,
			
 
				+                                               bias=bias,
			
 
				                                                linear_method=linear_method)
			
 
				         else:
			
 
				             self.merge_weight = True
			
@@ -157,13 +158,13 @@ class LlamaAttention(nn.Module):
 
				                 self.head_dim,
			
 
				                 self.total_num_heads,
			
 
				                 self.total_num_kv_heads,
			
 
				-                bias=False,
			
 
				+                bias=bias,
			
 
				                 linear_method=linear_method,
			
 
				             )
			
 
				         self.o_proj = RowParallelLinear(
			
 
				             self.total_num_heads * self.head_dim,
			
 
				             hidden_size,
			
 
				-            bias=False,
			
 
				+            bias=bias,
			
 
				             linear_method=linear_method,
			
 
				         )
			
 
				 
			
@@ -180,7 +181,8 @@ class LlamaAttention(nn.Module):
 
				         self.attn = PagedAttention(self.num_heads,
			
 
				                                    self.head_dim,
			
 
				                                    self.scaling,
			
 
				-                                   num_kv_heads=self.num_kv_heads)
			
 
				+                                   num_kv_heads=self.num_kv_heads,
			
 
				+                                   sliding_window=sliding_window)
			
 
				 
			
 
				     def forward(
			
 
				         self,
			
@@ -217,14 +219,18 @@ class LlamaDecoderLayer(nn.Module):
 
				         rope_scaling = getattr(config, "rope_scaling", None)
			
 
				         max_position_embeddings = getattr(config, "max_position_embeddings",
			
 
				                                           8192)
			
 
				+        sliding_window = getattr(config, "sliding_window", None)
			
 
				         self.self_attn = LlamaAttention(
			
 
				             hidden_size=self.hidden_size,
			
 
				             num_heads=config.num_attention_heads,
			
 
				-            num_kv_heads=config.num_key_value_heads,
			
 
				+            num_kv_heads=getattr(config, "num_key_value_heads",
			
 
				+                                 config.num_attention_heads),
			
 
				             rope_theta=rope_theta,
			
 
				             rope_scaling=rope_scaling,
			
 
				             max_position_embeddings=max_position_embeddings,
			
 
				             linear_method=linear_method,
			
 
				+            bias=getattr(config, "bias", False),
			
 
				+            sliding_window=sliding_window,
			
 
				         )
			
 
				         self.mlp = LlamaMLP(
			
 
				             hidden_size=self.hidden_size,
			
@@ -316,7 +322,32 @@ class LlamaModel(nn.Module):
 
				 
			
 
				 
			
 
				 class LlamaForCausalLM(nn.Module):
			
 
				-    supports_lora = True
			
 
				+    packed_modules_mapping = {
			
 
				+        "qkv_proj": [
			
 
				+            "q_proj",
			
 
				+            "k_proj",
			
 
				+            "v_proj",
			
 
				+        ],
			
 
				+        "gate_up_proj": [
			
 
				+            "gate_proj",
			
 
				+            "up_proj",
			
 
				+        ],
			
 
				+    }
			
 
				+
			
 
				+    # LoRA specific attributes
			
 
				+    supported_lora_modules = [
			
 
				+        "qkv_proj",
			
 
				+        "o_proj",
			
 
				+        "gate_up_proj",
			
 
				+        "down_proj",
			
 
				+        "embed_tokens",
			
 
				+        "lm_head",
			
 
				+    ]
			
 
				+    embedding_modules = {
			
 
				+        "embed_tokens": "input_embeddings",
			
 
				+        "lm_head": "output_embeddings",
			
 
				+    }
			
 
				+    embedding_padding_modules = ["lm_head"]
			
 
				 
			
 
				     def __init__(
			
 
				         self,
			
@@ -328,20 +359,20 @@ class LlamaForCausalLM(nn.Module):
 
				         self.config = config
			
 
				         self.linear_method = linear_method
			
 
				         self.model = LlamaModel(config, linear_method, lora_config=lora_config)
			
 
				-        unpadded_vocab_size = config.vocab_size
			
 
				+        self.unpadded_vocab_size = config.vocab_size
			
 
				         if lora_config:
			
 
				-            unpadded_vocab_size += lora_config.lora_extra_vocab_size
			
 
				+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
			
 
				         self.lm_head = ParallelLMHead(
			
 
				-            unpadded_vocab_size,
			
 
				+            self.unpadded_vocab_size,
			
 
				             config.hidden_size,
			
 
				-            linear_method=linear_method,
			
 
				             org_num_embeddings=config.vocab_size,
			
 
				+            linear_method=linear_method,
			
 
				             padding_size=DEFAULT_VOCAB_PADDING_SIZE
			
 
				             # We need bigger padding if using lora for kernel
			
 
				             # compatibility
			
 
				             if not lora_config else lora_config.lora_vocab_padding_size,
			
 
				         )
			
 
				-        self.sampler = Sampler(unpadded_vocab_size, config.vocab_size)
			
 
				+        self.sampler = Sampler(self.unpadded_vocab_size, config.vocab_size)
			
 
				 
			
 
				     def forward(
			
 
				         self,
			
--- a/aphrodite/modeling/models/mixtral.py
+++ b/aphrodite/modeling/models/mixtral.py
@@ -25,23 +25,23 @@
 
				 from typing import List, Optional, Tuple
			
 
				 
			
 
				 import torch
			
 
				-
			
 
				 from torch import nn
			
 
				 from transformers import MixtralConfig
			
 
				 
			
 
				+from aphrodite.common.config import LoRAConfig
			
 
				 from aphrodite.modeling.metadata import InputMetadata
			
 
				 from aphrodite.modeling.layers.attention import PagedAttention
			
 
				 from aphrodite.modeling.layers.triton_kernel.fused_moe import fused_moe
			
 
				 from aphrodite.modeling.layers.layernorm import RMSNorm
			
 
				 from aphrodite.modeling.layers.linear import (LinearMethodBase,
			
 
				-                                              ReplicatedLinear,
			
 
				                                               QKVParallelLinear,
			
 
				+                                              ReplicatedLinear,
			
 
				                                               RowParallelLinear,
			
 
				                                               ColumnParallelLinear)
			
 
				 from aphrodite.modeling.layers.rotary_embedding import get_rope
			
 
				 from aphrodite.modeling.layers.sampler import Sampler
			
 
				 from aphrodite.modeling.layers.vocab_parallel_embedding import (
			
 
				-    VocabParallelEmbedding, ParallelLMHead)
			
 
				+    VocabParallelEmbedding, ParallelLMHead, DEFAULT_VOCAB_PADDING_SIZE)
			
 
				 from aphrodite.modeling.megatron.communication_op import (
			
 
				     tensor_model_parallel_all_reduce)
			
 
				 from aphrodite.modeling.megatron.parallel_state import (
			
@@ -58,6 +58,7 @@ KVCache = Tuple[torch.Tensor, torch.Tensor]
 
				 class MixtralMoE(nn.Module):
			
 
				     """A tensor-parallel MoE implementation for Mixtral that shards each expert
			
 
				     across all ranks.
			
 
				+
			
 
				     Each expert's weights are sharded across all ranks and a fused MoE
			
 
				     kernel is used for the forward pass, and finally we reduce the outputs
			
 
				     across ranks.
			
@@ -70,13 +71,14 @@ class MixtralMoE(nn.Module):
 
				         hidden_size: int,
			
 
				         intermediate_size: int,
			
 
				         params_dtype: Optional[torch.dtype] = None,
			
 
				+        tp_size: Optional[int] = None,
			
 
				     ):
			
 
				         super().__init__()
			
 
				-        tp_size = get_tensor_model_parallel_world_size()
			
 
				+        self.tp_size = tp_size or get_tensor_model_parallel_world_size()
			
 
				         self.num_total_experts = num_experts
			
 
				         self.top_k = top_k
			
 
				         self.hidden_size = hidden_size
			
 
				-        self.intermediate_size = intermediate_size // tp_size
			
 
				+        self.intermediate_size = intermediate_size // self.tp_size
			
 
				 
			
 
				         if params_dtype is None:
			
 
				             params_dtype = torch.get_default_dtype()
			
@@ -127,7 +129,6 @@ class MixtralMoE(nn.Module):
 
				         hidden_states = hidden_states.view(-1, self.hidden_size)
			
 
				         # router_logits: (batch * sequence_length, n_experts)
			
 
				         router_logits, _ = self.gate(hidden_states)
			
 
				-
			
 
				         final_hidden_states = fused_moe(hidden_states,
			
 
				                                         self.ws,
			
 
				                                         self.w2s,
			
@@ -136,8 +137,9 @@ class MixtralMoE(nn.Module):
 
				                                         renormalize=True,
			
 
				                                         inplace=True)
			
 
				 
			
 
				-        final_hidden_states = tensor_model_parallel_all_reduce(
			
 
				-            final_hidden_states)
			
 
				+        if self.tp_size > 1:
			
 
				+            final_hidden_states = tensor_model_parallel_all_reduce(
			
 
				+                final_hidden_states)
			
 
				 
			
 
				         return final_hidden_states.view(batch_size, sequence_length,
			
 
				                                         hidden_size)
			
@@ -310,15 +312,20 @@ class MixtralModel(nn.Module):
 
				         self,
			
 
				         config: MixtralConfig,
			
 
				         linear_method: Optional[LinearMethodBase] = None,
			
 
				+        lora_config: Optional[LoRAConfig] = None,
			
 
				     ) -> None:
			
 
				         super().__init__()
			
 
				         self.padding_idx = config.pad_token_id
			
 
				-        self.vocab_size = config.vocab_size
			
 
				+        lora_vocab = (lora_config.lora_extra_vocab_size *
			
 
				+                      (lora_config.max_loras or 1)) if lora_config else 0
			
 
				+        self.vocab_size = config.vocab_size + lora_vocab
			
 
				+        self.org_vocab_size = config.vocab_size
			
 
				 
			
 
				         self.embed_tokens = VocabParallelEmbedding(
			
 
				-            config.vocab_size,
			
 
				+            self.vocab_size,
			
 
				             config.hidden_size,
			
 
				             linear_method=linear_method,
			
 
				+            org_num_embeddings=config.vocab_size,
			
 
				         )
			
 
				         self.layers = nn.ModuleList([
			
 
				             MixtralDecoderLayer(config, linear_method=linear_method)
			
@@ -345,20 +352,53 @@ class MixtralModel(nn.Module):
 
				 
			
 
				 
			
 
				 class MixtralForCausalLM(nn.Module):
			
 
				+    packed_modules_mapping = {
			
 
				+        "qkv_proj": [
			
 
				+            "q_proj",
			
 
				+            "k_proj",
			
 
				+            "v_proj",
			
 
				+        ],
			
 
				+    }
			
 
				+
			
 
				+    # LoRA specific attributes
			
 
				+    supported_lora_modules = [
			
 
				+        "qkv_proj",
			
 
				+        "o_proj",
			
 
				+        "embed_tokens",
			
 
				+        "lm_head",
			
 
				+    ]
			
 
				+    embedding_modules = {
			
 
				+        "embed_tokens": "input_embeddings",
			
 
				+        "lm_head": "output_embeddings",
			
 
				+    }
			
 
				+    embedding_padding_modules = ["lm_head"]
			
 
				 
			
 
				     def __init__(
			
 
				         self,
			
 
				         config: MixtralConfig,
			
 
				         linear_method: Optional[LinearMethodBase] = None,
			
 
				+        lora_config: Optional[LoRAConfig] = None,
			
 
				     ) -> None:
			
 
				         super().__init__()
			
 
				         self.config = config
			
 
				         self.linear_method = linear_method
			
 
				-        self.model = MixtralModel(config, linear_method)
			
 
				-        self.lm_head = ParallelLMHead(config.vocab_size,
			
 
				-                                      config.hidden_size,
			
 
				-                                      linear_method=linear_method)
			
 
				-        self.sampler = Sampler(config.vocab_size)
			
 
				+        self.model = MixtralModel(config,
			
 
				+                                  linear_method,
			
 
				+                                  lora_config=lora_config)
			
 
				+        self.unpadded_vocab_size = config.vocab_size
			
 
				+        if lora_config:
			
 
				+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
			
 
				+        self.lm_head = ParallelLMHead(
			
 
				+            self.unpadded_vocab_size,
			
 
				+            config.hidden_size,
			
 
				+            linear_method=linear_method,
			
 
				+            org_num_embeddings=config.vocab_size,
			
 
				+            padding_size=DEFAULT_VOCAB_PADDING_SIZE
			
 
				+            # We need bigger padding if using lora for kernel
			
 
				+            # compatibility
			
 
				+            if not lora_config else lora_config.lora_vocab_padding_size,
			
 
				+        )
			
 
				+        self.sampler = Sampler(self.unpadded_vocab_size, config.vocab_size)
			
 
				 
			
 
				     def forward(
			
 
				         self,
			
@@ -391,6 +431,10 @@ class MixtralForCausalLM(nn.Module):
 
				             ("qkv_proj", "k_proj", "k"),
			
 
				             ("qkv_proj", "v_proj", "v"),
			
 
				         ]
			
 
				+        if self.linear_method is not None and not self.linear_method.quant_config.merge_weight(
			
 
				+        ):
			
 
				+            stacked_params_mapping = []
			
 
				+
			
 
				         expert_params_mapping = [
			
 
				             # (param_name, weight_name, expert_id)
			
 
				             ("ws" if weight_name in ["w1", "w3"] else "w2s",
			
@@ -398,18 +442,18 @@ class MixtralForCausalLM(nn.Module):
 
				             for expert_id in range(self.config.num_local_experts)
			
 
				             for weight_name in ["w1", "w2", "w3"]
			
 
				         ]
			
 
				-        if self.linear_method is not None and not self.linear_method.quant_config.merge_weight(
			
 
				-        ):
			
 
				-            stacked_params_mapping = []
			
 
				+
			
 
				         params_dict = dict(self.named_parameters())
			
 
				         for name, loaded_weight in hf_model_weights_iterator(
			
 
				                 model_name_or_path,
			
 
				                 cache_dir,
			
 
				                 load_format,
			
 
				                 revision,
			
 
				+                self.config,
			
 
				                 fall_back_to_pt=False):
			
 
				             if "rotary_emb.inv_freq" in name:
			
 
				                 continue
			
 
				+
			
 
				             for (param_name, weight_name, shard_id) in stacked_params_mapping:
			
 
				                 if weight_name not in name:
			
 
				                     continue
			
--- a/aphrodite/modeling/models/mixtral_quant.py
+++ b/aphrodite/modeling/models/mixtral_quant.py
@@ -226,14 +226,12 @@ class MixtralAttention(nn.Module):
 
				             bias=False,
			
 
				             linear_method=linear_method,
			
 
				         )
			
 
				-        is_neox_style = True if linear_method is None or linear_method.quant_config.rope_style(
			
 
				-        ) is None else linear_method.quant_config.rope_style()
			
 
				         self.rotary_emb = get_rope(
			
 
				             self.head_dim,
			
 
				             rotary_dim=self.head_dim,
			
 
				             max_position=max_position,
			
 
				             base=int(self.rope_theta),
			
 
				-            is_neox_style=is_neox_style,
			
 
				+            is_neox_style=True,
			
 
				         )
			
 
				         self.attn = PagedAttention(
			
 
				             self.num_heads,
			
@@ -371,9 +369,11 @@ class MixtralForCausalLM(nn.Module):
 
				         self.config = config
			
 
				         self.linear_method = linear_method
			
 
				         self.model = MixtralModel(config, linear_method)
			
 
				-        self.lm_head = ParallelLMHead(config.vocab_size,
			
 
				-                                      config.hidden_size,
			
 
				-                                      linear_method=linear_method)
			
 
				+        self.lm_head = ParallelLMHead(
			
 
				+            config.vocab_size,
			
 
				+            config.hidden_size,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				         self.sampler = Sampler(config.vocab_size)
			
 
				 
			
 
				     def forward(
			
@@ -410,6 +410,7 @@ class MixtralForCausalLM(nn.Module):
 
				         if self.linear_method is not None and not self.linear_method.quant_config.merge_weight(
			
 
				         ):
			
 
				             stacked_params_mapping = []
			
 
				+
			
 
				         params_dict = dict(self.named_parameters())
			
 
				         for name, loaded_weight in hf_model_weights_iterator(
			
 
				                 model_name_or_path,
			
--- a/aphrodite/modeling/models/mpt.py
+++ b/aphrodite/modeling/models/mpt.py
@@ -0,0 +1,307 @@
 
				+# coding=utf-8
			
 
				+# Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main
			
 
				+import math
			
 
				+from typing import List, Optional, Tuple
			
 
				+
			
 
				+import torch
			
 
				+import torch.nn as nn
			
 
				+
			
 
				+from aphrodite.modeling.metadata import InputMetadata
			
 
				+from aphrodite.modeling.layers.activation import get_act_fn
			
 
				+from aphrodite.modeling.layers.attention import PagedAttention
			
 
				+from aphrodite.modeling.layers.linear import (ColumnParallelLinear,
			
 
				+                                              LinearMethodBase,
			
 
				+                                              QKVParallelLinear,
			
 
				+                                              RowParallelLinear)
			
 
				+from aphrodite.modeling.layers.sampler import Sampler
			
 
				+from aphrodite.modeling.layers.vocab_parallel_embedding import (
			
 
				+    VocabParallelEmbedding, ParallelLMHead)
			
 
				+from aphrodite.modeling.megatron.parallel_state import (
			
 
				+    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
			
 
				+from aphrodite.modeling.sampling_metadata import SamplingMetadata
			
 
				+from aphrodite.modeling.hf_downloader import (default_weight_loader,
			
 
				+                                              hf_model_weights_iterator)
			
 
				+from aphrodite.common.sequence import SamplerOutput
			
 
				+from aphrodite.transformers_utils.configs.mpt import MPTConfig
			
 
				+
			
 
				+KVCache = Tuple[torch.Tensor, torch.Tensor]
			
 
				+
			
 
				+
			
 
				+def _get_alibi_slopes(
			
 
				+    total_num_heads: int,
			
 
				+    alibi_bias_max: int,
			
 
				+) -> torch.Tensor:
			
 
				+    next_power_of_2 = 2**math.ceil(math.log2(total_num_heads))
			
 
				+    m = torch.arange(1, next_power_of_2 + 1, dtype=torch.float32)
			
 
				+    m = m.mul(alibi_bias_max / next_power_of_2)
			
 
				+    slopes = 1.0 / torch.pow(2, m)
			
 
				+    if next_power_of_2 != total_num_heads:
			
 
				+        slopes = torch.concat([slopes[1::2], slopes[::2]])[:total_num_heads]
			
 
				+    return slopes
			
 
				+
			
 
				+
			
 
				+class MPTAttention(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: MPTConfig,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.d_model = config.d_model
			
 
				+        self.total_num_heads = config.n_heads
			
 
				+        self.head_dim = self.d_model // self.total_num_heads
			
 
				+        self.clip_qkv = config.attn_config["clip_qkv"]
			
 
				+        self.qk_ln = config.attn_config["qk_ln"]
			
 
				+        self.alibi_bias_max = config.attn_config["alibi_bias_max"]
			
 
				+        if "kv_n_heads" in config.attn_config:
			
 
				+            self.total_num_kv_heads = config.attn_config['kv_n_heads']
			
 
				+        else:
			
 
				+            self.total_num_kv_heads = self.total_num_heads
			
 
				+        assert not config.attn_config["prefix_lm"]
			
 
				+        assert config.attn_config["alibi"]
			
 
				+
			
 
				+        # pylint: disable=invalid-name
			
 
				+        self.Wqkv = QKVParallelLinear(
			
 
				+            self.d_model,
			
 
				+            self.d_model // self.total_num_heads,
			
 
				+            self.total_num_heads,
			
 
				+            self.total_num_kv_heads,
			
 
				+            bias=not config.no_bias,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+        if self.qk_ln:
			
 
				+            self.q_ln = nn.LayerNorm(self.d_model)
			
 
				+            self.k_ln = nn.LayerNorm(self.d_model)
			
 
				+        self.out_proj = RowParallelLinear(
			
 
				+            self.d_model,
			
 
				+            self.d_model,
			
 
				+            bias=not config.no_bias,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+
			
 
				+        tp_world_size = get_tensor_model_parallel_world_size()
			
 
				+        assert self.total_num_heads % tp_world_size == 0
			
 
				+        self.num_heads = self.total_num_heads // tp_world_size
			
 
				+
			
 
				+        if self.total_num_kv_heads >= tp_world_size:
			
 
				+            # Number of KV heads is greater than TP size, so we partition
			
 
				+            # the KV heads across multiple tensor parallel GPUs.
			
 
				+            assert self.total_num_kv_heads % tp_world_size == 0
			
 
				+        else:
			
 
				+            # Number of KV heads is less than TP size, so we replicate
			
 
				+            # the KV heads across multiple tensor parallel GPUs.
			
 
				+            assert tp_world_size % self.total_num_kv_heads == 0
			
 
				+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_world_size)
			
 
				+        self.q_size = self.num_heads * self.head_dim
			
 
				+        self.kv_size = self.num_kv_heads * self.head_dim
			
 
				+        # Create the alibi slopes and slice them.
			
 
				+        tp_rank = get_tensor_model_parallel_rank()
			
 
				+        head_start = tp_rank * self.num_heads
			
 
				+        head_end = (tp_rank + 1) * self.num_heads
			
 
				+        alibi_slopes = _get_alibi_slopes(self.total_num_heads,
			
 
				+                                         self.alibi_bias_max)
			
 
				+        alibi_slopes = alibi_slopes[head_start:head_end].tolist()
			
 
				+
			
 
				+        self.head_dim = self.d_model // self.total_num_heads
			
 
				+        scaling = self.head_dim**-0.5
			
 
				+        self.attn = PagedAttention(self.num_heads,
			
 
				+                                   self.head_dim,
			
 
				+                                   scaling,
			
 
				+                                   alibi_slopes=alibi_slopes,
			
 
				+                                   num_kv_heads=self.num_kv_heads)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        position_ids: torch.Tensor,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        kv_cache: KVCache,
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        del position_ids  # unused.
			
 
				+        qkv, _ = self.Wqkv(hidden_states)
			
 
				+        if self.clip_qkv is not None:
			
 
				+            qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
			
 
				+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
			
 
				+        if self.qk_ln:
			
 
				+            q = self.q_ln(q)
			
 
				+            k = self.k_ln(k)
			
 
				+        k_cache, v_cache = kv_cache
			
 
				+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
			
 
				+        output, _ = self.out_proj(attn_output)
			
 
				+        return output
			
 
				+
			
 
				+
			
 
				+class MPTMLP(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: MPTConfig,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        hidden_size = config.d_model
			
 
				+        expansion_ratio = config.expansion_ratio
			
 
				+        intermediate_size = expansion_ratio * hidden_size
			
 
				+        self.up_proj = ColumnParallelLinear(
			
 
				+            hidden_size,
			
 
				+            intermediate_size,
			
 
				+            bias=not config.no_bias,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+        quant_config = getattr(linear_method, "quant_config", None)
			
 
				+        self.act = get_act_fn("gelu", quant_config, intermediate_size)
			
 
				+        self.down_proj = RowParallelLinear(
			
 
				+            intermediate_size,
			
 
				+            hidden_size,
			
 
				+            bias=not config.no_bias,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+
			
 
				+    def forward(self, x: torch.Tensor) -> torch.Tensor:
			
 
				+        x, _ = self.up_proj(x)
			
 
				+        x = self.act(x)
			
 
				+        x, _ = self.down_proj(x)
			
 
				+        return x
			
 
				+
			
 
				+
			
 
				+class MPTBlock(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: MPTConfig,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        hidden_size = config.d_model
			
 
				+        self.norm_1 = nn.LayerNorm(hidden_size)
			
 
				+        self.attn = MPTAttention(config, linear_method)
			
 
				+        self.norm_2 = nn.LayerNorm(hidden_size)
			
 
				+        self.ffn = MPTMLP(config, linear_method)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        position_ids: torch.Tensor,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        kv_cache: KVCache,
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        x = self.norm_1(hidden_states)
			
 
				+        x = self.attn(
			
 
				+            position_ids=position_ids,
			
 
				+            hidden_states=x,
			
 
				+            kv_cache=kv_cache,
			
 
				+            input_metadata=input_metadata,
			
 
				+        )
			
 
				+        hidden_states = hidden_states + x
			
 
				+        x = self.norm_2(hidden_states)
			
 
				+        x = self.ffn(x)
			
 
				+        hidden_states = hidden_states + x
			
 
				+        return hidden_states
			
 
				+
			
 
				+
			
 
				+class MPTModel(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: MPTConfig,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        assert config.embedding_fraction == 1.0
			
 
				+        assert config.norm_type == "low_precision_layernorm"
			
 
				+
			
 
				+        self.wte = VocabParallelEmbedding(config.vocab_size,
			
 
				+                                          config.d_model,
			
 
				+                                          linear_method=linear_method)
			
 
				+        self.blocks = nn.ModuleList(
			
 
				+            [MPTBlock(config, linear_method) for _ in range(config.n_layers)])
			
 
				+        self.norm_f = nn.LayerNorm(config.d_model)
			
 
				+        if config.no_bias:
			
 
				+            for module in self.modules():
			
 
				+                if hasattr(module, "bias") and isinstance(
			
 
				+                        module.bias, nn.Parameter):
			
 
				+                    # Remove the bias term in Linear and LayerNorm.
			
 
				+                    module.register_parameter("bias", None)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        input_ids: torch.Tensor,
			
 
				+        position_ids: torch.Tensor,
			
 
				+        kv_caches: List[KVCache],
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        hidden_states = self.wte(input_ids)
			
 
				+        for i in range(len(self.blocks)):
			
 
				+            block = self.blocks[i]
			
 
				+            hidden_states = block(
			
 
				+                position_ids,
			
 
				+                hidden_states,
			
 
				+                kv_caches[i],
			
 
				+                input_metadata,
			
 
				+            )
			
 
				+        hidden_states = self.norm_f(hidden_states)
			
 
				+        return hidden_states
			
 
				+
			
 
				+
			
 
				+class MPTForCausalLM(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: MPTConfig,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.config = config
			
 
				+        assert config.tie_word_embeddings
			
 
				+        self.linear_method = linear_method
			
 
				+
			
 
				+        self.transformer = MPTModel(config, linear_method)
			
 
				+        # self.lm_head_weight = self.transformer.wte.weight
			
 
				+        self.lm_head = ParallelLMHead(config.vocab_size,
			
 
				+                                      config.hidden_size,
			
 
				+                                      linear_method=linear_method)
			
 
				+        self.sampler = Sampler(config.vocab_size)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        input_ids: torch.Tensor,
			
 
				+        positions: torch.Tensor,
			
 
				+        kv_caches: List[KVCache],
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        hidden_states = self.transformer(input_ids, positions, kv_caches,
			
 
				+                                         input_metadata)
			
 
				+        return hidden_states
			
 
				+
			
 
				+    def sample(
			
 
				+        self,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        sampling_metadata: SamplingMetadata,
			
 
				+    ) -> Optional[SamplerOutput]:
			
 
				+        next_tokens = self.sampler(self.lm_head(hidden_states),
			
 
				+                                   sampling_metadata)
			
 
				+        return next_tokens
			
 
				+
			
 
				+    def load_weights(self,
			
 
				+                     model_name_or_path: str,
			
 
				+                     cache_dir: Optional[str] = None,
			
 
				+                     load_format: str = "auto",
			
 
				+                     revision: Optional[str] = None):
			
 
				+        params_dict = dict(self.named_parameters(remove_duplicate=False))
			
 
				+        for name, loaded_weight in hf_model_weights_iterator(
			
 
				+                model_name_or_path, cache_dir, load_format, revision,
			
 
				+                self.config):
			
 
				+            # Skip loading extra bias for GPTQ models.
			
 
				+            if name.endswith(".bias") and name not in params_dict:
			
 
				+                continue
			
 
				+            if "wte.weight" in name:
			
 
				+                # Copy word embedding to lm_head
			
 
				+                lm_head_param = params_dict["lm_head.weight"]
			
 
				+                weight_loader = getattr(lm_head_param, "weight_loader",
			
 
				+                                        default_weight_loader)
			
 
				+                weight_loader(lm_head_param, loaded_weight)
			
 
				+            param = params_dict[name]
			
 
				+            weight_loader = getattr(param, "weight_loader",
			
 
				+                                    default_weight_loader)
			
 
				+            weight_loader(param, loaded_weight)
			
--- a/aphrodite/modeling/models/olmo.py
+++ b/aphrodite/modeling/models/olmo.py
@@ -0,0 +1,377 @@
 
				+# coding=utf-8
			
 
				+# Adapted from
			
 
				+# https://github.com/allenai/OLMo/blob/v0.2.4/olmo/model.py and
			
 
				+# https://github.com/allenai/OLMo/blob/v0.2.4/hf_olmo/modeling_olmo.py
			
 
				+# Copyright 2023 The PygmalionAI team.
			
 
				+# Copyright 2023 The vLLM team.
			
 
				+# Copyright (c) Microsoft Corporation.
			
 
				+# Licensed under the MIT license.
			
 
				+#
			
 
				+# BSD 3-Clause License
			
 
				+#
			
 
				+# Copyright (c) 2022, Tri Dao, trid@cs.stanford.edu.
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# Redistribution and use in source and binary forms, with or without
			
 
				+# modification, are permitted provided that the following conditions are met:
			
 
				+#
			
 
				+# * Redistributions of source code must retain the above copyright notice, this
			
 
				+#   list of conditions and the following disclaimer.
			
 
				+#
			
 
				+# * Redistributions in binary form must reproduce the above copyright notice,
			
 
				+#   this list of conditions and the following disclaimer in the documentation
			
 
				+#   and/or other materials provided with the distribution.
			
 
				+#
			
 
				+# * Neither the name of the copyright holder nor the names of its
			
 
				+#   contributors may be used to endorse or promote products derived from
			
 
				+#   this software without specific prior written permission.
			
 
				+#
			
 
				+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
			
 
				+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
			
 
				+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
			
 
				+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
			
 
				+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
			
 
				+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
			
 
				+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
			
 
				+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
			
 
				+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
			
 
				+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
 
				+"""Inference-only OLMo model compatible with HuggingFace weights."""
			
 
				+from typing import List, Optional, Tuple
			
 
				+
			
 
				+import torch
			
 
				+import torch.nn.functional as F
			
 
				+from torch import nn
			
 
				+
			
 
				+from aphrodite.modeling.metadata import InputMetadata
			
 
				+from aphrodite.modeling.layers.attention import PagedAttention
			
 
				+from aphrodite.modeling.layers.linear import (
			
 
				+    ColumnParallelLinear,
			
 
				+    LinearMethodBase,
			
 
				+    QKVParallelLinear,
			
 
				+    RowParallelLinear,
			
 
				+)
			
 
				+from aphrodite.modeling.layers.rotary_embedding import get_rope
			
 
				+from aphrodite.modeling.layers.sampler import Sampler
			
 
				+from aphrodite.modeling.layers.vocab_parallel_embedding import (
			
 
				+    VocabParallelEmbedding, ParallelLMHead)
			
 
				+from aphrodite.modeling.megatron.parallel_state import (
			
 
				+    get_tensor_model_parallel_world_size, )
			
 
				+from aphrodite.modeling.sampling_metadata import SamplingMetadata
			
 
				+from aphrodite.modeling.hf_downloader import (
			
 
				+    default_weight_loader,
			
 
				+    hf_model_weights_iterator,
			
 
				+)
			
 
				+from aphrodite.common.sequence import SamplerOutput
			
 
				+from aphrodite.transformers_utils.configs.olmo import OLMoConfig
			
 
				+
			
 
				+KVCache = Tuple[torch.Tensor, torch.Tensor]
			
 
				+
			
 
				+
			
 
				+class SwiGLU(nn.Module):
			
 
				+
			
 
				+    def forward(self, x: torch.Tensor) -> torch.Tensor:
			
 
				+        x, gate = x.chunk(2, dim=-1)
			
 
				+        return F.silu(gate) * x
			
 
				+
			
 
				+    @property
			
 
				+    def output_multiplier(self) -> float:
			
 
				+        return 0.5
			
 
				+
			
 
				+
			
 
				+class OlmoAttention(nn.Module):
			
 
				+    """
			
 
				+    This is the attention block where the output is computed as ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
			
 
				+    (plus another skip connection).
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: OLMoConfig,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.config = config
			
 
				+        self.hidden_size = config.d_model
			
 
				+        assert config.d_model % config.n_heads == 0
			
 
				+        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size(
			
 
				+        )
			
 
				+        self.total_num_heads = self.config.n_heads
			
 
				+        assert self.total_num_heads % tensor_model_parallel_world_size == 0
			
 
				+        self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
			
 
				+        self.head_dim = self.hidden_size // self.total_num_heads
			
 
				+
			
 
				+        # Layer norms.
			
 
				+        self.attn_norm = nn.LayerNorm(config.d_model,
			
 
				+                                      elementwise_affine=False,
			
 
				+                                      bias=False)
			
 
				+        # Attention input projection. Projects x -> (q, k, v)
			
 
				+        self.att_proj = QKVParallelLinear(
			
 
				+            config.d_model,
			
 
				+            self.head_dim,
			
 
				+            self.total_num_heads,
			
 
				+            bias=config.include_bias,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+
			
 
				+        # Rotary embeddings.
			
 
				+        if self.config.rope:
			
 
				+            rope_theta = getattr(config, "rope_theta", 10000)
			
 
				+            max_position_embeddings = getattr(config,
			
 
				+                                              "max_position_embeddings", 8192)
			
 
				+            self.rotary_emb = get_rope(
			
 
				+                self.head_dim,
			
 
				+                rotary_dim=self.head_dim,
			
 
				+                max_position=max_position_embeddings,
			
 
				+                base=rope_theta,
			
 
				+            )
			
 
				+        self.scaling = self.head_dim**-0.5
			
 
				+        self.attn = PagedAttention(self.num_heads,
			
 
				+                                   self.head_dim,
			
 
				+                                   scale=self.scaling)
			
 
				+
			
 
				+        # Attention output projection.
			
 
				+        self.attn_out = RowParallelLinear(
			
 
				+            config.d_model,
			
 
				+            config.d_model,
			
 
				+            bias=config.include_bias,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        positions: torch.Tensor,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        kv_cache: KVCache,
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        hidden_states = self.attn_norm(hidden_states)
			
 
				+        qkv, _ = self.att_proj(hidden_states)
			
 
				+        q, k, v = qkv.chunk(chunks=3, dim=-1)
			
 
				+        if self.config.rope:
			
 
				+            q, k = self.rotary_emb(positions, q, k)
			
 
				+        k_cache, v_cache = kv_cache
			
 
				+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
			
 
				+        output, _ = self.attn_out(attn_output)
			
 
				+        return output
			
 
				+
			
 
				+
			
 
				+class OlmoMLP(nn.Module):
			
 
				+    """
			
 
				+    This is the MLP block where the output is computed as ``MLP(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
			
 
				+    (plus another skip connection).
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: OLMoConfig,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.config = config
			
 
				+        self.hidden_size = (config.mlp_hidden_size if config.mlp_hidden_size
			
 
				+                            is not None else config.mlp_ratio * config.d_model)
			
 
				+
			
 
				+        # Layer norms.
			
 
				+        self.ff_norm = nn.LayerNorm(config.d_model,
			
 
				+                                    elementwise_affine=False,
			
 
				+                                    bias=False)
			
 
				+
			
 
				+        # Feed-forward input projection.
			
 
				+        self.ff_proj = ColumnParallelLinear(
			
 
				+            config.d_model,
			
 
				+            self.hidden_size,
			
 
				+            bias=config.include_bias,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+
			
 
				+        # Activation function.
			
 
				+        # self.act = SiluAndMul()
			
 
				+        # self.act.output_multiplier = 0.5
			
 
				+        self.act = SwiGLU()
			
 
				+        assert (self.act.output_multiplier * self.hidden_size) % 1 == 0
			
 
				+
			
 
				+        # Feed-forward output projection.
			
 
				+        self.ff_out = RowParallelLinear(
			
 
				+            int(self.act.output_multiplier * self.hidden_size),
			
 
				+            config.d_model,
			
 
				+            bias=config.include_bias,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        x: torch.Tensor,
			
 
				+    ) -> torch.Tensor:
			
 
				+        # Add feed-forward projection.
			
 
				+        # shape: (batch_size, seq_len, d_model)
			
 
				+        og_x = x
			
 
				+        x = self.ff_norm(x)
			
 
				+        x, _ = self.ff_proj(x)
			
 
				+        x = self.act(x)
			
 
				+        x, _ = self.ff_out(x)
			
 
				+        x = og_x + x
			
 
				+
			
 
				+        return x
			
 
				+
			
 
				+
			
 
				+class OlmoBlock(nn.Module):
			
 
				+    """
			
 
				+    This is a typical transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
			
 
				+    (plus another skip connection).
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self,
			
 
				+                 config: OLMoConfig,
			
 
				+                 linear_method: Optional[LinearMethodBase] = None):
			
 
				+        super().__init__()
			
 
				+        # Attention block.
			
 
				+        self.attn = OlmoAttention(config, linear_method)
			
 
				+
			
 
				+        # MLP block.
			
 
				+        self.mlp = OlmoMLP(config, linear_method)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        positions: torch.Tensor,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        kv_cache: KVCache,
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
			
 
				+        # Attention block.
			
 
				+        og_x = hidden_states
			
 
				+        x = self.attn(positions, hidden_states, kv_cache, input_metadata)
			
 
				+        x = x + og_x
			
 
				+
			
 
				+        # MLP block.
			
 
				+        hidden_states = self.mlp(x)
			
 
				+        return hidden_states
			
 
				+
			
 
				+
			
 
				+class OlmoModel(nn.Module):
			
 
				+
			
 
				+    def __init__(self,
			
 
				+                 config: OLMoConfig,
			
 
				+                 linear_method: Optional[LinearMethodBase] = None):
			
 
				+        super().__init__()
			
 
				+        self.config = config
			
 
				+
			
 
				+        self.transformer = nn.ModuleDict(
			
 
				+            dict(wte=VocabParallelEmbedding(
			
 
				+                config.embedding_size or config.vocab_size,
			
 
				+                config.d_model,
			
 
				+                linear_method=linear_method,
			
 
				+            ),
			
 
				+                 ln_f=nn.LayerNorm(config.d_model,
			
 
				+                                   elementwise_affine=False,
			
 
				+                                   bias=False),
			
 
				+                 ff_out=ParallelLMHead(
			
 
				+                     config.embedding_size or config.vocab_size,
			
 
				+                     config.d_model,
			
 
				+                     bias=config.include_bias,
			
 
				+                     linear_method=linear_method,
			
 
				+                 )))
			
 
				+
			
 
				+        blocks = [
			
 
				+            OlmoBlock(config, linear_method) for i in range(config.n_layers)
			
 
				+        ]
			
 
				+        if self.config.block_group_size > 1:
			
 
				+            raise NotImplementedError("Block group size > 1 not supported yet")
			
 
				+        else:
			
 
				+            self.transformer.update({"blocks": nn.ModuleList(blocks)})
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        input_ids: torch.Tensor,
			
 
				+        positions: torch.Tensor,
			
 
				+        kv_caches: List[KVCache],
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        """
			
 
				+        :param input_ids: A tensor of shape `(batch_size, seq_len)`.
			
 
				+        """
			
 
				+        # Get embeddings of input.
			
 
				+        # shape: (batch_size, seq_len, d_model)
			
 
				+        x = self.transformer.wte(input_ids)  # type: ignore
			
 
				+
			
 
				+        # Apply blocks one-by-one.
			
 
				+        for block_idx, block in enumerate(self.transformer.blocks):
			
 
				+            # shape: (batch_size, seq_len, d_model)
			
 
				+            x = block(
			
 
				+                positions,
			
 
				+                x,
			
 
				+                kv_caches[block_idx],
			
 
				+                input_metadata,
			
 
				+            )
			
 
				+
			
 
				+        # Apply final layer norm.
			
 
				+        # shape: (batch_size, seq_len or 1, d_model)
			
 
				+        x = self.transformer.ln_f(x)  # type: ignore
			
 
				+        return x
			
 
				+
			
 
				+
			
 
				+class OLMoForCausalLM(nn.Module):
			
 
				+    """
			
 
				+    Extremely barebones HF model wrapper.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self,
			
 
				+                 config: OLMoConfig,
			
 
				+                 linear_method: Optional[LinearMethodBase] = None):
			
 
				+        super().__init__()
			
 
				+        self.config = config
			
 
				+        self.linear_method = linear_method
			
 
				+        self.model = OlmoModel(config, linear_method)
			
 
				+        self.sampler = Sampler(config.vocab_size)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        input_ids: torch.Tensor,
			
 
				+        positions: torch.Tensor,
			
 
				+        kv_caches: List[KVCache],
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        hidden_states = self.model(
			
 
				+            input_ids=input_ids,
			
 
				+            positions=positions,
			
 
				+            kv_caches=kv_caches,
			
 
				+            input_metadata=input_metadata,
			
 
				+        )
			
 
				+        return hidden_states
			
 
				+
			
 
				+    def sample(
			
 
				+        self,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        sampling_metadata: SamplingMetadata,
			
 
				+    ) -> Optional[SamplerOutput]:
			
 
				+        next_tokens = self.sampler(
			
 
				+            self.model.transformer.ff_out(hidden_states), sampling_metadata)
			
 
				+        return next_tokens
			
 
				+
			
 
				+    def load_weights(
			
 
				+        self,
			
 
				+        model_name_or_path: str,
			
 
				+        cache_dir: Optional[str] = None,
			
 
				+        load_format: str = "auto",
			
 
				+        revision: Optional[str] = None,
			
 
				+    ):
			
 
				+        params_dict = dict(self.named_parameters(remove_duplicate=False))
			
 
				+        for name, loaded_weight in hf_model_weights_iterator(
			
 
				+                model_name_or_path, cache_dir, load_format, revision):
			
 
				+            if "wte.weight" in name and self.config.weight_tying:
			
 
				+                # Copy word embedding to lm_head
			
 
				+                lm_head_param = params_dict["model.transformer.ff_out.weight"]
			
 
				+                weight_loader = getattr(lm_head_param, "weight_loader",
			
 
				+                                        default_weight_loader)
			
 
				+                weight_loader(lm_head_param, loaded_weight)
			
 
				+            # attention
			
 
				+            if ".att" in name:
			
 
				+                name = name.replace(".att", ".attn.att")
			
 
				+            # mlp
			
 
				+            if ".ff" in name and "transformer.ff_out" not in name:
			
 
				+                name = name.replace(".ff", ".mlp.ff")
			
 
				+            # there is no bias in olmo
			
 
				+            param = params_dict[name]
			
 
				+            weight_loader = getattr(param, "weight_loader",
			
 
				+                                    default_weight_loader)
			
 
				+            weight_loader(param, loaded_weight)
			
--- a/aphrodite/modeling/models/phi.py
+++ b/aphrodite/modeling/models/phi.py
@@ -287,7 +287,6 @@ class PhiForCausalLM(nn.Module):
 
				         hidden_states: torch.Tensor,
			
 
				         sampling_metadata: SamplingMetadata,
			
 
				     ) -> Optional[SamplerOutput]:
			
 
				-        head = self.lm_head  # pylint: disable=unused-variable
			
 
				         next_tokens = self.sampler(self.lm_head(hidden_states),
			
 
				                                    sampling_metadata)
			
 
				         return next_tokens
			
--- a/aphrodite/modeling/models/qwen.py
+++ b/aphrodite/modeling/models/qwen.py
@@ -0,0 +1,315 @@
 
				+# coding=utf-8
			
 
				+# Adapted from
			
 
				+# https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py
			
 
				+# Copyright (c) Alibaba Cloud.
			
 
				+# LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
			
 
				+"""Inference-only QWen model compatible with HuggingFace weights."""
			
 
				+from typing import Any, Dict, List, Optional, Tuple
			
 
				+
			
 
				+import torch
			
 
				+from torch import nn
			
 
				+
			
 
				+from aphrodite.modeling.metadata import InputMetadata
			
 
				+from aphrodite.modeling.layers.activation import SiluAndMul
			
 
				+from aphrodite.modeling.layers.attention import PagedAttention
			
 
				+from aphrodite.modeling.layers.layernorm import RMSNorm
			
 
				+from aphrodite.modeling.layers.linear import (LinearMethodBase,
			
 
				+                                              MergedColumnParallelLinear,
			
 
				+                                              QKVParallelLinear,
			
 
				+                                              RowParallelLinear,
			
 
				+                                              ColumnParallelLinear)
			
 
				+from aphrodite.modeling.layers.rotary_embedding import get_rope
			
 
				+from aphrodite.modeling.layers.sampler import Sampler
			
 
				+from aphrodite.modeling.layers.vocab_parallel_embedding import (
			
 
				+    VocabParallelEmbedding, ParallelLMHead)
			
 
				+from aphrodite.modeling.megatron.parallel_state import (
			
 
				+    get_tensor_model_parallel_world_size)
			
 
				+from aphrodite.modeling.sampling_metadata import SamplingMetadata
			
 
				+from aphrodite.modeling.hf_downloader import (default_weight_loader,
			
 
				+                                              hf_model_weights_iterator)
			
 
				+from aphrodite.common.sequence import SamplerOutput
			
 
				+from aphrodite.transformers_utils.configs.qwen import QWenConfig
			
 
				+
			
 
				+KVCache = Tuple[torch.Tensor, torch.Tensor]
			
 
				+
			
 
				+
			
 
				+class QWenMLP(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        hidden_size: int,
			
 
				+        intermediate_size: int,
			
 
				+        hidden_act: str = "silu",
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        if linear_method is not None and not linear_method.quant_config.merge_weight(
			
 
				+        ):
			
 
				+            self.merge_weight = False
			
 
				+            self.w2 = ColumnParallelLinear(hidden_size,
			
 
				+                                           intermediate_size,
			
 
				+                                           bias=False,
			
 
				+                                           linear_method=linear_method)
			
 
				+            self.w1 = ColumnParallelLinear(hidden_size,
			
 
				+                                           intermediate_size,
			
 
				+                                           bias=False,
			
 
				+                                           linear_method=linear_method)
			
 
				+        else:
			
 
				+            self.merge_weight = True
			
 
				+            self.gate_up_proj = MergedColumnParallelLinear(
			
 
				+                hidden_size, [intermediate_size] * 2,
			
 
				+                bias=False,
			
 
				+                linear_method=linear_method)
			
 
				+        self.c_proj = RowParallelLinear(intermediate_size,
			
 
				+                                        hidden_size,
			
 
				+                                        bias=False,
			
 
				+                                        linear_method=linear_method)
			
 
				+        if hidden_act != "silu":
			
 
				+            raise ValueError(f"Unsupported activation: {hidden_act}. "
			
 
				+                             "Only silu is supported for now.")
			
 
				+        self.act_fn = SiluAndMul()
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        if self.merge_weight:
			
 
				+            gate_up, _ = self.gate_up_proj(x)
			
 
				+        else:
			
 
				+            up, _ = self.w1(x)
			
 
				+            gate, _ = self.w2(x)
			
 
				+            gate_up = torch.cat([gate, up], dim=-1)
			
 
				+        x = self.act_fn(gate_up)
			
 
				+        x, _ = self.c_proj(x)
			
 
				+        return x
			
 
				+
			
 
				+
			
 
				+class QWenAttention(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        hidden_size: int,
			
 
				+        num_heads: int,
			
 
				+        max_position_embeddings: int,
			
 
				+        rope_theta: float = 10000,
			
 
				+        rope_scaling: Optional[Dict[str, Any]] = None,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.hidden_size = hidden_size
			
 
				+        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size(
			
 
				+        )
			
 
				+        self.total_num_heads = num_heads
			
 
				+        assert self.total_num_heads % tensor_model_parallel_world_size == 0
			
 
				+        self.num_heads = (self.total_num_heads //
			
 
				+                          tensor_model_parallel_world_size)
			
 
				+        self.head_dim = hidden_size // self.total_num_heads
			
 
				+        self.c_attn = QKVParallelLinear(
			
 
				+            hidden_size,
			
 
				+            self.head_dim,
			
 
				+            self.total_num_heads,
			
 
				+            bias=True,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+        self.c_proj = RowParallelLinear(
			
 
				+            self.total_num_heads * self.head_dim,
			
 
				+            hidden_size,
			
 
				+            bias=False,
			
 
				+            linear_method=linear_method,
			
 
				+        )
			
 
				+        self.scaling = self.head_dim**-0.5
			
 
				+
			
 
				+        is_neox_style = True if linear_method is None or linear_method.quant_config.rope_style(
			
 
				+        ) is None else linear_method.quant_config.rope_style()
			
 
				+        self.rotary_emb = get_rope(
			
 
				+            self.head_dim,
			
 
				+            rotary_dim=self.head_dim,
			
 
				+            max_position=max_position_embeddings,
			
 
				+            base=rope_theta,
			
 
				+            rope_scaling=rope_scaling,
			
 
				+            is_neox_style=is_neox_style,
			
 
				+        )
			
 
				+        self.attn = PagedAttention(self.num_heads, self.head_dim, self.scaling)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        positions: torch.Tensor,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        kv_cache: KVCache,
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        qkv, _ = self.c_attn(hidden_states)
			
 
				+        q, k, v = qkv.chunk(chunks=3, dim=-1)
			
 
				+        q, k = self.rotary_emb(positions, q, k)
			
 
				+        k_cache, v_cache = kv_cache
			
 
				+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
			
 
				+
			
 
				+        output, _ = self.c_proj(attn_output)
			
 
				+        return output
			
 
				+
			
 
				+
			
 
				+class QWenBlock(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: QWenConfig,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.ln_1 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
			
 
				+
			
 
				+        rope_theta = getattr(config, "rope_theta", 10000)
			
 
				+        rope_scaling = getattr(config, "rope_scaling", None)
			
 
				+        self.attn = QWenAttention(config.hidden_size,
			
 
				+                                  config.num_attention_heads,
			
 
				+                                  config.max_position_embeddings,
			
 
				+                                  rope_theta=rope_theta,
			
 
				+                                  rope_scaling=rope_scaling,
			
 
				+                                  linear_method=linear_method)
			
 
				+
			
 
				+        self.ln_2 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
			
 
				+
			
 
				+        self.mlp = QWenMLP(config.hidden_size,
			
 
				+                           config.intermediate_size // 2,
			
 
				+                           linear_method=linear_method)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        positions: torch.Tensor,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        kv_cache: KVCache,
			
 
				+        input_metadata: InputMetadata,
			
 
				+        residual: Optional[torch.Tensor],
			
 
				+    ) -> Tuple[torch.Tensor, torch.Tensor]:
			
 
				+        # Self Attention
			
 
				+        if residual is None:
			
 
				+            residual = hidden_states
			
 
				+            hidden_states = self.ln_1(hidden_states)
			
 
				+        else:
			
 
				+            hidden_states, residual = self.ln_1(hidden_states, residual)
			
 
				+        hidden_states = self.attn(
			
 
				+            positions=positions,
			
 
				+            hidden_states=hidden_states,
			
 
				+            kv_cache=kv_cache,
			
 
				+            input_metadata=input_metadata,
			
 
				+        )
			
 
				+
			
 
				+        # Fully Connected
			
 
				+        hidden_states, residual = self.ln_2(hidden_states, residual)
			
 
				+        hidden_states = self.mlp(hidden_states)
			
 
				+        return hidden_states, residual
			
 
				+
			
 
				+
			
 
				+class QWenModel(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: QWenConfig,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.config = config
			
 
				+        self.vocab_size = config.vocab_size
			
 
				+
			
 
				+        self.wte = VocabParallelEmbedding(config.vocab_size,
			
 
				+                                          config.hidden_size,
			
 
				+                                          linear_method=linear_method)
			
 
				+        self.h = nn.ModuleList([
			
 
				+            QWenBlock(config, linear_method)
			
 
				+            for _ in range(config.num_hidden_layers)
			
 
				+        ])
			
 
				+        self.ln_f = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        input_ids: torch.Tensor,
			
 
				+        positions: torch.Tensor,
			
 
				+        kv_caches: List[KVCache],
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        hidden_states = self.wte(input_ids)
			
 
				+        residual = None
			
 
				+        for i in range(len(self.h)):
			
 
				+            layer = self.h[i]
			
 
				+            hidden_states, residual = layer(
			
 
				+                positions,
			
 
				+                hidden_states,
			
 
				+                kv_caches[i],
			
 
				+                input_metadata,
			
 
				+                residual,
			
 
				+            )
			
 
				+        hidden_states, _ = self.ln_f(hidden_states, residual)
			
 
				+        return hidden_states
			
 
				+
			
 
				+
			
 
				+class QWenLMHeadModel(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: QWenConfig,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.config = config
			
 
				+        self.linear_method = linear_method
			
 
				+        self.transformer = QWenModel(config, linear_method)
			
 
				+        self.lm_head = ParallelLMHead(config.vocab_size,
			
 
				+                                      config.hidden_size,
			
 
				+                                      linear_method=linear_method)
			
 
				+        self.sampler = Sampler(config.vocab_size)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        input_ids: torch.Tensor,
			
 
				+        positions: torch.Tensor,
			
 
				+        kv_caches: List[KVCache],
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        hidden_states = self.transformer(input_ids, positions, kv_caches,
			
 
				+                                         input_metadata)
			
 
				+        return hidden_states
			
 
				+
			
 
				+    def sample(
			
 
				+        self,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        sampling_metadata: SamplingMetadata,
			
 
				+    ) -> Optional[SamplerOutput]:
			
 
				+        next_tokens = self.sampler(self.lm_head(hidden_states),
			
 
				+                                   sampling_metadata)
			
 
				+        return next_tokens
			
 
				+
			
 
				+    def load_weights(self,
			
 
				+                     model_name_or_path: str,
			
 
				+                     cache_dir: Optional[str] = None,
			
 
				+                     load_format: str = "auto",
			
 
				+                     revision: Optional[str] = None):
			
 
				+        stacked_params_mapping = [
			
 
				+            # (param_name, shard_name, shard_id)
			
 
				+            ("gate_up_proj", "w2", 0),
			
 
				+            ("gate_up_proj", "w1", 1),
			
 
				+        ]
			
 
				+        if self.linear_method is not None and not self.linear_method.quant_config.merge_weight(
			
 
				+        ):
			
 
				+            stacked_params_mapping = []
			
 
				+        params_dict = dict(self.named_parameters())
			
 
				+        for name, loaded_weight in hf_model_weights_iterator(
			
 
				+                model_name_or_path, cache_dir, load_format, revision,
			
 
				+                self.config):
			
 
				+            if "rotary_emb.inv_freq" in name:
			
 
				+                continue
			
 
				+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
			
 
				+                if weight_name not in name:
			
 
				+                    continue
			
 
				+                name = name.replace(weight_name, param_name)
			
 
				+                # Skip loading extra bias for GPTQ models.
			
 
				+                if name.endswith(".bias") and name not in params_dict:
			
 
				+                    continue
			
 
				+                param = params_dict[name]
			
 
				+                weight_loader = param.weight_loader
			
 
				+                weight_loader(param, loaded_weight, shard_id)
			
 
				+                break
			
 
				+            else:
			
 
				+                # Skip loading extra bias for GPTQ models.
			
 
				+                if name.endswith(".bias") and name not in params_dict:
			
 
				+                    continue
			
 
				+                param = params_dict[name]
			
 
				+                weight_loader = getattr(param, "weight_loader",
			
 
				+                                        default_weight_loader)
			
 
				+                weight_loader(param, loaded_weight)
			
--- a/aphrodite/modeling/models/mistral.py
+++ b/aphrodite/modeling/models/mistral.py
@@ -1,7 +1,8 @@
 
				 # coding=utf-8
			
 
				 # Adapted from
			
 
				-# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
			
 
				+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py
			
 
				 # Copyright 2023 The PygmalionAI team.
			
 
				+# Copyright 2024 The Qwen team.
			
 
				 # Copyright 2023 The vLLM team.
			
 
				 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
			
 
				 #
			
@@ -21,38 +22,37 @@
 
				 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				 # See the License for the specific language governing permissions and
			
 
				 # limitations under the License.
			
 
				-"""Inference-only Mistral model compatible with HuggingFace weights."""
			
 
				+"""Inference-only Qwen2 model compatible with HuggingFace weights."""
			
 
				 from typing import List, Optional, Tuple
			
 
				 
			
 
				 import torch
			
 
				 from torch import nn
			
 
				-from transformers import MistralConfig
			
 
				+from transformers import Qwen2Config
			
 
				 
			
 
				 from aphrodite.modeling.metadata import InputMetadata
			
 
				 from aphrodite.modeling.layers.activation import SiluAndMul
			
 
				 from aphrodite.modeling.layers.attention import PagedAttention
			
 
				 from aphrodite.modeling.layers.layernorm import RMSNorm
			
 
				 from aphrodite.modeling.layers.linear import (LinearMethodBase,
			
 
				+                                              ColumnParallelLinear,
			
 
				                                               MergedColumnParallelLinear,
			
 
				                                               QKVParallelLinear,
			
 
				-                                              RowParallelLinear,
			
 
				-                                              ColumnParallelLinear)
			
 
				+                                              RowParallelLinear)
			
 
				 from aphrodite.modeling.layers.rotary_embedding import get_rope
			
 
				 from aphrodite.modeling.layers.sampler import Sampler
			
 
				 from aphrodite.modeling.layers.vocab_parallel_embedding import (
			
 
				-    VocabParallelEmbedding, ParallelLMHead, DEFAULT_VOCAB_PADDING_SIZE)
			
 
				+    VocabParallelEmbedding, ParallelLMHead)
			
 
				 from aphrodite.modeling.megatron.parallel_state import (
			
 
				     get_tensor_model_parallel_world_size)
			
 
				 from aphrodite.modeling.sampling_metadata import SamplingMetadata
			
 
				 from aphrodite.modeling.hf_downloader import (default_weight_loader,
			
 
				                                               hf_model_weights_iterator)
			
 
				 from aphrodite.common.sequence import SamplerOutput
			
 
				-from aphrodite.common.config import LoRAConfig
			
 
				 
			
 
				 KVCache = Tuple[torch.Tensor, torch.Tensor]
			
 
				 
			
 
				 
			
 
				-class MistralMLP(nn.Module):
			
 
				+class Qwen2MLP(nn.Module):
			
 
				 
			
 
				     def __init__(
			
 
				         self,
			
@@ -100,7 +100,7 @@ class MistralMLP(nn.Module):
 
				         return x
			
 
				 
			
 
				 
			
 
				-class MistralAttention(nn.Module):
			
 
				+class Qwen2Attention(nn.Module):
			
 
				 
			
 
				     def __init__(self,
			
 
				                  hidden_size: int,
			
@@ -108,6 +108,7 @@ class MistralAttention(nn.Module):
 
				                  num_kv_heads: int,
			
 
				                  max_position: int = 4096 * 32,
			
 
				                  rope_theta: float = 10000,
			
 
				+                 use_sliding_window: bool = False,
			
 
				                  linear_method: Optional[LinearMethodBase] = None,
			
 
				                  sliding_window: Optional[int] = None) -> None:
			
 
				         super().__init__()
			
@@ -131,22 +132,22 @@ class MistralAttention(nn.Module):
 
				         self.kv_size = self.num_kv_heads * self.head_dim
			
 
				         self.scaling = self.head_dim**-0.5
			
 
				         self.rope_theta = rope_theta
			
 
				-        self.sliding_window = sliding_window
			
 
				+        self.sliding_window = sliding_window if use_sliding_window else None
			
 
				 
			
 
				         if linear_method is not None and not linear_method.quant_config.merge_weight(
			
 
				         ):
			
 
				             self.merge_weight = False
			
 
				             self.q_proj = ColumnParallelLinear(hidden_size,
			
 
				                                                self.q_size,
			
 
				-                                               bias=False,
			
 
				+                                               bias=True,
			
 
				                                                linear_method=linear_method)
			
 
				             self.k_proj = ColumnParallelLinear(hidden_size,
			
 
				                                                self.kv_size,
			
 
				-                                               bias=False,
			
 
				+                                               bias=True,
			
 
				                                                linear_method=linear_method)
			
 
				             self.v_proj = ColumnParallelLinear(hidden_size,
			
 
				                                                self.kv_size,
			
 
				-                                               bias=False,
			
 
				+                                               bias=True,
			
 
				                                                linear_method=linear_method)
			
 
				         else:
			
 
				             self.merge_weight = True
			
@@ -155,7 +156,7 @@ class MistralAttention(nn.Module):
 
				                 self.head_dim,
			
 
				                 self.total_num_heads,
			
 
				                 self.total_num_kv_heads,
			
 
				-                bias=False,
			
 
				+                bias=True,
			
 
				                 linear_method=linear_method,
			
 
				             )
			
 
				         self.o_proj = RowParallelLinear(
			
@@ -165,14 +166,11 @@ class MistralAttention(nn.Module):
 
				             linear_method=linear_method,
			
 
				         )
			
 
				 
			
 
				-        is_neox_style = True if linear_method is None or linear_method.quant_config.rope_style(
			
 
				-        ) is None else linear_method.quant_config.rope_style()
			
 
				         self.rotary_emb = get_rope(
			
 
				             self.head_dim,
			
 
				             rotary_dim=self.head_dim,
			
 
				             max_position=max_position,
			
 
				             base=self.rope_theta,
			
 
				-            is_neox_style=is_neox_style,
			
 
				         )
			
 
				         self.attn = PagedAttention(self.num_heads,
			
 
				                                    self.head_dim,
			
@@ -202,26 +200,29 @@ class MistralAttention(nn.Module):
 
				         return output
			
 
				 
			
 
				 
			
 
				-class MistralDecoderLayer(nn.Module):
			
 
				+class Qwen2DecoderLayer(nn.Module):
			
 
				 
			
 
				     def __init__(
			
 
				         self,
			
 
				-        config: MistralConfig,
			
 
				+        config: Qwen2Config,
			
 
				+        layer_idx: int,
			
 
				         linear_method: Optional[LinearMethodBase] = None,
			
 
				     ) -> None:
			
 
				         super().__init__()
			
 
				         self.hidden_size = config.hidden_size
			
 
				         # Requires transformers > 4.32.0
			
 
				-        rope_theta = getattr(config, "rope_theta", 10000)
			
 
				-        self.self_attn = MistralAttention(
			
 
				+        rope_theta = getattr(config, "rope_theta", 1000000)
			
 
				+        use_sliding_window = config.use_sliding_window and layer_idx < config.max_window_layers
			
 
				+        self.self_attn = Qwen2Attention(
			
 
				             hidden_size=self.hidden_size,
			
 
				             num_heads=config.num_attention_heads,
			
 
				             max_position=config.max_position_embeddings,
			
 
				             num_kv_heads=config.num_key_value_heads,
			
 
				             rope_theta=rope_theta,
			
 
				+            use_sliding_window=use_sliding_window,
			
 
				             linear_method=linear_method,
			
 
				             sliding_window=config.sliding_window)
			
 
				-        self.mlp = MistralMLP(
			
 
				+        self.mlp = Qwen2MLP(
			
 
				             hidden_size=self.hidden_size,
			
 
				             intermediate_size=config.intermediate_size,
			
 
				             hidden_act=config.hidden_act,
			
@@ -261,31 +262,26 @@ class MistralDecoderLayer(nn.Module):
 
				         return hidden_states, residual
			
 
				 
			
 
				 
			
 
				-class MistralModel(nn.Module):
			
 
				+class Qwen2Model(nn.Module):
			
 
				 
			
 
				     def __init__(
			
 
				         self,
			
 
				-        config: MistralConfig,
			
 
				+        config: Qwen2Config,
			
 
				         linear_method: Optional[LinearMethodBase] = None,
			
 
				-        lora_config: Optional[LoRAConfig] = None,
			
 
				     ) -> None:
			
 
				         super().__init__()
			
 
				         self.config = config
			
 
				         self.padding_idx = config.pad_token_id
			
 
				-        lora_vocab = (lora_config.lora_extra_vocab_size *
			
 
				-                      (lora_config.max_loras or 1)) if lora_config else 0
			
 
				-        self.vocab_size = config.vocab_size + lora_vocab
			
 
				-        self.org_vocab_size = config.vocab_size
			
 
				+        self.vocab_size = config.vocab_size
			
 
				 
			
 
				         self.embed_tokens = VocabParallelEmbedding(
			
 
				-            self.vocab_size,
			
 
				+            config.vocab_size,
			
 
				             config.hidden_size,
			
 
				             linear_method=linear_method,
			
 
				-            org_num_embeddings=config.vocab_size,
			
 
				         )
			
 
				         self.layers = nn.ModuleList([
			
 
				-            MistralDecoderLayer(config, linear_method)
			
 
				-            for _ in range(config.num_hidden_layers)
			
 
				+            Qwen2DecoderLayer(config, layer_idx, linear_method)
			
 
				+            for layer_idx in range(config.num_hidden_layers)
			
 
				         ])
			
 
				         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
			
 
				 
			
@@ -311,35 +307,23 @@ class MistralModel(nn.Module):
 
				         return hidden_states
			
 
				 
			
 
				 
			
 
				-class MistralForCausalLM(nn.Module):
			
 
				-    supports_lora = True
			
 
				+class Qwen2ForCausalLM(nn.Module):
			
 
				 
			
 
				     def __init__(
			
 
				         self,
			
 
				-        config: MistralConfig,
			
 
				+        config: Qwen2Config,
			
 
				         linear_method: Optional[LinearMethodBase] = None,
			
 
				-        lora_config: Optional[LoRAConfig] = None,
			
 
				     ) -> None:
			
 
				         super().__init__()
			
 
				         self.config = config
			
 
				         self.linear_method = linear_method
			
 
				-        self.model = MistralModel(config,
			
 
				-                                  linear_method,
			
 
				-                                  lora_config=lora_config)
			
 
				-        unpadded_vocab_size = config.vocab_size
			
 
				-        if lora_config:
			
 
				-            unpadded_vocab_size += lora_config.lora_extra_vocab_size
			
 
				+        self.model = Qwen2Model(config, linear_method)
			
 
				         self.lm_head = ParallelLMHead(
			
 
				-            unpadded_vocab_size,
			
 
				+            config.vocab_size,
			
 
				             config.hidden_size,
			
 
				             linear_method=linear_method,
			
 
				-            org_num_embeddings=config.vocab_size,
			
 
				-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
			
 
				-            # We need bigger padding if using lora for kernel
			
 
				-            # compatibility
			
 
				-            if not lora_config else lora_config.lora_vocab_padding_size,
			
 
				         )
			
 
				-        self.sampler = Sampler(unpadded_vocab_size, config.vocab_size)
			
 
				+        self.sampler = Sampler(config.vocab_size)
			
 
				 
			
 
				     def forward(
			
 
				         self,
			
@@ -379,8 +363,7 @@ class MistralForCausalLM(nn.Module):
 
				             stacked_params_mapping = []
			
 
				         params_dict = dict(self.named_parameters())
			
 
				         for name, loaded_weight in hf_model_weights_iterator(
			
 
				-                model_name_or_path, cache_dir, load_format, revision,
			
 
				-                self.config):
			
 
				+                model_name_or_path, cache_dir, load_format, revision):
			
 
				             if "rotary_emb.inv_freq" in name:
			
 
				                 continue
			
 
				             for (param_name, weight_name, shard_id) in stacked_params_mapping:
			
--- a/aphrodite/modeling/models/stablelm.py
+++ b/aphrodite/modeling/models/stablelm.py
@@ -0,0 +1,352 @@
 
				+# coding=utf-8
			
 
				+# Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+#
			
 
				+# This code is based off the following work:
			
 
				+# https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/modeling_stablelm_epoch.py
			
 
				+# https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/config.json
			
 
				+"""Inference-only StabeLM (https://github.com/Stability-AI/StableLM) model compatible with HuggingFace weights."""
			
 
				+from typing import List, Optional, Tuple
			
 
				+
			
 
				+import torch
			
 
				+from torch import nn
			
 
				+from transformers import PretrainedConfig
			
 
				+
			
 
				+from aphrodite.modeling.metadata import InputMetadata
			
 
				+from aphrodite.modeling.layers.activation import SiluAndMul
			
 
				+from aphrodite.modeling.layers.attention import PagedAttention
			
 
				+from aphrodite.modeling.layers.linear import (LinearMethodBase,
			
 
				+                                              MergedColumnParallelLinear,
			
 
				+                                              QKVParallelLinear,
			
 
				+                                              RowParallelLinear,
			
 
				+                                              ColumnParallelLinear)
			
 
				+from aphrodite.modeling.layers.rotary_embedding import get_rope
			
 
				+from aphrodite.modeling.layers.sampler import Sampler
			
 
				+from aphrodite.modeling.layers.vocab_parallel_embedding import (
			
 
				+    VocabParallelEmbedding, ParallelLMHead)
			
 
				+from aphrodite.modeling.megatron.parallel_state import (
			
 
				+    get_tensor_model_parallel_world_size)
			
 
				+from aphrodite.modeling.sampling_metadata import SamplingMetadata
			
 
				+from aphrodite.modeling.hf_downloader import (default_weight_loader,
			
 
				+                                              hf_model_weights_iterator)
			
 
				+from aphrodite.common.sequence import SamplerOutput
			
 
				+
			
 
				+KVCache = Tuple[torch.Tensor, torch.Tensor]
			
 
				+
			
 
				+
			
 
				+class StablelmMLP(nn.Module):
			
 
				+
			
 
				+    def __init__(self,
			
 
				+                 config: PretrainedConfig,
			
 
				+                 linear_method: Optional[LinearMethodBase] = None) -> None:
			
 
				+        super().__init__()
			
 
				+        self.config = config
			
 
				+        self.hidden_size = config.hidden_size
			
 
				+        self.intermediate_size = config.intermediate_size
			
 
				+        if linear_method is not None and not linear_method.quant_config.merge_weight(
			
 
				+        ):
			
 
				+            self.merge_weight = False
			
 
				+            self.gate_proj = ColumnParallelLinear(config.hidden_size,
			
 
				+                                                  config.intermediate_size,
			
 
				+                                                  bias=False,
			
 
				+                                                  linear_method=linear_method)
			
 
				+            self.up_proj = ColumnParallelLinear(config.hidden_size,
			
 
				+                                                config.intermediate_size,
			
 
				+                                                bias=False,
			
 
				+                                                linear_method=linear_method)
			
 
				+        else:
			
 
				+            self.merge_weight = True
			
 
				+            self.gate_up_proj = MergedColumnParallelLinear(
			
 
				+                config.hidden_size, [config.intermediate_size] * 2,
			
 
				+                bias=False,
			
 
				+                linear_method=linear_method)
			
 
				+        self.down_proj = RowParallelLinear(config.intermediate_size,
			
 
				+                                           config.hidden_size,
			
 
				+                                           bias=False)
			
 
				+        self.act_fn = SiluAndMul()
			
 
				+
			
 
				+    def forward(self, x: torch.Tensor) -> torch.Tensor:
			
 
				+        if self.merge_weight:
			
 
				+            gate_up, _ = self.gate_up_proj(x)
			
 
				+        else:
			
 
				+            up, _ = self.up_proj(x)
			
 
				+            gate, _ = self.gate_proj(x)
			
 
				+            gate_up = torch.cat([gate, up], dim=-1)
			
 
				+        x = self.act_fn(gate_up)
			
 
				+        x, _ = self.down_proj(x)
			
 
				+        return x
			
 
				+
			
 
				+
			
 
				+class StablelmAttention(nn.Module):
			
 
				+
			
 
				+    def __init__(self,
			
 
				+                 config: PretrainedConfig,
			
 
				+                 linear_method: Optional[LinearMethodBase] = None) -> None:
			
 
				+        super().__init__()
			
 
				+        self.config = config
			
 
				+        self.hidden_size = config.hidden_size
			
 
				+        tp_size = get_tensor_model_parallel_world_size()
			
 
				+        self.total_num_heads = config.num_attention_heads
			
 
				+        self.num_heads = self.total_num_heads // tp_size
			
 
				+
			
 
				+        self.total_num_key_value_heads = config.num_key_value_heads
			
 
				+        if self.total_num_key_value_heads >= tp_size:
			
 
				+            # Number of KV heads is greater than TP size, so we partition
			
 
				+            # the KV heads across multiple tensor parallel GPUs.
			
 
				+            assert self.total_num_key_value_heads % tp_size == 0
			
 
				+        else:
			
 
				+            # Number of KV heads is less than TP size, so we replicate
			
 
				+            # the KV heads across multiple tensor parallel GPUs.
			
 
				+            assert tp_size % self.total_num_key_value_heads == 0
			
 
				+        self.num_key_value_heads = max(
			
 
				+            1, self.total_num_key_value_heads // tp_size)
			
 
				+        self.head_dim = self.hidden_size // self.total_num_heads
			
 
				+        self.max_position_embeddings = config.max_position_embeddings
			
 
				+        self.rotary_ndims = int(self.head_dim * self.config.rope_pct)
			
 
				+        self.scaling = self.head_dim**-0.5
			
 
				+        self.q_size = self.num_heads * self.head_dim
			
 
				+        self.kv_size = self.num_key_value_heads * self.head_dim
			
 
				+        self.qkv_bias = getattr(config, "use_qkv_bias", False)
			
 
				+        if (self.head_dim * self.num_heads * tp_size) != self.hidden_size:
			
 
				+            raise ValueError(
			
 
				+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
			
 
				+                f" and `num_heads`: {self.num_heads}).")
			
 
				+
			
 
				+        if linear_method is not None and not linear_method.quant_config.merge_weight(
			
 
				+        ):
			
 
				+            self.merge_weight = False
			
 
				+            self.q_proj = ColumnParallelLinear(self.hidden_size,
			
 
				+                                               self.q_size,
			
 
				+                                               bias=self.qkv_bias,
			
 
				+                                               linear_method=linear_method)
			
 
				+            self.k_proj = ColumnParallelLinear(self.hidden_size,
			
 
				+                                               self.kv_size,
			
 
				+                                               bias=self.qkv_bias,
			
 
				+                                               linear_method=linear_method)
			
 
				+            self.v_proj = ColumnParallelLinear(self.hidden_size,
			
 
				+                                               self.kv_size,
			
 
				+                                               bias=self.qkv_bias,
			
 
				+                                               linear_method=linear_method)
			
 
				+        else:
			
 
				+            self.merge_weight = True
			
 
				+            self.qkv_proj = QKVParallelLinear(
			
 
				+                self.hidden_size,
			
 
				+                self.head_dim,
			
 
				+                self.total_num_heads,
			
 
				+                self.total_num_key_value_heads,
			
 
				+                self.qkv_bias,
			
 
				+                linear_method=linear_method,
			
 
				+            )
			
 
				+        self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
			
 
				+                                        self.hidden_size,
			
 
				+                                        bias=False,
			
 
				+                                        linear_method=linear_method)
			
 
				+        self.rotary_ndims = int(self.head_dim * self.config.rope_pct)
			
 
				+        is_neox_style = True if linear_method is None or linear_method.quant_config.rope_style(
			
 
				+        ) is None else linear_method.quant_config.rope_style()
			
 
				+        self.rotary_emb = get_rope(
			
 
				+            self.head_dim,
			
 
				+            rotary_dim=self.rotary_ndims,
			
 
				+            max_position=self.config.max_position_embeddings,
			
 
				+            base=self.config.rope_theta,
			
 
				+            is_neox_style=is_neox_style,
			
 
				+        )
			
 
				+        self.attn = PagedAttention(self.num_heads,
			
 
				+                                   self.head_dim,
			
 
				+                                   self.scaling,
			
 
				+                                   num_kv_heads=self.num_key_value_heads)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        positions: torch.Tensor,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        kv_cache: KVCache,
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        if self.merge_weight:
			
 
				+            qkv, _ = self.qkv_proj(hidden_states)
			
 
				+            q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size],
			
 
				+                                dim=-1)
			
 
				+        else:
			
 
				+            q, _ = self.q_proj(hidden_states)
			
 
				+            k, _ = self.k_proj(hidden_states)
			
 
				+            v, _ = self.v_proj(hidden_states)
			
 
				+        q, k = self.rotary_emb(positions, q, k)
			
 
				+        k_cache, v_cache = kv_cache
			
 
				+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
			
 
				+        output, _ = self.o_proj(attn_output)
			
 
				+        return output
			
 
				+
			
 
				+
			
 
				+class StablelmDecoderLayer(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: PretrainedConfig,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ) -> None:
			
 
				+        super().__init__()
			
 
				+        self.self_attn = StablelmAttention(config)
			
 
				+        self.mlp = StablelmMLP(config, linear_method)
			
 
				+        self.input_layernorm = nn.LayerNorm(config.hidden_size,
			
 
				+                                            eps=config.norm_eps)
			
 
				+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
			
 
				+                                                     eps=config.norm_eps)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        positions: torch.Tensor,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        kv_cache: KVCache,
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> Tuple[torch.Tensor, torch.Tensor]:
			
 
				+        # Self Attention
			
 
				+        residual = hidden_states
			
 
				+        hidden_states = self.input_layernorm(hidden_states)
			
 
				+        hidden_states = self.self_attn(
			
 
				+            positions=positions,
			
 
				+            hidden_states=hidden_states,
			
 
				+            kv_cache=kv_cache,
			
 
				+            input_metadata=input_metadata,
			
 
				+        )
			
 
				+        hidden_states = residual + hidden_states
			
 
				+
			
 
				+        # Fully Connected
			
 
				+        residual = hidden_states
			
 
				+        hidden_states = self.post_attention_layernorm(hidden_states)
			
 
				+        hidden_states = self.mlp(hidden_states)
			
 
				+        hidden_states = residual + hidden_states
			
 
				+
			
 
				+        return hidden_states, residual
			
 
				+
			
 
				+
			
 
				+class StableLMEpochModel(nn.Module):
			
 
				+
			
 
				+    def __init__(self,
			
 
				+                 config: PretrainedConfig,
			
 
				+                 linear_method: Optional[LinearMethodBase] = None) -> None:
			
 
				+        super().__init__()
			
 
				+        # self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
			
 
				+        self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
			
 
				+                                                   config.hidden_size,
			
 
				+                                                   linear_method=linear_method)
			
 
				+        self.layers = nn.ModuleList([
			
 
				+            StablelmDecoderLayer(config, linear_method)
			
 
				+            for _ in range(config.num_hidden_layers)
			
 
				+        ])
			
 
				+        self.norm = nn.LayerNorm(config.hidden_size, eps=config.norm_eps)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        input_ids: torch.Tensor,
			
 
				+        positions: torch.Tensor,
			
 
				+        kv_caches: List[KVCache],
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        hidden_states = self.embed_tokens(input_ids)
			
 
				+        for i in range(len(self.layers)):
			
 
				+            layer = self.layers[i]
			
 
				+            # pylint: disable=unused-variable
			
 
				+            hidden_states, residual = layer(
			
 
				+                positions,
			
 
				+                hidden_states,
			
 
				+                kv_caches[i],
			
 
				+                input_metadata,
			
 
				+            )
			
 
				+        hidden_states = self.norm(hidden_states)
			
 
				+        return hidden_states
			
 
				+
			
 
				+
			
 
				+class StablelmForCausalLM(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: PretrainedConfig,
			
 
				+        linear_method: Optional[LinearMethodBase] = None,
			
 
				+    ) -> None:
			
 
				+        super().__init__()
			
 
				+        self.config = config
			
 
				+        self.linear_method = linear_method
			
 
				+        self.model = StableLMEpochModel(config, linear_method)
			
 
				+        self.lm_head = ParallelLMHead(config.vocab_size,
			
 
				+                                      config.hidden_size,
			
 
				+                                      linear_method=linear_method)
			
 
				+        self.sampler = Sampler(config.vocab_size)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        input_ids: torch.Tensor,
			
 
				+        positions: torch.Tensor,
			
 
				+        kv_caches: List[KVCache],
			
 
				+        input_metadata: InputMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        hidden_states = self.model(input_ids, positions, kv_caches,
			
 
				+                                   input_metadata)
			
 
				+        return hidden_states
			
 
				+
			
 
				+    def sample(
			
 
				+        self,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        sampling_metadata: SamplingMetadata,
			
 
				+    ) -> Optional[SamplerOutput]:
			
 
				+        next_tokens = self.sampler(self.lm_head(hidden_states),
			
 
				+                                   sampling_metadata)
			
 
				+        return next_tokens
			
 
				+
			
 
				+    def load_weights(self,
			
 
				+                     model_name_or_path: str,
			
 
				+                     cache_dir: Optional[str] = None,
			
 
				+                     load_format: str = "auto",
			
 
				+                     revision: Optional[str] = None):
			
 
				+        stacked_params_mapping = [
			
 
				+            # (param_name, shard_name, shard_id)
			
 
				+            ("qkv_proj", "q_proj", "q"),
			
 
				+            ("qkv_proj", "k_proj", "k"),
			
 
				+            ("qkv_proj", "v_proj", "v"),
			
 
				+            ("gate_up_proj", "gate_proj", 0),
			
 
				+            ("gate_up_proj", "up_proj", 1),
			
 
				+        ]
			
 
				+        if self.linear_method is not None and not self.linear_method.quant_config.merge_weight(
			
 
				+        ):
			
 
				+            stacked_params_mapping = []
			
 
				+        params_dict = dict(self.named_parameters())
			
 
				+        for name, loaded_weight in hf_model_weights_iterator(
			
 
				+                model_name_or_path, cache_dir, load_format, revision,
			
 
				+                self.config):
			
 
				+            if "rotary_emb.inv_freq" in name:
			
 
				+                continue
			
 
				+            if ("rotary_emb.cos_cached" in name
			
 
				+                    or "rotary_emb.sin_cached" in name):
			
 
				+                # Models trained using ColossalAI may include these tensors in
			
 
				+                # the checkpoint. Skip them.
			
 
				+                continue
			
 
				+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
			
 
				+                if weight_name not in name:
			
 
				+                    continue
			
 
				+                name = name.replace(weight_name, param_name)
			
 
				+                # Skip loading extra bias for GPTQ models.
			
 
				+                if name.endswith(".bias") and name not in params_dict:
			
 
				+                    continue
			
 
				+                param = params_dict[name]
			
 
				+                weight_loader = param.weight_loader
			
 
				+                weight_loader(param, loaded_weight, shard_id)
			
 
				+                break
			
 
				+            else:
			
 
				+                # Skip loading extra bias for GPTQ models.
			
 
				+                if name.endswith(".bias") and name not in params_dict:
			
 
				+                    continue
			
 
				+                param = params_dict[name]
			
 
				+                weight_loader = getattr(param, "weight_loader",
			
 
				+                                        default_weight_loader)
			
 
				+                weight_loader(param, loaded_weight)
			
--- a/aphrodite/transformers_utils/config.py
+++ b/aphrodite/transformers_utils/config.py
@@ -3,37 +3,44 @@ from typing import Optional
 
				 import gguf
			
 
				 from transformers import AutoConfig, PretrainedConfig
			
 
				 from transformers.models.auto.configuration_auto import CONFIG_MAPPING
			
 
				-from aphrodite.transformers_utils.configs import YiConfig, QWenConfig
			
 
				+
			
 
				+from aphrodite.transformers_utils.configs import (BaiChuanConfig,
			
 
				+                                                  ChatGLMConfig, MPTConfig,
			
 
				+                                                  QWenConfig, RWConfig)
			
 
				 
			
 
				 _CONFIG_REGISTRY = {
			
 
				+    "baichuan": BaiChuanConfig,
			
 
				+    "chatglm": ChatGLMConfig,
			
 
				+    "mpt": MPTConfig,
			
 
				     "qwen": QWenConfig,
			
 
				-    "yi": YiConfig,
			
 
				+    "RefinedWeb": RWConfig,  # For tiiuae/falcon-40b(-instruct)
			
 
				+    "RefinedWebModel": RWConfig,  # For tiiuae/falcon-7b(-instruct)
			
 
				 }
			
 
				 
			
 
				 
			
 
				 def extract_gguf_config(checkpoint):
			
 
				     result = gguf.GGUFReader(checkpoint)
			
 
				-    architecture = result.fields["general.architecture"]
			
 
				+    architecture = result.fields['general.architecture']
			
 
				     architecture = str(bytes(architecture.parts[architecture.data[0]]),
			
 
				-                       encoding="utf-8")
			
 
				+                       encoding='utf-8')
			
 
				     # Only support llama so far
			
 
				     if architecture != "llama":
			
 
				         raise RuntimeError(f"Unsupported architecture {architecture}")
			
 
				 
			
 
				     # write config
			
 
				-    vocab_size = len(result.fields["tokenizer.ggml.token_type"].data)
			
 
				-    context_length = int(result.fields["llama.context_length"].parts[-1])
			
 
				-    n_layer = int(result.fields["llama.block_count"].parts[-1])
			
 
				-    n_head = int(result.fields["llama.attention.head_count"].parts[-1])
			
 
				+    vocab_size = len(result.fields['tokenizer.ggml.token_type'].data)
			
 
				+    context_length = int(result.fields['llama.context_length'].parts[-1])
			
 
				+    n_layer = int(result.fields['llama.block_count'].parts[-1])
			
 
				+    n_head = int(result.fields['llama.attention.head_count'].parts[-1])
			
 
				     n_local_heads = int(
			
 
				-        result.fields["llama.attention.head_count_kv"].parts[-1])
			
 
				+        result.fields['llama.attention.head_count_kv'].parts[-1])
			
 
				     intermediate_size = int(
			
 
				-        result.fields["llama.feed_forward_length"].parts[-1])
			
 
				+        result.fields['llama.feed_forward_length'].parts[-1])
			
 
				     norm_eps = float(
			
 
				-        result.fields["llama.attention.layer_norm_rms_epsilon"].parts[-1])
			
 
				-    dim = int(result.fields["llama.embedding_length"].parts[-1])
			
 
				+        result.fields['llama.attention.layer_norm_rms_epsilon'].parts[-1])
			
 
				+    dim = int(result.fields['llama.embedding_length'].parts[-1])
			
 
				     arch = "MixtralForCausalLM"
			
 
				-    if "llama.expert_count" in result.fields:
			
 
				+    if 'llama.expert_count' in result.fields:
			
 
				         arch = "MixtralForCausalLM"
			
 
				         name = "mixtral"
			
 
				     else:
			
@@ -55,14 +62,14 @@ def extract_gguf_config(checkpoint):
 
				         "torch_dtype": "float16",
			
 
				         "vocab_size": vocab_size
			
 
				     }
			
 
				-    if "llama.rope.freq_base" in result.fields:
			
 
				-        model_config["rope_theta"] = float(
			
 
				-            result.fields["llama.rope.freq_base"].parts[-1])
			
 
				-    if "llama.expert_count" in result.fields:
			
 
				-        model_config["num_local_experts"] = int(
			
 
				-            result.fields["llama.expert_count"].parts[-1])
			
 
				-        model_config["num_experts_per_tok"] = int(
			
 
				-            result.fields["llama.expert_used_count"].parts[-1])
			
 
				+    if 'llama.rope.freq_base' in result.fields:
			
 
				+        model_config['rope_theta'] = float(
			
 
				+            result.fields['llama.rope.freq_base'].parts[-1])
			
 
				+    if 'llama.expert_count' in result.fields:
			
 
				+        model_config['num_local_experts'] = int(
			
 
				+            result.fields['llama.expert_count'].parts[-1])
			
 
				+        model_config['num_experts_per_tok'] = int(
			
 
				+            result.fields['llama.expert_used_count'].parts[-1])
			
 
				     if name in _CONFIG_REGISTRY:
			
 
				         config_class = _CONFIG_REGISTRY[name]
			
 
				     else:
			
@@ -73,12 +80,16 @@ def extract_gguf_config(checkpoint):
 
				 
			
 
				 def get_config(model: str,
			
 
				                trust_remote_code: bool,
			
 
				-               revision: Optional[str] = None) -> PretrainedConfig:
			
 
				+               revision: Optional[str] = None,
			
 
				+               code_revision: Optional[str] = None) -> PretrainedConfig:
			
 
				     if model.endswith("gguf"):
			
 
				         return extract_gguf_config(model)
			
 
				     try:
			
 
				         config = AutoConfig.from_pretrained(
			
 
				-            model, trust_remote_code=trust_remote_code, revision=revision)
			
 
				+            model,
			
 
				+            trust_remote_code=trust_remote_code,
			
 
				+            revision=revision,
			
 
				+            code_revision=code_revision)
			
 
				     except ValueError as e:
			
 
				         if (not trust_remote_code and
			
 
				                 "requires you to execute the configuration file" in str(e)):
			
@@ -92,5 +103,7 @@ def get_config(model: str,
 
				             raise e
			
 
				     if config.model_type in _CONFIG_REGISTRY:
			
 
				         config_class = _CONFIG_REGISTRY[config.model_type]
			
 
				-        config = config_class.from_pretrained(model, revision=revision)
			
 
				+        config = config_class.from_pretrained(model,
			
 
				+                                              revision=revision,
			
 
				+                                              code_revision=code_revision)
			
 
				     return config
			
--- a/aphrodite/transformers_utils/configs/__init__.py
+++ b/aphrodite/transformers_utils/configs/__init__.py
@@ -1,7 +1,18 @@
 
				+from aphrodite.transformers_utils.configs.baichuan import BaiChuanConfig
			
 
				+from aphrodite.transformers_utils.configs.chatglm import ChatGLMConfig
			
 
				+from aphrodite.transformers_utils.configs.mpt import MPTConfig
			
 
				+from aphrodite.transformers_utils.configs.olmo import OLMoConfig
			
 
				 from aphrodite.transformers_utils.configs.qwen import QWenConfig
			
 
				-from aphrodite.transformers_utils.configs.yi import YiConfig
			
 
				+# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
			
 
				+# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
			
 
				+# `FalconConfig` class from the official HuggingFace transformers library.
			
 
				+from aphrodite.transformers_utils.configs.falcon import RWConfig
			
 
				 
			
 
				 __all__ = [
			
 
				+    "BaiChuanConfig",
			
 
				+    "ChatGLMConfig",
			
 
				+    "MPTConfig",
			
 
				+    "OLMoConfig",
			
 
				     "QWenConfig",
			
 
				-    "YiConfig",
			
 
				+    "RWConfig",
			
 
				 ]
			
--- a/aphrodite/transformers_utils/configs/baichuan.py
+++ b/aphrodite/transformers_utils/configs/baichuan.py
@@ -0,0 +1,62 @@
 
				+# coding=utf-8
			
 
				+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
			
 
				+#
			
 
				+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
			
 
				+# and OPT implementations in this library. It has been modified from its
			
 
				+# original forms to accommodate minor architectural differences compared
			
 
				+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+from transformers.configuration_utils import PretrainedConfig
			
 
				+
			
 
				+
			
 
				+class BaiChuanConfig(PretrainedConfig):
			
 
				+    model_type = "baichuan"
			
 
				+    keys_to_ignore_at_inference = ["past_key_values"]
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        vocab_size=64000,
			
 
				+        hidden_size=4096,
			
 
				+        intermediate_size=11008,
			
 
				+        num_hidden_layers=32,
			
 
				+        num_attention_heads=32,
			
 
				+        hidden_act="silu",
			
 
				+        max_position_embeddings=4096,
			
 
				+        initializer_range=0.02,
			
 
				+        rms_norm_eps=1e-6,
			
 
				+        use_cache=True,
			
 
				+        pad_token_id=0,
			
 
				+        bos_token_id=1,
			
 
				+        eos_token_id=2,
			
 
				+        tie_word_embeddings=False,
			
 
				+        **kwargs,
			
 
				+    ):
			
 
				+        self.vocab_size = vocab_size
			
 
				+        self.max_position_embeddings = max_position_embeddings
			
 
				+        self.hidden_size = hidden_size
			
 
				+        self.intermediate_size = intermediate_size
			
 
				+        self.num_hidden_layers = num_hidden_layers
			
 
				+        self.num_attention_heads = num_attention_heads
			
 
				+        self.hidden_act = hidden_act
			
 
				+        self.initializer_range = initializer_range
			
 
				+        self.rms_norm_eps = rms_norm_eps
			
 
				+        self.use_cache = use_cache
			
 
				+        super().__init__(
			
 
				+            pad_token_id=pad_token_id,
			
 
				+            bos_token_id=bos_token_id,
			
 
				+            eos_token_id=eos_token_id,
			
 
				+            tie_word_embeddings=tie_word_embeddings,
			
 
				+            **kwargs,
			
 
				+        )
			
--- a/aphrodite/transformers_utils/configs/chatglm.py
+++ b/aphrodite/transformers_utils/configs/chatglm.py
@@ -0,0 +1,68 @@
 
				+# coding=utf-8
			
 
				+# Adapted from
			
 
				+# https://github.com/THUDM/ChatGLM2-6B
			
 
				+from transformers import PretrainedConfig
			
 
				+
			
 
				+
			
 
				+class ChatGLMConfig(PretrainedConfig):
			
 
				+    model_type = "chatglm"
			
 
				+    attribute_map = {
			
 
				+        "num_hidden_layers": "num_layers",
			
 
				+        "n_head_kv": "multi_query_group_num",
			
 
				+    }
			
 
				+
			
 
				+    def __init__(self,
			
 
				+                 num_layers=28,
			
 
				+                 padded_vocab_size=65024,
			
 
				+                 hidden_size=4096,
			
 
				+                 ffn_hidden_size=13696,
			
 
				+                 kv_channels=128,
			
 
				+                 num_attention_heads=32,
			
 
				+                 seq_length=2048,
			
 
				+                 hidden_dropout=0.0,
			
 
				+                 attention_dropout=0.0,
			
 
				+                 layernorm_epsilon=1e-5,
			
 
				+                 rmsnorm=True,
			
 
				+                 apply_residual_connection_post_layernorm=False,
			
 
				+                 post_layer_norm=True,
			
 
				+                 add_bias_linear=False,
			
 
				+                 add_qkv_bias=False,
			
 
				+                 interleaved_qkv=False,
			
 
				+                 bias_dropout_fusion=True,
			
 
				+                 multi_query_attention=False,
			
 
				+                 multi_query_group_num=1,
			
 
				+                 apply_query_key_layer_scaling=True,
			
 
				+                 attention_softmax_in_fp32=True,
			
 
				+                 fp32_residual_connection=False,
			
 
				+                 quantization_bit=0,
			
 
				+                 pre_seq_len=None,
			
 
				+                 prefix_projection=False,
			
 
				+                 **kwargs):
			
 
				+        self.num_layers = num_layers
			
 
				+        self.vocab_size = padded_vocab_size
			
 
				+        self.padded_vocab_size = padded_vocab_size
			
 
				+        self.hidden_size = hidden_size
			
 
				+        self.ffn_hidden_size = ffn_hidden_size
			
 
				+        self.kv_channels = kv_channels
			
 
				+        self.num_attention_heads = num_attention_heads
			
 
				+        self.seq_length = seq_length
			
 
				+        self.hidden_dropout = hidden_dropout
			
 
				+        self.attention_dropout = attention_dropout
			
 
				+        self.layernorm_epsilon = layernorm_epsilon
			
 
				+        self.rmsnorm = rmsnorm
			
 
				+        self.apply_residual_connection_post_layernorm = (
			
 
				+            apply_residual_connection_post_layernorm)
			
 
				+        self.post_layer_norm = post_layer_norm
			
 
				+        self.add_bias_linear = add_bias_linear
			
 
				+        self.add_qkv_bias = add_qkv_bias
			
 
				+        self.bias_dropout_fusion = bias_dropout_fusion
			
 
				+        self.multi_query_attention = multi_query_attention
			
 
				+        self.multi_query_group_num = multi_query_group_num
			
 
				+        self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
			
 
				+        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
			
 
				+        self.fp32_residual_connection = fp32_residual_connection
			
 
				+        self.quantization_bit = quantization_bit
			
 
				+        self.pre_seq_len = pre_seq_len
			
 
				+        self.prefix_projection = prefix_projection
			
 
				+        self.interleaved_qkv = interleaved_qkv
			
 
				+        super().__init__(**kwargs)
			
--- a/aphrodite/transformers_utils/configs/falcon.py
+++ b/aphrodite/transformers_utils/configs/falcon.py
@@ -0,0 +1,88 @@
 
				+# Adapted from
			
 
				+# https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py
			
 
				+# Copyright 2023 The PygmalionAI team.
			
 
				+# Copyright 2023 The vLLM team.
			
 
				+# Copyright 2022 the Big Science Workshop and HuggingFace Inc. team.
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+"""Falcon configuration"""
			
 
				+from transformers.configuration_utils import PretrainedConfig
			
 
				+
			
 
				+
			
 
				+class RWConfig(PretrainedConfig):
			
 
				+    model_type = "falcon"
			
 
				+    keys_to_ignore_at_inference = ["past_key_values"]
			
 
				+    attribute_map = {
			
 
				+        "num_hidden_layers": "n_layer",
			
 
				+        "num_attention_heads": "n_head",
			
 
				+        "num_kv_heads": "n_head_kv",
			
 
				+    }
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        vocab_size=250880,
			
 
				+        hidden_size=64,
			
 
				+        n_layer=2,
			
 
				+        n_head=8,
			
 
				+        layer_norm_epsilon=1e-5,
			
 
				+        initializer_range=0.02,
			
 
				+        use_cache=True,
			
 
				+        bos_token_id=1,
			
 
				+        eos_token_id=2,
			
 
				+        hidden_dropout=0.0,
			
 
				+        attention_dropout=0.0,
			
 
				+        multi_query=True,
			
 
				+        n_head_kv=None,
			
 
				+        alibi=False,
			
 
				+        bias=False,
			
 
				+        parallel_attn=False,
			
 
				+        new_decoder_architecture=False,
			
 
				+        **kwargs,
			
 
				+    ) -> None:
			
 
				+        self.vocab_size = vocab_size
			
 
				+        # Backward compatibility with n_embed kwarg
			
 
				+        n_embed = kwargs.pop("n_embed", None)
			
 
				+        self.hidden_size = hidden_size if n_embed is None else n_embed
			
 
				+        self.n_layer = n_layer
			
 
				+        self.n_head = n_head
			
 
				+        self.layer_norm_epsilon = layer_norm_epsilon
			
 
				+        self.initializer_range = initializer_range
			
 
				+        self.use_cache = use_cache
			
 
				+        self.hidden_dropout = hidden_dropout
			
 
				+        self.attention_dropout = attention_dropout
			
 
				+
			
 
				+        self.bos_token_id = bos_token_id
			
 
				+        self.eos_token_id = eos_token_id
			
 
				+        self.multi_query = multi_query
			
 
				+        self.n_head_kv = 1 if n_head_kv is None else n_head_kv
			
 
				+        self.alibi = alibi
			
 
				+        self.bias = bias
			
 
				+        self.parallel_attn = parallel_attn
			
 
				+        self.new_decoder_architecture = new_decoder_architecture
			
 
				+
			
 
				+        if self.hidden_size == 8192:
			
 
				+            # Hack for falcon-40b
			
 
				+            self.new_decoder_architecture = True
			
 
				+
			
 
				+        super().__init__(bos_token_id=bos_token_id,
			
 
				+                         eos_token_id=eos_token_id,
			
 
				+                         **kwargs)
			
 
				+
			
 
				+    @property
			
 
				+    def head_dim(self):
			
 
				+        return self.hidden_size // self.n_head
			
 
				+
			
 
				+    @property
			
 
				+    def rotary(self):
			
 
				+        return not self.alibi
			
--- a/aphrodite/transformers_utils/configs/mpt.py
+++ b/aphrodite/transformers_utils/configs/mpt.py
@@ -0,0 +1,233 @@
 
				+# coding=utf-8
			
 
				+# Copied from
			
 
				+# https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py
			
 
				+"""A HuggingFace-style model configuration."""
			
 
				+import warnings
			
 
				+from typing import Any, Dict, Optional, Union
			
 
				+
			
 
				+from transformers import PretrainedConfig
			
 
				+
			
 
				+attn_config_defaults: Dict = {
			
 
				+    'attn_type': 'multihead_attention',
			
 
				+    'attn_pdrop': 0.0,
			
 
				+    'attn_impl': 'triton',
			
 
				+    'qk_ln': False,
			
 
				+    'clip_qkv': None,
			
 
				+    'softmax_scale': None,
			
 
				+    'prefix_lm': False,
			
 
				+    'attn_uses_sequence_id': False,
			
 
				+    'alibi': False,
			
 
				+    'alibi_bias_max': 8
			
 
				+}
			
 
				+ffn_config_defaults: Dict = {'ffn_type': 'mptmlp'}
			
 
				+init_config_defaults: Dict = {
			
 
				+    'name': 'kaiming_normal_',
			
 
				+    'fan_mode': 'fan_in',
			
 
				+    'init_nonlinearity': 'relu',
			
 
				+    'init_div_is_residual': True,
			
 
				+    'emb_init_std': None,
			
 
				+    'emb_init_uniform_lim': None,
			
 
				+    'init_std': None,
			
 
				+    'init_gain': 0.0
			
 
				+}
			
 
				+
			
 
				+
			
 
				+class MPTConfig(PretrainedConfig):
			
 
				+    model_type = 'mpt'
			
 
				+    attribute_map = {
			
 
				+        'num_attention_heads': 'n_heads',
			
 
				+        'hidden_size': 'd_model',
			
 
				+        'num_hidden_layers': 'n_layers',
			
 
				+    }
			
 
				+
			
 
				+    # pylint: disable=dangerous-default-value
			
 
				+    def __init__(self,
			
 
				+                 d_model: int = 2048,
			
 
				+                 n_heads: int = 16,
			
 
				+                 n_layers: int = 24,
			
 
				+                 expansion_ratio: int = 4,
			
 
				+                 max_seq_len: int = 2048,
			
 
				+                 vocab_size: int = 50368,
			
 
				+                 resid_pdrop: float = 0.0,
			
 
				+                 emb_pdrop: float = 0.0,
			
 
				+                 learned_pos_emb: bool = True,
			
 
				+                 attn_config: Dict = attn_config_defaults,
			
 
				+                 ffn_config: Dict = ffn_config_defaults,
			
 
				+                 init_device: str = 'cpu',
			
 
				+                 logit_scale: Optional[Union[float, str]] = None,
			
 
				+                 no_bias: bool = False,
			
 
				+                 embedding_fraction: float = 1.0,
			
 
				+                 norm_type: str = 'low_precision_layernorm',
			
 
				+                 use_cache: bool = False,
			
 
				+                 init_config: Dict = init_config_defaults,
			
 
				+                 fc_type: str = 'torch',
			
 
				+                 verbose: Optional[int] = None,
			
 
				+                 **kwargs: Any):
			
 
				+        """The MPT configuration class.
			
 
				+        Args:
			
 
				+            d_model (int): The size of the embedding dimension of the model.
			
 
				+            n_heads (int): The number of attention heads.
			
 
				+            n_layers (int): The number of layers in the model.
			
 
				+            expansion_ratio (int): The ratio of the up/down scale in the ffn.
			
 
				+            max_seq_len (int): The maximum sequence length of the model.
			
 
				+            vocab_size (int): The size of the vocabulary.
			
 
				+            resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
			
 
				+            emb_pdrop (float): The dropout probability for the embedding layer.
			
 
				+            learned_pos_emb (bool): Whether to use learned positional embeddings
			
 
				+            attn_config (Dict): A dictionary used to configure the model's attention module:
			
 
				+                attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention, grouped_query_attention
			
 
				+                attn_pdrop (float): The dropout probability for the attention layers.
			
 
				+                attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
			
 
				+                qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.
			
 
				+                clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to
			
 
				+                    this value.
			
 
				+                softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None,
			
 
				+                    use the default scale of ``1/sqrt(d_keys)``.
			
 
				+                prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an
			
 
				+                    extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix
			
 
				+                    can attend to one another bi-directionally. Tokens outside the prefix use causal attention.
			
 
				+                attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id.
			
 
				+                    When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates
			
 
				+                    which sub-sequence each token belongs to.
			
 
				+                    Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
			
 
				+                alibi (bool): Whether to use the alibi bias instead of position embeddings.
			
 
				+                alibi_bias_max (int): The maximum value of the alibi bias.
			
 
				+                kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
			
 
				+            ffn_config (Dict): A dictionary used to configure the model's ffn module:
			
 
				+                ffn_type (str): type of ffn to use. Options: mptmlp, te_ln_mlp
			
 
				+            init_device (str): The device to use for parameter initialization.
			
 
				+            logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
			
 
				+            no_bias (bool): Whether to use bias in all layers.
			
 
				+            verbose (int): The verbosity level. 0 is silent.
			
 
				+            embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
			
 
				+            norm_type (str): choose type of norm to use
			
 
				+            use_cache (bool): Whether or not the model should return the last key/values attentions
			
 
				+            init_config (Dict): A dictionary used to configure the model initialization:
			
 
				+                init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_',
			
 
				+                    'kaiming_uniform_', 'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or
			
 
				+                    'xavier_normal_'. These mimic the parameter initialization methods in PyTorch.
			
 
				+                init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True.
			
 
				+                emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer.
			
 
				+                emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution
			
 
				+                    used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``.
			
 
				+                init_std (float): The standard deviation of the normal distribution used to initialize the model,
			
 
				+                    if using the baseline_ parameter initialization scheme.
			
 
				+                init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes.
			
 
				+                fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes.
			
 
				+                init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
			
 
				+                ---
			
 
				+                See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
			
 
				+            fc_type (str): choose fc layer implementation. Options: torch and te. te layers support fp8 when using H100 GPUs.
			
 
				+        """
			
 
				+        self.d_model = d_model
			
 
				+        self.n_heads = n_heads
			
 
				+        self.n_layers = n_layers
			
 
				+        self.expansion_ratio = expansion_ratio
			
 
				+        self.max_seq_len = max_seq_len
			
 
				+        self.vocab_size = vocab_size
			
 
				+        self.resid_pdrop = resid_pdrop
			
 
				+        self.emb_pdrop = emb_pdrop
			
 
				+        self.learned_pos_emb = learned_pos_emb
			
 
				+        self.attn_config = attn_config
			
 
				+        self.ffn_config = ffn_config
			
 
				+        self.init_device = init_device
			
 
				+        self.logit_scale = logit_scale
			
 
				+        self.no_bias = no_bias
			
 
				+        self.embedding_fraction = embedding_fraction
			
 
				+        self.norm_type = norm_type
			
 
				+        self.use_cache = use_cache
			
 
				+        self.init_config = init_config
			
 
				+        self.fc_type = fc_type
			
 
				+        if verbose is not None:
			
 
				+            warnings.warn(DeprecationWarning(
			
 
				+                'verbose argument for MPTConfig is now ignored and will be removed. Use python_log_level instead.'
			
 
				+            ),
			
 
				+                          stacklevel=2)
			
 
				+        if 'name' in kwargs:
			
 
				+            del kwargs['name']
			
 
				+        if 'loss_fn' in kwargs:
			
 
				+            del kwargs['loss_fn']
			
 
				+        if self.attn_config.get('alibi', False):
			
 
				+            self.learned_pos_emb = False
			
 
				+            warnings.warn(
			
 
				+                f'alibi is turned on, setting `learned_pos_emb` to {self.learned_pos_emb}`',
			
 
				+                stacklevel=2)
			
 
				+        super().__init__(**kwargs)
			
 
				+        self._validate_config()
			
 
				+
			
 
				+    def _set_config_defaults(
			
 
				+            self, config: Dict[str, Any],
			
 
				+            config_defaults: Dict[str, Any]) -> Dict[str, Any]:
			
 
				+        for (k, v) in config_defaults.items():
			
 
				+            if k not in config:
			
 
				+                config[k] = v
			
 
				+        return config
			
 
				+
			
 
				+    def _validate_config(self) -> None:
			
 
				+        self.attn_config = self._set_config_defaults(self.attn_config,
			
 
				+                                                     attn_config_defaults)
			
 
				+        self.ffn_config = self._set_config_defaults(self.ffn_config,
			
 
				+                                                    ffn_config_defaults)
			
 
				+        self.init_config = self._set_config_defaults(self.init_config,
			
 
				+                                                     init_config_defaults)
			
 
				+        if self.d_model % self.n_heads != 0:
			
 
				+            raise ValueError('d_model must be divisible by n_heads')
			
 
				+        if any((
			
 
				+                prob < 0 or prob > 1 for prob in
			
 
				+            [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop]
			
 
				+        )):
			
 
				+            raise ValueError(
			
 
				+                "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1"  # pylint: disable=line-too-long
			
 
				+            )
			
 
				+        if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
			
 
				+            raise ValueError(
			
 
				+                f"Unknown attn_impl={self.attn_config['attn_impl']}")
			
 
				+        if self.attn_config['prefix_lm'] and self.attn_config[
			
 
				+                'attn_impl'] not in ['torch', 'triton']:
			
 
				+            raise NotImplementedError(
			
 
				+                'prefix_lm only implemented with torch and triton attention.')
			
 
				+        if self.attn_config['alibi'] and self.attn_config['attn_impl'] not in [
			
 
				+                'torch', 'triton'
			
 
				+        ]:
			
 
				+            raise NotImplementedError(
			
 
				+                'alibi only implemented with torch and triton attention.')
			
 
				+        if self.attn_config['attn_uses_sequence_id'] and self.attn_config[
			
 
				+                'attn_impl'] not in ['torch', 'triton']:
			
 
				+            raise NotImplementedError(
			
 
				+                'attn_uses_sequence_id only implemented with torch and triton attention.'  # pylint: disable=line-too-long
			
 
				+            )
			
 
				+        if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
			
 
				+            raise ValueError(
			
 
				+                'model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!'  # pylint: disable=line-too-long
			
 
				+            )
			
 
				+        if isinstance(self.logit_scale,
			
 
				+                      str) and self.logit_scale != 'inv_sqrt_d_model':
			
 
				+            raise ValueError(
			
 
				+                f"self.logit_scale={self.logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'."  # pylint: disable=line-too-long
			
 
				+            )
			
 
				+        if self.init_config.get('name', None) is None:
			
 
				+            raise ValueError(
			
 
				+                f"self.init_config={self.init_config!r} 'name' needs to be set."
			
 
				+            )
			
 
				+        if not self.learned_pos_emb and (not self.attn_config['alibi']):
			
 
				+            warnings.warn(
			
 
				+                'Positional information not being provided to the model.',
			
 
				+                stacklevel=2)
			
 
				+        if self.fc_type == 'te' or self.ffn_config['ffn_type'] == 'te_ln_mlp':
			
 
				+            try:
			
 
				+                # pylint: disable=import-outside-toplevel
			
 
				+                import transformer_engine.pytorch as te
			
 
				+                del te
			
 
				+            except Exception as exc:
			
 
				+                raise ImportError(
			
 
				+                    # pylint: disable=line-too-long
			
 
				+                    'TransformerEngine import fail. `fc_type: te` requires TransformerEngine be installed. '
			
 
				+                    +
			
 
				+                    'The required version of transformer_engine also requires FlashAttention v1.0.6 is installed:\n'
			
 
				+                    + 'pip install flash-attn==1.0.6 --no-build-isolation \n' +
			
 
				+                    'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156'
			
 
				+                ) from exc
			
 
				+        if self.ffn_config['ffn_type'] == 'mptmlp':
			
 
				+            self.ffn_config['fc_type'] = self.fc_type
			
 
				+        elif self.ffn_config['ffn_type'] == 'te_ln_mlp':
			
 
				+            self.ffn_config['bias'] = not self.no_bias
			
--- a/aphrodite/transformers_utils/configs/olmo.py
+++ b/aphrodite/transformers_utils/configs/olmo.py
@@ -0,0 +1,72 @@
 
				+# coding=utf-8
			
 
				+# adapted from https://github.com/allenai/OLMo/blob/v0.2.4/hf_olmo/configuration_olmo.py
			
 
				+"""OLMo configuration"""
			
 
				+from transformers import PretrainedConfig
			
 
				+
			
 
				+
			
 
				+class OLMoConfig(PretrainedConfig):
			
 
				+    model_type = 'olmo'
			
 
				+    attribute_map = {
			
 
				+        'num_attention_heads': 'n_heads',
			
 
				+        'hidden_size': 'd_model',
			
 
				+        'num_hidden_layers': 'n_layers',
			
 
				+    }
			
 
				+
			
 
				+    # Note that the defaults for these attributes are equivalent to the base GPT2 model.
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        d_model=768,
			
 
				+        n_heads=12,
			
 
				+        n_layers=12,
			
 
				+        mlp_ratio=4,
			
 
				+        mlp_hidden_size=None,
			
 
				+        activation_type="swiglu",
			
 
				+        block_type="sequential",
			
 
				+        block_group_size=1,
			
 
				+        alibi=False,
			
 
				+        alibi_bias_max=8.0,
			
 
				+        rope=False,
			
 
				+        rope_full_precision=True,
			
 
				+        multi_query_attention=False,
			
 
				+        attention_layer_norm=False,
			
 
				+        layer_norm_type="default",
			
 
				+        layer_norm_with_affine=True,
			
 
				+        attention_layer_norm_with_affine=True,
			
 
				+        max_sequence_length=1024,
			
 
				+        include_bias=True,
			
 
				+        bias_for_layer_norm=None,
			
 
				+        scale_logits=False,
			
 
				+        vocab_size=50257,
			
 
				+        embedding_size=50304,
			
 
				+        weight_tying=True,
			
 
				+        eos_token_id=50256,
			
 
				+        pad_token_id=50256,
			
 
				+        **kwargs,
			
 
				+    ):
			
 
				+        self.d_model = d_model
			
 
				+        self.n_heads = n_heads
			
 
				+        self.n_layers = n_layers
			
 
				+        self.mlp_ratio = mlp_ratio
			
 
				+        self.mlp_hidden_size = mlp_hidden_size
			
 
				+        self.activation_type = activation_type
			
 
				+        self.block_type = block_type
			
 
				+        self.block_group_size = block_group_size
			
 
				+        self.alibi = alibi
			
 
				+        self.alibi_bias_max = alibi_bias_max
			
 
				+        self.rope = rope
			
 
				+        self.rope_full_precision = rope_full_precision
			
 
				+        self.multi_query_attention = multi_query_attention
			
 
				+        self.attention_layer_norm = attention_layer_norm
			
 
				+        self.layer_norm_type = layer_norm_type
			
 
				+        self.layer_norm_with_affine = layer_norm_with_affine
			
 
				+        self.attention_layer_norm_with_affine = attention_layer_norm_with_affine
			
 
				+        self.max_sequence_length = max_sequence_length
			
 
				+        self.include_bias = include_bias
			
 
				+        self.bias_for_layer_norm = bias_for_layer_norm
			
 
				+        self.scale_logits = scale_logits
			
 
				+        self.vocab_size = vocab_size
			
 
				+        self.embedding_size = embedding_size
			
 
				+        self.weight_tying = weight_tying
			
 
				+        self.eos_token_id = eos_token_id
			
 
				+        self.pad_token_id = pad_token_id
			
 
				+        super().__init__(**kwargs)
			
--- a/aphrodite/transformers_utils/tokenizer.py
+++ b/aphrodite/transformers_utils/tokenizer.py
@@ -1,6 +1,6 @@
 
				 import os
			
 
				 import tempfile
			
 
				-from typing import List, Tuple, Union, Optional
			
 
				+from typing import List, Optional, Tuple, Union
			
 
				 
			
 
				 import gguf
			
 
				 from transformers import (AutoTokenizer, PreTrainedTokenizer,
			
@@ -10,6 +10,7 @@ from transformers.convert_slow_tokenizer import import_protobuf
 
				 from aphrodite.common.logger import init_logger
			
 
				 from aphrodite.lora.request import LoRARequest
			
 
				 from aphrodite.common.utils import make_async, LRUCache
			
 
				+from aphrodite.transformers_utils.tokenizers import BaichuanTokenizer
			
 
				 
			
 
				 logger = init_logger(__name__)
			
 
				 
			
@@ -19,90 +20,75 @@ def convert_gguf_to_tokenizer(checkpoint):
 
				     # write vocab
			
 
				     sentencepiece_model_pb2 = import_protobuf()
			
 
				     vocab = sentencepiece_model_pb2.ModelProto()
			
 
				-    vocab_size = len(result.fields["tokenizer.ggml.token_type"].data)
			
 
				+    vocab_size = len(result.fields['tokenizer.ggml.token_type'].data)
			
 
				     vocab.trainer_spec.model_type = 2  # BPE
			
 
				     vocab.trainer_spec.vocab_size = vocab_size
			
 
				     vocab.trainer_spec.byte_fallback = True
			
 
				     vocab.normalizer_spec.remove_extra_whitespaces = False
			
 
				-    tokens = result.fields["tokenizer.ggml.tokens"]
			
 
				-    scores = result.fields["tokenizer.ggml.scores"]
			
 
				-    types = result.fields["tokenizer.ggml.token_type"]
			
 
				+    tokens = result.fields['tokenizer.ggml.tokens']
			
 
				+    scores = result.fields['tokenizer.ggml.scores']
			
 
				+    types = result.fields['tokenizer.ggml.token_type']
			
 
				     for i in range(vocab_size):
			
 
				         new_token = vocab.SentencePiece()
			
 
				         new_token.piece = str(bytes(tokens.parts[tokens.data[i]]),
			
 
				-                              encoding="utf-8")
			
 
				+                              encoding='utf-8')
			
 
				         new_token.score = scores.parts[scores.data[i]]
			
 
				         # llama.cpp tokentype is the same with sentencepiece token type
			
 
				         new_token.type = int(types.parts[types.data[i]])
			
 
				         vocab.pieces.append(new_token)
			
 
				-    with tempfile.NamedTemporaryFile(mode="wb", delete=False) as temp_file:
			
 
				+    with tempfile.NamedTemporaryFile(mode='wb', delete=False) as temp_file:
			
 
				         temp_file.write(vocab.SerializeToString())
			
 
				         temp_file_filename = temp_file.name
			
 
				     tokenizer_args = {"vocab_file": temp_file_filename}
			
 
				 
			
 
				-    if "tokenizer.ggml.bos_token_id" in result.fields:
			
 
				+    if 'tokenizer.ggml.bos_token_id' in result.fields:
			
 
				         tokenizer_args["bos_token"] = vocab.pieces[int(
			
 
				-            result.fields["tokenizer.ggml.bos_token_id"].parts[-1])].piece
			
 
				-    if "tokenizer.ggml.eos_token_id" in result.fields:
			
 
				+            result.fields['tokenizer.ggml.bos_token_id'].parts[-1])].piece
			
 
				+    if 'tokenizer.ggml.eos_token_id' in result.fields:
			
 
				         tokenizer_args["eos_token"] = vocab.pieces[int(
			
 
				-            result.fields["tokenizer.ggml.eos_token_id"].parts[-1])].piece
			
 
				-    if "tokenizer.ggml.padding_token_id" in result.fields:
			
 
				+            result.fields['tokenizer.ggml.eos_token_id'].parts[-1])].piece
			
 
				+    if 'tokenizer.ggml.padding_token_id' in result.fields:
			
 
				         tokenizer_args["pad_token"] = vocab.pieces[int(
			
 
				-            result.fields["tokenizer.ggml.padding_token_id"].parts[-1])].piece
			
 
				-    if "tokenizer.ggml.unknown_token_id" in result.fields:
			
 
				+            result.fields['tokenizer.ggml.padding_token_id'].parts[-1])].piece
			
 
				+    if 'tokenizer.ggml.unknown_token_id' in result.fields:
			
 
				         tokenizer_args["unk_token"] = vocab.pieces[int(
			
 
				-            result.fields["tokenizer.ggml.unknown_token_id"].parts[-1])].piece
			
 
				-    if "tokenizer.ggml.add_bos_token" in result.fields:
			
 
				+            result.fields['tokenizer.ggml.unknown_token_id'].parts[-1])].piece
			
 
				+    if 'tokenizer.ggml.add_bos_token' in result.fields:
			
 
				         tokenizer_args["add_bos_token"] = bool(
			
 
				-            result.fields["tokenizer.ggml.add_bos_token"].parts[-1])
			
 
				-    if "tokenizer.ggml.add_eos_token" in result.fields:
			
 
				+            result.fields['tokenizer.ggml.add_bos_token'].parts[-1])
			
 
				+    if 'tokenizer.ggml.add_eos_token' in result.fields:
			
 
				         tokenizer_args["add_eos_token"] = bool(
			
 
				-            result.fields["tokenizer.ggml.add_eos_token"].parts[-1])
			
 
				-    tokenizer = LlamaTokenizer(**tokenizer_args, legacy=False)
			
 
				+            result.fields['tokenizer.ggml.add_eos_token'].parts[-1])
			
 
				+    tokenizer = LlamaTokenizer(**tokenizer_args)
			
 
				     os.unlink(temp_file_filename)
			
 
				     return tokenizer
			
 
				 
			
 
				 
			
 
				-# A fast LLaMA tokenizer with the pre-processed `tokenizer.json` file.
			
 
				-_FAST_LLAMA_TOKENIZER = "hf-internal-testing/llama-tokenizer"
			
 
				-
			
 
				-
			
 
				 def get_tokenizer(
			
 
				     tokenizer_name: str,
			
 
				     *args,
			
 
				     tokenizer_mode: str = "auto",
			
 
				     trust_remote_code: bool = False,
			
 
				+    tokenizer_revision: Optional[str] = None,
			
 
				     **kwargs,
			
 
				 ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
			
 
				     """Gets a tokenizer for the given model name via Huggingface."""
			
 
				     if tokenizer_name.endswith("gguf"):
			
 
				         return convert_gguf_to_tokenizer(tokenizer_name)
			
 
				+
			
 
				     if tokenizer_mode == "slow":
			
 
				         if kwargs.get("use_fast", False):
			
 
				             raise ValueError(
			
 
				                 "Cannot use the fast tokenizer in slow tokenizer mode.")
			
 
				         kwargs["use_fast"] = False
			
 
				 
			
 
				-    if ("llama" in tokenizer_name.lower() and kwargs.get("use_fast", True)
			
 
				-            and tokenizer_name != _FAST_LLAMA_TOKENIZER):
			
 
				-        logger.info(
			
 
				-            "For some LLaMA V1 models, initializing the fast tokenizer may "
			
 
				-            "take a long time. To reduce the initialization time, consider "
			
 
				-            f"using '{_FAST_LLAMA_TOKENIZER}' instead of the original "
			
 
				-            "tokenizer.")
			
 
				     try:
			
 
				         tokenizer = AutoTokenizer.from_pretrained(
			
 
				             tokenizer_name,
			
 
				             *args,
			
 
				             trust_remote_code=trust_remote_code,
			
 
				+            tokenizer_revision=tokenizer_revision,
			
 
				             **kwargs)
			
 
				-    except TypeError as e:
			
 
				-        # The LLaMA tokenizer causes a protobuf error in some environments.
			
 
				-        err_msg = (
			
 
				-            "Failed to load the tokenizer. If you are using a LLaMA V1 model "
			
 
				-            f"consider using '{_FAST_LLAMA_TOKENIZER}' instead of the "
			
 
				-            "original tokenizer.")
			
 
				-        raise RuntimeError(err_msg) from e
			
 
				     except ValueError as e:
			
 
				         # If the error pertains to the tokenizer class not existing or not
			
 
				         # currently being imported, suggest using the --trust-remote-code flag.
			
@@ -117,6 +103,18 @@ def get_tokenizer(
 
				             raise RuntimeError(err_msg) from e
			
 
				         else:
			
 
				             raise e
			
 
				+    except AttributeError as e:
			
 
				+        if "BaichuanTokenizer" in str(e):
			
 
				+            # This is for the error "'BaichuanTokenizer' object has no
			
 
				+            # attribute 'sp_model'".
			
 
				+            tokenizer = BaichuanTokenizer.from_pretrained(
			
 
				+                tokenizer_name,
			
 
				+                *args,
			
 
				+                trust_remote_code=trust_remote_code,
			
 
				+                tokenizer_revision=tokenizer_revision,
			
 
				+                **kwargs)
			
 
				+        else:
			
 
				+            raise e
			
 
				 
			
 
				     if not isinstance(tokenizer, PreTrainedTokenizerFast):
			
 
				         logger.warning(
			
@@ -161,21 +159,18 @@ class TokenizerGroup:
 
				         else:
			
 
				             self.lora_tokenizers = None
			
 
				 
			
 
				-    def encode(
			
 
				-        self,
			
 
				-        prompt: str,
			
 
				-        request_id: Optional[str] = None,  # pylint: disable=unused-argument
			
 
				-        lora_request: Optional[LoRARequest] = None
			
 
				-    ) -> List[int]:
			
 
				+    def encode(self,
			
 
				+               prompt: str,
			
 
				+               request_id: Optional[str] = None,
			
 
				+               lora_request: Optional[LoRARequest] = None) -> List[int]:
			
 
				         tokenizer = self.get_lora_tokenizer(lora_request)
			
 
				         return tokenizer.encode(prompt)
			
 
				 
			
 
				     async def encode_async(
			
 
				-        self,
			
 
				-        prompt: str,
			
 
				-        request_id: Optional[str] = None,  # pylint: disable=unused-argument
			
 
				-        lora_request: Optional[LoRARequest] = None
			
 
				-    ) -> List[int]:
			
 
				+            self,
			
 
				+            prompt: str,
			
 
				+            request_id: Optional[str] = None,
			
 
				+            lora_request: Optional[LoRARequest] = None) -> List[int]:
			
 
				         tokenizer = await self.get_lora_tokenizer_async(lora_request)
			
 
				         return tokenizer.encode(prompt)
			
 
				 
			
@@ -262,7 +257,7 @@ def detokenize_incrementally(
 
				         # tokenizers (bigger = more conservative).
			
 
				         # Subtract 1 extra to account for the generated token.
			
 
				         prefix_offset = max(len(output_tokens) - 6, 0)
			
 
				-        # If the first new token is a special token we can't skip 1 extra token
			
 
				+        # If the first new token is a special token, we can't skip 1 extra token
			
 
				         if skip_special_tokens and new_token_id in tokenizer.all_special_ids:
			
 
				             read_offset = max(len(output_tokens), 0)
			
 
				         else:
			
@@ -286,12 +281,14 @@ def detokenize_incrementally(
 
				             tokenizer,
			
 
				             output_tokens[prefix_offset:read_offset],
			
 
				             skip_special_tokens=skip_special_tokens,
			
 
				-            spaces_between_special_tokens=spaces_between_special_tokens)
			
 
				+            spaces_between_special_tokens=spaces_between_special_tokens,
			
 
				+        )
			
 
				         new_text = _convert_tokens_to_string_with_added_encoders(
			
 
				             tokenizer,
			
 
				             output_tokens[prefix_offset:],
			
 
				             skip_special_tokens=skip_special_tokens,
			
 
				-            spaces_between_special_tokens=spaces_between_special_tokens)
			
 
				+            spaces_between_special_tokens=spaces_between_special_tokens,
			
 
				+        )
			
 
				 
			
 
				     if len(new_text) > len(prefix_text) and not new_text.endswith("�"):
			
 
				         # utf-8 char at the end means it's a potential unfinished byte sequence
			
--- a/aphrodite/transformers_utils/tokenizers/__init__.py
+++ b/aphrodite/transformers_utils/tokenizers/__init__.py
@@ -0,0 +1,5 @@
 
				+from aphrodite.transformers_utils.tokenizers.baichuan import BaichuanTokenizer
			
 
				+
			
 
				+__all__ = [
			
 
				+    "BaichuanTokenizer",
			
 
				+]
			
--- a/aphrodite/transformers_utils/tokenizers/baichuan.py
+++ b/aphrodite/transformers_utils/tokenizers/baichuan.py
@@ -0,0 +1,261 @@
 
				+# yapf: disable
			
 
				+# Adapted from
			
 
				+# https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/8f6e343d545c503b91429582231d1d354dac2740/tokenization_baichuan.py
			
 
				+# Copyright (c) 2023, Baichuan Intelligent Technology. All rights reserved.
			
 
				+
			
 
				+import os
			
 
				+from shutil import copyfile
			
 
				+from typing import Any, Dict, List, Optional, Tuple
			
 
				+
			
 
				+import sentencepiece as spm
			
 
				+from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
			
 
				+from transformers.utils import logging
			
 
				+
			
 
				+
			
 
				+logger = logging.get_logger(__name__)
			
 
				+
			
 
				+VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
			
 
				+
			
 
				+PRETRAINED_VOCAB_FILES_MAP = {
			
 
				+    "vocab_file": {},
			
 
				+    "tokenizer_file": {},
			
 
				+}
			
 
				+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
			
 
				+
			
 
				+
			
 
				+class BaichuanTokenizer(PreTrainedTokenizer):
			
 
				+    """
			
 
				+    Construct a Baichuan tokenizer. Based on byte-level Byte-Pair-Encoding.
			
 
				+
			
 
				+    Args:
			
 
				+        vocab_file (`str`):
			
 
				+            Path to the vocabulary file.
			
 
				+    """
			
 
				+
			
 
				+    vocab_files_names = VOCAB_FILES_NAMES
			
 
				+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
			
 
				+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
			
 
				+    model_input_names = ["input_ids", "attention_mask"]
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        vocab_file,
			
 
				+        unk_token="<unk>",
			
 
				+        bos_token="<s>",
			
 
				+        eos_token="</s>",
			
 
				+        pad_token=None,
			
 
				+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
			
 
				+        add_bos_token=True,
			
 
				+        add_eos_token=False,
			
 
				+        clean_up_tokenization_spaces=False,
			
 
				+        **kwargs,
			
 
				+    ):
			
 
				+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
			
 
				+        bos_token = (
			
 
				+            AddedToken(bos_token, lstrip=False, rstrip=False)
			
 
				+            if isinstance(bos_token, str)
			
 
				+            else bos_token
			
 
				+        )
			
 
				+        eos_token = (
			
 
				+            AddedToken(eos_token, lstrip=False, rstrip=False)
			
 
				+            if isinstance(eos_token, str)
			
 
				+            else eos_token
			
 
				+        )
			
 
				+        unk_token = (
			
 
				+            AddedToken(unk_token, lstrip=False, rstrip=False)
			
 
				+            if isinstance(unk_token, str)
			
 
				+            else unk_token
			
 
				+        )
			
 
				+        pad_token = (
			
 
				+            AddedToken(pad_token, lstrip=False, rstrip=False)
			
 
				+            if isinstance(pad_token, str)
			
 
				+            else pad_token
			
 
				+        )
			
 
				+        self.vocab_file = vocab_file
			
 
				+        self.add_bos_token = add_bos_token
			
 
				+        self.add_eos_token = add_eos_token
			
 
				+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
			
 
				+        self.sp_model.Load(vocab_file)
			
 
				+        super().__init__(
			
 
				+            bos_token=bos_token,
			
 
				+            eos_token=eos_token,
			
 
				+            unk_token=unk_token,
			
 
				+            pad_token=pad_token,
			
 
				+            add_bos_token=add_bos_token,
			
 
				+            add_eos_token=add_eos_token,
			
 
				+            sp_model_kwargs=self.sp_model_kwargs,
			
 
				+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
			
 
				+            **kwargs,
			
 
				+        )
			
 
				+
			
 
				+    def __getstate__(self):
			
 
				+        state = self.__dict__.copy()
			
 
				+        state["sp_model"] = None
			
 
				+        return state
			
 
				+
			
 
				+    def __setstate__(self, d):
			
 
				+        self.__dict__ = d
			
 
				+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
			
 
				+        self.sp_model.Load(self.vocab_file)
			
 
				+
			
 
				+    @property
			
 
				+    def vocab_size(self):
			
 
				+        """Returns vocab size"""
			
 
				+        return self.sp_model.get_piece_size()
			
 
				+
			
 
				+    def get_vocab(self):
			
 
				+        """Returns vocab as a dict"""
			
 
				+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
			
 
				+        vocab.update(self.added_tokens_encoder)
			
 
				+        return vocab
			
 
				+
			
 
				+    def _tokenize(self, text):
			
 
				+        """Returns a tokenized string."""
			
 
				+        return self.sp_model.encode(text, out_type=str)
			
 
				+
			
 
				+    def _convert_token_to_id(self, token):
			
 
				+        """Converts a token (str) in an id using the vocab."""
			
 
				+        return self.sp_model.piece_to_id(token)
			
 
				+
			
 
				+    def _convert_id_to_token(self, index):
			
 
				+        """Converts an index (integer) in a token (str) using the vocab."""
			
 
				+        token = self.sp_model.IdToPiece(index)
			
 
				+        return token
			
 
				+
			
 
				+    def convert_tokens_to_string(self, tokens):
			
 
				+        """Converts a sequence of tokens (string) in a single string."""
			
 
				+        current_sub_tokens = []
			
 
				+        out_string = ""
			
 
				+        prev_is_special = False
			
 
				+        for i, token in enumerate(tokens):
			
 
				+            # make sure that special tokens are not decoded using sentencepiece model
			
 
				+            if token in self.all_special_tokens:
			
 
				+                if not prev_is_special and i != 0:
			
 
				+                    out_string += " "
			
 
				+                out_string += self.sp_model.decode(current_sub_tokens) + token
			
 
				+                prev_is_special = True
			
 
				+                current_sub_tokens = []
			
 
				+            else:
			
 
				+                current_sub_tokens.append(token)
			
 
				+                prev_is_special = False
			
 
				+        out_string += self.sp_model.decode(current_sub_tokens)
			
 
				+        return out_string
			
 
				+
			
 
				+    def save_vocabulary(
			
 
				+        self, save_directory, filename_prefix: Optional[str] = None
			
 
				+    ) -> Tuple[str]:
			
 
				+        """
			
 
				+        Save the vocabulary and special tokens file to a directory.
			
 
				+
			
 
				+        Args:
			
 
				+            save_directory (`str`):
			
 
				+                The directory in which to save the vocabulary.
			
 
				+
			
 
				+        Returns:
			
 
				+            `Tuple(str)`: Paths to the files saved.
			
 
				+        """
			
 
				+        if not os.path.isdir(save_directory):
			
 
				+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
			
 
				+            return
			
 
				+        out_vocab_file = os.path.join(
			
 
				+            save_directory,
			
 
				+            (filename_prefix + "-" if filename_prefix else "")
			
 
				+            + VOCAB_FILES_NAMES["vocab_file"],
			
 
				+        )
			
 
				+
			
 
				+        if os.path.abspath(self.vocab_file) != os.path.abspath(
			
 
				+            out_vocab_file
			
 
				+        ) and os.path.isfile(self.vocab_file):
			
 
				+            copyfile(self.vocab_file, out_vocab_file)
			
 
				+        elif not os.path.isfile(self.vocab_file):
			
 
				+            with open(out_vocab_file, "wb") as fi:
			
 
				+                content_spiece_model = self.sp_model.serialized_model_proto()
			
 
				+                fi.write(content_spiece_model)
			
 
				+
			
 
				+        return (out_vocab_file,)
			
 
				+
			
 
				+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
			
 
				+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
			
 
				+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
			
 
				+
			
 
				+        output = bos_token_id + token_ids_0 + eos_token_id
			
 
				+
			
 
				+        if token_ids_1 is not None:
			
 
				+            output = output + bos_token_id + token_ids_1 + eos_token_id
			
 
				+
			
 
				+        return output
			
 
				+
			
 
				+    def get_special_tokens_mask(
			
 
				+        self,
			
 
				+        token_ids_0: List[int],
			
 
				+        token_ids_1: Optional[List[int]] = None,
			
 
				+        already_has_special_tokens: bool = False,
			
 
				+    ) -> List[int]:
			
 
				+        """
			
 
				+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
			
 
				+        special tokens using the tokenizer `prepare_for_model` method.
			
 
				+
			
 
				+        Args:
			
 
				+            token_ids_0 (`List[int]`):
			
 
				+                List of IDs.
			
 
				+            token_ids_1 (`List[int]`, *optional*):
			
 
				+                Optional second list of IDs for sequence pairs.
			
 
				+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
			
 
				+                Whether or not the token list is already formatted with special tokens for the model.
			
 
				+
			
 
				+        Returns:
			
 
				+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
			
 
				+        """
			
 
				+        if already_has_special_tokens:
			
 
				+            return super().get_special_tokens_mask(
			
 
				+                token_ids_0=token_ids_0,
			
 
				+                token_ids_1=token_ids_1,
			
 
				+                already_has_special_tokens=True,
			
 
				+            )
			
 
				+
			
 
				+        bos_token_id = [1] if self.add_bos_token else []
			
 
				+        eos_token_id = [1] if self.add_eos_token else []
			
 
				+
			
 
				+        if token_ids_1 is None:
			
 
				+            return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
			
 
				+        return (
			
 
				+            bos_token_id
			
 
				+            + ([0] * len(token_ids_0))
			
 
				+            + eos_token_id
			
 
				+            + bos_token_id
			
 
				+            + ([0] * len(token_ids_1))
			
 
				+            + eos_token_id
			
 
				+        )
			
 
				+
			
 
				+    def create_token_type_ids_from_sequences(
			
 
				+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
			
 
				+    ) -> List[int]:
			
 
				+        """
			
 
				+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
			
 
				+        sequence pair mask has the following format:
			
 
				+
			
 
				+        ```
			
 
				+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
			
 
				+        | first sequence    | second sequence |
			
 
				+        ```
			
 
				+
			
 
				+        if token_ids_1 is None, only returns the first portion of the mask (0s).
			
 
				+
			
 
				+        Args:
			
 
				+            token_ids_0 (`List[int]`):
			
 
				+                List of ids.
			
 
				+            token_ids_1 (`List[int]`, *optional*):
			
 
				+                Optional second list of IDs for sequence pairs.
			
 
				+
			
 
				+        Returns:
			
 
				+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
			
 
				+        """
			
 
				+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
			
 
				+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
			
 
				+
			
 
				+        output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
			
 
				+
			
 
				+        if token_ids_1 is not None:
			
 
				+            output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
			
 
				+
			
 
				+        return output
			
--- a/kernels/activation_kernels.cu
+++ b/kernels/activation_kernels.cu
@@ -2,19 +2,16 @@
 
				 #include <torch/extension.h>
			
 
				 #include <c10/cuda/CUDAGuard.h>
			
 
				 
			
 
				+#include <cmath>
			
 
				+
			
 
				 #include "cuda_compat.h"
			
 
				 #include "dispatch_utils.h"
			
 
				 
			
 
				 namespace aphrodite {
			
 
				 
			
 
				-template<typename T>
			
 
				-__device__ __forceinline__ T silu(const T& x) {
			
 
				-  // x * sigmoid(x)
			
 
				-  return (T) (((float) x) / (1.0f + expf((float) -x)));
			
 
				-}
			
 
				-
			
 
				-template<typename scalar_t>
			
 
				-__global__ void silu_and_mul_kernel(
			
 
				+// Activation and gating kernel template.
			
 
				+template<typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
			
 
				+__global__ void act_and_mul_kernel(
			
 
				   scalar_t* __restrict__ out,               // [..., d]
			
 
				   const scalar_t* __restrict__ input,       // [..., 2, d]
			
 
				   const int d) {
			
@@ -22,32 +19,58 @@ __global__ void silu_and_mul_kernel(
 
				   for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
			
 
				     const scalar_t x = APHRODITE_LDG(&input[token_idx * 2 * d + idx]);
			
 
				     const scalar_t y = APHRODITE_LDG(&input[token_idx * 2 * d + d + idx]);
			
 
				-    out[token_idx * d + idx] = silu(x) * y;
			
 
				+    out[token_idx * d + idx] = ACT_FN(x) * y;
			
 
				   }
			
 
				 }
			
 
				 
			
 
				+template<typename T>
			
 
				+__device__ __forceinline__ T silu_kernel(const T& x) {
			
 
				+  // x * sigmoid(x)
			
 
				+  return (T) (((float) x) / (1.0f + expf((float) -x)));
			
 
				+}
			
 
				+
			
 
				+template<typename T>
			
 
				+__device__ __forceinline__ T gelu_kernel(const T& x) {
			
 
				+  // Equivalent to PyTorch GELU with 'none' approximation.
			
 
				+  // Refer to:
			
 
				+  // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L38
			
 
				+  const float f = (float) x;
			
 
				+  constexpr float ALPHA = M_SQRT1_2;
			
 
				+  return (T) (f * 0.5f * (1.0f + ::erf(f * ALPHA)));
			
 
				+}
			
 
				+
			
 
				 } // namespace aphrodite
			
 
				 
			
 
				+// Launch activation and gating kernel.
			
 
				+#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL)                                                 \
			
 
				+  int d = input.size(-1) / 2;                                                                 \
			
 
				+  int64_t num_tokens = input.numel() / input.size(-1);                                        \
			
 
				+  dim3 grid(num_tokens);                                                                      \
			
 
				+  dim3 block(std::min(d, 1024));                                                              \
			
 
				+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));                           \
			
 
				+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                               \
			
 
				+  APHRODITE_DISPATCH_FLOATING_TYPES(                                                          \
			
 
				+    input.scalar_type(),                                                                      \
			
 
				+    "act_and_mul_kernel",                                                                     \
			
 
				+    [&] {                                                                                     \
			
 
				+      aphrodite::act_and_mul_kernel<scalar_t, KERNEL<scalar_t>><<<grid, block, 0, stream>>>(  \
			
 
				+        out.data_ptr<scalar_t>(),                                                             \
			
 
				+        input.data_ptr<scalar_t>(),                                                           \
			
 
				+        d);                                                                                   \
			
 
				+    });
			
 
				+
			
 
				 void silu_and_mul(
			
 
				   torch::Tensor& out,      // [..., d]
			
 
				   torch::Tensor& input)    // [..., 2 * d]
			
 
				 {
			
 
				-  int64_t num_tokens = input.numel() / input.size(-1);
			
 
				-  int d = input.size(-1) / 2;
			
 
				-
			
 
				-  dim3 grid(num_tokens);
			
 
				-  dim3 block(std::min(d, 1024));
			
 
				-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
			
 
				-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
			
 
				-  APHRODITE_DISPATCH_FLOATING_TYPES(
			
 
				-    input.scalar_type(),
			
 
				-    "silu_and_mul_kernel",
			
 
				-    [&] {
			
 
				-      aphrodite::silu_and_mul_kernel<scalar_t><<<grid, block, 0, stream>>>(
			
 
				-        out.data_ptr<scalar_t>(),
			
 
				-        input.data_ptr<scalar_t>(),
			
 
				-        d);
			
 
				-    });
			
 
				+  LAUNCH_ACTIVATION_GATE_KERNEL(aphrodite::silu_kernel);
			
 
				+}
			
 
				+
			
 
				+void gelu_and_mul(
			
 
				+  torch::Tensor& out,      // [..., d]
			
 
				+  torch::Tensor& input)    // [..., 2 * d]
			
 
				+{
			
 
				+  LAUNCH_ACTIVATION_GATE_KERNEL(aphrodite::gelu_kernel);
			
 
				 }
			
 
				 
			
 
				 namespace aphrodite {
			
--- a/kernels/ops.h
+++ b/kernels/ops.h
@@ -57,6 +57,10 @@ void silu_and_mul(
 
				   torch::Tensor& out,
			
 
				   torch::Tensor& input);
			
 
				 
			
 
				+void gelu_and_mul(
			
 
				+  torch::Tensor& out,
			
 
				+  torch::Tensor& input);
			
 
				+
			
 
				 void gelu_new(
			
 
				   torch::Tensor& out,
			
 
				   torch::Tensor& input);
			
--- a/kernels/pybind.cpp
+++ b/kernels/pybind.cpp
@@ -22,6 +22,10 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
 
				     "silu_and_mul",
			
 
				     &silu_and_mul,
			
 
				     "Activation function used in SwiGLU.");
			
 
				+  ops.def(
			
 
				+    "gelu_and_mul",
			
 
				+    &gelu_and_mul,
			
 
				+    "Activation function used in GeGLU.");
			
 
				   ops.def(
			
 
				     "gelu_new",
			
 
				     &gelu_new,