1 ano atrás · b9b295d74e
--- a/aphrodite/common/config.py
+++ b/aphrodite/common/config.py
@@ -95,23 +95,20 @@ class ModelConfig:
 
				         supported_load_format = [
			
 
				             "auto", "pt", "safetensors", "npcache", "dummy"
			
 
				         ]
			
 
				-        rocm_not_supported_load_format = ["safetensors"]
			
 
				+        rocm_not_supported_load_format = []
			
 
				         if load_format not in supported_load_format:
			
 
				             raise ValueError(
			
 
				                 f"Unknown load format: {self.load_format}. Must be one of "
			
 
				                 "'auto', 'pt', 'safetensors', 'npcache', or 'dummy'.")
			
 
				-        if is_hip():
			
 
				-            if load_format in ["safetensors"]:
			
 
				-                rocm_supported_load_format = [
			
 
				-                    f for f in supported_load_format
			
 
				-                    if (f not in rocm_not_supported_load_format)
			
 
				-                ]
			
 
				-                raise ValueError(
			
 
				-                    f"load format {load_format} is not supported on ROCm. "
			
 
				-                    f"Must be one of {rocm_supported_load_format}.")
			
 
				-            # force ROCm to load from pt weights if nothing is set
			
 
				-            if load_format == "auto":
			
 
				-                load_format = "pt"
			
 
				+        if is_hip() and load_format in rocm_not_supported_load_format:
			
 
				+            rocm_supported_load_format = [
			
 
				+                f for f in supported_load_format
			
 
				+                if (f not in rocm_not_supported_load_format)
			
 
				+            ]
			
 
				+            raise ValueError(
			
 
				+                f"load format \'{load_format}\' is not supported in ROCm. "
			
 
				+                f"Supported load format are "
			
 
				+                f"{rocm_supported_load_format}")
			
 
				 
			
 
				         # TODO: Remove this check once HF updates the pt weights of Mixtral.
			
 
				         architectures = getattr(self.hf_config, "architectures", [])
			
@@ -166,11 +163,6 @@ class ModelConfig:
 
				             self.max_context_len_to_capture = self.max_model_len
			
 
				         self.max_context_len_to_capture = min(self.max_context_len_to_capture,
			
 
				                                               self.max_model_len)
			
 
				-        if (self.quantization in ["gptq", "squeezellm"]
			
 
				-                and not self.enforce_eager):
			
 
				-            logger.warning(f"{self.quantization} does not support CUDA graph "
			
 
				-                           "yet. Disabling CUDA graph.")
			
 
				-            self.enforce_eager = True
			
 
				 
			
 
				     def verify_with_parallel_config(
			
 
				         self,
			
--- a/aphrodite/endpoints/openai/api_server.py
+++ b/aphrodite/endpoints/openai/api_server.py
@@ -87,6 +87,14 @@ def parse_args():
 
				                         default="assistant",
			
 
				                         help="The role name to return if "
			
 
				                         "`request.add_generation_prompt=True.")
			
 
				+    parser.add_argument("--ssl-keyfile",
			
 
				+                        type=str,
			
 
				+                        default=None,
			
 
				+                        help="SSL key file path.")
			
 
				+    parser.add_argument("--ssl-certfile",
			
 
				+                        type=str,
			
 
				+                        default=None,
			
 
				+                        help="SSL cert file path.")
			
 
				 
			
 
				     parser = AsyncEngineArgs.add_cli_args(parser)
			
 
				     return parser.parse_args()
			
@@ -819,4 +827,6 @@ if __name__ == "__main__":
 
				                 host=args.host,
			
 
				                 port=args.port,
			
 
				                 log_level="info",
			
 
				-                timeout_keep_alive=TIMEOUT_KEEP_ALIVE)
			
 
				+                timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
			
 
				+                ssl_keyfile=args.ssl_keyfile,
			
 
				+                ssl_certfile=args.ssl_certfile)
			
--- a/aphrodite/engine/aphrodite_engine.py
+++ b/aphrodite/engine/aphrodite_engine.py
@@ -1,4 +1,5 @@
 
				 import copy
			
 
				+import os
			
 
				 import time
			
 
				 from functools import partial
			
 
				 from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Union
			
@@ -15,7 +16,6 @@ from aphrodite.common.logger import init_logger
 
				 from aphrodite.common.outputs import RequestOutput
			
 
				 from aphrodite.common.sampling_params import SamplingParams
			
 
				 from aphrodite.common.sequence import (SamplerOutput, Sequence, SequenceGroup,
			
 
				-                                       SequenceGroupMetadata,
			
 
				                                        SequenceGroupOutput, SequenceOutput,
			
 
				                                        SequenceStatus)
			
 
				 from aphrodite.transformers_utils.tokenizer import (detokenize_incrementally,
			
@@ -106,6 +106,10 @@ class AphroditeEngine:
 
				 
			
 
				         # Create the parallel GPU workers.
			
 
				         if self.parallel_config.worker_use_ray:
			
 
				+            # Disable Ray usage stats collection.
			
 
				+            ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
			
 
				+            if ray_usage != "1":
			
 
				+                os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
			
 
				             self._init_workers_ray(placement_group)
			
 
				         else:
			
 
				             self._init_workers(distributed_init_method)
			
@@ -250,6 +254,15 @@ class AphroditeEngine:
 
				                              "Try increasing `gpu_memory_utilization` when "
			
 
				                              "initializing the engine.")
			
 
				 
			
 
				+        max_seq_len = self.cache_config.block_size * num_gpu_blocks
			
 
				+        if self.model_config.max_model_len > max_seq_len:
			
 
				+            raise ValueError(
			
 
				+                f"The model's max seq len ({self.model_config.max_model_len}) "
			
 
				+                "is larger than the maximum number of tokens that can be "
			
 
				+                f"stored in KV cache ({max_seq_len}). Try increasing "
			
 
				+                "`gpu_memory_utilization` or decreasing `max_model_len` when "
			
 
				+                "initializing the engine.")
			
 
				+
			
 
				         self.cache_config.num_gpu_blocks = num_gpu_blocks
			
 
				         self.cache_config.num_cpu_blocks = num_cpu_blocks
			
 
				 
			
@@ -337,16 +350,6 @@ class AphroditeEngine:
 
				         """Returns True if there are unfinished requests."""
			
 
				         return self.scheduler.has_unfinished_seqs()
			
 
				 
			
 
				-    def _schedule(
			
 
				-        self
			
 
				-    ) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs,
			
 
				-               List[RequestOutput]]:
			
 
				-        seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule()
			
 
				-        return seq_group_metadata_list, scheduler_outputs, [
			
 
				-            RequestOutput.from_seq_group(seq_group)
			
 
				-            for seq_group in scheduler_outputs.ignored_seq_groups
			
 
				-        ]
			
 
				-
			
 
				     def _check_beam_search_early_stopping(
			
 
				         self,
			
 
				         early_stopping: Union[bool, str],
			
@@ -597,9 +600,7 @@ class AphroditeEngine:
 
				         and updates the scheduler with the model outputs. Finally, it decodes
			
 
				         the sequences and returns the newly generated results.
			
 
				         """
			
 
				-        seq_group_metadata_list, scheduler_outputs, ignored = self._schedule()
			
 
				-        if scheduler_outputs.is_empty():
			
 
				-            return ignored
			
 
				+        seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule()
			
 
				 
			
 
				         # Execute the model.
			
 
				         output = self._run_workers(
			
@@ -608,7 +609,7 @@ class AphroditeEngine:
 
				             blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
			
 
				             blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out,
			
 
				             blocks_to_copy=scheduler_outputs.blocks_to_copy,
			
 
				-        )
			
 
				+        ) if not scheduler_outputs.is_empty() else []
			
 
				 
			
 
				         return self._process_model_outputs(output, scheduler_outputs)
			
 
				 
			
--- a/aphrodite/engine/async_aphrodite.py
+++ b/aphrodite/engine/async_aphrodite.py
@@ -182,20 +182,18 @@ class _AsyncAphrodite(AphroditeEngine):
 
				         and updates the scheduler with the model outputs. Finally, it decodes
			
 
				         the sequences and returns the newly generated results.
			
 
				         """
			
 
				-        seq_group_metadata_list, scheduler_outputs, ignored = self._schedule()
			
 
				-        if scheduler_outputs.is_empty():
			
 
				-            return ignored
			
 
				+        seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule()
			
 
				 
			
 
				         # Execute the model.
			
 
				-        output = await self._run_workers_async(
			
 
				+        output = (await self._run_workers_async(
			
 
				             "execute_model",
			
 
				             seq_group_metadata_list=seq_group_metadata_list,
			
 
				             blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
			
 
				             blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out,
			
 
				             blocks_to_copy=scheduler_outputs.blocks_to_copy,
			
 
				-        )
			
 
				+        )) if not scheduler_outputs.is_empty() else []
			
 
				 
			
 
				-        return self._process_model_outputs(output, scheduler_outputs) + ignored
			
 
				+        return self._process_model_outputs(output, scheduler_outputs)
			
 
				 
			
 
				     async def _run_workers_async(
			
 
				         self,
			
--- a/aphrodite/modeling/models/gpt_neox.py
+++ b/aphrodite/modeling/models/gpt_neox.py
@@ -59,6 +59,7 @@ class GPTNeoXAttention(nn.Module):
 
				         self.total_num_heads = config.num_attention_heads
			
 
				         self.hidden_size = config.hidden_size
			
 
				         self.head_size = self.hidden_size // self.total_num_heads
			
 
				+        self.bias = getattr(config, "attention_bias", True)
			
 
				 
			
 
				         tensor_model_parallel_world_size = (
			
 
				             get_tensor_model_parallel_world_size())
			
@@ -70,11 +71,13 @@ class GPTNeoXAttention(nn.Module):
 
				             config.hidden_size,
			
 
				             self.head_size,
			
 
				             self.total_num_heads,
			
 
				+            bias=self.bias,
			
 
				             linear_method=linear_method,
			
 
				         )
			
 
				         self.dense = RowParallelLinear(
			
 
				             config.hidden_size,
			
 
				             config.hidden_size,
			
 
				+            bias=self.bias,
			
 
				             linear_method=linear_method,
			
 
				         )
			
 
				 
			
--- a/aphrodite/modeling/models/mixtral.py
+++ b/aphrodite/modeling/models/mixtral.py
@@ -50,7 +50,6 @@ from aphrodite.modeling.megatron.parallel_state import (
 
				 from aphrodite.modeling.sampling_metadata import SamplingMetadata
			
 
				 from aphrodite.modeling.hf_downloader import (default_weight_loader,
			
 
				                                               hf_model_weights_iterator)
			
 
				-from aphrodite.modeling.utils import set_weight_attrs
			
 
				 from aphrodite.common.sequence import SamplerOutput
			
 
				 
			
 
				 KVCache = Tuple[torch.Tensor, torch.Tensor]
			
@@ -95,30 +94,6 @@ class MixtralMLP(nn.Module):
 
				         return current_hidden_states
			
 
				 
			
 
				 
			
 
				-class DummyModule(nn.Module):
			
 
				-
			
 
				-    def __init__(self) -> None:
			
 
				-        super().__init__()
			
 
				-
			
 
				-        self.w1 = nn.Linear(0, 0, bias=False)
			
 
				-        self.w2 = nn.Linear(0, 0, bias=False)
			
 
				-        self.w3 = nn.Linear(0, 0, bias=False)
			
 
				-
			
 
				-        set_weight_attrs(self.w1.weight,
			
 
				-                         {"weight_loader": self.dummy_weight_loader})
			
 
				-        set_weight_attrs(self.w2.weight,
			
 
				-                         {"weight_loader": self.dummy_weight_loader})
			
 
				-        set_weight_attrs(self.w3.weight,
			
 
				-                         {"weight_loader": self.dummy_weight_loader})
			
 
				-
			
 
				-    def forward(self, *args, **kwargs) -> None:
			
 
				-        raise NotImplementedError()
			
 
				-
			
 
				-    def dummy_weight_loader(self, *args, **kwargs) -> None:  # pylint: disable=unused-argument
			
 
				-        # Noop
			
 
				-        return
			
 
				-
			
 
				-
			
 
				 class MixtralMoE(nn.Module):
			
 
				 
			
 
				     def __init__(
			
@@ -148,7 +123,7 @@ class MixtralMoE(nn.Module):
 
				                        config.hidden_size,
			
 
				                        config.intermediate_size,
			
 
				                        linear_method=linear_method)
			
 
				-            if idx in self.expert_indicies else DummyModule()
			
 
				+            if idx in self.expert_indicies else None
			
 
				             for idx in range(self.num_total_experts)
			
 
				         ])
			
 
				         self.gate = ReplicatedLinear(config.hidden_size,
			
@@ -433,6 +408,10 @@ class MixtralForCausalLM(nn.Module):
 
				                 # Skip loading extra bias for GPTQ models.
			
 
				                 if name.endswith(".bias") and name not in params_dict:
			
 
				                     continue
			
 
				+                # Skip experts that are not assigned to this worker.
			
 
				+                if ("block_sparse_moe.experts." in name
			
 
				+                        and name not in params_dict):
			
 
				+                    continue
			
 
				                 param = params_dict[name]
			
 
				                 weight_loader = getattr(param, "weight_loader",
			
 
				                                         default_weight_loader)
			
--- a/aphrodite/processing/block_manager.py
+++ b/aphrodite/processing/block_manager.py
@@ -103,7 +103,7 @@ class BlockSpaceManager:
 
				     def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
			
 
				         # FIXME: Here we assume that all sequences in the group share
			
 
				         # the same prompt. This may not be true for preempted sequences.
			
 
				-        seq = seq_group.get_seqs()[0]
			
 
				+        seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0]
			
 
				         num_required_blocks = len(seq.logical_token_blocks)
			
 
				         if self.block_sliding_window is not None:
			
 
				             num_required_blocks = min(num_required_blocks,
			
@@ -121,7 +121,7 @@ class BlockSpaceManager:
 
				     def allocate(self, seq_group: SequenceGroup) -> None:
			
 
				         # NOTE: Here we assume that all sequences in the group have the same
			
 
				         # prompt.
			
 
				-        seq = seq_group.get_seqs()[0]
			
 
				+        seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0]
			
 
				 
			
 
				         # Allocate new physical token blocks that will store the prompt tokens.
			
 
				         block_table: BlockTable = []
			
@@ -136,7 +136,7 @@ class BlockSpaceManager:
 
				             block_table.append(block)
			
 
				 
			
 
				         # Assign the block table for each sequence.
			
 
				-        for seq in seq_group.get_seqs():
			
 
				+        for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
			
 
				             self.block_tables[seq.seq_id] = block_table.copy()
			
 
				 
			
 
				     def can_append_slot(self, seq_group: SequenceGroup) -> bool:
			
--- a/aphrodite/processing/scheduler.py
+++ b/aphrodite/processing/scheduler.py
@@ -139,15 +139,17 @@ class Scheduler:
 
				             while self.waiting:
			
 
				                 seq_group = self.waiting[0]
			
 
				 
			
 
				-                assert seq_group.num_seqs() == 1, (
			
 
				+                waiting_seqs = seq_group.get_seqs(
			
 
				+                    status=SequenceStatus.WAITING)
			
 
				+                assert len(waiting_seqs) == 1, (
			
 
				                     "Waiting sequence group should have only one prompt "
			
 
				                     "sequence.")
			
 
				-                num_prompt_tokens = seq_group.get_seqs()[0].get_len()
			
 
				+                num_prompt_tokens = waiting_seqs[0].get_len()
			
 
				                 if num_prompt_tokens > self.prompt_limit:
			
 
				                     logger.warning(
			
 
				                         f"Input prompt ({num_prompt_tokens} tokens) is too long"
			
 
				                         f" and exceeds limit of {self.prompt_limit}")
			
 
				-                    for seq in seq_group.get_seqs():
			
 
				+                    for seq in waiting_seqs:
			
 
				                         seq.status = SequenceStatus.FINISHED_IGNORED
			
 
				                     ignored_seq_groups.append(seq_group)
			
 
				                     self.waiting.pop(0)
			
@@ -161,7 +163,7 @@ class Scheduler:
 
				                     logger.warning(
			
 
				                         f"Input prompt ({num_prompt_tokens} tokens) is too long"
			
 
				                         f" and exceeds the capacity of the block manager.")
			
 
				-                    for seq in seq_group.get_seqs():
			
 
				+                    for seq in waiting_seqs:
			
 
				                         seq.status = SequenceStatus.FINISHED_IGNORED
			
 
				                     ignored_seq_groups.append(seq_group)
			
 
				                     self.waiting.pop(0)
			
@@ -320,7 +322,7 @@ class Scheduler:
 
				 
			
 
				     def _allocate(self, seq_group: SequenceGroup) -> None:
			
 
				         self.block_manager.allocate(seq_group)
			
 
				-        for seq in seq_group.get_seqs():
			
 
				+        for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
			
 
				             seq.status = SequenceStatus.RUNNING
			
 
				 
			
 
				     def _append_slot(
			
--- a/kernels/activation_kernels.cu
+++ b/kernels/activation_kernels.cu
@@ -1,4 +1,5 @@
 
				 #include <torch/extension.h>
			
 
				+#include <c10/cuda/CUDAGuard.h>
			
 
				 #include <ATen/cuda/CUDAContext.h>
			
 
				 
			
 
				 #include "cuda_compat.h"
			
@@ -36,6 +37,7 @@ void silu_and_mul(
 
				 
			
 
				   dim3 grid(num_tokens);
			
 
				   dim3 block(std::min(d, 1024));
			
 
				+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
			
 
				   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
			
 
				   APHRODITE_DISPATCH_FLOATING_TYPES(
			
 
				     input.scalar_type(),
			
@@ -71,6 +73,7 @@ __global__ void activation_kernel(
 
				   int64_t num_tokens = input.numel() / d;                                                 \
			
 
				   dim3 grid(num_tokens);                                                                  \
			
 
				   dim3 block(std::min(d, 1024));                                                          \
			
 
				+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));                       \
			
 
				   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                           \
			
 
				   APHRODITE_DISPATCH_FLOATING_TYPES(                                                           \
			
 
				     input.scalar_type(),                                                                  \
			
--- a/kernels/attention/attention_kernels.cu
+++ b/kernels/attention/attention_kernels.cu
@@ -21,6 +21,7 @@
 
				 #endif
			
 
				 
			
 
				 #include <torch/extension.h>
			
 
				+#include <c10/cuda/CUDAGuard.h>
			
 
				 #include <ATen/cuda/CUDAContext.h>
			
 
				 
			
 
				 #include "attention_dtypes.h"
			
@@ -616,6 +617,7 @@ void paged_attention_v1_launcher(
 
				 
			
 
				   dim3 grid(num_heads, num_seqs, 1);
			
 
				   dim3 block(NUM_THREADS);
			
 
				+  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
			
 
				   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
			
 
				   switch (head_size) {
			
 
				     // NOTE: To reduce the compilation time, we only compile for the
			
@@ -784,6 +786,7 @@ void paged_attention_v2_launcher(
 
				   int reduce_shared_mem_size = 2 * max_num_partitions * sizeof(float);
			
 
				 
			
 
				   dim3 block(NUM_THREADS);
			
 
				+  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
			
 
				   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
			
 
				   switch (head_size) {
			
 
				     // NOTE: To reduce the compilation time, we only compile for the
			
--- a/kernels/cache_kernels.cu
+++ b/kernels/cache_kernels.cu
@@ -1,4 +1,5 @@
 
				 #include <torch/extension.h>
			
 
				+#include <c10/cuda/CUDAGuard.h>
			
 
				 #include <ATen/cuda/CUDAContext.h>
			
 
				 
			
 
				 #include "cuda_compat.h"
			
@@ -33,6 +34,7 @@ void swap_blocks(
 
				   char *dst_ptr = static_cast<char*>(dst.data_ptr());
			
 
				 
			
 
				   const int64_t block_size_in_bytes = src.element_size() * src[0].numel();
			
 
				+  const at::cuda::OptionalCUDAGuard device_guard(src_device);
			
 
				   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
			
 
				   // NOTE: This can be slow if the number of blocks is large.
			
 
				   for (const auto& pair : block_mapping) {
			
@@ -127,6 +129,7 @@ void copy_blocks(
 
				   const int numel_per_block = key_caches[0][0].numel();
			
 
				   dim3 grid(num_layers, num_pairs);
			
 
				   dim3 block(std::min(1024, numel_per_block));
			
 
				+  const at::cuda::OptionalCUDAGuard device_guard(cache_device);
			
 
				   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
			
 
				   APHRODITE_DISPATCH_FLOATING_TYPES(
			
 
				     key_caches[0].scalar_type(), "copy_blocks_kernel", ([&] {
			
@@ -207,6 +210,7 @@ void reshape_and_cache(
 
				 
			
 
				   dim3 grid(num_tokens);
			
 
				   dim3 block(std::min(num_heads * head_size, 512));
			
 
				+  const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
			
 
				   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
			
 
				   APHRODITE_DISPATCH_FLOATING_TYPES(
			
 
				     key.scalar_type(),
			
@@ -367,6 +371,7 @@ void gather_cached_kv(
 
				 
			
 
				   dim3 grid(num_tokens);
			
 
				   dim3 block(std::min(num_heads * head_size, 512));
			
 
				+  const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
			
 
				   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
			
 
				   APHRODITE_DISPATCH_FLOATING_TYPES(
			
 
				     key.scalar_type(),
			
--- a/kernels/layernorm_kernels.cu
+++ b/kernels/layernorm_kernels.cu
@@ -1,4 +1,5 @@
 
				 #include <torch/extension.h>
			
 
				+#include <c10/cuda/CUDAGuard.h>
			
 
				 #include <ATen/cuda/CUDAContext.h>
			
 
				 
			
 
				 #include "dispatch_utils.h"
			
@@ -75,6 +76,7 @@ void rms_norm(
 
				 
			
 
				   dim3 grid(num_tokens);
			
 
				   dim3 block(std::min(hidden_size, 1024));
			
 
				+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
			
 
				   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
			
 
				   APHRODITE_DISPATCH_FLOATING_TYPES(
			
 
				     input.scalar_type(),
			
@@ -100,6 +102,7 @@ void fused_add_rms_norm(
 
				 
			
 
				   dim3 grid(num_tokens);
			
 
				   dim3 block(std::min(hidden_size, 1024));
			
 
				+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
			
 
				   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
			
 
				   APHRODITE_DISPATCH_FLOATING_TYPES(
			
 
				     input.scalar_type(),
			
--- a/kernels/pos_encoding_kernels.cu
+++ b/kernels/pos_encoding_kernels.cu
@@ -1,4 +1,5 @@
 
				 #include <torch/extension.h>
			
 
				+#include <c10/cuda/CUDAGuard.h>
			
 
				 #include <ATen/cuda/CUDAContext.h>
			
 
				 
			
 
				 #include "cuda_compat.h"
			
@@ -94,6 +95,7 @@ void rotary_embedding(
 
				 
			
 
				   dim3 grid(num_tokens);
			
 
				   dim3 block(std::min(num_heads * rot_dim / 2, 512));
			
 
				+  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
			
 
				   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
			
 
				   APHRODITE_DISPATCH_FLOATING_TYPES(
			
 
				     query.scalar_type(),
			
--- a/kernels/quantization/gptq/matrix_view.cuh
+++ b/kernels/quantization/gptq/matrix_view.cuh
@@ -147,5 +147,5 @@ public:
 
				 };
			
 
				 
			
 
				 }  // namespace gptq
			
 
				-}  // namespace vllm
			
 
				+}  // namespace aphrodite
			
 
				 #endif
			
--- a/kernels/quantization/gptq/q_gemm.cu
+++ b/kernels/quantization/gptq/q_gemm.cu
@@ -28,6 +28,7 @@ namespace gptq {
 
				 #define DIVIDE(x, size) (((x) + (size) - 1) / (size))
			
 
				 
			
 
				 #if defined(USE_ROCM)
			
 
				+#include <hipblas/hipblas.h>
			
 
				 __host__ __forceinline__ hipblasStatus_t __compat_hipblasHgemm(hipblasHandle_t    handle,
			
 
				                                                                hipblasOperation_t transA,
			
 
				                                                                hipblasOperation_t transB,
			
@@ -286,7 +287,8 @@ void gemm_half_q_half_cuda_part
 
				 
			
 
				     fp_gemm_half_q_half_gptq_kernel kernel = pick_gemm_half_q_half_gptq_kernel(true, m_count);
			
 
				 
			
 
				-    kernel<<<gridDim, blockDim>>>
			
 
				+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
			
 
				+    kernel<<<gridDim, blockDim, 0, stream>>>
			
 
				     (
			
 
				         a,
			
 
				         b_q_weight,
			
@@ -433,7 +435,8 @@ void reconstruct_exllama
 
				     gridDim.y = DIVIDE(height, BLOCK_KN_SIZE);
			
 
				     gridDim.x = DIVIDE(width, BLOCK_KN_SIZE);
			
 
				 
			
 
				-    reconstruct_exllama_kernel<<<gridDim, blockDim>>>
			
 
				+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
			
 
				+    reconstruct_exllama_kernel<<<gridDim, blockDim, 0, stream>>>
			
 
				     (
			
 
				         b_q_weight,
			
 
				         b_q_perm,
			
@@ -520,16 +523,25 @@ __global__ void gemm_half_q_half_alt_kernel(
 
				             zeros_tmp[tmp_k] = zero;
			
 
				         }
			
 
				         for (int m = 0; m < b_end; m++) {
			
 
				+#ifndef USE_ROCM
			
 
				             res2 = {};
			
 
				+#else
			
 
				+            res2.x = __half_as_ushort(__float2half(0));
			
 
				+            res2.y = __half_as_ushort(__float2half(0));
			
 
				+#endif
			
 
				             res2 = __hfma2(__hfma2(deq2[(tmp >>  0) & 0xff][off], scales_tmp[0], zeros_tmp[0]), blockvec[m][k + 0], res2);
			
 
				             res2 = __hfma2(__hfma2(deq2[(tmp >>  8) & 0xff][off], scales_tmp[1], zeros_tmp[1]), blockvec[m][k + 1], res2);
			
 
				             res2 = __hfma2(__hfma2(deq2[(tmp >> 16) & 0xff][off], scales_tmp[2], zeros_tmp[2]), blockvec[m][k + 2], res2);
			
 
				             res2 = __hfma2(__hfma2(deq2[(tmp >> 24) & 0xff][off], scales_tmp[3], zeros_tmp[3]), blockvec[m][k + 3], res2);
			
 
				+#ifndef USE_ROCM
			
 
				             res[m] = __hadd(res[m], __hadd(res2.x, res2.y));
			
 
				+#else
			
 
				+            res[m] = __hadd(res[m], __hadd(__ushort_as_half(res2.x), __ushort_as_half(res2.y)));
			
 
				+#endif
			
 
				         }
			
 
				         i += width;
			
 
				         k += 4;
			
 
				-    }
			
 
				+}
			
 
				     for (int m = 0; m < b_end; m++) {
			
 
				         atomicAdd(&mul[(b + m) * width + w], res[m]);
			
 
				     }
			
@@ -557,7 +569,8 @@ void gemm_half_q_half_alt
 
				     gridDim.y = DIVIDE(size_m, BLOCK_M_SIZE_MAX);
			
 
				     gridDim.z = DIVIDE(size_k, BLOCK_KN_SIZE);
			
 
				 
			
 
				-    gemm_half_q_half_alt_kernel<<<gridDim, blockDim>>>
			
 
				+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
			
 
				+    gemm_half_q_half_alt_kernel<<<gridDim, blockDim, 0, stream>>>
			
 
				     (
			
 
				         (const half2*) a,
			
 
				         b_q_weight,
			
@@ -629,7 +642,8 @@ void reconstruct_gptq
 
				     blockDim.y = 1;
			
 
				     gridDim.y = DIVIDE(height, 8);
			
 
				     gridDim.x = DIVIDE(width, BLOCK_KN_SIZE);
			
 
				-    reconstruct_gptq_kernel<<<gridDim, blockDim>>>
			
 
				+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
			
 
				+    reconstruct_gptq_kernel<<<gridDim, blockDim, 0, stream>>>
			
 
				     (
			
 
				         b_q_weight,
			
 
				         b_gptq_scales,
			
@@ -784,7 +798,8 @@ void shuffle_exllama_weight
 
				         gridDim.x = DIVIDE(width, THREADS_X);
			
 
				         gridDim.y = height / 8;
			
 
				 
			
 
				-        make_sequential_kernel<<<gridDim, blockDim>>>
			
 
				+        const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
			
 
				+        make_sequential_kernel<<<gridDim, blockDim, 0, stream>>>
			
 
				         (
			
 
				             q_weight,
			
 
				             new_qweight,
			
@@ -803,7 +818,8 @@ void shuffle_exllama_weight
 
				     blockDim.y = 1;
			
 
				     gridDim.x = DIVIDE(width, THREADS_X);
			
 
				     gridDim.y = 1;
			
 
				-    shuffle_kernel<<<gridDim, blockDim>>>(q_weight, height, width);
			
 
				+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
			
 
				+    shuffle_kernel<<<gridDim, blockDim, 0, stream>>>(q_weight, height, width);
			
 
				 }
			
 
				 
			
 
				 }  // namespace gptq
			
--- a/kernels/quantization/squeezellm/quant_cuda_kernel.cu
+++ b/kernels/quantization/squeezellm/quant_cuda_kernel.cu
@@ -7,6 +7,7 @@
 
				 // half-tensor
			
 
				 #include <c10/cuda/CUDAStream.h>
			
 
				 #include <ATen/cuda/CUDATensorMethods.cuh>
			
 
				+#include <c10/cuda/CUDAGuard.h>
			
 
				 
			
 
				 #define BLOCKWIDTH 128
			
 
				 #define BLOCKHEIGHT4 16
			
@@ -199,8 +200,9 @@ void squeezellm_gemm(
 
				     (width + BLOCKWIDTH - 1) / BLOCKWIDTH
			
 
				   );
			
 
				   dim3 threads(BLOCKWIDTH);
			
 
				-
			
 
				-  aphrodite::squeezellm::NUQ4MatMulKernel<<<blocks, threads>>>(
			
 
				+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
			
 
				+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
			
 
				+  aphrodite::squeezellm::NUQ4MatMulKernel<<<blocks, threads, 0, stream>>>(
			
 
				 #ifndef USE_ROCM
			
 
				     (half2*) vec.data<at::Half>(),
			
 
				 #else