8 ماه پیش · 9ce319b03c
--- a/aphrodite/common/sampling_params.py
+++ b/aphrodite/common/sampling_params.py
@@ -5,7 +5,8 @@ from functools import cached_property
 
				 from typing import Any, Callable, Dict, List, Optional, Union
			
 
				 
			
 
				 import torch
			
 
				-from pydantic import conint
			
 
				+from pydantic import Field
			
 
				+from typing_extensions import Annotated
			
 
				 
			
 
				 _SAMPLING_EPS = 1e-5
			
 
				 
			
@@ -170,7 +171,7 @@ class SamplingParams:
 
				         skip_special_tokens: bool = True,
			
 
				         spaces_between_special_tokens: bool = True,
			
 
				         logits_processors: Optional[List[LogitsProcessorFunc]] = None,
			
 
				-        truncate_prompt_tokens: Optional[conint(ge=1)] = None,
			
 
				+        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
			
 
				     ) -> None:
			
 
				         self.n = n
			
 
				         self.best_of = best_of if best_of is not None else n
			
@@ -220,6 +221,12 @@ class SamplingParams:
 
				         self.logits_processors = logits_processors or []
			
 
				         self.include_stop_str_in_output = include_stop_str_in_output
			
 
				         self.truncate_prompt_tokens = truncate_prompt_tokens
			
 
				+        # Number of characters to hold back for stop string evaluation
			
 
				+        # until sequence is finished.
			
 
				+        if self.stop and not include_stop_str_in_output:
			
 
				+            self.output_text_buffer_length = max(len(s) for s in self.stop) - 1
			
 
				+        else:
			
 
				+            self.output_text_buffer_length = 0
			
 
				 
			
 
				         self.default_values = {
			
 
				             "n": 1,
			
--- a/aphrodite/modeling/layers/sampler.py
+++ b/aphrodite/modeling/layers/sampler.py
@@ -17,14 +17,18 @@ from aphrodite.modeling.sampling_metadata import (SamplingMetadata,
 
				 
			
 
				 class Sampler(nn.Module):
			
 
				     """Samples the next tokens from the model's outputs.
			
 
				+
			
 
				     This layer does the following:
			
 
				     1. Discard the hidden states that are not used for sampling (i.e., all
			
 
				         tokens except the final one in each prompt).
			
 
				     2. Compute the logits for the next tokens.
			
 
				-    3. Apply all the different sampler functions in the specified order.
			
 
				-    4. Sample the next tokens.
			
 
				+    3. Apply presence, frequency and repetition penalties.
			
 
				+    4. Apply temperature scaling.
			
 
				+    5. Apply top-p and top-k truncation.
			
 
				+    6. Sample the next tokens.
			
 
				     Here, each sequence group within the batch can have different sampling
			
 
				     parameters (e.g., sampling method, temperature, top-p, top-k, etc.).
			
 
				+
			
 
				     The structure of the logits tensor is coupled with the seq_groups in
			
 
				     sampling_metadata. Typically, each sequence in each seq_group has one row in
			
 
				     logits for the next token to be sampled; however, for a seq_group with a
			
@@ -52,17 +56,16 @@ class Sampler(nn.Module):
 
				         """
			
 
				         assert logits is not None
			
 
				         _, vocab_size = logits.shape
			
 
				-        # Apply min_tokens penalty which sets stop tokens to -inf if min_tokens
			
 
				-        # have not been generated yet
			
 
				+
			
 
				         logits = _apply_min_tokens_penalty(logits, sampling_metadata)
			
 
				 
			
 
				         # Prepare sampling tensors with pinned memory to avoid blocking.
			
 
				-        (sampling_tensors, do_temperatures, do_penalties, do_topks, do_topps,
			
 
				-         do_topas, do_minps, do_tfss, do_eta_cutoffs, do_epsilon_cutoffs,
			
 
				-         do_typical_ps,
			
 
				-         do_quadratic) = (SamplingTensors.from_sampling_metadata(
			
 
				-             sampling_metadata, vocab_size, logits.device, logits.dtype))
			
 
				+        (sampling_tensors, do_penalties, do_top_p_top_k, do_top_as, do_min_p,
			
 
				+         do_tfss, do_eta_cutoffs, do_epsilon_cutoffs, do_typical_ps,
			
 
				+         do_quadratic) = SamplingTensors.from_sampling_metadata(
			
 
				+             sampling_metadata, vocab_size, logits.device, logits.dtype)
			
 
				 
			
 
				+        # Apply presence and frequency penalties.
			
 
				         if do_penalties:
			
 
				             logits = _apply_penalties(logits, sampling_tensors.prompt_tokens,
			
 
				                                       sampling_tensors.output_tokens,
			
@@ -70,18 +73,30 @@ class Sampler(nn.Module):
 
				                                       sampling_tensors.frequency_penalties,
			
 
				                                       sampling_tensors.repetition_penalties)
			
 
				 
			
 
				-        if (do_topks or do_topps or do_topas or do_minps):
			
 
				-            logits = _apply_alphabet_soup(logits, sampling_tensors.top_ps,
			
 
				-                                          sampling_tensors.top_ks,
			
 
				-                                          sampling_tensors.top_as,
			
 
				-                                          sampling_tensors.min_ps)
			
 
				+        # Apply temperature scaling.
			
 
				+        # Use in-place division to avoid creating a new tensor.
			
 
				+        logits.div_(sampling_tensors.temperatures.unsqueeze_(dim=1))
			
 
				+
			
 
				+        if do_top_p_top_k:
			
 
				+            logits = _apply_top_k_top_p(logits, sampling_tensors.top_ps,
			
 
				+                                        sampling_tensors.top_ks)
			
 
				+
			
 
				+        if do_top_as:
			
 
				+            logits = _apply_top_a(logits, sampling_tensors.top_as)
			
 
				+
			
 
				+        if do_min_p:
			
 
				+            logits = _apply_min_p(logits, sampling_tensors.min_ps)
			
 
				+
			
 
				         if do_tfss:
			
 
				             logits = _apply_tfs(logits, sampling_tensors.tfss)
			
 
				+
			
 
				         if do_eta_cutoffs:
			
 
				             logits = _apply_eta_cutoff(logits, sampling_tensors.eta_cutoffs)
			
 
				+
			
 
				         if do_epsilon_cutoffs:
			
 
				             logits = _apply_epsilon_cutoff(logits,
			
 
				                                            sampling_tensors.epsilon_cutoffs)
			
 
				+
			
 
				         if do_typical_ps:
			
 
				             logits = _apply_typical_sampling(logits,
			
 
				                                              sampling_tensors.typical_ps)
			
@@ -91,15 +106,7 @@ class Sampler(nn.Module):
 
				                 logits, sampling_tensors.smoothing_factors,
			
 
				                 sampling_tensors.smoothing_curves)
			
 
				 
			
 
				-        if do_temperatures:
			
 
				-            logits = _apply_temperature(logits, sampling_tensors.temperatures,
			
 
				-                                        # sampling_tensors.dynatemp_mins,
			
 
				-                                        # sampling_tensors.dynatemp_maxs,
			
 
				-                                        # sampling_tensors.dynatemp_exps
			
 
				-                                        )
			
 
				-
			
 
				         banned_tokens = _get_custom_token_bans(sampling_metadata)
			
 
				-        # assert len(banned_tokens) == logits.shape[0]
			
 
				         logits = _apply_token_bans(logits, banned_tokens)
			
 
				 
			
 
				         # We use float32 for probabilities and log probabilities.
			
@@ -117,12 +124,14 @@ class Sampler(nn.Module):
 
				             include_gpu_probs_tensor=self.include_gpu_probs_tensor,
			
 
				             modify_greedy_probs=self._should_modify_greedy_probs_inplace,
			
 
				         )
			
 
				+
			
 
				         if self.include_gpu_probs_tensor:
			
 
				             assert maybe_sampled_tokens_tensor is not None
			
 
				             sampled_tokens_tensor = maybe_sampled_tokens_tensor
			
 
				             on_device_tensors = (probs, sampled_tokens_tensor)
			
 
				         else:
			
 
				             on_device_tensors = None
			
 
				+
			
 
				         # Get the logprobs query results.
			
 
				         prompt_logprobs, sample_logprobs = _get_logprobs(
			
 
				             logprobs, sampling_metadata, sample_results)
			
@@ -137,8 +146,10 @@ class Sampler(nn.Module):
 
				         """Whether or not the sampler should modify the probability distribution
			
 
				         of greedily-sampled tokens such that multinomial sampling would sample
			
 
				         the greedily-sampled token.
			
 
				+
			
 
				         In other words, if True then we set the probability of the greedily-
			
 
				         sampled token to 1.
			
 
				+
			
 
				         This is used by speculative decoding, which requires that the sampling
			
 
				         method be encoded into the probability distribution.
			
 
				         """
			
@@ -258,38 +269,27 @@ def _apply_min_tokens_penalty(
 
				     return logits
			
 
				 
			
 
				 
			
 
				-def _apply_alphabet_soup(
			
 
				+def _apply_top_k_top_p(
			
 
				     logits: torch.Tensor,
			
 
				     p: torch.Tensor,
			
 
				     k: torch.Tensor,
			
 
				-    a: torch.Tensor,
			
 
				-    m: torch.Tensor,
			
 
				 ) -> torch.Tensor:
			
 
				-    logits_sort, logits_idx = logits.sort(dim=-1, descending=True)
			
 
				-
			
 
				-    # Apply top-p, min-p and top-a.
			
 
				-    probs_sort = logits_sort.softmax(dim=-1)
			
 
				-    probs_sum = probs_sort.cumsum(dim=-1).sub_(probs_sort)
			
 
				-    min_p_thresholds = probs_sort[:, 0] * m
			
 
				-    top_a_thresholds = torch.pow(probs_sort[:, 0], 2) * a
			
 
				-    threshold = torch.maximum(min_p_thresholds, top_a_thresholds)
			
 
				-    mask = (probs_sort < threshold.unsqueeze(1)
			
 
				-            )  # Cull logits below the top-a threshold
			
 
				-    mask.logical_or_(
			
 
				-        probs_sum >
			
 
				-        p.unsqueeze(dim=1))  # Cull logits above the top-p summation threshold
			
 
				-    mask[:, 0] = False  # Guarantee at least one token is pickable
			
 
				-    logits_sort[mask] = -float("inf")
			
 
				+    logits_sort, logits_idx = logits.sort(dim=-1, descending=False)
			
 
				 
			
 
				     # Apply top-k.
			
 
				-    # Create a mask for the top-k elements.
			
 
				-    top_k_mask = torch.arange(logits_idx.shape[-1], device=logits_idx.device)
			
 
				-    top_k_mask = top_k_mask.expand(logits_idx.shape[0], -1)
			
 
				-    top_k_mask = top_k_mask >= k.unsqueeze_(dim=1)
			
 
				+    top_k_mask = logits_sort.size(1) - k.to(torch.long)
			
 
				+    # Get all the top_k values.
			
 
				+    top_k_mask = logits_sort.gather(1, top_k_mask.unsqueeze(dim=1))
			
 
				+    top_k_mask = logits_sort < top_k_mask
			
 
				+    logits_sort.masked_fill_(top_k_mask, -float("inf"))
			
 
				 
			
 
				-    # Final mask.
			
 
				-    mask = (mask | top_k_mask)
			
 
				-    logits_sort.masked_fill_(mask, -float("inf"))
			
 
				+    # Apply top-p.
			
 
				+    probs_sort = logits_sort.softmax(dim=-1)
			
 
				+    probs_sum = probs_sort.cumsum(dim=-1)
			
 
				+    top_p_mask = probs_sum <= 1 - p.unsqueeze(dim=1)
			
 
				+    # at least one
			
 
				+    top_p_mask[:, -1] = False
			
 
				+    logits_sort.masked_fill_(top_p_mask, -float("inf"))
			
 
				 
			
 
				     # Re-sort the probabilities.
			
 
				     src = torch.arange(logits_idx.shape[-1],
			
@@ -301,6 +301,36 @@ def _apply_alphabet_soup(
 
				     return logits
			
 
				 
			
 
				 
			
 
				+def _apply_min_p(
			
 
				+    logits: torch.Tensor,
			
 
				+    min_p: torch.Tensor,
			
 
				+) -> torch.Tensor:
			
 
				+    """
			
 
				+    Adapted from
			
 
				+    https://github.com/oobabooga/text-generation-webui/blob/3146124ec01f02c8fb1650a6517cf1b60b537aaf/modules/sampler_hijack.py#L16C17-L16C17
			
 
				+    """
			
 
				+    probs = torch.softmax(logits, dim=-1)
			
 
				+    top_probs, _ = probs.max(dim=-1, keepdim=True)
			
 
				+    scaled_min_p = min_p.unsqueeze_(dim=1) * top_probs
			
 
				+    tokens_to_remove = probs < scaled_min_p
			
 
				+    logits = logits.masked_fill_(tokens_to_remove, -float("inf"))
			
 
				+
			
 
				+    return logits
			
 
				+
			
 
				+
			
 
				+def _apply_top_a(
			
 
				+    logits: torch.Tensor,
			
 
				+    top_a: torch.Tensor,
			
 
				+) -> torch.Tensor:
			
 
				+    probs = torch.softmax(logits, dim=-1)
			
 
				+    top_probs, _ = probs.max(dim=-1, keepdim=True)
			
 
				+    threshold = torch.pow(top_probs, 2) * top_a.unsqueeze_(dim=1)
			
 
				+    tokens_to_remove = probs < threshold
			
 
				+    logits = logits.masked_fill_(tokens_to_remove, -float("inf"))
			
 
				+
			
 
				+    return logits
			
 
				+
			
 
				+
			
 
				 def _apply_tfs(
			
 
				     logits: torch.Tensor,
			
 
				     tfs: torch.Tensor,
			
@@ -393,37 +423,6 @@ def _apply_typical_sampling(
 
				     return logits
			
 
				 
			
 
				 
			
 
				-# pulls double duty for temperature and dynatemp
			
 
				-def _apply_temperature(
			
 
				-    logits: torch.Tensor,
			
 
				-    temperatures: torch.Tensor,
			
 
				-    # dynatemp_mins: torch.Tensor,
			
 
				-    # dynatemp_maxs: torch.Tensor,
			
 
				-    # dynatemp_exps: torch.Tensor,
			
 
				-) -> torch.Tensor:
			
 
				-    # dynatemp_mask = torch.logical_or(dynatemp_mins > 0, dynatemp_maxs > 0)
			
 
				-    # dynatemp_mins = dynatemp_mins[dynatemp_mask]
			
 
				-    # dynatemp_maxs = dynatemp_maxs[dynatemp_mask]
			
 
				-    # dynatemp_exps = dynatemp_exps[dynatemp_mask]
			
 
				-    # dynatemp_mins = dynatemp_mins.clamp_(min=0)
			
 
				-
			
 
				-    # dynatemp_logits = logits[dynatemp_mask]
			
 
				-    # dynatemp_shifted_logits = torch.log_softmax(dynatemp_logits, dim=-1)
			
 
				-    # dynatemp_probs = dynatemp_shifted_logits.exp()
			
 
				-    # dynatemp_entropies = -(dynatemp_probs *
			
 
				-    #                        dynatemp_shifted_logits).nansum(dim=-1)
			
 
				-    # dynatemp_max_entropies = torch.log_(
			
 
				-    #     (dynatemp_logits > float("-inf")).sum(dim=-1).float())
			
 
				-    # normalized_entropies = dynatemp_entropies.div_(dynatemp_max_entropies)
			
 
				-    # dyn_temp = (dynatemp_mins + (dynatemp_maxs - dynatemp_mins) *
			
 
				-    #             normalized_entropies.pow_(dynatemp_exps))
			
 
				-
			
 
				-    # temperatures[dynatemp_mask] = dyn_temp
			
 
				-    # temperatures[temperatures == 0.0] = 1.0
			
 
				-    logits.div_(temperatures.unsqueeze_(dim=1))
			
 
				-    return logits
			
 
				-
			
 
				-
			
 
				 def _apply_quadratic_sampling(
			
 
				     logits: torch.Tensor,
			
 
				     smoothing_factor: torch.Tensor,
			
--- a/aphrodite/modeling/sampling_metadata.py
+++ b/aphrodite/modeling/sampling_metadata.py
@@ -58,9 +58,11 @@ class SamplingMetadata:
 
				     hidden_states = execute_model(...)
			
 
				     logits = hidden_states[sampling_metadata.selected_token_indices]
			
 
				     sample(logits)
			
 
				+
			
 
				     def sample(logits):
			
 
				         # Use categorized_sample_indices for sampling....
			
 
				     ```
			
 
				+
			
 
				     Args:
			
 
				         seq_groups: List of batched sequence groups.
			
 
				         selected_token_indices: (num_query_tokens_to_logprob). Indices to find
			
@@ -141,6 +143,7 @@ def _prepare_seq_groups(
 
				 ) -> Tuple[List[SequenceGroupToSample], List[int], Dict[
			
 
				         SamplingType, List[Tuple[int, int]]], int]:
			
 
				     """Prepare sequence groups and indices for sampling.
			
 
				+
			
 
				     Args:
			
 
				         seq_group_metadata_list: A list of sequence group to batch.
			
 
				         prompt_lens: A list of prompt lens per sequence group.
			
@@ -149,6 +152,7 @@ def _prepare_seq_groups(
 
				             of entire prompt tokens, and it could be shorter.
			
 
				         device: A device to use for random number generator,
			
 
				             `SequenceGroupToSample.generator`.
			
 
				+
			
 
				     Returns:
			
 
				         seq_groups: A list of sequence group to sample.
			
 
				         selected_token_indices: See the definition from `SamplingMetadata`.
			
@@ -215,6 +219,7 @@ def _prepare_seq_groups(
 
				         """
			
 
				         This blocks computes selected_token_indices which is used in the
			
 
				         following way.
			
 
				+
			
 
				         hidden_states = model(...)
			
 
				         logits = hidden_states[selected_token_indices]
			
 
				         """
			
@@ -232,6 +237,7 @@ def _prepare_seq_groups(
 
				         """
			
 
				         This block computes categorized_sample_indices which is used in the
			
 
				         following way.
			
 
				+
			
 
				         hidden_states = model(...)
			
 
				         logits = hidden_states[selected_token_indices]
			
 
				         def sample(logits):
			
@@ -274,6 +280,7 @@ def _prepare_seq_groups(
 
				 @dataclass
			
 
				 class SamplingTensors:
			
 
				     """Tensors for sampling."""
			
 
				+
			
 
				     temperatures: torch.Tensor
			
 
				     top_ps: torch.Tensor
			
 
				     top_ks: torch.Tensor
			
@@ -286,9 +293,6 @@ class SamplingTensors:
 
				     eta_cutoffs: torch.Tensor
			
 
				     epsilon_cutoffs: torch.Tensor
			
 
				     typical_ps: torch.Tensor
			
 
				-    dynatemp_mins: torch.Tensor
			
 
				-    dynatemp_maxs: torch.Tensor
			
 
				-    dynatemp_exps: torch.Tensor
			
 
				     smoothing_factors: torch.Tensor
			
 
				     smoothing_curves: torch.Tensor
			
 
				     sampling_seeds: torch.Tensor
			
@@ -308,7 +312,12 @@ class SamplingTensors:
 
				         extra_seeds_to_generate: int = 0,
			
 
				         extra_entropy: Optional[Tuple[int, ...]] = None
			
 
				     ) -> Tuple["SamplingTensors", bool, bool, bool, bool, bool, bool, bool,
			
 
				-               bool, bool, bool, bool]:
			
 
				+               bool, bool]:
			
 
				+        """
			
 
				+        extra_seeds_to_generate: extra seeds to generate using the
			
 
				+            user-defined seed for each sequence.
			
 
				+        extra_entropy: extra entropy to use when generating seeds.
			
 
				+        """
			
 
				         prompt_tokens: List[List[int]] = []
			
 
				         output_tokens: List[List[int]] = []
			
 
				         top_ks: List[int] = []
			
@@ -323,20 +332,15 @@ class SamplingTensors:
 
				         eta_cutoffs: List[float] = []
			
 
				         epsilon_cutoffs: List[float] = []
			
 
				         typical_ps: List[float] = []
			
 
				-        dynatemp_mins: List[float] = []
			
 
				-        dynatemp_maxs: List[float] = []
			
 
				-        dynatemp_exps: List[float] = []
			
 
				         smoothing_factors: List[float] = []
			
 
				         smoothing_curves: List[float] = []
			
 
				         sampling_seeds: List[int] = []
			
 
				         sample_indices: List[int] = []
			
 
				         prompt_best_of: List[int] = []
			
 
				-        do_temperatures = False
			
 
				         do_penalties = False
			
 
				-        do_topks = False
			
 
				-        do_topps = False
			
 
				-        do_topas = False
			
 
				-        do_minps = False
			
 
				+        do_top_p_top_k = False
			
 
				+        do_top_as = False
			
 
				+        do_min_p = False
			
 
				         do_tfss = False
			
 
				         do_eta_cutoffs = False
			
 
				         do_epsilon_cutoffs = False
			
@@ -356,38 +360,37 @@ class SamplingTensors:
 
				             f = sampling_params.frequency_penalty
			
 
				             r = sampling_params.repetition_penalty
			
 
				             top_p = sampling_params.top_p
			
 
				-            # k should not be greater than the vocab size
			
 
				-            top_k = min(sampling_params.top_k, vocab_size)
			
 
				-            top_k = vocab_size if top_k == -1 else top_k
			
 
				             top_a = sampling_params.top_a
			
 
				             min_p = sampling_params.min_p
			
 
				             tfs = sampling_params.tfs
			
 
				             eta_cutoff = sampling_params.eta_cutoff
			
 
				             epsilon_cutoff = sampling_params.epsilon_cutoff
			
 
				             typical_p = sampling_params.typical_p
			
 
				-            dynatemp_min = sampling_params.dynatemp_min
			
 
				-            dynatemp_max = sampling_params.dynatemp_max
			
 
				-            dynatemp_exp = sampling_params.dynatemp_exponent
			
 
				             smoothing_factor = sampling_params.smoothing_factor
			
 
				             smoothing_curve = sampling_params.smoothing_curve
			
 
				             seed = sampling_params.seed
			
 
				 
			
 
				             is_greedy = sampling_params.sampling_type == SamplingType.GREEDY
			
 
				 
			
 
				-            if do_temperatures is False and temperature > _SAMPLING_EPS:
			
 
				-                do_temperatures = True
			
 
				+            # k should not be greater than the vocab size.
			
 
				+            top_k = min(sampling_params.top_k, vocab_size)
			
 
				+            top_k = vocab_size if top_k == -1 else top_k
			
 
				+            if temperature < _SAMPLING_EPS:
			
 
				+                # NOTE: Zero temperature means deterministic sampling
			
 
				+                # (i.e., greedy sampling or beam search).
			
 
				+                # Set the temperature to 1 to avoid division by zero.
			
 
				+                temperature = 1.0
			
 
				+            if not do_top_p_top_k and (top_p < 1.0 - _SAMPLING_EPS
			
 
				+                                       or top_k != vocab_size):
			
 
				+                do_top_p_top_k = True
			
 
				+            if do_top_as is False and top_a > 0.0:
			
 
				+                do_top_as = True
			
 
				+            if not do_min_p and min_p > _SAMPLING_EPS:
			
 
				+                do_min_p = True
			
 
				             if not do_penalties and (abs(p) >= _SAMPLING_EPS
			
 
				                                      or abs(f) >= _SAMPLING_EPS
			
 
				                                      or abs(r - 1.0) >= _SAMPLING_EPS):
			
 
				                 do_penalties = True
			
 
				-            if do_topks is False and top_k != vocab_size:
			
 
				-                do_topks = True
			
 
				-            if do_topps is False and top_p < 1.0 - _SAMPLING_EPS:
			
 
				-                do_topps = True
			
 
				-            if do_topas is False and top_a > 0.0:
			
 
				-                do_topas = True
			
 
				-            if do_minps is False and min_p > _SAMPLING_EPS:
			
 
				-                do_minps = True
			
 
				             if do_tfss is False and tfs < 1.0 - _SAMPLING_EPS:
			
 
				                 do_tfss = True
			
 
				             if do_eta_cutoffs is False and eta_cutoff > _SAMPLING_EPS:
			
@@ -403,8 +406,8 @@ class SamplingTensors:
 
				             is_prompt = seq_group.is_prompt
			
 
				             if (seq_group.is_prompt
			
 
				                     and sampling_params.prompt_logprobs is not None):
			
 
				-                # For tokens in the prompt that we only need to get their
			
 
				-                # logprobs
			
 
				+                # For tokens in the prompt that we only need to get
			
 
				+                # their logprobs
			
 
				                 subquery_len = seq_group.subquery_len
			
 
				                 assert subquery_len is not None
			
 
				                 prefill_len = len(seq_group.prompt_logprob_indices)
			
@@ -420,9 +423,6 @@ class SamplingTensors:
 
				                 eta_cutoffs += [0] * prefill_len
			
 
				                 epsilon_cutoffs += [0] * prefill_len
			
 
				                 typical_ps += [1] * prefill_len
			
 
				-                dynatemp_mins += [dynatemp_min] * prefill_len
			
 
				-                dynatemp_maxs += [dynatemp_max] * prefill_len
			
 
				-                dynatemp_exps += [dynatemp_exp] * prefill_len
			
 
				                 smoothing_factors += [smoothing_factor] * prefill_len
			
 
				                 smoothing_curves += [smoothing_curve] * prefill_len
			
 
				                 prompt_tokens.extend([] for _ in range(prefill_len))
			
@@ -435,23 +435,20 @@ class SamplingTensors:
 
				                     seq_data = seq_group.seq_data[seq_id]
			
 
				                     prompt_tokens.append(seq_data.prompt_token_ids)
			
 
				                     output_tokens.append(seq_data.output_token_ids)
			
 
				-            temperatures += [temperature] * len(seq_ids)
			
 
				-            top_ps += [top_p] * len(seq_ids)
			
 
				-            top_ks += [top_k] * len(seq_ids)
			
 
				-            top_as += [top_a] * len(seq_ids)
			
 
				-            min_ps += [min_p] * len(seq_ids)
			
 
				-            presence_penalties += [p] * len(seq_ids)
			
 
				-            frequency_penalties += [f] * len(seq_ids)
			
 
				-            repetition_penalties += [r] * len(seq_ids)
			
 
				-            tfss += [tfs] * len(seq_ids)
			
 
				-            eta_cutoffs += [eta_cutoff] * len(seq_ids)
			
 
				-            epsilon_cutoffs += [epsilon_cutoff] * len(seq_ids)
			
 
				-            typical_ps += [typical_p] * len(seq_ids)
			
 
				-            dynatemp_mins += [dynatemp_min] * len(seq_ids)
			
 
				-            dynatemp_maxs += [dynatemp_max] * len(seq_ids)
			
 
				-            dynatemp_exps += [dynatemp_exp] * len(seq_ids)
			
 
				-            smoothing_factors += [smoothing_factor] * len(seq_ids)
			
 
				-            smoothing_curves += [smoothing_curve] * len(seq_ids)
			
 
				+                temperatures += [temperature] * len(seq_ids)
			
 
				+                top_ps += [top_p] * len(seq_ids)
			
 
				+                top_ks += [top_k] * len(seq_ids)
			
 
				+                top_as += [top_a] * len(seq_ids)
			
 
				+                min_ps += [min_p] * len(seq_ids)
			
 
				+                presence_penalties += [p] * len(seq_ids)
			
 
				+                frequency_penalties += [f] * len(seq_ids)
			
 
				+                repetition_penalties += [r] * len(seq_ids)
			
 
				+                tfss += [tfs] * len(seq_ids)
			
 
				+                eta_cutoffs += [eta_cutoff] * len(seq_ids)
			
 
				+                epsilon_cutoffs += [epsilon_cutoff] * len(seq_ids)
			
 
				+                typical_ps += [typical_p] * len(seq_ids)
			
 
				+                smoothing_factors += [smoothing_factor] * len(seq_ids)
			
 
				+                smoothing_curves += [smoothing_curve] * len(seq_ids)
			
 
				 
			
 
				             if is_prompt:
			
 
				                 prompt_best_of.append(sampling_params.best_of)
			
@@ -474,13 +471,12 @@ class SamplingTensors:
 
				         sampling_tensors = SamplingTensors.from_lists(
			
 
				             temperatures, top_ps, top_ks, top_as, min_ps, presence_penalties,
			
 
				             frequency_penalties, repetition_penalties, tfss, eta_cutoffs,
			
 
				-            epsilon_cutoffs, typical_ps, dynatemp_mins, dynatemp_maxs,
			
 
				-            dynatemp_exps, smoothing_factors, smoothing_curves, sampling_seeds,
			
 
				-            sample_indices, prompt_tokens, output_tokens, vocab_size,
			
 
				-            extra_seeds_to_generate, device, dtype)
			
 
				-        return (sampling_tensors, do_temperatures, do_penalties, do_topks,
			
 
				-                do_topps, do_topas, do_minps, do_tfss, do_eta_cutoffs,
			
 
				-                do_epsilon_cutoffs, do_typical_ps, do_quadratic)
			
 
				+            epsilon_cutoffs, typical_ps, smoothing_factors, smoothing_curves,
			
 
				+            sampling_seeds, sample_indices, prompt_tokens, output_tokens,
			
 
				+            vocab_size, extra_seeds_to_generate, device, dtype)
			
 
				+        return (sampling_tensors, do_penalties, do_top_p_top_k, do_top_as,
			
 
				+                do_min_p, do_tfss, do_eta_cutoffs, do_epsilon_cutoffs,
			
 
				+                do_typical_ps, do_quadratic)
			
 
				 
			
 
				     @classmethod
			
 
				     def from_lists(cls, temperatures: List[float], top_ps: List[float],
			
@@ -489,9 +485,7 @@ class SamplingTensors:
 
				                    frequency_penalties: List[float],
			
 
				                    repetition_penalties: List[float], tfss: List[float],
			
 
				                    eta_cutoffs: List[float], epsilon_cutoffs: List[float],
			
 
				-                   typical_ps: List[float], dynatemp_mins: List[float],
			
 
				-                   dynatemp_maxs: List[float], dynatemp_exps: List[float],
			
 
				-                   smoothing_factors: List[float],
			
 
				+                   typical_ps: List[float], smoothing_factors: List[float],
			
 
				                    smoothing_curves: List[float], sampling_seeds: List[int],
			
 
				                    sample_indices: List[int], prompt_tokens: List[List[int]],
			
 
				                    output_tokens: List[List[int]], vocab_size: int,
			
@@ -513,38 +507,52 @@ class SamplingTensors:
 
				             for tokens in output_tokens
			
 
				         ]
			
 
				 
			
 
				-        temperatures_t = torch.tensor(temperatures,
			
 
				-                                      device="cpu",
			
 
				-                                      dtype=dtype,
			
 
				-                                      pin_memory=pin_memory)
			
 
				-        top_ps_t = torch.tensor(top_ps,
			
 
				-                                device="cpu",
			
 
				-                                dtype=dtype,
			
 
				-                                pin_memory=pin_memory)
			
 
				-        top_ks_t = torch.tensor(top_ks,
			
 
				-                                device="cpu",
			
 
				-                                dtype=torch.int,
			
 
				-                                pin_memory=pin_memory)
			
 
				+        temperatures_t = torch.tensor(
			
 
				+            temperatures,
			
 
				+            device="cpu",
			
 
				+            dtype=dtype,
			
 
				+            pin_memory=pin_memory,
			
 
				+        )
			
 
				+        top_ps_t = torch.tensor(
			
 
				+            top_ps,
			
 
				+            device="cpu",
			
 
				+            dtype=dtype,
			
 
				+            pin_memory=pin_memory,
			
 
				+        )
			
 
				         top_as_t = torch.tensor(top_as,
			
 
				                                 device="cpu",
			
 
				                                 dtype=dtype,
			
 
				                                 pin_memory=pin_memory)
			
 
				-        min_ps_t = torch.tensor(min_ps,
			
 
				-                                device="cpu",
			
 
				-                                dtype=dtype,
			
 
				-                                pin_memory=pin_memory)
			
 
				-        presence_penalties_t = torch.tensor(presence_penalties,
			
 
				-                                            device="cpu",
			
 
				-                                            dtype=dtype,
			
 
				-                                            pin_memory=pin_memory)
			
 
				-        frequency_penalties_t = torch.tensor(frequency_penalties,
			
 
				-                                             device="cpu",
			
 
				-                                             dtype=dtype,
			
 
				-                                             pin_memory=pin_memory)
			
 
				-        repetition_penalties_t = torch.tensor(repetition_penalties,
			
 
				-                                              device="cpu",
			
 
				-                                              dtype=dtype,
			
 
				-                                              pin_memory=pin_memory)
			
 
				+        min_ps_t = torch.tensor(
			
 
				+            min_ps,
			
 
				+            device="cpu",
			
 
				+            dtype=dtype,
			
 
				+            pin_memory=pin_memory,
			
 
				+        )
			
 
				+        presence_penalties_t = torch.tensor(
			
 
				+            presence_penalties,
			
 
				+            device="cpu",
			
 
				+            dtype=dtype,
			
 
				+            pin_memory=pin_memory,
			
 
				+        )
			
 
				+        frequency_penalties_t = torch.tensor(
			
 
				+            frequency_penalties,
			
 
				+            device="cpu",
			
 
				+            dtype=dtype,
			
 
				+            pin_memory=pin_memory,
			
 
				+        )
			
 
				+        repetition_penalties_t = torch.tensor(
			
 
				+            repetition_penalties,
			
 
				+            device="cpu",
			
 
				+            dtype=dtype,
			
 
				+            pin_memory=pin_memory,
			
 
				+        )
			
 
				+        top_ks_t = torch.tensor(
			
 
				+            top_ks,
			
 
				+            device="cpu",
			
 
				+            dtype=torch.int,
			
 
				+            pin_memory=pin_memory,
			
 
				+        )
			
 
				         tfss_t = torch.tensor(tfss,
			
 
				                               device="cpu",
			
 
				                               dtype=dtype,
			
@@ -561,18 +569,6 @@ class SamplingTensors:
 
				                                     device="cpu",
			
 
				                                     dtype=dtype,
			
 
				                                     pin_memory=pin_memory)
			
 
				-        dynatemp_mins_t = torch.tensor(dynatemp_mins,
			
 
				-                                       device="cpu",
			
 
				-                                       dtype=dtype,
			
 
				-                                       pin_memory=pin_memory)
			
 
				-        dynatemp_maxs_t = torch.tensor(dynatemp_maxs,
			
 
				-                                       device="cpu",
			
 
				-                                       dtype=dtype,
			
 
				-                                       pin_memory=pin_memory)
			
 
				-        dynatemp_exps_t = torch.tensor(dynatemp_exps,
			
 
				-                                       device="cpu",
			
 
				-                                       dtype=dtype,
			
 
				-                                       pin_memory=pin_memory)
			
 
				         smoothing_factors_t = torch.tensor(smoothing_factors,
			
 
				                                            device="cpu",
			
 
				                                            dtype=dtype,
			
@@ -581,18 +577,24 @@ class SamplingTensors:
 
				                                           device="cpu",
			
 
				                                           dtype=dtype,
			
 
				                                           pin_memory=pin_memory)
			
 
				-        sample_indices_t = torch.tensor(sample_indices,
			
 
				-                                        device="cpu",
			
 
				-                                        dtype=torch.int,
			
 
				-                                        pin_memory=pin_memory)
			
 
				-        prompt_tensor = torch.tensor(prompt_padded_tokens,
			
 
				-                                     device=device,
			
 
				-                                     dtype=torch.long,
			
 
				-                                     pin_memory=pin_memory)
			
 
				-        output_tensor = torch.tensor(output_padded_tokens,
			
 
				-                                     device=device,
			
 
				-                                     dtype=torch.long,
			
 
				-                                     pin_memory=pin_memory)
			
 
				+        sample_indices_t = torch.tensor(
			
 
				+            sample_indices,
			
 
				+            device="cpu",
			
 
				+            dtype=torch.long,
			
 
				+            pin_memory=pin_memory,
			
 
				+        )
			
 
				+        prompt_tensor = torch.tensor(
			
 
				+            prompt_padded_tokens,
			
 
				+            device="cpu",
			
 
				+            dtype=torch.long,
			
 
				+            pin_memory=pin_memory,
			
 
				+        )
			
 
				+        output_tensor = torch.tensor(
			
 
				+            output_padded_tokens,
			
 
				+            device="cpu",
			
 
				+            dtype=torch.long,
			
 
				+            pin_memory=pin_memory,
			
 
				+        )
			
 
				         # need to transpose and make contiguous to
			
 
				         # copy the tensor correctly.
			
 
				         # [batch_size, n_seeds] -> [n_seeds, batch_size]
			
@@ -602,6 +604,7 @@ class SamplingTensors:
 
				             dtype=torch.long,
			
 
				             pin_memory=pin_memory,
			
 
				         ).T.contiguous()
			
 
				+
			
 
				         # Because the memory is pinned, we can do non-blocking
			
 
				         # transfer to device.
			
 
				 
			
@@ -613,6 +616,7 @@ class SamplingTensors:
 
				         if not extra_seeds_gpu.numel():
			
 
				             extra_seeds_gpu = None
			
 
				         sampling_seeds_gpu = sampling_seeds_gpu[:num_base_seeds]
			
 
				+
			
 
				         return cls(
			
 
				             temperatures=temperatures_t.to(device=device, non_blocking=True),
			
 
				             top_ps=top_ps_t.to(device=device, non_blocking=True),
			
@@ -629,9 +633,6 @@ class SamplingTensors:
 
				             eta_cutoffs=eta_cutoffs_t.to(device=device, non_blocking=True),
			
 
				             epsilon_cutoffs=epsilon_cutoffs_t.to(device=device,
			
 
				                                                  non_blocking=True),
			
 
				-            dynatemp_mins=dynatemp_mins_t.to(device=device, non_blocking=True),
			
 
				-            dynatemp_maxs=dynatemp_maxs_t.to(device=device, non_blocking=True),
			
 
				-            dynatemp_exps=dynatemp_exps_t.to(device=device, non_blocking=True),
			
 
				             smoothing_factors=smoothing_factors_t.to(device=device,
			
 
				                                                      non_blocking=True),
			
 
				             smoothing_curves=smoothing_curves_t.to(device=device,