11 сар өмнө · f01c668259
--- a/aphrodite/modeling/layers/sampler.py
+++ b/aphrodite/modeling/layers/sampler.py
@@ -7,14 +7,11 @@ import torch.nn as nn
 
				 from aphrodite.modeling.sampling_metadata import (SamplingMetadata,
			
 
				                                                   OutputMetadata,
			
 
				                                                   SamplingTensors)
			
 
				-from aphrodite.modeling.megatron.communication_op import (
			
 
				-    tensor_model_parallel_gather)
			
 
				 from aphrodite.common.sampling_params import SamplingParams, SamplingType
			
 
				 from aphrodite.common.sequence import (Logprob, PromptLogprobs, SampleLogprobs,
			
 
				                                        SamplerOutput, SequenceData,
			
 
				                                        SequenceGroupOutput, SequenceOutput)
			
 
				 from aphrodite.modeling.layers.ops.sample import sample as sample_triton
			
 
				-from aphrodite.common.utils import is_neuron
			
 
				 
			
 
				 
			
 
				 class Sampler(nn.Module):
			
@@ -32,118 +29,22 @@ class Sampler(nn.Module):
 
				     parameters (e.g., sampling method, temperature, top-p, top-k, etc.).
			
 
				     """
			
 
				 
			
 
				-    def __init__(self,
			
 
				-                 vocab_size: int,
			
 
				-                 org_vocab_size: Optional[int] = None) -> None:
			
 
				-        super().__init__()
			
 
				-        self.vocab_size = vocab_size
			
 
				-        # Transformers-neuronx generates outputs as logits directly
			
 
				-        self.logits_as_hidden_states = is_neuron()
			
 
				-        # original vocabulary size (without LoRA).
			
 
				-        self.org_vocab_size = org_vocab_size or vocab_size
			
 
				-
			
 
				-    def _get_logits(self, hidden_states: torch.Tensor, embedding: torch.Tensor,
			
 
				-                    embedding_bias: Optional[torch.Tensor]) -> torch.Tensor:
			
 
				-        # Get the logits for the next tokens.
			
 
				-        logits = torch.matmul(hidden_states, embedding.t())
			
 
				-        if embedding_bias is not None:
			
 
				-            logits += embedding_bias
			
 
				-        logits = tensor_model_parallel_gather(logits)
			
 
				-        # Remove paddings in vocab (if any).
			
 
				-        if logits is not None:
			
 
				-            logits = logits[:, :self.org_vocab_size]
			
 
				-        return logits
			
 
				-
			
 
				-    def forward(
			
 
				-        self,
			
 
				-        embedding: torch.Tensor,
			
 
				-        hidden_states: torch.Tensor,
			
 
				-        sampling_metadata: SamplingMetadata,
			
 
				-        embedding_bias: Optional[torch.Tensor] = None,
			
 
				-    ) -> Optional[SamplerOutput]:
			
 
				-        # Get the hidden states that we use for sampling.
			
 
				-        if self.logits_as_hidden_states:
			
 
				-            logits = hidden_states
			
 
				-        else:
			
 
				-            hidden_states = _prune_hidden_states(hidden_states,
			
 
				-                                                 sampling_metadata)
			
 
				-
			
 
				-            # Get the logits for the next tokens.
			
 
				-            logits = self._get_logits(hidden_states, embedding, embedding_bias)
			
 
				-
			
 
				-        return _perform_sampling(logits, sampling_metadata)
			
 
				-
			
 
				-
			
 
				-# FIXME: This is a hack for the missing GPU blocks. This should be removed
			
 
				-# once a proper fix is implemented.
			
 
				-class QuantSampler(nn.Module):
			
 
				-    """Samples the next tokens from the model's outputs.
			
 
				-
			
 
				-    This layer does the following:
			
 
				-    1. Discard the hidden states that are not used for sampling (i.e., all
			
 
				-        tokens except the final one in each prompt).
			
 
				-    2. Compute the logits for the next tokens.
			
 
				-    3. Apply presence and frequency penalties.
			
 
				-    4. Apply temperature scaling.
			
 
				-    5. Apply top-p and top-k truncation.
			
 
				-    6. Sample the next tokens.
			
 
				-    Here, each sequence group within the batch can have different sampling
			
 
				-    parameters (e.g., sampling method, temperature, top-p, top-k, etc.).
			
 
				-    """
			
 
				-
			
 
				-    def __init__(self,
			
 
				-                 vocab_size: int,
			
 
				-                 org_vocab_size: Optional[int] = None) -> None:
			
 
				-        super().__init__()
			
 
				-        self.vocab_size = vocab_size
			
 
				-        # original vocabulary size (without LoRA).
			
 
				-        self.org_vocab_size = org_vocab_size or vocab_size
			
 
				-
			
 
				-    def _get_logits(self, hidden_states: torch.Tensor, embedding: torch.Tensor,
			
 
				-                    embedding_bias: Optional[torch.Tensor]) -> torch.Tensor:
			
 
				-        # Get the logits for the next tokens.
			
 
				-        logits = torch.matmul(hidden_states, embedding.t())
			
 
				-        if embedding_bias is not None:
			
 
				-            logits += embedding_bias
			
 
				-        logits = tensor_model_parallel_gather(logits)
			
 
				-        # Remove paddings in vocab (if any).
			
 
				-        if logits is not None:
			
 
				-            logits = logits[:, :self.org_vocab_size]
			
 
				-        return logits
			
 
				-
			
 
				     def forward(
			
 
				         self,
			
 
				         logits: torch.Tensor,
			
 
				         sampling_metadata: SamplingMetadata,
			
 
				     ) -> Optional[SamplerOutput]:
			
 
				-        # Get the hidden states that we use for sampling.
			
 
				-        logits = _prune_hidden_states(logits, sampling_metadata)
			
 
				-        logits = tensor_model_parallel_gather(logits)
			
 
				-        # Remove paddings in vocab (if any).
			
 
				-        if logits is not None:
			
 
				-            logits = logits[:, :self.vocab_size]
			
 
				 
			
 
				         return _perform_sampling(logits, sampling_metadata)
			
 
				 
			
 
				-
			
 
				 def _perform_sampling(
			
 
				         logits: torch.Tensor,
			
 
				         sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]:
			
 
				-    # Only perform sampling in the driver worker.
			
 
				-    # Note: `_get_logits` is still distributed across TP workers because
			
 
				-    # the `embedding` weight is distributed across TP workers.
			
 
				-    # TODO: Change the get_logits part to a separate stage.
			
 
				-    if not sampling_metadata.perform_sampling:
			
 
				-        return None
			
 
				-
			
 
				     assert logits is not None
			
 
				     _, vocab_size = logits.shape
			
 
				 
			
 
				     output_metadata = OutputMetadata()
			
 
				 
			
 
				-    # Apply logits processors (if any)
			
 
				-    logits = _apply_logits_processors(logits, sampling_metadata)
			
 
				-
			
 
				     # Prepare sampling tensors with pinned memory to avoid blocking.
			
 
				     (sampling_tensors, do_temperatures, do_penalties, do_topks, do_topps,
			
 
				      do_topas, do_minps, do_tfss, do_eta_cutoffs, do_epsilon_cutoffs,
			
@@ -207,15 +108,6 @@ def _perform_sampling(
 
				                                  prompt_logprobs, sample_logprobs,
			
 
				                                  output_metadata)
			
 
				 
			
 
				-
			
 
				-def _prune_hidden_states(
			
 
				-    hidden_states: torch.Tensor,
			
 
				-    sampling_metadata: SamplingMetadata,
			
 
				-) -> torch.Tensor:
			
 
				-    return hidden_states.index_select(0,
			
 
				-                                      sampling_metadata.selected_token_indices)
			
 
				-
			
 
				-
			
 
				 def _get_bin_counts_and_mask(
			
 
				     tokens: torch.Tensor,
			
 
				     vocab_size: int,
			
@@ -247,56 +139,6 @@ def _get_custom_token_bans(
 
				     return banned_tokens
			
 
				 
			
 
				 
			
 
				-# def _apply_logits_processors(
			
 
				-#     logits: torch.Tensor,
			
 
				-#     metadata: SamplingMetadata,
			
 
				-# ) -> torch.Tensor:
			
 
				-#     seq_offset = 0
			
 
				-#     for i, (seq_ids, sampling_params) in enumerate(metadata.seq_groups):
			
 
				-#         seq_size = len(seq_ids)
			
 
				-#         output_tokens = []
			
 
				-#         if (i < metadata.num_prompts
			
 
				-#                 and sampling_params.prompt_logprobs is not None):
			
 
				-#             prompt_seqs = metadata.prompt_lens[i] - 1
			
 
				-#             seq_size += prompt_seqs
			
 
				-#             output_tokens.extend([[]] * prompt_seqs)
			
 
				-#         seq_end = seq_offset + seq_size
			
 
				-
			
 
				-#         if sampling_params.logits_processors:
			
 
				-#             output_tokens.extend(metadata.seq_data[sid].output_token_ids
			
 
				-#                                  for sid in seq_ids)
			
 
				-#             for proc in sampling_params.logits_processors:
			
 
				-#                 proc(logits[seq_offset:seq_end], output_tokens)
			
 
				-
			
 
				-#         seq_offset = seq_end
			
 
				-
			
 
				-#     return logits
			
 
				-
			
 
				-
			
 
				-def _apply_logits_processors(
			
 
				-    logits: torch.Tensor,
			
 
				-    sampling_metadata: SamplingMetadata,
			
 
				-) -> torch.Tensor:
			
 
				-    logits_row_idx = 0
			
 
				-    found_logits_processors = False
			
 
				-    for seq_ids, sampling_params in sampling_metadata.seq_groups:
			
 
				-        logits_processors = sampling_params.logits_processors
			
 
				-        if logits_processors:
			
 
				-            found_logits_processors = True
			
 
				-            for seq_id in seq_ids:
			
 
				-                logits_row = logits[logits_row_idx]
			
 
				-                token_ids = sampling_metadata.seq_data[seq_id].output_token_ids
			
 
				-                for logits_processor in logits_processors:
			
 
				-                    logits_row = logits_processor(token_ids, logits_row)
			
 
				-                logits[logits_row_idx] = logits_row
			
 
				-                logits_row_idx += 1
			
 
				-        else:
			
 
				-            logits_row_idx += len(seq_ids)
			
 
				-    if found_logits_processors:
			
 
				-        assert logits_row_idx == logits.shape[0]
			
 
				-    return logits
			
 
				-
			
 
				-
			
 
				 def _apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor,
			
 
				                      output_tokens_tensor: torch.Tensor,
			
 
				                      presence_penalties: torch.Tensor,