1 jaar geleden · efc6f7fbec
--- a/aphrodite/common/block.py
+++ b/aphrodite/common/block.py
@@ -49,6 +49,7 @@ class LogicalTokenBlock:
 
				 
			
 
				 class PhysicalTokenBlock:
			
 
				     """Represents the state of a block in the KV cache."""
			
 
				+
			
 
				     def __init__(
			
 
				         self,
			
 
				         device: Device,
			
--- a/aphrodite/common/config.py
+++ b/aphrodite/common/config.py
@@ -261,11 +261,11 @@ class SchedulerConfig:
 
				     """
			
 
				 
			
 
				     def __init__(
			
 
				-            self,
			
 
				-            max_num_batched_tokens: Optional[int],
			
 
				-            max_num_seqs: int,
			
 
				-            max_model_len: int,
			
 
				-            max_paddings: int,
			
 
				+        self,
			
 
				+        max_num_batched_tokens: Optional[int],
			
 
				+        max_num_seqs: int,
			
 
				+        max_model_len: int,
			
 
				+        max_paddings: int,
			
 
				     ) -> None:
			
 
				         if max_num_batched_tokens is not None:
			
 
				             self.max_num_batched_tokens = max_num_batched_tokens
			
@@ -288,7 +288,8 @@ class SchedulerConfig:
 
				         if self.max_num_batched_tokens < self.max_num_seqs:
			
 
				             raise ValueError(
			
 
				                 f"max_num_batched_tokens ({self.max_num_batched_tokens}) must "
			
 
				-                f"be greater than or equal to max_num_seqs ({self.max_num_seqs}).")
			
 
				+                f"be greater than or equal to max_num_seqs ({self.max_num_seqs})."
			
 
				+            )
			
 
				 
			
 
				 
			
 
				 _STR_DTYPE_TO_TORCH_DTYPE = {
			
@@ -358,7 +359,8 @@ def _get_and_verify_max_len(
 
				     if derived_max_model_len == float('inf'):
			
 
				         raise ValueError(
			
 
				             "The model's config.json must contain one of the following keys "
			
 
				-            f"to determine the original maximum length of the model: {possible_keys}")
			
 
				+            f"to determine the original maximum length of the model: {possible_keys}"
			
 
				+        )
			
 
				 
			
 
				     rope_scaling = getattr(hf_config, "rope_scaling", None)
			
 
				     if rope_scaling is not None:
			
@@ -375,4 +377,4 @@ def _get_and_verify_max_len(
 
				             " in model's config.json). This may lead to incorrect model "
			
 
				             "outputs or CUDA errors. Make sure the value is correct and "
			
 
				             "within the model context size.")
			
 
				-    return int(max_model_len)
			
 
				+    return int(max_model_len)
			
--- a/aphrodite/common/logits_processor.py
+++ b/aphrodite/common/logits_processor.py
@@ -6,7 +6,8 @@ from typing import Dict
 
				 class LogitsProcessor(ABC):
			
 
				 
			
 
				     @abstractmethod
			
 
				-    def __call__(self, logits: torch.Tensor, output_tokens: list[list[int]]) -> None:
			
 
				+    def __call__(self, logits: torch.Tensor,
			
 
				+                 output_tokens: list[list[int]]) -> None:
			
 
				         """Logits are edited in-place"""
			
 
				         pass
			
 
				 
			
@@ -42,14 +43,15 @@ class BiasLogitsProcessor(LogitsProcessor):
 
				                                      1 / (1 - (values / 100)))
			
 
				         logits[0, keys] *= update_factors
			
 
				 
			
 
				-        
			
 
				+
			
 
				 class BanEOSUntil(LogitsProcessor):
			
 
				     """Bans the EOS token until a certain condition is met.
			
 
				     In this case, 'number of output tokens'.
			
 
				 
			
 
				     With this condition, both 'min_tokens' and 'ignore_eos'
			
 
				     parameters can be handled gracefully."""
			
 
				-    def __init__(self, min_tokens:int, eos_token_id:int):
			
 
				+
			
 
				+    def __init__(self, min_tokens: int, eos_token_id: int):
			
 
				         self._min_tokens = min_tokens
			
 
				         self._eos_token_id = eos_token_id
			
 
				 
			
--- a/aphrodite/common/outputs.py
+++ b/aphrodite/common/outputs.py
@@ -1,8 +1,7 @@
 
				 from typing import List, Optional
			
 
				 
			
 
				-from aphrodite.common.sequence import (
			
 
				-    PromptLogprobs, SampleLogprobs, SequenceGroup,
			
 
				-    SequenceStatus)
			
 
				+from aphrodite.common.sequence import (PromptLogprobs, SampleLogprobs,
			
 
				+                                       SequenceGroup, SequenceStatus)
			
 
				 
			
 
				 
			
 
				 class CompletionOutput:
			
@@ -117,4 +116,4 @@ class RequestOutput:
 
				                 f"prompt_token_ids={self.prompt_token_ids}, "
			
 
				                 f"prompt_logprobs={self.prompt_logprobs}, "
			
 
				                 f"outputs={self.outputs}, "
			
 
				-                f"finished={self.finished})")
			
 
				+                f"finished={self.finished})")
			
--- a/aphrodite/common/sampling_params.py
+++ b/aphrodite/common/sampling_params.py
@@ -194,11 +194,15 @@ class SamplingParams:
 
				         if not 0.0 < self.tfs <= 1.0:
			
 
				             raise ValueError(f"tfs must be in (0, 1], got {self.tfs}.")
			
 
				         if not 0.0 <= self.epsilon_cutoff <= 1000.0:
			
 
				-            raise ValueError(f"epsilon_cutoff must be in [0, 1000], got {self.epsilon_cutoff}.")
			
 
				+            raise ValueError(
			
 
				+                f"epsilon_cutoff must be in [0, 1000], got {self.epsilon_cutoff}."
			
 
				+            )
			
 
				         if not self.eta_cutoff >= 0:
			
 
				-            raise ValueError(f"eta_cutoff must be non negative, got {self.eta_cutoff}.")
			
 
				+            raise ValueError(
			
 
				+                f"eta_cutoff must be non negative, got {self.eta_cutoff}.")
			
 
				         if not 0.0 <= self.typical_p <= 1.0:
			
 
				-            raise ValueError(f"typical_p must be in (0, 1], got {self.typical_p}.")
			
 
				+            raise ValueError(
			
 
				+                f"typical_p must be in (0, 1], got {self.typical_p}.")
			
 
				         if self.max_tokens < 1:
			
 
				             raise ValueError(
			
 
				                 f"max_tokens must be at least 1, got {self.max_tokens}.")
			
@@ -207,7 +211,8 @@ class SamplingParams:
 
				                 f"logprobs must be non-negative, got {self.logprobs}.")
			
 
				         if self.prompt_logprobs is not None and self.prompt_logprobs < 0:
			
 
				             raise ValueError(
			
 
				-                f"prompt_logprobs must be non-negative, got {self.prompt_logprobs}.")
			
 
				+                f"prompt_logprobs must be non-negative, got {self.prompt_logprobs}."
			
 
				+            )
			
 
				 
			
 
				     def _verify_beam_search(self) -> None:
			
 
				         if self.best_of == 1:
			
@@ -274,4 +279,4 @@ class SamplingParams:
 
				                 f"custom_token_bans={self.custom_token_bans}, "
			
 
				                 f"logprobs={self.logprobs}, "
			
 
				                 f"prompt_logprobs={self.prompt_logprobs}, "
			
 
				-                f"skip_special_tokens={self.skip_special_tokens})")
			
 
				+                f"skip_special_tokens={self.skip_special_tokens})")
			
--- a/aphrodite/common/sequence.py
+++ b/aphrodite/common/sequence.py
@@ -382,6 +382,7 @@ class SequenceOutputs:
 
				                 and self.output_token == other.output_token
			
 
				                 and self.logprobs == other.logprobs)
			
 
				 
			
 
				+
			
 
				 class SequenceGroupOutputs:
			
 
				     """The model outputs associated with a sequence group."""
			
 
				 
			
@@ -403,6 +404,7 @@ class SequenceGroupOutputs:
 
				         return (self.samples == other.samples
			
 
				                 and self.prompt_logprobs == other.prompt_logprobs)
			
 
				 
			
 
				+
			
 
				 # For each sequence group, we generate a list of SequenceOutputs object,
			
 
				 # each of which contains one possible candidate for the next token.
			
 
				-SamplerOutput = List[SequenceGroupOutputs]
			
 
				+SamplerOutput = List[SequenceGroupOutputs]
			
--- a/aphrodite/common/utils.py
+++ b/aphrodite/common/utils.py
@@ -8,6 +8,7 @@ import torch
 
				 
			
 
				 from aphrodite import cuda_utils
			
 
				 
			
 
				+
			
 
				 class Device(enum.Enum):
			
 
				     GPU = enum.auto()
			
 
				     CPU = enum.auto()
			
@@ -52,4 +53,4 @@ def random_uuid() -> str:
 
				 
			
 
				 def in_wsl() -> bool:
			
 
				     # Reference: https://github.com/microsoft/WSL/issues/4071
			
 
				-    return "microsoft" in " ".join(uname()).lower()
			
 
				+    return "microsoft" in " ".join(uname()).lower()
			
--- a/aphrodite/endpoints/api_server_kobold.py
+++ b/aphrodite/endpoints/api_server_kobold.py
@@ -30,15 +30,17 @@ engine: AsyncAphrodite = None
 
				 
			
 
				 badwordsids: List[int] = []
			
 
				 
			
 
				+
			
 
				 def _set_badwords(tokenizer, hf_config):
			
 
				     global badwordsids
			
 
				     if hf_config.bad_words_ids is not None:
			
 
				         badwordsids = hf_config.bad_words_ids
			
 
				         return
			
 
				-    
			
 
				-    badwordsids = [ v for k, v in tokenizer.get_vocab().items()
			
 
				-                    if any(c in str(k) for c in "[]")
			
 
				-                  ]
			
 
				+
			
 
				+    badwordsids = [
			
 
				+        v for k, v in tokenizer.get_vocab().items()
			
 
				+        if any(c in str(k) for c in "[]")
			
 
				+    ]
			
 
				     if tokenizer.pad_token_id in badwordsids:
			
 
				         badwordsids.remove(tokenizer.pad_token_id)
			
 
				     badwordsids.append(tokenizer.eos_token_id)
			
@@ -57,22 +59,30 @@ app.add_middleware(
 
				     allow_headers=["*"],
			
 
				 )
			
 
				 
			
 
				-def create_error_response(status_code: HTTPStatus, message: str) -> JSONResponse:
			
 
				-    return JSONResponse({"msg": message, "type": "invalid_request_error"},
			
 
				+
			
 
				+def create_error_response(status_code: HTTPStatus,
			
 
				+                          message: str) -> JSONResponse:
			
 
				+    return JSONResponse({
			
 
				+        "msg": message,
			
 
				+        "type": "invalid_request_error"
			
 
				+    },
			
 
				                         status_code=status_code.value)
			
 
				 
			
 
				+
			
 
				 @app.exception_handler(ValueError)
			
 
				 def validation_exception_handler(request, exc):  # pylint: disable=unused-argument
			
 
				     return create_error_response(HTTPStatus.UNPROCESSABLE_ENTITY, str(exc))
			
 
				 
			
 
				-def prepare_engine_payload(kai_payload: KAIGenerationInputSchema) -> Tuple[SamplingParams, List[int]]:
			
 
				+
			
 
				+def prepare_engine_payload(
			
 
				+        kai_payload: KAIGenerationInputSchema
			
 
				+) -> Tuple[SamplingParams, List[int]]:
			
 
				     """Create SamplingParams and truncated input tokens for AsyncEngine"""
			
 
				 
			
 
				     if kai_payload.max_context_length > max_model_len:
			
 
				         raise ValueError(
			
 
				             f"max_context_length ({kai_payload.max_context_length}) must be less than or equal to "
			
 
				-            f"max_model_len ({max_model_len})"
			
 
				-        )
			
 
				+            f"max_model_len ({max_model_len})")
			
 
				 
			
 
				     sampling_params = SamplingParams(max_tokens=kai_payload.max_length)
			
 
				 
			
@@ -86,7 +96,6 @@ def prepare_engine_payload(kai_payload: KAIGenerationInputSchema) -> Tuple[Sampl
 
				         kai_payload.top_p = 1.0
			
 
				         kai_payload.top_k = -1
			
 
				 
			
 
				-
			
 
				     sampling_params = SamplingParams(
			
 
				         n=kai_payload.n,
			
 
				         best_of=kai_payload.n,
			
@@ -100,38 +109,47 @@ def prepare_engine_payload(kai_payload: KAIGenerationInputSchema) -> Tuple[Sampl
 
				         eta_cutoff=kai_payload.eta_cutoff,
			
 
				         epsilon_cutoff=kai_payload.eps_cutoff,
			
 
				         stop=kai_payload.stop_sequence,
			
 
				-        custom_token_bans=badwordsids if kai_payload.use_default_badwordsids else [],
			
 
				+        custom_token_bans=badwordsids
			
 
				+        if kai_payload.use_default_badwordsids else [],
			
 
				         max_tokens=kai_payload.max_length,
			
 
				     )
			
 
				 
			
 
				-    max_input_tokens = max(1, kai_payload.max_context_length - kai_payload.max_length)
			
 
				+    max_input_tokens = max(
			
 
				+        1, kai_payload.max_context_length - kai_payload.max_length)
			
 
				     input_tokens = tokenizer(kai_payload.prompt).input_ids[-max_input_tokens:]
			
 
				 
			
 
				     return sampling_params, input_tokens
			
 
				 
			
 
				+
			
 
				 @kai_api.post("/generate")
			
 
				 async def generate(kai_payload: KAIGenerationInputSchema) -> JSONResponse:
			
 
				     """ Generate text """
			
 
				 
			
 
				     req_id = f"kai-{random_uuid()}"
			
 
				     sampling_params, input_tokens = prepare_engine_payload(kai_payload)
			
 
				-    result_generator = engine.generate(None, sampling_params, req_id, input_tokens)
			
 
				+    result_generator = engine.generate(None, sampling_params, req_id,
			
 
				+                                       input_tokens)
			
 
				 
			
 
				     final_res: RequestOutput = None
			
 
				     async for res in result_generator:
			
 
				         final_res = res
			
 
				     assert final_res is not None
			
 
				 
			
 
				-    return JSONResponse({"results": [{"text": output.text} for output in final_res.outputs]})
			
 
				+    return JSONResponse(
			
 
				+        {"results": [{
			
 
				+            "text": output.text
			
 
				+        } for output in final_res.outputs]})
			
 
				 
			
 
				 
			
 
				 @extra_api.post("/generate/stream")
			
 
				-async def generate_stream(kai_payload: KAIGenerationInputSchema) -> StreamingResponse:
			
 
				+async def generate_stream(
			
 
				+        kai_payload: KAIGenerationInputSchema) -> StreamingResponse:
			
 
				     """ Generate text SSE streaming """
			
 
				 
			
 
				     req_id = f"kai-{random_uuid()}"
			
 
				     sampling_params, input_tokens = prepare_engine_payload(kai_payload)
			
 
				-    results_generator = engine.generate(None, sampling_params, req_id, input_tokens)
			
 
				+    results_generator = engine.generate(None, sampling_params, req_id,
			
 
				+                                        input_tokens)
			
 
				 
			
 
				     async def stream_kobold() -> AsyncGenerator[bytes, None]:
			
 
				         previous_output = ""
			
@@ -142,44 +160,55 @@ async def generate_stream(kai_payload: KAIGenerationInputSchema) -> StreamingRes
 
				             yield f"data: {json.dumps({'token': new_chunk})}\n\n".encode()
			
 
				 
			
 
				     return StreamingResponse(stream_kobold(),
			
 
				-                             headers={"Cache-Control": "no-cache", "Connection": "keep-alive"},
			
 
				+                             headers={
			
 
				+                                 "Cache-Control": "no-cache",
			
 
				+                                 "Connection": "keep-alive"
			
 
				+                             },
			
 
				                              media_type='text/event-stream')
			
 
				 
			
 
				+
			
 
				 @extra_api.post("/generate/check")
			
 
				 async def check_generation():
			
 
				     """ stub for compatibility """
			
 
				     return JSONResponse({"results": [{"text": ""}]})
			
 
				 
			
 
				+
			
 
				 @kai_api.get("/info/version")
			
 
				 async def get_version():
			
 
				     """ Impersonate KAI """
			
 
				     return JSONResponse({"result": "1.2.4"})
			
 
				 
			
 
				+
			
 
				 @kai_api.get("/model")
			
 
				 async def get_model():
			
 
				     """ Get current model """
			
 
				     return JSONResponse({"result": f"aphrodite/{served_model}"})
			
 
				 
			
 
				+
			
 
				 @kai_api.get("/config/soft_prompts_list")
			
 
				 async def get_available_softprompts():
			
 
				     """ stub for compatibility """
			
 
				-    return JSONResponse({"values":[]})
			
 
				+    return JSONResponse({"values": []})
			
 
				+
			
 
				 
			
 
				 @kai_api.get("/config/soft_prompt")
			
 
				 async def get_current_softprompt():
			
 
				     """ stub for compatibility """
			
 
				     return JSONResponse({"value": ""})
			
 
				 
			
 
				+
			
 
				 @kai_api.put("/config/soft_prompt")
			
 
				 async def set_current_softprompt():
			
 
				     """ stub for compatibility """
			
 
				     return JSONResponse({})
			
 
				 
			
 
				+
			
 
				 @app.get("/api/latest/config/max_context_length")
			
 
				 async def get_max_context_length() -> JSONResponse:
			
 
				     """Return the max context length based on the EngineArgs configuration."""
			
 
				     max_context_length = engine_model_config.max_model_len
			
 
				-    return JSONResponse({"value": max_context_length })
			
 
				+    return JSONResponse({"value": max_context_length})
			
 
				+
			
 
				 
			
 
				 @app.get("/api/latest/config/max_length")
			
 
				 async def get_max_length() -> JSONResponse:
			
@@ -187,22 +216,25 @@ async def get_max_length() -> JSONResponse:
 
				     max_length = args.max_length
			
 
				     return JSONResponse({"value": max_length})
			
 
				 
			
 
				+
			
 
				 @extra_api.post("/abort")
			
 
				 async def abort_generation():
			
 
				     """ stub for compatibility """
			
 
				     return JSONResponse({})
			
 
				 
			
 
				+
			
 
				 @extra_api.get("/version")
			
 
				 async def get_extra_version():
			
 
				     """ Impersonate KoboldCpp with streaming support """
			
 
				     return JSONResponse({"result": "KoboldCpp", "version": "1.30"})
			
 
				 
			
 
				+
			
 
				 @app.get("/")
			
 
				 async def get_kobold_lite_ui():
			
 
				     """Serves a cached copy of the Kobold Lite UI, loading it from disk on demand if needed."""
			
 
				     #read and return embedded kobold lite
			
 
				     global kobold_lite_ui
			
 
				-    if kobold_lite_ui=="":
			
 
				+    if kobold_lite_ui == "":
			
 
				         scriptpath = os.path.dirname(os.path.abspath(__file__))
			
 
				         klitepath = os.path.join(scriptpath, "klite.embd")
			
 
				         if os.path.exists(klitepath):
			
@@ -212,6 +244,7 @@ async def get_kobold_lite_ui():
 
				             print("Embedded Kobold Lite not found")
			
 
				     return HTMLResponse(content=kobold_lite_ui)
			
 
				 
			
 
				+
			
 
				 app.include_router(kai_api, prefix="/api/v1")
			
 
				 app.include_router(kai_api, prefix="/api/latest", include_in_schema=False)
			
 
				 app.include_router(extra_api, prefix="/api/extra")
			
@@ -231,10 +264,10 @@ if __name__ == "__main__":
 
				                         "specified, the model name will be the same as "
			
 
				                         "the huggingface name.")
			
 
				     parser.add_argument("--max-length",
			
 
				-                    type=int,
			
 
				-                    default=256,
			
 
				-                    help="The maximum length of the generated text. "
			
 
				-                    "For use with Kobold Horde.")
			
 
				+                        type=int,
			
 
				+                        default=256,
			
 
				+                        help="The maximum length of the generated text. "
			
 
				+                        "For use with Kobold Horde.")
			
 
				 
			
 
				     parser = AsyncEngineArgs.add_cli_args(parser)
			
 
				     global args
			
@@ -256,11 +289,11 @@ if __name__ == "__main__":
 
				     tokenizer = get_tokenizer(engine_args.tokenizer,
			
 
				                               tokenizer_mode=engine_args.tokenizer_mode,
			
 
				                               trust_remote_code=engine_args.trust_remote_code)
			
 
				-    
			
 
				+
			
 
				     _set_badwords(tokenizer, engine_model_config.hf_config)
			
 
				 
			
 
				     uvicorn.run(app,
			
 
				                 host=args.host,
			
 
				                 port=args.port,
			
 
				                 log_level="info",
			
 
				-                timeout_keep_alive=TIMEOUT_KEEP_ALIVE)
			
 
				+                timeout_keep_alive=TIMEOUT_KEEP_ALIVE)
			
--- a/aphrodite/endpoints/api_server_ooba.py
+++ b/aphrodite/endpoints/api_server_ooba.py
@@ -31,8 +31,10 @@ app.add_middleware(
 
				     allow_headers=["*"],
			
 
				 )
			
 
				 
			
 
				+
			
 
				 @app.post("/api/v1/generate")
			
 
				-async def generate(request: Request, x_api_key: str = Header(None)) -> Response:
			
 
				+async def generate(
			
 
				+    request: Request, x_api_key: str = Header(None)) -> Response:
			
 
				     """Generate completion for the request.
			
 
				 
			
 
				     The request should be a JSON object with the following fields:
			
@@ -41,12 +43,13 @@ async def generate(request: Request, x_api_key: str = Header(None)) -> Response:
 
				     - other fields: the sampling parameters (See `SamplingParams` for details).
			
 
				     """
			
 
				     if x_api_key is None or x_api_key != valid_api_key:
			
 
				-        raise HTTPException(status_code=401, detail="Unauthorized. Please acquire an API key.")
			
 
				+        raise HTTPException(status_code=401,
			
 
				+                            detail="Unauthorized. Please acquire an API key.")
			
 
				 
			
 
				     request_dict = await request.json()
			
 
				     prompt = request_dict.pop("prompt")
			
 
				     stream = request_dict.pop("stream", False)
			
 
				-    
			
 
				+
			
 
				     if 'stopping_strings' in request_dict:
			
 
				         request_dict['stop'] = request_dict.pop('stopping_strings')
			
 
				     if 'max_new_tokens' in request_dict:
			
@@ -61,11 +64,14 @@ async def generate(request: Request, x_api_key: str = Header(None)) -> Response:
 
				     request_dict['logits_processors'] = []
			
 
				 
			
 
				     min_length = request_dict.pop('min_tokens', 0)
			
 
				-    if request_dict.get('ignore_eos', False):  # ignore_eos/ban_eos_token is functionally equivalent to `min_tokens = max_tokens`
			
 
				+    if request_dict.get(
			
 
				+            'ignore_eos', False
			
 
				+    ):  # ignore_eos/ban_eos_token is functionally equivalent to `min_tokens = max_tokens`
			
 
				         min_length = request_dict.get('max_tokens', 16)
			
 
				 
			
 
				     if min_length:
			
 
				-        request_dict['logits_processors'].append(BanEOSUntil(min_length, engine.engine.tokenizer.eos_token_id))
			
 
				+        request_dict['logits_processors'].append(
			
 
				+            BanEOSUntil(min_length, engine.engine.tokenizer.eos_token_id))
			
 
				 
			
 
				     sampling_params = SamplingParams()
			
 
				     for key, value in request_dict.items():
			
@@ -80,9 +86,9 @@ async def generate(request: Request, x_api_key: str = Header(None)) -> Response:
 
				     async def stream_results() -> AsyncGenerator[bytes, None]:
			
 
				         async for request_output in results_generator:
			
 
				             prompt = request_output.prompt
			
 
				-            text_outputs = [
			
 
				-                {"text": output.text} for output in request_output.outputs
			
 
				-            ]
			
 
				+            text_outputs = [{
			
 
				+                "text": output.text
			
 
				+            } for output in request_output.outputs]
			
 
				             ret = {"results": text_outputs}
			
 
				             yield (json.dumps(ret) + "\n\n").encode("utf-8")
			
 
				 
			
--- a/aphrodite/endpoints/protocol.py
+++ b/aphrodite/endpoints/protocol.py
@@ -1,6 +1,7 @@
 
				 from typing import List, Optional, Union
			
 
				 from pydantic import BaseModel, Field, root_validator, conint, confloat, conlist, NonNegativeFloat, NonNegativeInt, PositiveInt
			
 
				 
			
 
				+
			
 
				 class SamplingParams(BaseModel):
			
 
				     n: int = Field(1, alias="n")
			
 
				     best_of: Optional[int] = Field(None, alias="best_of")
			
@@ -20,16 +21,19 @@ class SamplingParams(BaseModel):
 
				     ignore_eos: bool = Field(False, alias="ignore_eos")
			
 
				     max_tokens: int = Field(16, alias="max_length")
			
 
				     logprobs: Optional[int] = Field(None, alias="logprobs")
			
 
				-    custom_token_bans: Optional[List[int]] = Field(None, alias="custom_token_bans")
			
 
				+    custom_token_bans: Optional[List[int]] = Field(None,
			
 
				+                                                   alias="custom_token_bans")
			
 
				 
			
 
				     @root_validator
			
 
				     def validate_best_of(cls, values):
			
 
				         best_of = values.get("best_of")
			
 
				         n = values.get("n")
			
 
				         if best_of is not None and (best_of <= 0 or best_of > n):
			
 
				-            raise ValueError("best_of must be a positive integer less than or equal to n")
			
 
				+            raise ValueError(
			
 
				+                "best_of must be a positive integer less than or equal to n")
			
 
				         return values
			
 
				 
			
 
				+
			
 
				 class KAIGenerationInputSchema(BaseModel):
			
 
				     prompt: str
			
 
				     n: Optional[conint(ge=1, le=5)] = 1
			
@@ -42,7 +46,7 @@ class KAIGenerationInputSchema(BaseModel):
 
				     top_a: Optional[NonNegativeFloat] = 0.0
			
 
				     top_p: Optional[confloat(ge=0, le=1)] = 1.0
			
 
				     tfs: Optional[confloat(ge=0, le=1)] = 1.0
			
 
				-    eps_cutoff: Optional[confloat(ge=0,le=1000)] = 0.0
			
 
				+    eps_cutoff: Optional[confloat(ge=0, le=1000)] = 0.0
			
 
				     eta_cutoff: Optional[NonNegativeFloat] = 0.0
			
 
				     typical: Optional[confloat(ge=0, le=1)] = 1.0
			
 
				     temperature: Optional[NonNegativeFloat] = 1.0
			
@@ -67,5 +71,7 @@ class KAIGenerationInputSchema(BaseModel):
 
				 
			
 
				     @root_validator
			
 
				     def check_context(cls, values):
			
 
				-        assert values.get("max_length") <= values.get("max_context_length"), f"max_length must not be larger than max_context_length"
			
 
				-        return values
			
 
				+        assert values.get("max_length") <= values.get(
			
 
				+            "max_context_length"
			
 
				+        ), f"max_length must not be larger than max_context_length"
			
 
				+        return values
			
--- a/aphrodite/engine/aphrodite_engine.py
+++ b/aphrodite/engine/aphrodite_engine.py
@@ -4,18 +4,19 @@ from functools import partial
 
				 from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Union
			
 
				 
			
 
				 from aphrodite.common.config import (CacheConfig, ModelConfig, ParallelConfig,
			
 
				-                         SchedulerConfig)
			
 
				+                                     SchedulerConfig)
			
 
				 from aphrodite.processing.scheduler import Scheduler, SchedulerOutputs
			
 
				 from aphrodite.engine.args_tools import EngineArgs
			
 
				 from aphrodite.engine.ray_tools import RayWorker, initialize_cluster, ray
			
 
				 from aphrodite.common.logger import init_logger
			
 
				 from aphrodite.common.outputs import RequestOutput
			
 
				 from aphrodite.common.sampling_params import SamplingParams
			
 
				-from aphrodite.common.sequence import (
			
 
				-    SamplerOutput, Sequence, SequenceGroup, SequenceGroupMetadata,
			
 
				-    SequenceGroupOutputs, SequenceOutputs, SequenceStatus)
			
 
				+from aphrodite.common.sequence import (SamplerOutput, Sequence, SequenceGroup,
			
 
				+                                       SequenceGroupMetadata,
			
 
				+                                       SequenceGroupOutputs, SequenceOutputs,
			
 
				+                                       SequenceStatus)
			
 
				 from aphrodite.transformers_utils.tokenizer import (detokenize_incrementally,
			
 
				-                                               get_tokenizer)
			
 
				+                                                    get_tokenizer)
			
 
				 from aphrodite.common.utils import Counter
			
 
				 
			
 
				 if ray:
			
@@ -707,4 +708,4 @@ class AphroditeEngine:
 
				         output = all_outputs[0]
			
 
				         for other_output in all_outputs[1:]:
			
 
				             assert output == other_output
			
 
				-        return output
			
 
				+        return output
			
--- a/aphrodite/engine/args_tools.py
+++ b/aphrodite/engine/args_tools.py
@@ -3,7 +3,8 @@ import dataclasses
 
				 from dataclasses import dataclass
			
 
				 from typing import Optional, Tuple
			
 
				 
			
 
				-from aphrodite.common.config import (CacheConfig, ModelConfig, ParallelConfig, SchedulerConfig)
			
 
				+from aphrodite.common.config import (CacheConfig, ModelConfig, ParallelConfig,
			
 
				+                                     SchedulerConfig)
			
 
				 
			
 
				 
			
 
				 @dataclass
			
@@ -180,10 +181,9 @@ class EngineArgs:
 
				                                    self.download_dir, self.load_format,
			
 
				                                    self.dtype, self.seed, self.revision,
			
 
				                                    self.max_model_len, self.quantization)
			
 
				-        cache_config = CacheConfig(self.block_size,
			
 
				-                                   self.gpu_memory_utilization,
			
 
				-                                   self.swap_space, getattr(model_config.hf_config,
			
 
				-                                                            'sliding_window', None))
			
 
				+        cache_config = CacheConfig(
			
 
				+            self.block_size, self.gpu_memory_utilization, self.swap_space,
			
 
				+            getattr(model_config.hf_config, 'sliding_window', None))
			
 
				         parallel_config = ParallelConfig(self.pipeline_parallel_size,
			
 
				                                          self.tensor_parallel_size,
			
 
				                                          self.worker_use_ray)
			
@@ -218,4 +218,4 @@ class AsyncEngineArgs(EngineArgs):
 
				                             help='max number of prompt characters or prompt '
			
 
				                             'ID numbers being printed in log. '
			
 
				                             'Default: unlimited.')
			
 
				-        return parser
			
 
				+        return parser
			
--- a/aphrodite/engine/async_aphrodite.py
+++ b/aphrodite/engine/async_aphrodite.py
@@ -168,6 +168,7 @@ class RequestTracker:
 
				     async def wait_for_new_requests(self):
			
 
				         await self.new_requests_event.wait()
			
 
				 
			
 
				+
			
 
				 class _AsyncAphrodite(AphroditeEngine):
			
 
				     """Extension of AphroditeEngine to add async methods."""
			
 
				 
			
@@ -490,4 +491,4 @@ class AsyncAphrodite:
 
				                      log_stats=not engine_args.disable_log_stats,
			
 
				                      max_log_len=engine_args.max_log_len,
			
 
				                      start_engine_loop=start_engine_loop)
			
 
				-        return engine
			
 
				+        return engine
			
--- a/aphrodite/modeling/hf_downloader.py
+++ b/aphrodite/modeling/hf_downloader.py
@@ -17,7 +17,6 @@ from aphrodite.common.logger import init_logger
 
				 from aphrodite.modeling.quantization_utils import get_quant_class
			
 
				 from aphrodite.modeling.quantization_utils.base import QuantizationConfig
			
 
				 
			
 
				-
			
 
				 logger = init_logger(__name__)
			
 
				 
			
 
				 
			
@@ -92,7 +91,7 @@ def get_quant_config(
 
				     if quantization == "gptq" and hasattr(hf_config, "quantization_config"):
			
 
				         config = hf_config.quantization_config
			
 
				         return get_quant_class(quantization).from_config(config)
			
 
				-    
			
 
				+
			
 
				     is_local = os.path.isdir(model_name_or_path)
			
 
				     if not is_local:
			
 
				         # Download the config files.
			
@@ -307,7 +306,7 @@ def load_tensor_parallel_weights(
 
				             else:
			
 
				                 index = [slice(None)] * (len(loaded_weight.get_shape()) -
			
 
				                                          1) + [slice(start_idx, end_idx)]
			
 
				-                loaded_weight = loaded_weight[index] 
			
 
				+                loaded_weight = loaded_weight[index]
			
 
				             break
			
 
				 
			
 
				     loaded_weight = convert_pyslice_to_tensor(loaded_weight)
			
@@ -340,7 +339,8 @@ def get_parallel_weight(model: torch.nn.Module):
 
				         row_weight_suffixes = ["weight"]
			
 
				         ignore_weight_suffixes = []
			
 
				     else:
			
 
				-        column_weight_suffixes = model.quant_config.get_column_tp_tensor_names()
			
 
				+        column_weight_suffixes = model.quant_config.get_column_tp_tensor_names(
			
 
				+        )
			
 
				         row_weight_suffixes = model.quant_config.get_row_tp_tensor_names()
			
 
				         ignore_weight_suffixes = model.quant_config.get_ignore_tensor_names()
			
 
				 
			
@@ -357,4 +357,4 @@ def get_parallel_weight(model: torch.nn.Module):
 
				         for layer in model.parallel_vocab_layers:
			
 
				             for suffix in ["weight", "bias"]:
			
 
				                 column_parallel_weights.append(f"{layer}.{suffix}")
			
 
				-    return column_parallel_weights, row_parallel_weights, ignore_weight_suffixes
			
 
				+    return column_parallel_weights, row_parallel_weights, ignore_weight_suffixes
			
--- a/aphrodite/modeling/layers/activation.py
+++ b/aphrodite/modeling/layers/activation.py
@@ -4,7 +4,6 @@ import torch.nn as nn
 
				 from aphrodite import activation_ops
			
 
				 
			
 
				 
			
 
				-
			
 
				 class SiluAndMul(nn.Module):
			
 
				     """An activation function for SwiGLU.
			
 
				 
			
@@ -27,13 +26,15 @@ class SiluAndMul(nn.Module):
 
				         activation_ops.silu_and_mul(out, x)
			
 
				         return out
			
 
				 
			
 
				+
			
 
				 class NewGELU(nn.Module):
			
 
				 
			
 
				     def forward(self, x: torch.Tensor) -> torch.Tensor:
			
 
				         out = torch.empty_like(x)
			
 
				         activation_ops.gelu_new(out, x)
			
 
				         return out
			
 
				-    
			
 
				+
			
 
				+
			
 
				 class FastGELU(nn.Module):
			
 
				 
			
 
				     def forward(self, x: torch.Tensor) -> torch.Tensor:
			
@@ -41,6 +42,7 @@ class FastGELU(nn.Module):
 
				         activation_ops.gelu_fast(out, x)
			
 
				         return out
			
 
				 
			
 
				+
			
 
				 _ACTIVATION_REGISTRY = {
			
 
				     "gelu": nn.GELU(),
			
 
				     "gelu_new": NewGELU(),
			
@@ -49,9 +51,11 @@ _ACTIVATION_REGISTRY = {
 
				     "relu": nn.ReLU(),
			
 
				 }
			
 
				 
			
 
				+
			
 
				 def get_act_fn(act_fn: str) -> nn.Module:
			
 
				     """Get an activation function by name."""
			
 
				     act_fn = act_fn.lower()
			
 
				     if act_fn in _ACTIVATION_REGISTRY:
			
 
				         return _ACTIVATION_REGISTRY[act_fn]
			
 
				-    raise ValueError(f"Activation function {act_fn!r} is currently not supported.")
			
 
				+    raise ValueError(
			
 
				+        f"Activation function {act_fn!r} is currently not supported.")
			
--- a/aphrodite/modeling/layers/layernorm.py
+++ b/aphrodite/modeling/layers/layernorm.py
@@ -4,6 +4,7 @@ import torch.nn as nn
 
				 
			
 
				 from aphrodite import layernorm_ops
			
 
				 
			
 
				+
			
 
				 class RMSNorm(nn.Module):
			
 
				     """Root mean square normalization.
			
 
				 
			
@@ -12,9 +13,9 @@ class RMSNorm(nn.Module):
 
				     """
			
 
				 
			
 
				     def __init__(
			
 
				-        self,
			
 
				-        hidden_size: int,
			
 
				-        eps: float = 1e-6, # the epsilon value used by llama models
			
 
				+            self,
			
 
				+            hidden_size: int,
			
 
				+            eps: float = 1e-6,  # the epsilon value used by llama models
			
 
				     ) -> None:
			
 
				         super().__init__()
			
 
				         self.weight = nn.Parameter(torch.ones(hidden_size))
			
@@ -28,4 +29,4 @@ class RMSNorm(nn.Module):
 
				             self.weight.data,
			
 
				             self.variance_epsilon,
			
 
				         )
			
 
				-        return out
			
 
				+        return out
			
--- a/aphrodite/modeling/layers/quantized_linear/__init__.py
+++ b/aphrodite/modeling/layers/quantized_linear/__init__.py
@@ -2,10 +2,10 @@ from torch import nn
 
				 
			
 
				 from aphrodite.modeling.layers.quantized_linear.awq import (
			
 
				     AWQColumnParallelLinear, AWQRowParallelLinear)
			
 
				-from aphrodite.modeling.layers.quantized_linear.gptq import(
			
 
				+from aphrodite.modeling.layers.quantized_linear.gptq import (
			
 
				     GPTQColumnParallelLinear, GPTQRowParallelLinear, GPTQLinear)
			
 
				-from aphrodite.modeling.megatron.layers import (
			
 
				-    ColumnParallelLinear, RowParallelLinear)
			
 
				+from aphrodite.modeling.megatron.layers import (ColumnParallelLinear,
			
 
				+                                                RowParallelLinear)
			
 
				 
			
 
				 _QUANTIZED_LINEAR_REGISTRY = {
			
 
				     "awq": (AWQColumnParallelLinear, AWQRowParallelLinear, None),
			
@@ -57,4 +57,4 @@ class ParallelLinear:
 
				             raise ValueError(f"No quantized linear is found for {name}")
			
 
				 
			
 
				         quant_linear_cls = _QUANTIZED_LINEAR_REGISTRY[name][1]
			
 
				-        return quant_linear_cls(*args, **kwargs)
			
 
				+        return quant_linear_cls(*args, **kwargs)
			
--- a/aphrodite/modeling/layers/quantized_linear/awq.py
+++ b/aphrodite/modeling/layers/quantized_linear/awq.py
@@ -5,8 +5,9 @@ import torch
 
				 from torch.nn.parameter import Parameter
			
 
				 
			
 
				 from aphrodite import quantization_ops
			
 
				-from aphrodite.modeling.megatron.layers import (
			
 
				-    ColumnParallelLinear, RowParallelLinear)
			
 
				+from aphrodite.modeling.megatron.layers import (ColumnParallelLinear,
			
 
				+                                                RowParallelLinear)
			
 
				+
			
 
				 
			
 
				 class AWQColumnParallelLinear(ColumnParallelLinear):
			
 
				 
			
--- a/aphrodite/modeling/layers/quantized_linear/gptq.py
+++ b/aphrodite/modeling/layers/quantized_linear/gptq.py
@@ -5,7 +5,7 @@ from torch.nn.parameter import Parameter
 
				 
			
 
				 from aphrodite import quantization_ops
			
 
				 from aphrodite.modeling.megatron.layers import (ColumnParallelLinear,
			
 
				-                                                       RowParallelLinear)
			
 
				+                                                RowParallelLinear)
			
 
				 
			
 
				 
			
 
				 class GPTQLinear(torch.nn.Module):
			
@@ -262,4 +262,4 @@ class GPTQRowParallelLinear(RowParallelLinear):
 
				                                                  self.scales.float(),
			
 
				                                                  self.qzeros, self.g_idx)
			
 
				             output = output.half()
			
 
				-        return output.reshape(out_shape)
			
 
				+        return output.reshape(out_shape)
			
--- a/aphrodite/modeling/layers/quantized_linear/utils.py
+++ b/aphrodite/modeling/layers/quantized_linear/utils.py
@@ -6,6 +6,7 @@ from aphrodite import quantization_ops
 
				 from aphrodite.modeling.layers.quantized_linear.gptq import (
			
 
				     GPTQColumnParallelLinear, GPTQRowParallelLinear, GPTQLinear)
			
 
				 
			
 
				+
			
 
				 def quant_post_init(model, max_input_length: Optional[int] = None):
			
 
				     device_to_buffers_size = {}
			
 
				 
			
@@ -26,7 +27,7 @@ def quant_post_init(model, max_input_length: Optional[int] = None):
 
				             device_to_buffers_size[device]["max_dq_buffer_size"] = max(
			
 
				                 device_to_buffers_size[device]["max_dq_buffer_size"],
			
 
				                 submodule.qweight.numel() * 8)
			
 
				-            
			
 
				+
			
 
				             in_features = submodule.input_size_per_partition if isinstance(
			
 
				                 submodule, GPTQRowParallelLinear) else submodule.input_size
			
 
				             out_features = submodule.output_size_per_partition if isinstance(
			
@@ -36,7 +37,7 @@ def quant_post_init(model, max_input_length: Optional[int] = None):
 
				                 device_to_buffers_size[device]["max_inner_outer_dim"] = max(
			
 
				                     device_to_buffers_size[device]["max_inner_outer_dim"],
			
 
				                     in_features, out_features)
			
 
				-    
			
 
				+
			
 
				     if model_uses_exllama:
			
 
				         device_to_buffers = {}
			
 
				         max_input_len = max_input_length if use_act_order else 1
			
@@ -64,20 +65,20 @@ def quant_post_init(model, max_input_length: Optional[int] = None):
 
				             quantization_ops.gptq_prepare_buffers(device,
			
 
				                                                   buffers["temp_state"],
			
 
				                                                   buffers["temp_dq"])
			
 
				-            
			
 
				+
			
 
				         matmul_recons_thd = 8
			
 
				         matmul_fused_remap = False
			
 
				         matmul_no_half2 = False
			
 
				         quantization_ops.gptq_set_tuning_params(matmul_recons_thd,
			
 
				                                                 matmul_fused_remap,
			
 
				                                                 matmul_no_half2)
			
 
				-        
			
 
				+
			
 
				         # the buffers need to have been initialized first before calling make_q4
			
 
				         for _, submodule in model.named_modules():
			
 
				             if isinstance(
			
 
				-                submodule,
			
 
				+                    submodule,
			
 
				                 (GPTQColumnParallelLinear, GPTQRowParallelLinear, GPTQLinear)):
			
 
				                 submodule.post_init()
			
 
				 
			
 
				         torch.cuda.empty_cache()
			
 
				-    return model
			
 
				+    return model
			
--- a/aphrodite/modeling/layers/rotary_embedding.py
+++ b/aphrodite/modeling/layers/rotary_embedding.py
@@ -169,7 +169,8 @@ class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding):
 
				         sin = freqs.sin()
			
 
				         cache = torch.cat((cos, sin), dim=-1)
			
 
				         return cache
			
 
				-    
			
 
				+
			
 
				+
			
 
				 def _yarn_find_correction_dim(num_rotations: int,
			
 
				                               dim: int,
			
 
				                               base: float = 10000,
			
@@ -178,6 +179,7 @@ def _yarn_find_correction_dim(num_rotations: int,
 
				                            (num_rotations * 2 * math.pi))) / (2 *
			
 
				                                                               math.log(base))
			
 
				 
			
 
				+
			
 
				 def _yarn_find_correction_range(low_rot: int,
			
 
				                                 high_rot: int,
			
 
				                                 dim: int,
			
@@ -186,8 +188,10 @@ def _yarn_find_correction_range(low_rot: int,
 
				     low = math.floor(
			
 
				         _yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings))
			
 
				     high = math.ceil(
			
 
				-        _yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings))
			
 
				-    return max(low, 0), min(high, dim - 1) # clamp values just in case
			
 
				+        _yarn_find_correction_dim(high_rot, dim, base,
			
 
				+                                  max_position_embeddings))
			
 
				+    return max(low, 0), min(high, dim - 1)  # clamp values just in case
			
 
				+
			
 
				 
			
 
				 def _yarn_linear_ramp_mask(low: float, high: float, dim: int,
			
 
				                            dtype: torch.dtype,
			
@@ -200,6 +204,7 @@ def _yarn_linear_ramp_mask(low: float, high: float, dim: int,
 
				     ramp_func = torch.clamp(linear_func, 0, 1)
			
 
				     return ramp_func
			
 
				 
			
 
				+
			
 
				 def _yarn_get_mscale(scale: float = 1) -> float:
			
 
				     if scale <= 1:
			
 
				         return 1.0
			
@@ -210,18 +215,18 @@ class YaRNScalingRotaryEmbedding(RotaryEmbedding):
 
				     """Rotary embedding extended with YaRN method (Peng et al.)"""
			
 
				 
			
 
				     def __init__(
			
 
				-            self,
			
 
				-            head_size: int,
			
 
				-            rotary_dim: int,
			
 
				-            max_position_embeddings: int,
			
 
				-            base: int,
			
 
				-            is_neox_style: bool,
			
 
				-            scaling_factor: float,
			
 
				-            *,
			
 
				-            extrapolation_factor: float = 1,
			
 
				-            attn_factor: float = 1,
			
 
				-            beta_fast: float = 32,
			
 
				-            beta_slow: float = 1,
			
 
				+        self,
			
 
				+        head_size: int,
			
 
				+        rotary_dim: int,
			
 
				+        max_position_embeddings: int,
			
 
				+        base: int,
			
 
				+        is_neox_style: bool,
			
 
				+        scaling_factor: float,
			
 
				+        *,
			
 
				+        extrapolation_factor: float = 1,
			
 
				+        attn_factor: float = 1,
			
 
				+        beta_fast: float = 32,
			
 
				+        beta_slow: float = 1,
			
 
				     ) -> None:
			
 
				         self.scaling_factor = scaling_factor
			
 
				         self.extrapolation_factor = extrapolation_factor
			
@@ -229,9 +234,11 @@ class YaRNScalingRotaryEmbedding(RotaryEmbedding):
 
				         self.beta_fast = beta_fast
			
 
				         self.beta_slow = beta_slow
			
 
				         self.mscale = float(
			
 
				-            _yarn_get_mscale(self.scaling_factor) * attn_factor) # get n-d magnitude scaling corrected for interpolation
			
 
				-        super().__init__(head_size, rotary_dim, max_position_embeddings, base, is_neox_style)
			
 
				-    
			
 
				+            _yarn_get_mscale(self.scaling_factor) * attn_factor
			
 
				+        )  # get n-d magnitude scaling corrected for interpolation
			
 
				+        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
			
 
				+                         is_neox_style)
			
 
				+
			
 
				     def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
			
 
				         pos_freqs = self.base**(torch.arange(
			
 
				             0, self.rotary_dim, 2, dtype=torch.float, device="cuda") /
			
@@ -242,14 +249,14 @@ class YaRNScalingRotaryEmbedding(RotaryEmbedding):
 
				         low, high = _yarn_find_correction_range(self.beta_fast, self.beta_slow,
			
 
				                                                 self.rotary_dim, self.base,
			
 
				                                                 self.max_position_embeddings)
			
 
				-        
			
 
				+
			
 
				         inv_freq_mask = (1 - _yarn_linear_ramp_mask(
			
 
				             low, high, self.rotary_dim // 2, dtype=torch.float,
			
 
				             device="cuda")) * self.extrapolation_factor
			
 
				         inv_freq = inv_freq_interpolation * (
			
 
				             1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask
			
 
				         return inv_freq
			
 
				-    
			
 
				+
			
 
				     def _compute_cos_sin_cache(self) -> torch.Tensor:
			
 
				         inv_freq = self._compute_inv_freq(self.scaling_factor)
			
 
				         t = torch.arange(self.max_position_embeddings * self.scaling_factor,
			
@@ -259,4 +266,4 @@ class YaRNScalingRotaryEmbedding(RotaryEmbedding):
 
				         cos = (freqs.cos() * self.mscale)
			
 
				         sin = (freqs.sin() * self.mscale)
			
 
				         cache = torch.cat((cos, sin), dim=-1)
			
 
				-        return cache
			
 
				+        return cache
			
--- a/aphrodite/modeling/layers/sampler.py
+++ b/aphrodite/modeling/layers/sampler.py
@@ -7,12 +7,11 @@ import torch.nn as nn
 
				 
			
 
				 from aphrodite.modeling.metadata import InputMetadata
			
 
				 from aphrodite.modeling.megatron.communication_op import (
			
 
				-    tensor_model_parallel_all_gather
			
 
				-)
			
 
				+    tensor_model_parallel_all_gather)
			
 
				 from aphrodite.common.sampling_params import SamplingParams, SamplingType
			
 
				-from aphrodite.common.sequence import (
			
 
				-    PromptLogprobs, SampleLogprobs, SamplerOutput, SequenceData,
			
 
				-    SequenceGroupOutputs, SequenceOutputs)
			
 
				+from aphrodite.common.sequence import (PromptLogprobs, SampleLogprobs,
			
 
				+                                       SamplerOutput, SequenceData,
			
 
				+                                       SequenceGroupOutputs, SequenceOutputs)
			
 
				 from aphrodite.common.sequence import SamplerOutput, SequenceOutputs, SequenceData
			
 
				 
			
 
				 _SAMPLING_EPS = 1e-5
			
@@ -54,18 +53,20 @@ class Sampler(nn.Module):
 
				         # Apply presence and frequency penalties.
			
 
				         output_tokens = _get_output_tokens(input_metadata)
			
 
				         assert len(output_tokens) == logits.shape[0]
			
 
				-        presence_penalties, frequency_penalties, repetition_penalties = _get_penalties(input_metadata)
			
 
				+        presence_penalties, frequency_penalties, repetition_penalties = _get_penalties(
			
 
				+            input_metadata)
			
 
				         assert len(presence_penalties) == logits.shape[0]
			
 
				         assert len(frequency_penalties) == logits.shape[0]
			
 
				-        logits = _apply_penalties(logits, output_tokens,
			
 
				-                                  presence_penalties, frequency_penalties, repetition_penalties,
			
 
				+        logits = _apply_penalties(logits, output_tokens, presence_penalties,
			
 
				+                                  frequency_penalties, repetition_penalties,
			
 
				                                   self.vocab_size)
			
 
				-        
			
 
				+
			
 
				         banned_tokens = _get_custom_token_bans(input_metadata)
			
 
				         assert len(banned_tokens) == logits.shape[0]
			
 
				         logits = _apply_token_bans(logits, banned_tokens)
			
 
				-        
			
 
				-        logits = _apply_logits_processors(input_metadata, logits, output_tokens)
			
 
				+
			
 
				+        logits = _apply_logits_processors(input_metadata, logits,
			
 
				+                                          output_tokens)
			
 
				 
			
 
				         # Apply Eta sampling, as described in https://arxiv.org/abs/2210.15191
			
 
				         eta_cutoffs = _get_eta_cutoffs(input_metadata)
			
@@ -101,7 +102,8 @@ class Sampler(nn.Module):
 
				             logits.div_(t.unsqueeze(dim=1))
			
 
				 
			
 
				         # Apply top-p, top-k, and top-a truncation.
			
 
				-        top_ps, top_ks, top_as = _get_top_a_top_p_top_k(input_metadata, self.vocab_size)
			
 
				+        top_ps, top_ks, top_as = _get_top_a_top_p_top_k(
			
 
				+            input_metadata, self.vocab_size)
			
 
				         assert len(top_ps) == len(top_ks) == logits.shape[0]
			
 
				         do_top_p = any(p < 1.0 - _SAMPLING_EPS for p in top_ps)
			
 
				         do_top_k = any(k != self.vocab_size for k in top_ks)
			
@@ -141,7 +143,7 @@ def _get_logits(hidden_states: torch.Tensor, embedding: torch.Tensor,
 
				 def _prune_hidden_states(
			
 
				     hidden_states: torch.Tensor,
			
 
				     input_metadata: InputMetadata,
			
 
				-) -> torch.Tensor:   
			
 
				+) -> torch.Tensor:
			
 
				     selected_token_indices: List[int] = []
			
 
				     start_idx = 0
			
 
				     for i, seq_group in enumerate(input_metadata.seq_groups):
			
@@ -166,6 +168,7 @@ def _prune_hidden_states(
 
				     hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
			
 
				     return hidden_states.index_select(0, selected_token_indices)
			
 
				 
			
 
				+
			
 
				 def _get_penalties(
			
 
				         input_metadata: InputMetadata) -> Tuple[List[float], List[float]]:
			
 
				     # Collect the presence and frequency penalties.
			
@@ -181,8 +184,10 @@ def _get_penalties(
 
				             frequency_penalties += [0] * (prompt_len - 1)
			
 
				             repetition_penalties += [0] * (prompt_len - 1)
			
 
				         presence_penalties += [sampling_params.presence_penalty] * len(seq_ids)
			
 
				-        frequency_penalties += [sampling_params.frequency_penalty] * len(seq_ids)
			
 
				-        repetition_penalties += [sampling_params.repetition_penalty] * len(seq_ids)
			
 
				+        frequency_penalties += [sampling_params.frequency_penalty
			
 
				+                                ] * len(seq_ids)
			
 
				+        repetition_penalties += [sampling_params.repetition_penalty
			
 
				+                                 ] * len(seq_ids)
			
 
				     return presence_penalties, frequency_penalties, repetition_penalties
			
 
				 
			
 
				 
			
@@ -215,14 +220,12 @@ def _get_custom_token_bans(input_metadata: InputMetadata) -> List[List[int]]:
 
				     return banned_tokens
			
 
				 
			
 
				 
			
 
				-def _apply_logits_processors(
			
 
				-    input_metadata: InputMetadata,
			
 
				-    logits: torch.Tensor,
			
 
				-    output_tokens: List[List[int]]
			
 
				-) -> torch.Tensor:
			
 
				+def _apply_logits_processors(input_metadata: InputMetadata,
			
 
				+                             logits: torch.Tensor,
			
 
				+                             output_tokens: List[List[int]]) -> torch.Tensor:
			
 
				     seq_offset = 0
			
 
				 
			
 
				-    for seq_ids,sampling_params in input_metadata.seq_groups:
			
 
				+    for seq_ids, sampling_params in input_metadata.seq_groups:
			
 
				         seq_end = seq_offset + len(seq_ids)
			
 
				 
			
 
				         for proc in sampling_params.logits_processors:
			
@@ -232,6 +235,7 @@ def _apply_logits_processors(
 
				 
			
 
				     return logits
			
 
				 
			
 
				+
			
 
				 def _apply_penalties(
			
 
				     logits: torch.Tensor,
			
 
				     output_tokens: List[List[int]],
			
@@ -244,9 +248,9 @@ def _apply_penalties(
 
				     for i in range(num_seqs):
			
 
				         if not output_tokens[i]:
			
 
				             continue
			
 
				-        if (abs(presence_penalties[i]) < _SAMPLING_EPS and
			
 
				-            abs(frequency_penalties[i]) < _SAMPLING_EPS and
			
 
				-            repetition_penalties[i] < 1.0 + _SAMPLING_EPS):
			
 
				+        if (abs(presence_penalties[i]) < _SAMPLING_EPS
			
 
				+                and abs(frequency_penalties[i]) < _SAMPLING_EPS
			
 
				+                and repetition_penalties[i] < 1.0 + _SAMPLING_EPS):
			
 
				             continue
			
 
				         break
			
 
				     else:
			
@@ -278,8 +282,8 @@ def _apply_penalties(
 
				                                       dtype=logits.dtype,
			
 
				                                       device=logits.device)
			
 
				     repetition_penalties = torch.tensor(repetition_penalties,
			
 
				-                                      dtype=logits.dtype,
			
 
				-                                      device=logits.device)
			
 
				+                                        dtype=logits.dtype,
			
 
				+                                        device=logits.device)
			
 
				 
			
 
				     # We follow the definition in OpenAI API.
			
 
				     # Refer to https://platform.openai.com/docs/api-reference/parameter-details
			
@@ -289,13 +293,16 @@ def _apply_penalties(
 
				 
			
 
				     # Effectively: If token is present and logit is positive, divide logit by rep_pen.
			
 
				     #              If token is present and logit is negative, multiply logit by rep_pen.
			
 
				-    logits += logits * (1 / repetition_penalties.unsqueeze(dim=1) - 1) * presence_mask * (logits > 0)
			
 
				-    logits += logits * (repetition_penalties.unsqueeze(dim=1) - 1) * presence_mask * (logits < 0)
			
 
				+    logits += logits * (1 / repetition_penalties.unsqueeze(dim=1) -
			
 
				+                        1) * presence_mask * (logits > 0)
			
 
				+    logits += logits * (repetition_penalties.unsqueeze(dim=1) -
			
 
				+                        1) * presence_mask * (logits < 0)
			
 
				 
			
 
				     return logits
			
 
				 
			
 
				 
			
 
				-def _apply_token_bans(logits: torch.Tensor, banned_tokens: List[List[int]]) -> torch.Tensor:
			
 
				+def _apply_token_bans(logits: torch.Tensor,
			
 
				+                      banned_tokens: List[List[int]]) -> torch.Tensor:
			
 
				     for i, banned_token_ids in enumerate(banned_tokens):
			
 
				         if not banned_token_ids:
			
 
				             continue
			
@@ -340,7 +347,7 @@ def _get_top_a_top_p_top_k(
 
				             prompt_len = input_metadata.prompt_lens[i]
			
 
				             top_ps += [sampling_params.top_p] * (prompt_len - 1)
			
 
				             top_ks += [top_k] * (prompt_len - 1)
			
 
				-            top_as += [sampling_params.top_a] * (prompt_len - 1) 
			
 
				+            top_as += [sampling_params.top_a] * (prompt_len - 1)
			
 
				         top_ps += [sampling_params.top_p] * len(seq_ids)
			
 
				         top_ks += [top_k] * len(seq_ids)
			
 
				         top_as += [sampling_params.top_a] * len(seq_ids)
			
@@ -415,11 +422,13 @@ def _apply_top_a_top_p_top_k(
 
				     probs_sort = logits_sort.softmax(dim=-1)
			
 
				     probs_sum = probs_sort.cumsum(dim=-1)
			
 
				     top_a_thresholds = torch.pow(probs_sort[:, 0], 2) * ts_a
			
 
				-    top_ap_mask = (probs_sort < top_a_thresholds.unsqueeze(1)) # Cull logits below the top-a threshold
			
 
				-    top_ap_mask.logical_or_(probs_sum > ts_p.unsqueeze(dim=1)) # Cull logits above the top-p summation threshold
			
 
				-    top_ap_mask[:, 0] = False # Guarantee at least one token is pickable
			
 
				+    top_ap_mask = (probs_sort < top_a_thresholds.unsqueeze(1)
			
 
				+                   )  # Cull logits below the top-a threshold
			
 
				+    top_ap_mask.logical_or_(probs_sum > ts_p.unsqueeze(
			
 
				+        dim=1))  # Cull logits above the top-p summation threshold
			
 
				+    top_ap_mask[:, 0] = False  # Guarantee at least one token is pickable
			
 
				     logits_sort[top_ap_mask] = -float("inf")
			
 
				-    
			
 
				+
			
 
				     # Apply top-k.
			
 
				     # Create a mask for the top-k elements.
			
 
				     top_k_mask = torch.arange(logits_idx.shape[-1], device=logits_idx.device)
			
@@ -433,6 +442,7 @@ def _apply_top_a_top_p_top_k(
 
				                           index=torch.argsort(logits_idx, dim=-1))
			
 
				     return logits
			
 
				 
			
 
				+
			
 
				 def _apply_tfs(
			
 
				     logits: torch.Tensor,
			
 
				     tfss: List[float],
			
@@ -446,14 +456,16 @@ def _apply_tfs(
 
				     tfs_mask = curvature_cdf > z.unsqueeze(dim=-1)
			
 
				 
			
 
				     tfs_mask = torch.cat(
			
 
				-            (
			
 
				-                torch.zeros(logits.shape[0], 1, dtype=torch.bool, device=logits.device),
			
 
				-                tfs_mask,
			
 
				-                torch.ones(logits.shape[0], 1, dtype=torch.bool, device=logits.device),
			
 
				-            ),
			
 
				-            dim=-1,
			
 
				-        )
			
 
				-    
			
 
				+        (
			
 
				+            torch.zeros(
			
 
				+                logits.shape[0], 1, dtype=torch.bool, device=logits.device),
			
 
				+            tfs_mask,
			
 
				+            torch.ones(
			
 
				+                logits.shape[0], 1, dtype=torch.bool, device=logits.device),
			
 
				+        ),
			
 
				+        dim=-1,
			
 
				+    )
			
 
				+
			
 
				     logits_sort[tfs_mask] = -float("inf")
			
 
				     logits = torch.gather(logits_sort,
			
 
				                           dim=-1,
			
@@ -462,21 +474,22 @@ def _apply_tfs(
 
				     return logits
			
 
				 
			
 
				 
			
 
				-
			
 
				 def _apply_eta_cutoff(
			
 
				     logits: torch.Tensor,
			
 
				     eta_cutoffs: List[float],
			
 
				 ) -> torch.Tensor:
			
 
				-    eta = torch.tensor(eta_cutoffs, dtype=logits.dtype, device=logits.device) * 1e-4
			
 
				+    eta = torch.tensor(eta_cutoffs, dtype=logits.dtype,
			
 
				+                       device=logits.device) * 1e-4
			
 
				     shifted_logits = torch.log_softmax(logits, dim=-1)
			
 
				     probs = shifted_logits.exp()
			
 
				 
			
 
				     neg_entropy = (probs * shifted_logits).nansum(dim=-1)
			
 
				-    eps = torch.min(eta, torch.sqrt(eta)*torch.exp(neg_entropy)).unsqueeze(dim=1)
			
 
				+    eps = torch.min(eta,
			
 
				+                    torch.sqrt(eta) * torch.exp(neg_entropy)).unsqueeze(dim=1)
			
 
				 
			
 
				     eta_mask = probs < eps
			
 
				 
			
 
				-    if(torch.all(eta_mask)): # guard against nulling out all the logits
			
 
				+    if (torch.all(eta_mask)):  # guard against nulling out all the logits
			
 
				         topk_prob, _ = torch.max(probs, dim=-1)
			
 
				         eta_mask = probs < topk_prob
			
 
				 
			
@@ -488,12 +501,14 @@ def _apply_epsilon_cutoff(
 
				     logits: torch.Tensor,
			
 
				     epsilon_cutoffs: List[float],
			
 
				 ) -> torch.Tensor:
			
 
				-    eps = torch.tensor(epsilon_cutoffs, dtype=logits.dtype, device=logits.device).unsqueeze(dim=1)
			
 
				+    eps = torch.tensor(epsilon_cutoffs,
			
 
				+                       dtype=logits.dtype,
			
 
				+                       device=logits.device).unsqueeze(dim=1)
			
 
				     probs = logits.softmax(dim=-1)
			
 
				 
			
 
				     eps_mask = probs < (eps * 1e-4)
			
 
				 
			
 
				-    if(torch.all(eps_mask)): # guard against nulling out all the logits
			
 
				+    if (torch.all(eps_mask)):  # guard against nulling out all the logits
			
 
				         topk_prob, _ = torch.max(probs, dim=-1)
			
 
				         eps_mask = probs < topk_prob
			
 
				 
			
@@ -515,17 +530,16 @@ def _apply_typical_sampling(
 
				     _, indices = torch.sort(surprisal_deviations)
			
 
				     reordered_probs = probs.gather(-1, indices)
			
 
				     typ_mask_sorted = reordered_probs.cumsum(dim=-1) >= typ_p.unsqueeze(dim=1)
			
 
				-    
			
 
				+
			
 
				     min_tokens_to_keep = 1
			
 
				     # Keep at least min_tokens_to_keep
			
 
				     typ_mask_sorted[..., :min_tokens_to_keep] = 0
			
 
				 
			
 
				-    typ_mask = typ_mask_sorted.scatter(
			
 
				-        1, indices, typ_mask_sorted
			
 
				-    )
			
 
				+    typ_mask = typ_mask_sorted.scatter(1, indices, typ_mask_sorted)
			
 
				     logits[typ_mask] = -float("inf")
			
 
				     return logits
			
 
				 
			
 
				+
			
 
				 def _greedy_sample(
			
 
				     selected_seq_groups: List[Tuple[List[int], SamplingParams]],
			
 
				     logprobs: torch.Tensor,
			
@@ -680,12 +694,13 @@ def _sample(
 
				                                                  category_logprobs)
			
 
				         else:
			
 
				             raise ValueError(f"Unsupported sampling type: {sampling_type}")
			
 
				-        
			
 
				+
			
 
				         sample_results_dict.update(zip(seq_group_ids, sample_results))
			
 
				 
			
 
				         sample_results = [
			
 
				-        sample_results_dict[i] for i in range(len(input_metadata.seq_groups))
			
 
				-    ]
			
 
				+            sample_results_dict[i]
			
 
				+            for i in range(len(input_metadata.seq_groups))
			
 
				+        ]
			
 
				     return sample_results
			
 
				 
			
 
				 
			
@@ -822,4 +837,4 @@ def _build_sampler_output(
 
				                 SequenceOutputs(seq_ids[parent_id], next_token_id, logprobs))
			
 
				         sampler_output.append(
			
 
				             SequenceGroupOutputs(seq_outputs, group_prompt_logprobs))
			
 
				-    return sampler_output
			
 
				+    return sampler_output
			
--- a/aphrodite/modeling/loader.py
+++ b/aphrodite/modeling/loader.py
@@ -19,11 +19,10 @@ _MODEL_REGISTRY = {
 
				 }
			
 
				 
			
 
				 _MODEL_CLASSES_SUPPORT_QUANTIZATION = {
			
 
				-    "awq": [
			
 
				-        LlamaForCausalLM, MistralForCausalLM
			
 
				-    ],
			
 
				+    "awq": [LlamaForCausalLM, MistralForCausalLM],
			
 
				     "gptq": [
			
 
				-        LlamaForCausalLM, GPTJForCausalLM, GPTNeoXForCausalLM, MistralForCausalLM
			
 
				+        LlamaForCausalLM, GPTJForCausalLM, GPTNeoXForCausalLM,
			
 
				+        MistralForCausalLM
			
 
				     ],
			
 
				 }
			
 
				 
			
@@ -53,7 +52,8 @@ def get_model(model_config: ModelConfig, max_tokens: int) -> nn.Module:
 
				     # Get the quantization config.
			
 
				     quant_config = None
			
 
				     if model_config.quantization is not None:
			
 
				-        if model_class not in _MODEL_CLASSES_SUPPORT_QUANTIZATION[model_config.quantization]:
			
 
				+        if model_class not in _MODEL_CLASSES_SUPPORT_QUANTIZATION[
			
 
				+                model_config.quantization]:
			
 
				             raise ValueError(
			
 
				                 f"Quantization is not supported for {model_class}.")
			
 
				         quant_config = get_quant_config(model_config.quantization,
			
@@ -96,4 +96,4 @@ def get_model(model_config: ModelConfig, max_tokens: int) -> nn.Module:
 
				             model = model.cuda()
			
 
				         if model_config.quantization is not None:
			
 
				             quant_post_init(model, max_tokens)
			
 
				-    return model.eval()
			
 
				+    return model.eval()
			
--- a/aphrodite/modeling/megatron/parallel_state.py
+++ b/aphrodite/modeling/megatron/parallel_state.py
@@ -2,7 +2,6 @@
 
				 # Copyright 2023 The vLLM team.
			
 
				 # Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
			
 
				 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
			
 
				-
			
 
				 """Model and data parallel groups."""
			
 
				 
			
 
				 import torch
			
@@ -177,4 +176,4 @@ def destroy_model_parallel():
 
				     global _PIPELINE_MODEL_PARALLEL_GROUP
			
 
				     _PIPELINE_MODEL_PARALLEL_GROUP = None
			
 
				     global _PIPELINE_GLOBAL_RANKS
			
 
				-    _PIPELINE_GLOBAL_RANKS = None
			
 
				+    _PIPELINE_GLOBAL_RANKS = None
			
--- a/aphrodite/modeling/metadata.py
+++ b/aphrodite/modeling/metadata.py
@@ -5,6 +5,7 @@ from xformers.ops import AttentionBias
 
				 from aphrodite.common.sampling_params import SamplingParams
			
 
				 from aphrodite.common.sequence import SequenceData
			
 
				 
			
 
				+
			
 
				 class InputMetadata:
			
 
				     """Metadata for input sequences. Used for PagedAttention.
			
 
				 
			
@@ -41,7 +42,7 @@ class InputMetadata:
 
				         self.to_cache = None
			
 
				         if sliding_window is not None:
			
 
				             # We need to keep the positions of sliding windows within
			
 
				-            # the key/value tables, this is helpful to know which 
			
 
				+            # the key/value tables, this is helpful to know which
			
 
				             # elements we need to cache and where.
			
 
				             to_cache, start_idx = [], 0
			
 
				             for prompt_len in self.prompt_lens:
			
@@ -80,4 +81,4 @@ class InputMetadata:
 
				                 f'max_context_len={self.max_context_len}), '
			
 
				                 f'max_num_blocks_per_seq={self.max_num_blocks_per_seq}, '
			
 
				                 f'block_tables={self.block_tables}), '
			
 
				-                f'slot_mapping={self.slot_mapping}')
			
 
				+                f'slot_mapping={self.slot_mapping}')
			
--- a/aphrodite/modeling/models/__init__.py
+++ b/aphrodite/modeling/models/__init__.py
@@ -3,10 +3,9 @@ from aphrodite.modeling.models.mistral import MistralForCausalLM
 
				 from aphrodite.modeling.models.gpt_j import GPTJForCausalLM
			
 
				 from aphrodite.modeling.models.gpt_neox import GPTNeoXForCausalLM
			
 
				 
			
 
				-
			
 
				 __all__ = [
			
 
				     "LlamaForCausalLM",
			
 
				     "GPTJForCausalLM",
			
 
				     "GPTNeoXForCausalLM",
			
 
				     "MistralForCausalLM",
			
 
				-]
			
 
				+]
			
--- a/aphrodite/modeling/models/gpt_j.py
+++ b/aphrodite/modeling/models/gpt_j.py
@@ -35,8 +35,7 @@ from aphrodite.modeling.hf_downloader import (hf_model_weights_iterator,
 
				                                               load_tensor_parallel_weights)
			
 
				 from aphrodite.modeling.megatron.parallel_state import (
			
 
				     get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
			
 
				-from aphrodite.modeling.megatron.layers import (
			
 
				-    VocabParallelEmbedding)
			
 
				+from aphrodite.modeling.megatron.layers import (VocabParallelEmbedding)
			
 
				 from aphrodite.common.sequence import SamplerOutput
			
 
				 
			
 
				 KVCache = Tuple[torch.Tensor, torch.Tensor]
			
@@ -265,4 +264,4 @@ class GPTJForCausalLM(nn.Module):
 
				             param = state_dict[name]
			
 
				             load_tensor_parallel_weights(param, loaded_weight, name,
			
 
				                                          self._column_parallel_weights,
			
 
				-                                         self._row_parallel_weights, tp_rank)
			
 
				+                                         self._row_parallel_weights, tp_rank)
			
--- a/aphrodite/modeling/models/gpt_neox.py
+++ b/aphrodite/modeling/models/gpt_neox.py
@@ -36,8 +36,8 @@ from aphrodite.modeling.hf_downloader import (hf_model_weights_iterator,
 
				 from aphrodite.modeling.megatron.parallel_state import (
			
 
				     get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
			
 
				 from aphrodite.modeling.megatron.layers import (VocabParallelEmbedding,
			
 
				-                                                       ColumnParallelLinear,
			
 
				-                                                       RowParallelLinear)
			
 
				+                                                ColumnParallelLinear,
			
 
				+                                                RowParallelLinear)
			
 
				 from aphrodite.common.sequence import SamplerOutput
			
 
				 
			
 
				 KVCache = Tuple[torch.Tensor, torch.Tensor]
			
@@ -283,4 +283,4 @@ class GPTNeoXForCausalLM(nn.Module):
 
				             load_tensor_parallel_weights(param, loaded_weight, name,
			
 
				                                          self._column_parallel_weights,
			
 
				                                          self._row_parallel_weights,
			
 
				-                                         tensor_model_parallel_rank)
			
 
				+                                         tensor_model_parallel_rank)
			
--- a/aphrodite/modeling/models/llama.py
+++ b/aphrodite/modeling/models/llama.py
@@ -39,8 +39,7 @@ from aphrodite.modeling.layers.sampler import Sampler
 
				 from aphrodite.modeling.layers.quantized_linear import ParallelLinear
			
 
				 from aphrodite.modeling.megatron.parallel_state import (
			
 
				     get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
			
 
				-from aphrodite.modeling.megatron.layers import (
			
 
				-    VocabParallelEmbedding)
			
 
				+from aphrodite.modeling.megatron.layers import (VocabParallelEmbedding)
			
 
				 from aphrodite.modeling.quantization_utils import QuantizationConfig
			
 
				 from aphrodite.modeling.hf_downloader import (
			
 
				     convert_pyslice_to_tensor, hf_model_weights_iterator,
			
@@ -132,7 +131,7 @@ class LlamaAttention(nn.Module):
 
				             self.head_dim,
			
 
				             self.scaling,
			
 
				             base=self.rope_theta,
			
 
				-            rope_scaling = self.rope_scaling,
			
 
				+            rope_scaling=self.rope_scaling,
			
 
				             max_position=self.max_position_embeddings,
			
 
				             rotary_dim=self.head_dim,
			
 
				             num_kv_heads=self.num_kv_heads)
			
@@ -229,8 +228,8 @@ class LlamaModel(nn.Module):
 
				         self.vocab_size = config.vocab_size
			
 
				 
			
 
				         vocab_size = ((config.vocab_size + 63) // 64) * 64
			
 
				-        self.embed_tokens = VocabParallelEmbedding(
			
 
				-            vocab_size, config.hidden_size)
			
 
				+        self.embed_tokens = VocabParallelEmbedding(vocab_size,
			
 
				+                                                   config.hidden_size)
			
 
				         self.layers = nn.ModuleList([
			
 
				             LlamaDecoderLayer(config, quant_config)
			
 
				             for _ in range(config.num_hidden_layers)
			
@@ -403,4 +402,4 @@ class LlamaForCausalLM(nn.Module):
 
				             load_tensor_parallel_weights(param, loaded_weight, name,
			
 
				                                          column_parallel_weights,
			
 
				                                          row_parallel_weights,
			
 
				-                                         tensor_model_parallel_rank)
			
 
				+                                         tensor_model_parallel_rank)
			
--- a/aphrodite/modeling/models/mistral.py
+++ b/aphrodite/modeling/models/mistral.py
@@ -294,7 +294,7 @@ class MistralForCausalLM(nn.Module):
 
				             if "rotary_emb.inv_freq" in name:
			
 
				                 continue
			
 
				             if any(name.endswith(suffix) for suffix in ignore_weight_suffixes):
			
 
				-                continue            
			
 
				+                continue
			
 
				 
			
 
				             is_packed = False
			
 
				             is_transposed = False
			
@@ -355,7 +355,7 @@ class MistralForCausalLM(nn.Module):
 
				                 break
			
 
				             if is_gate_up_weight:
			
 
				                 continue
			
 
				-            
			
 
				+
			
 
				             if name not in state_dict:
			
 
				                 continue
			
 
				             param = state_dict[name]
			
@@ -370,4 +370,4 @@ class MistralForCausalLM(nn.Module):
 
				             load_tensor_parallel_weights(param, loaded_weight, name,
			
 
				                                          column_parallel_weights,
			
 
				                                          row_parallel_weights,
			
 
				-                                         tensor_model_parallel_rank)
			
 
				+                                         tensor_model_parallel_rank)
			
--- a/aphrodite/modeling/quantization_utils/awq.py
+++ b/aphrodite/modeling/quantization_utils/awq.py
@@ -69,6 +69,6 @@ class AWQConfig(QuantizationConfig):
 
				 
			
 
				     def get_row_tp_tensor_names(self) -> List[str]:
			
 
				         return ["qweight", "qzeros", "scales"]
			
 
				-    
			
 
				+
			
 
				     def get_column_tp_tensor_names(self) -> List[str]:
			
 
				-        return ["qweight", "qzeros", "scales", "bias"]
			
 
				+        return ["qweight", "qzeros", "scales", "bias"]
			
--- a/aphrodite/modeling/quantization_utils/base.py
+++ b/aphrodite/modeling/quantization_utils/base.py
@@ -73,9 +73,9 @@ class QuantizationConfig:
 
				     @classmethod
			
 
				     def get_row_tp_tensor_names(self) -> List[str]:
			
 
				         raise NotImplementedError
			
 
				-    
			
 
				+
			
 
				     def get_column_tp_tensor_names(self) -> List[str]:
			
 
				         raise NotImplementedError
			
 
				-    
			
 
				+
			
 
				     def get_ignore_tensor_names(self) -> List[str]:
			
 
				-        return []
			
 
				+        return []
			
--- a/aphrodite/modeling/quantization_utils/gptq.py
+++ b/aphrodite/modeling/quantization_utils/gptq.py
@@ -6,6 +6,7 @@ from aphrodite.modeling.quantization_utils.base import QuantizationConfig
 
				 
			
 
				 
			
 
				 class GPTQConfig(QuantizationConfig):
			
 
				+
			
 
				     def __init__(
			
 
				         self,
			
 
				         weight_bits: int,
			
@@ -18,58 +19,58 @@ class GPTQConfig(QuantizationConfig):
 
				         self.pack_factor = 32 // self.weight_bits
			
 
				         if self.weight_bits != 4:
			
 
				             raise ValueError(
			
 
				-                f"Currently only 4-bit quant is supported for GPTQ, you passed {self.weight_bits} bits.")
			
 
				-    
			
 
				+                f"Currently only 4-bit quant is supported for GPTQ, you passed {self.weight_bits} bits."
			
 
				+            )
			
 
				+
			
 
				     def __repr__(self) -> str:
			
 
				         return (f"GPTQConfig(weight_bits={self.weight_bits}), "
			
 
				                 f"group_size={self.group_size}, "
			
 
				                 f"desc_act={self.desc_act}")
			
 
				-    
			
 
				+
			
 
				     @classmethod
			
 
				     def get_name(cls) -> str:
			
 
				         return "gptq"
			
 
				-    
			
 
				+
			
 
				     @classmethod
			
 
				     def get_supported_act_dtypes(cls) -> List[torch.dtype]:
			
 
				         return [torch.half]
			
 
				-    
			
 
				+
			
 
				     @classmethod
			
 
				     def get_min_capability(cls) -> int:
			
 
				         return 60
			
 
				-    
			
 
				+
			
 
				     @classmethod
			
 
				     def get_config_filenames(cls) -> List[str]:
			
 
				         return [
			
 
				             "quantize_config.json",
			
 
				         ]
			
 
				-    
			
 
				+
			
 
				     @classmethod
			
 
				     def from_config(cls, config: Dict[str, Any]) -> "GPTQConfig":
			
 
				         weight_bits = cls.get_from_keys(config, ["bits"])
			
 
				         group_size = cls.get_from_keys(config, ["group_size"])
			
 
				         desc_act = cls.get_from_keys(config, ["desc_act"])
			
 
				         return cls(weight_bits, group_size, desc_act)
			
 
				-    
			
 
				+
			
 
				     @classmethod
			
 
				     def get_packed_tensor_names(cls) -> List[str]:
			
 
				         return ["qzeros"]
			
 
				-    
			
 
				+
			
 
				     @classmethod
			
 
				     def get_transposed_tensor_names(cls) -> List[str]:
			
 
				         return ["qweight", "qzeros", "scales"]
			
 
				-    
			
 
				+
			
 
				     def get_row_tp_tensor_names(self) -> List[str]:
			
 
				         if self.desc_act and self.group_size != -1:
			
 
				             return ["qweight", "g_idx"]
			
 
				         if self.group_size == -1:
			
 
				             return ["qweight"]
			
 
				         return ["qweight", "qzeros", "scales"]
			
 
				-    
			
 
				+
			
 
				     def get_column_tp_tensor_names(self) -> List[str]:
			
 
				         return ["qweight", "qzeros", "scales", "bias"]
			
 
				-    
			
 
				+
			
 
				     def get_ignore_tensor_names(self) -> List[str]:
			
 
				         if self.desc_act and self.group_size != -1:
			
 
				             return []
			
 
				         return ["g_idx"]
			
 
				-        
			
--- a/aphrodite/modeling/utils.py
+++ b/aphrodite/modeling/utils.py
@@ -3,6 +3,7 @@ import random
 
				 import numpy as np
			
 
				 import torch
			
 
				 
			
 
				+
			
 
				 def set_random_seed(seed: int) -> None:
			
 
				     random.seed(seed)
			
 
				     np.random.seed(seed)
			
--- a/aphrodite/processing/policy.py
+++ b/aphrodite/processing/policy.py
@@ -2,6 +2,7 @@ from typing import List
 
				 
			
 
				 from aphrodite.common.sequence import SequenceGroup
			
 
				 
			
 
				+
			
 
				 class Policy:
			
 
				 
			
 
				     def get_priority(
			
@@ -22,6 +23,7 @@ class Policy:
 
				             reverse=True,
			
 
				         )
			
 
				 
			
 
				+
			
 
				 class FCFS(Policy):
			
 
				 
			
 
				     def get_priority(
			
@@ -31,6 +33,7 @@ class FCFS(Policy):
 
				     ) -> float:
			
 
				         return now - seq_group.arrival_time
			
 
				 
			
 
				+
			
 
				 class PolicyFactory:
			
 
				 
			
 
				     _POLICY_REGISTRY = {
			
@@ -39,4 +42,4 @@ class PolicyFactory:
 
				 
			
 
				     @classmethod
			
 
				     def get_policy(cls, policy_name: str, **kwargs) -> Policy:
			
 
				-        return cls._POLICY_REGISTRY[policy_name](**kwargs)
			
 
				+        return cls._POLICY_REGISTRY[policy_name](**kwargs)
			
--- a/aphrodite/processing/scheduler.py
+++ b/aphrodite/processing/scheduler.py
@@ -7,7 +7,7 @@ from aphrodite.processing.block_manager import BlockSpaceManager
 
				 from aphrodite.processing.policy import PolicyFactory
			
 
				 from aphrodite.common.logger import init_logger
			
 
				 from aphrodite.common.sequence import (Sequence, SequenceData, SequenceGroup,
			
 
				-                           SequenceGroupMetadata, SequenceStatus)
			
 
				+                                       SequenceGroupMetadata, SequenceStatus)
			
 
				 
			
 
				 logger = init_logger(__name__)
			
 
				 
			
@@ -397,4 +397,4 @@ class Scheduler:
 
				         mapping = self.block_manager.swap_out(seq_group)
			
 
				         blocks_to_swap_out.update(mapping)
			
 
				         for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
			
 
				-            seq.status = SequenceStatus.SWAPPED
			
 
				+            seq.status = SequenceStatus.SWAPPED
			
--- a/aphrodite/task_handler/worker.py
+++ b/aphrodite/task_handler/worker.py
@@ -6,7 +6,7 @@ import torch
 
				 import torch.distributed
			
 
				 
			
 
				 from aphrodite.common.config import (CacheConfig, ModelConfig, ParallelConfig,
			
 
				-                         SchedulerConfig)
			
 
				+                                     SchedulerConfig)
			
 
				 from aphrodite.modeling import get_model, InputMetadata, set_random_seed
			
 
				 from aphrodite.modeling.megatron.parallel_state import (
			
 
				     initialize_model_parallel)
			
@@ -67,7 +67,8 @@ class Worker:
 
				 
			
 
				         # Initialize the model.
			
 
				         set_random_seed(self.model_config.seed)
			
 
				-        self.model = get_model(self.model_config, self.scheduler_config.max_num_batched_tokens)
			
 
				+        self.model = get_model(self.model_config,
			
 
				+                               self.scheduler_config.max_num_batched_tokens)
			
 
				 
			
 
				     @torch.inference_mode()
			
 
				     def profile_num_available_blocks(
			
@@ -146,7 +147,7 @@ class Worker:
 
				         else:
			
 
				             max_seq_len = min(self.scheduler_config.max_model_len,
			
 
				                               self.sliding_window)
			
 
				-        
			
 
				+
			
 
				         _check_if_can_support_max_seq_len(max_seq_len, self.block_size)
			
 
				 
			
 
				         self.cache_engine = CacheEngine(self.cache_config, self.model_config,
			
@@ -401,6 +402,7 @@ def _check_if_can_support_max_seq_len(max_seq_len: int,
 
				             f"available shared memory {max_shared_mem}). "
			
 
				             "This will be fixed in a future release.")
			
 
				 
			
 
				+
			
 
				 def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
			
 
				     if torch_dtype == torch.bfloat16:
			
 
				         compute_capability = torch.cuda.get_device_capability()
			
@@ -410,4 +412,5 @@ def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
 
				                 "Bfloat16 is only supported on GPUs with compute capability "
			
 
				                 f"of at least 8.0. You {gpu_name} GPU has compute capability "
			
 
				                 f"{compute_capability[0]}.{compute_capability[1]}. Please "
			
 
				-                "use the `--dtype float16` argument when launching the engine.")
			
 
				+                "use the `--dtype float16` argument when launching the engine."
			
 
				+            )
			
--- a/aphrodite/transformers_utils/config.py
+++ b/aphrodite/transformers_utils/config.py
@@ -1,7 +1,9 @@
 
				 from typing import Optional
			
 
				 from transformers import AutoConfig, PretrainedConfig
			
 
				 
			
 
				-def get_config(model: str, trust_remote_code: bool,
			
 
				+
			
 
				+def get_config(model: str,
			
 
				+               trust_remote_code: bool,
			
 
				                revision: Optional[str] = None) -> PretrainedConfig:
			
 
				     try:
			
 
				         config = AutoConfig.from_pretrained(
			
--- a/tests/kernels/conftest.py
+++ b/tests/kernels/conftest.py
@@ -5,13 +5,13 @@ import torch
 
				 
			
 
				 
			
 
				 def create_kv_caches(
			
 
				-        num_blocks: int,
			
 
				-        block_size: int,
			
 
				-        num_layers: int,
			
 
				-        num_heads: int,
			
 
				-        head_size: int,
			
 
				-        dtype: torch.dtype,
			
 
				-        seed: int,
			
 
				+    num_blocks: int,
			
 
				+    block_size: int,
			
 
				+    num_layers: int,
			
 
				+    num_heads: int,
			
 
				+    head_size: int,
			
 
				+    dtype: torch.dtype,
			
 
				+    seed: int,
			
 
				 ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
			
 
				     torch.random.manual_seed(seed)
			
 
				     torch.cuda.manual_seed(seed)
			
@@ -37,6 +37,7 @@ def create_kv_caches(
 
				         values_caches.append(values_cache)
			
 
				     return key_caches, values_caches
			
 
				 
			
 
				+
			
 
				 @pytest.fixture()
			
 
				 def kv_cache_factory():
			
 
				-    return create_kv_caches
			
 
				+    return create_kv_caches
			
--- a/tests/kernels/test_activation.py
+++ b/tests/kernels/test_activation.py
@@ -8,13 +8,15 @@ from aphrodite import activation_ops
 
				 
			
 
				 DTYPES = [torch.half, torch.bfloat16, torch.float]
			
 
				 NUM_TOKENS = [7, 38, 2048]
			
 
				-D = [512, 4096, 5120, 13824] # arbitrary values for testing
			
 
				+D = [512, 4096, 5120, 13824]  # arbitrary values for testing
			
 
				 SEEDS = [0]
			
 
				 
			
 
				+
			
 
				 def ref_silu_and_mul(x: torch.Tensor) -> torch.Tensor:
			
 
				     x1, x2 = x.chunk(chunks=2, dim=1)
			
 
				     return F.silu(x1) * x2
			
 
				 
			
 
				+
			
 
				 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
			
 
				 @pytest.mark.parametrize("d", D)
			
 
				 @pytest.mark.parametrize("dtype", DTYPES)
			
@@ -71,4 +73,4 @@ def test_gelu_fast(
 
				     out = torch.empty(num_tokens, d, dtype=dtype, device='cuda')
			
 
				     activation_ops.gelu_fast(out, x)
			
 
				     ref_out = get_activation("gelu_fast")(x)
			
 
				-    assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5)
			
 
				+    assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5)
			
--- a/tests/models/test_models.py
+++ b/tests/models/test_models.py
@@ -4,6 +4,7 @@ MODELS = [
 
				     "EleutherAI/pythia-70m-deduped",
			
 
				 ]
			
 
				 
			
 
				+
			
 
				 @pytest.mark.parametrize("model", MODELS)
			
 
				 @pytest.mark.parametrize("dtype", ["half"])
			
 
				 @pytest.mark.parametrize("max_tokens", [128])
			
@@ -20,13 +21,16 @@ def test_models(
 
				     del hf_model
			
 
				 
			
 
				     aphrodite_model = aphrodite_runner(model, dtype=dtype)
			
 
				-    aphrodite_outputs = aphrodite_model.generate_greedy(example_prompts, max_tokens)
			
 
				+    aphrodite_outputs = aphrodite_model.generate_greedy(
			
 
				+        example_prompts, max_tokens)
			
 
				     del aphrodite_model
			
 
				 
			
 
				     for i in range(len(example_prompts)):
			
 
				         hf_output_ids, hf_output_str = hf_outputs[i]
			
 
				         aphrodite_output_ids, aphrodite_output_str = aphrodite_outputs[i]
			
 
				         assert hf_output_str == aphrodite_output_str, (
			
 
				-            f"Test{i}:\nHF: {hf_output_str!r}\nAphrodite: {aphrodite_output_str!r}")
			
 
				+            f"Test{i}:\nHF: {hf_output_str!r}\nAphrodite: {aphrodite_output_str!r}"
			
 
				+        )
			
 
				         assert hf_output_ids == aphrodite_output_ids, (
			
 
				-            f"Test{i}:\nHF: {hf_output_ids}\nAphrodite: {aphrodite_output_ids}")
			
 
				+            f"Test{i}:\nHF: {hf_output_ids}\nAphrodite: {aphrodite_output_ids}"
			
 
				+        )
			
--- a/tests/samplers/test_samplers.py
+++ b/tests/samplers/test_samplers.py
@@ -19,18 +19,19 @@ class MockLogitsSampler(Sampler):
 
				 
			
 
				     def forward(self, *args, **kwargs):
			
 
				         with patch("aphrodite.modeling.layers.sampler._prune_hidden_states",
			
 
				-                    lambda x, y: x):
			
 
				+                   lambda x, y: x):
			
 
				             with patch("aphrodite.modeling.layers.sampler._get_logits",
			
 
				-                      lambda *args, **kwargs: self.fake_logits):
			
 
				+                       lambda *args, **kwargs: self.fake_logits):
			
 
				                 return super().forward(*args, **kwargs)
			
 
				-            
			
 
				+
			
 
				+
			
 
				 def _prepare_test(
			
 
				     batch_size: int
			
 
				 ) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsSampler, Worker]:
			
 
				     vocab_size = 32000
			
 
				     input_tensor = torch.rand((batch_size, 1024),
			
 
				-                             device='cuda',
			
 
				-                             dtype=torch.float16)
			
 
				+                              device='cuda',
			
 
				+                              dtype=torch.float16)
			
 
				     fake_logits = torch.full((batch_size, vocab_size),
			
 
				                              1e-2,
			
 
				                              device=input_tensor.device,
			
@@ -40,8 +41,10 @@ def _prepare_test(
 
				     worker.block_size = 16
			
 
				     return input_tensor, fake_logits, sampler, worker
			
 
				 
			
 
				+
			
 
				 RANDOM_SEEDS = list(range(128))
			
 
				 
			
 
				+
			
 
				 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
			
 
				 def test_sampler_all_greedy(seed: int):
			
 
				     set_random_seed(seed)
			
@@ -58,16 +61,17 @@ def test_sampler_all_greedy(seed: int):
 
				                 sampling_params=SamplingParams(temperature=0, ),
			
 
				                 block_tables={0: [1]},
			
 
				             ))
			
 
				-    
			
 
				+
			
 
				     _, _, input_metadata = worker._prepare_inputs(seq_group_metadata_list)
			
 
				     sampler_output = sampler(embedding=None,
			
 
				-                            hidden_states=input_tensor,
			
 
				-                            input_metadata=input_metadata)
			
 
				+                             hidden_states=input_tensor,
			
 
				+                             input_metadata=input_metadata)
			
 
				     expected = torch.argmax(fake_logits, dim=-1)
			
 
				     for i, sequence_output in enumerate(sampler_output):
			
 
				         for nth_output in sequence_output:
			
 
				             assert nth_output.output_token == expected[i].item()
			
 
				 
			
 
				+
			
 
				 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
			
 
				 def test_sampler_all_random(seed: int):
			
 
				     set_random_seed(seed)
			
@@ -76,7 +80,7 @@ def test_sampler_all_random(seed: int):
 
				 
			
 
				     for i in range(batch_size):
			
 
				         fake_logits[i, i] = 1e2
			
 
				-    
			
 
				+
			
 
				     seq_group_metadata_list = []
			
 
				     for i in range(batch_size):
			
 
				         seq_group_metadata_list.append(
			
@@ -98,6 +102,7 @@ def test_sampler_all_random(seed: int):
 
				         for nth_output in sequence_output:
			
 
				             assert nth_output.output_token == i
			
 
				 
			
 
				+
			
 
				 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
			
 
				 def test_sampler_all_beam(seed: int):
			
 
				     set_random_seed(seed)
			
@@ -123,6 +128,7 @@ def test_sampler_all_beam(seed: int):
 
				             hidden_states=input_tensor,
			
 
				             input_metadata=input_metadata)
			
 
				 
			
 
				+
			
 
				 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
			
 
				 def test_sampler_mixed(seed: int):
			
 
				     set_random_seed(seed)
			
@@ -156,7 +162,7 @@ def test_sampler_mixed(seed: int):
 
				             sampling_params = SamplingParams(temperature=0,
			
 
				                                              use_beam_search=True,
			
 
				                                              best_of=2)
			
 
				-        
			
 
				+
			
 
				         for idx in range(n):
			
 
				             fake_logits[i, i + idx] = 1e2
			
 
				             expected_tokens.append(i + idx)
			
@@ -168,7 +174,7 @@ def test_sampler_mixed(seed: int):
 
				                 sampling_params=sampling_params,
			
 
				                 block_tables={0: [1]},
			
 
				             ))
			
 
				-        
			
 
				+
			
 
				     _, _, input_metadata = worker._prepare_inputs(seq_group_metadata_list)
			
 
				     sampler_output = sampler(embedding=None,
			
 
				                              hidden_states=input_tensor,
			
--- a/tests/serving.py
+++ b/tests/serving.py
@@ -23,15 +23,10 @@ def sample_requests(
 
				     with open(dataset_path) as f:
			
 
				         dataset = json.load(f)
			
 
				     # Filter out the conversations with less than 2 turns.
			
 
				-    dataset = [
			
 
				-        data for data in dataset
			
 
				-        if len(data["conversations"]) >= 2
			
 
				-    ]
			
 
				+    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
			
 
				     # Only keep the first two turns of each conversation.
			
 
				-    dataset = [
			
 
				-        (data["conversations"][0]["value"], data["conversations"][1]["value"])
			
 
				-        for data in dataset
			
 
				-    ]
			
 
				+    dataset = [(data["conversations"][0]["value"],
			
 
				+                data["conversations"][1]["value"]) for data in dataset]
			
 
				 
			
 
				     # Tokenize the prompts and completions.
			
 
				     prompts = [prompt for prompt, _ in dataset]
			
@@ -120,7 +115,8 @@ async def send_request(
 
				     timeout = aiohttp.ClientTimeout(total=3 * 3600)
			
 
				     async with aiohttp.ClientSession(timeout=timeout) as session:
			
 
				         while True:
			
 
				-            async with session.post(api_url, headers=headers, json=pload) as response:
			
 
				+            async with session.post(api_url, headers=headers,
			
 
				+                                    json=pload) as response:
			
 
				                 chunks = []
			
 
				                 async for chunk, _ in response.content.iter_chunks():
			
 
				                     chunks.append(chunk)
			
@@ -147,9 +143,9 @@ async def benchmark(
 
				     tasks: List[asyncio.Task] = []
			
 
				     async for request in get_request(input_requests, request_rate):
			
 
				         prompt, prompt_len, output_len = request
			
 
				-        task = asyncio.create_task(send_request(backend, api_url, prompt,
			
 
				-                                                prompt_len, output_len,
			
 
				-                                                best_of, use_beam_search))
			
 
				+        task = asyncio.create_task(
			
 
				+            send_request(backend, api_url, prompt, prompt_len, output_len,
			
 
				+                         best_of, use_beam_search))
			
 
				         tasks.append(task)
			
 
				     await asyncio.gather(*tasks)
			
 
				 
			
@@ -160,12 +156,14 @@ def main(args: argparse.Namespace):
 
				     np.random.seed(args.seed)
			
 
				 
			
 
				     api_url = f"http://{args.host}:{args.port}/api/v1/generate"
			
 
				-    tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code)
			
 
				+    tokenizer = get_tokenizer(args.tokenizer,
			
 
				+                              trust_remote_code=args.trust_remote_code)
			
 
				     input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer)
			
 
				 
			
 
				     benchmark_start_time = time.perf_counter()
			
 
				-    asyncio.run(benchmark(args.backend, api_url, input_requests, args.best_of,
			
 
				-                          args.use_beam_search, args.request_rate))
			
 
				+    asyncio.run(
			
 
				+        benchmark(args.backend, api_url, input_requests, args.best_of,
			
 
				+                  args.use_beam_search, args.request_rate))
			
 
				     benchmark_end_time = time.perf_counter()
			
 
				     benchmark_time = benchmark_end_time - benchmark_start_time
			
 
				     print(f"Total time: {benchmark_time:.2f} s")
			
@@ -179,10 +177,8 @@ def main(args: argparse.Namespace):
 
				         for prompt_len, output_len, latency in REQUEST_LATENCY
			
 
				     ])
			
 
				     print(f"Average latency per token: {avg_per_token_latency:.2f} s")
			
 
				-    avg_per_output_token_latency = np.mean([
			
 
				-        latency / output_len
			
 
				-        for _, output_len, latency in REQUEST_LATENCY
			
 
				-    ])
			
 
				+    avg_per_output_token_latency = np.mean(
			
 
				+        [latency / output_len for _, output_len, latency in REQUEST_LATENCY])
			
 
				     print("Average latency per output token: "
			
 
				           f"{avg_per_output_token_latency:.2f} s")
			
 
				 
			
@@ -190,27 +186,40 @@ def main(args: argparse.Namespace):
 
				 if __name__ == "__main__":
			
 
				     parser = argparse.ArgumentParser(
			
 
				         description="Benchmark the online serving throughput.")
			
 
				-    parser.add_argument("--backend", type=str, default="aphrodite",
			
 
				+    parser.add_argument("--backend",
			
 
				+                        type=str,
			
 
				+                        default="aphrodite",
			
 
				                         choices=["aphrodite", "tgi"])
			
 
				     parser.add_argument("--host", type=str, default="localhost")
			
 
				     parser.add_argument("--port", type=int, default=2242)
			
 
				-    parser.add_argument("--dataset", type=str, required=True,
			
 
				+    parser.add_argument("--dataset",
			
 
				+                        type=str,
			
 
				+                        required=True,
			
 
				                         help="Path to the dataset.")
			
 
				-    parser.add_argument("--tokenizer", type=str, required=True,
			
 
				+    parser.add_argument("--tokenizer",
			
 
				+                        type=str,
			
 
				+                        required=True,
			
 
				                         help="Name or path of the tokenizer.")
			
 
				-    parser.add_argument("--best-of", type=int, default=1,
			
 
				+    parser.add_argument("--best-of",
			
 
				+                        type=int,
			
 
				+                        default=1,
			
 
				                         help="Generates `best_of` sequences per prompt and "
			
 
				-                             "returns the best one.")
			
 
				+                        "returns the best one.")
			
 
				     parser.add_argument("--use-beam-search", action="store_true")
			
 
				-    parser.add_argument("--num-prompts", type=int, default=1000,
			
 
				+    parser.add_argument("--num-prompts",
			
 
				+                        type=int,
			
 
				+                        default=1000,
			
 
				                         help="Number of prompts to process.")
			
 
				-    parser.add_argument("--request-rate", type=float, default=float("inf"),
			
 
				+    parser.add_argument("--request-rate",
			
 
				+                        type=float,
			
 
				+                        default=float("inf"),
			
 
				                         help="Number of requests per second. If this is inf, "
			
 
				-                             "then all the requests are sent at time 0. "
			
 
				-                             "Otherwise, we use Poisson process to synthesize "
			
 
				-                             "the request arrival times.")
			
 
				+                        "then all the requests are sent at time 0. "
			
 
				+                        "Otherwise, we use Poisson process to synthesize "
			
 
				+                        "the request arrival times.")
			
 
				     parser.add_argument("--seed", type=int, default=0)
			
 
				-    parser.add_argument('--trust-remote-code', action='store_true',
			
 
				+    parser.add_argument('--trust-remote-code',
			
 
				+                        action='store_true',
			
 
				                         help='trust remote code from huggingface')
			
 
				     args = parser.parse_args()
			
 
				-    main(args)
			
 
				+    main(args)
			
--- a/tests/throughput.py
+++ b/tests/throughput.py
@@ -12,6 +12,7 @@ from tqdm import tqdm
 
				 from aphrodite import LLM, SamplingParams
			
 
				 from aphrodite.transformers_utils.tokenizer import get_tokenizer
			
 
				 
			
 
				+
			
 
				 def sample_requests(
			
 
				     dataset_path: str,
			
 
				     num_requests: int,
			
@@ -170,9 +171,10 @@ def main(args: argparse.Namespace):
 
				 
			
 
				     if args.backend == "aphrodite":
			
 
				         elapsed_time = run_aphrodite(requests, args.model, args.tokenizer,
			
 
				-                                args.quantization, args.tensor_parallel_size,
			
 
				-                                args.seed, args.n, args.use_beam_search,
			
 
				-                                args.trust_remote_code, args.dtype)
			
 
				+                                     args.quantization,
			
 
				+                                     args.tensor_parallel_size, args.seed,
			
 
				+                                     args.n, args.use_beam_search,
			
 
				+                                     args.trust_remote_code, args.dtype)
			
 
				     elif args.backend == "hf":
			
 
				         assert args.tensor_parallel_size == 1
			
 
				         elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
			
@@ -245,4 +247,4 @@ if __name__ == "__main__":
 
				     if args.tokenizer is None:
			
 
				         args.tokenizer = args.model
			
 
				 
			
 
				-    main(args)
			
 
				+    main(args)