há 1 mês atrás · 349a612338
--- a/aphrodite/common/config.py
+++ b/aphrodite/common/config.py
@@ -260,7 +260,7 @@ class ModelConfig:
 
				         self._verify_embedding_mode()
			
 
				         self._verify_quantization()
			
 
				         self._verify_cuda_graph()
			
 
				-
			
 
				+        self._verify_bnb_config()
			
 
				     def _init_multimodal_config(
			
 
				         self, limit_mm_per_prompt: Optional[Mapping[str, int]]
			
 
				     ) -> Optional["MultiModalConfig"]:
			
@@ -450,6 +450,28 @@ class ModelConfig:
 
				         self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
			
 
				                                           self.max_model_len)
			
 
				 
			
 
				+    def _verify_bnb_config(self) -> None:
			
 
				+        """
			
 
				+        The current version of bitsandbytes (0.45.1) with 8-bit models does not 
			
 
				+        yet support CUDA graph.
			
 
				+        """
			
 
				+        is_bitsandbytes = self.quantization == "bitsandbytes"
			
 
				+        has_quantization_config = (getattr(self.hf_config,
			
 
				+                                           "quantization_config", None)
			
 
				+                                   is not None)
			
 
				+        is_8bit = (self.hf_config.quantization_config.get(
			
 
				+            "load_in_8bit", False) if has_quantization_config else False)
			
 
				+        if all([
			
 
				+                is_bitsandbytes,
			
 
				+                has_quantization_config,
			
 
				+                is_8bit,
			
 
				+                not self.enforce_eager,
			
 
				+        ]):
			
 
				+            logger.warning(
			
 
				+                "CUDA graph is not supported on Bitsandbytes 8bit yet, "
			
 
				+                "falling back to eager mode.")
			
 
				+            self.enforce_eager = True
			
 
				+
			
 
				     def verify_async_output_proc(self, parallel_config, speculative_config,
			
 
				                                  device_config) -> None:
			
 
				         if not self.use_async_output_proc:
			
@@ -511,11 +533,6 @@ class ModelConfig:
 
				                 f" architectures: {_PP_SUPPORTED_MODELS}. You are using "
			
 
				                 f"the following architecture: {architectures}.")
			
 
				 
			
 
				-        if self.quantization == "bitsandbytes" and self.enforce_eager is False:
			
 
				-            logger.warning("CUDA graph is not supported on BitAndBytes yet, "
			
 
				-                           "fallback to the eager mode.")
			
 
				-            self.enforce_eager = True
			
 
				-
			
 
				         if pipeline_parallel_size > 1 and self.use_async_output_proc:
			
 
				             logger.warning("Async output processor is not supported with "
			
 
				                            "pipeline parallelism currently. Disabling it.")
			
--- a/aphrodite/modeling/model_loader/loader.py
+++ b/aphrodite/modeling/model_loader/loader.py
@@ -856,12 +856,12 @@ class BitsAndBytesModelLoader(BaseModelLoader):
 
				         # only load the bitsandbytes module when needed
			
 
				         try:
			
 
				             import bitsandbytes
			
 
				-            if bitsandbytes.__version__ < "0.42.0":
			
 
				+            if bitsandbytes.__version__ < "0.45.1":
			
 
				                 raise ImportError("bitsandbytes version is wrong. Please "
			
 
				-                                  "install bitsandbytes>=0.42.0.")
			
 
				+                                  "install bitsandbytes>=0.45.1.")
			
 
				         except ImportError as err:
			
 
				-            raise ImportError("Please install bitsandbytes>=0.42.0 via "
			
 
				-                              "`pip install bitsandbytes>=0.42.0` to use "
			
 
				+            raise ImportError("Please install bitsandbytes>=0.45.1 via "
			
 
				+                              "`pip install bitsandbytes>=0.45.1` to use "
			
 
				                               "bitsandbytes quantizer.") from err
			
 
				 
			
 
				         hf_weights_files, use_safetensors = self._prepare_weights(
			
--- a/aphrodite/quantization/bitsandbytes.py
+++ b/aphrodite/quantization/bitsandbytes.py
@@ -114,12 +114,12 @@ class BitsAndBytesLinearMethod(LinearMethodBase):
 
				     def __init__(self, quant_config: BitsAndBytesConfig):
			
 
				         try:
			
 
				             import bitsandbytes
			
 
				-            if bitsandbytes.__version__ < "0.42.0":
			
 
				+            if bitsandbytes.__version__ < "0.45.1":
			
 
				                 raise ImportError("bitsandbytes version is wrong. Please "
			
 
				-                                  "install bitsandbytes>=0.42.0.")
			
 
				+                                  "install bitsandbytes>=0.45.1.")
			
 
				         except ImportError as err:
			
 
				-            raise ImportError("Please install bitsandbytes>=0.42.0 via "
			
 
				-                              "`pip install bitsandbytes>=0.42.0` to use "
			
 
				+            raise ImportError("Please install bitsandbytes>=0.45.1 via "
			
 
				+                              "`pip install bitsandbytes>=0.45.1` to use "
			
 
				                               "bitsandbytes quantizer.") from err
			
 
				 
			
 
				         self.quant_config = quant_config
			
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -95,7 +95,7 @@ def validate_generated_texts(hf_runner,
 
				     with aphrodite_runner(model_name,
			
 
				                      quantization='bitsandbytes',
			
 
				                      load_format='bitsandbytes',
			
 
				-                     enforce_eager=True,
			
 
				+                     enforce_eager=False,
			
 
				                      gpu_memory_utilization=0.8) as llm:
			
 
				         aphrodite_outputs = llm.generate_greedy(prompts, 8)
			
 
				         aphrodite_logs = log_generated_texts(prompts, aphrodite_outputs,