Ver Fonte

chore: bump bitsandbytes version to latest; enable cuda graphs for 4bit bnb (#1123)

AlpinDale há 1 mês atrás
pai
commit
349a612338

+ 23 - 6
aphrodite/common/config.py

@@ -260,7 +260,7 @@ class ModelConfig:
         self._verify_embedding_mode()
         self._verify_quantization()
         self._verify_cuda_graph()
-
+        self._verify_bnb_config()
     def _init_multimodal_config(
         self, limit_mm_per_prompt: Optional[Mapping[str, int]]
     ) -> Optional["MultiModalConfig"]:
@@ -450,6 +450,28 @@ class ModelConfig:
         self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
                                           self.max_model_len)
 
+    def _verify_bnb_config(self) -> None:
+        """
+        The current version of bitsandbytes (0.45.1) with 8-bit models does not 
+        yet support CUDA graph.
+        """
+        is_bitsandbytes = self.quantization == "bitsandbytes"
+        has_quantization_config = (getattr(self.hf_config,
+                                           "quantization_config", None)
+                                   is not None)
+        is_8bit = (self.hf_config.quantization_config.get(
+            "load_in_8bit", False) if has_quantization_config else False)
+        if all([
+                is_bitsandbytes,
+                has_quantization_config,
+                is_8bit,
+                not self.enforce_eager,
+        ]):
+            logger.warning(
+                "CUDA graph is not supported on Bitsandbytes 8bit yet, "
+                "falling back to eager mode.")
+            self.enforce_eager = True
+
     def verify_async_output_proc(self, parallel_config, speculative_config,
                                  device_config) -> None:
         if not self.use_async_output_proc:
@@ -511,11 +533,6 @@ class ModelConfig:
                 f" architectures: {_PP_SUPPORTED_MODELS}. You are using "
                 f"the following architecture: {architectures}.")
 
-        if self.quantization == "bitsandbytes" and self.enforce_eager is False:
-            logger.warning("CUDA graph is not supported on BitAndBytes yet, "
-                           "fallback to the eager mode.")
-            self.enforce_eager = True
-
         if pipeline_parallel_size > 1 and self.use_async_output_proc:
             logger.warning("Async output processor is not supported with "
                            "pipeline parallelism currently. Disabling it.")

+ 4 - 4
aphrodite/modeling/model_loader/loader.py

@@ -856,12 +856,12 @@ class BitsAndBytesModelLoader(BaseModelLoader):
         # only load the bitsandbytes module when needed
         try:
             import bitsandbytes
-            if bitsandbytes.__version__ < "0.42.0":
+            if bitsandbytes.__version__ < "0.45.1":
                 raise ImportError("bitsandbytes version is wrong. Please "
-                                  "install bitsandbytes>=0.42.0.")
+                                  "install bitsandbytes>=0.45.1.")
         except ImportError as err:
-            raise ImportError("Please install bitsandbytes>=0.42.0 via "
-                              "`pip install bitsandbytes>=0.42.0` to use "
+            raise ImportError("Please install bitsandbytes>=0.45.1 via "
+                              "`pip install bitsandbytes>=0.45.1` to use "
                               "bitsandbytes quantizer.") from err
 
         hf_weights_files, use_safetensors = self._prepare_weights(

+ 4 - 4
aphrodite/quantization/bitsandbytes.py

@@ -114,12 +114,12 @@ class BitsAndBytesLinearMethod(LinearMethodBase):
     def __init__(self, quant_config: BitsAndBytesConfig):
         try:
             import bitsandbytes
-            if bitsandbytes.__version__ < "0.42.0":
+            if bitsandbytes.__version__ < "0.45.1":
                 raise ImportError("bitsandbytes version is wrong. Please "
-                                  "install bitsandbytes>=0.42.0.")
+                                  "install bitsandbytes>=0.45.1.")
         except ImportError as err:
-            raise ImportError("Please install bitsandbytes>=0.42.0 via "
-                              "`pip install bitsandbytes>=0.42.0` to use "
+            raise ImportError("Please install bitsandbytes>=0.45.1 via "
+                              "`pip install bitsandbytes>=0.45.1` to use "
                               "bitsandbytes quantizer.") from err
 
         self.quant_config = quant_config

+ 1 - 1
tests/quantization/test_bitsandbytes.py

@@ -95,7 +95,7 @@ def validate_generated_texts(hf_runner,
     with aphrodite_runner(model_name,
                      quantization='bitsandbytes',
                      load_format='bitsandbytes',
-                     enforce_eager=True,
+                     enforce_eager=False,
                      gpu_memory_utilization=0.8) as llm:
         aphrodite_outputs = llm.generate_greedy(prompts, 8)
         aphrodite_logs = log_generated_texts(prompts, aphrodite_outputs,