|
@@ -17,17 +17,27 @@ from aphrodite.distributed import (divide,
|
|
|
tensor_model_parallel_all_reduce)
|
|
|
from aphrodite.modeling.parameter import (BaseAphroditeParameter,
|
|
|
PackedAphroditeParameter,
|
|
|
- PerTensorScaleParameter)
|
|
|
+ PackedColumnParameter,
|
|
|
+ PerTensorScaleParameter,
|
|
|
+ RowAphroditeParameter)
|
|
|
# yapf: enable
|
|
|
from aphrodite.modeling.utils import set_weight_attrs
|
|
|
from aphrodite.quantization.base_config import (QuantizationConfig,
|
|
|
QuantizeMethodBase)
|
|
|
|
|
|
WEIGHT_LOADER_V2_SUPPORTED = [
|
|
|
- "CompressedTensorsLinearMethod", "GPTQMarlinLinearMethod",
|
|
|
- "AWQMarlinLinearMethod", "AWQLinearMethod", "HQQMarlinMethod",
|
|
|
- "Fp8LinearMethod", "MarlinLinearMethod", "QQQLinearMethod",
|
|
|
- "GPTQMarlin24LinearMethod", "TPUInt8LinearMethod"
|
|
|
+ "AWQLinearMethod",
|
|
|
+ "AWQMarlinLinearMethod",
|
|
|
+ "CompressedTensorsLinearMethod",
|
|
|
+ "FBGEMMFp8LinearMethod",
|
|
|
+ "Fp8LinearMethod",
|
|
|
+ "GPTQLinearMethod",
|
|
|
+ "GPTQMarlin24LinearMethod",
|
|
|
+ "GPTQMarlinLinearMethod",
|
|
|
+ "HQQMarlinMethod",
|
|
|
+ "MarlinLinearMethod",
|
|
|
+ "QQQLinearMethod",
|
|
|
+ "TPUInt8LinearMethod",
|
|
|
]
|
|
|
|
|
|
|
|
@@ -601,8 +611,9 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
|
|
|
# Special case for Quantization.
|
|
|
# If quantized, we need to adjust the offset and size to account
|
|
|
# for the packing.
|
|
|
- if isinstance(param, PackedAphroditeParameter
|
|
|
- ) and param.packed_dim == param.output_dim:
|
|
|
+ if isinstance(param, (PackedColumnParameter,
|
|
|
+ PackedAphroditeParameter,
|
|
|
+ )) and param.packed_dim == param.output_dim:
|
|
|
shard_size, shard_offset = \
|
|
|
param.adjust_shard_indexes_for_packing(
|
|
|
shard_size=shard_size, shard_offset=shard_offset)
|
|
@@ -621,7 +632,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
|
|
|
param.load_merged_column_weight(loaded_weight=loaded_weight,
|
|
|
shard_id=0)
|
|
|
return
|
|
|
- elif type(param) is BaseAphroditeParameter:
|
|
|
+ elif type(param) in (RowAphroditeParameter, BaseAphroditeParameter):
|
|
|
param.load_merged_column_weight(loaded_weight=loaded_weight)
|
|
|
return
|
|
|
self._load_fused_module_from_checkpoint(param, loaded_weight)
|
|
@@ -760,8 +771,9 @@ class QKVParallelLinear(ColumnParallelLinear):
|
|
|
# Special case for Quantization.
|
|
|
# If quantized, we need to adjust the offset and size to account
|
|
|
# for the packing.
|
|
|
- if isinstance(param, PackedAphroditeParameter
|
|
|
- ) and param.packed_dim == param.output_dim:
|
|
|
+ if isinstance(param, (PackedColumnParameter,
|
|
|
+ PackedAphroditeParameter,
|
|
|
+ )) and param.packed_dim == param.output_dim:
|
|
|
shard_size, shard_offset = \
|
|
|
param.adjust_shard_indexes_for_packing(
|
|
|
shard_size=shard_size, shard_offset=shard_offset)
|
|
@@ -777,11 +789,10 @@ class QKVParallelLinear(ColumnParallelLinear):
|
|
|
loaded_shard_id: Optional[str] = None):
|
|
|
if loaded_shard_id is None: # special case for certain models
|
|
|
if isinstance(param, PerTensorScaleParameter):
|
|
|
- param.load_merged_column_weight(loaded_weight=loaded_weight,
|
|
|
- shard_id=0)
|
|
|
+ param.load_qkv_weight(loaded_weight=loaded_weight, shard_id=0)
|
|
|
return
|
|
|
- elif type(param) is BaseAphroditeParameter:
|
|
|
- param.load_merged_column_weight(loaded_weight=loaded_weight)
|
|
|
+ elif type(param) in (RowAphroditeParameter, BaseAphroditeParameter):
|
|
|
+ param.load_qkv_weight(loaded_weight=loaded_weight)
|
|
|
return
|
|
|
self._load_fused_module_from_checkpoint(param, loaded_weight)
|
|
|
return
|