Support loading of larger models with on-the-fly quantization (#3061)

This commit is contained in:
Ke Wen
2025-01-22 21:33:17 -08:00
committed by GitHub
parent 8b84e69f25
commit 862bcff833
6 changed files with 116 additions and 14 deletions

View File

@@ -185,9 +185,12 @@ class ModelRunner:
self.load_model()
# Apply torchao quantization
apply_torchao_config_to_model(
self.model, global_server_args_dict["torchao_config"]
)
torchao_applied = getattr(self.model, "torchao_applied", False)
# In layered loading, torchao may have been applied
if not torchao_applied:
apply_torchao_config_to_model(
self.model, global_server_args_dict["torchao_config"]
)
# Apply torch TP if the model supports it
supports_torch_tp = getattr(self.model, "supports_torch_tp", False)