port fp8 mixtral (#460)

This commit is contained in:
Lianmin Zheng
2024-05-21 11:46:35 -07:00
committed by GitHub
parent 19d2135cb8
commit 0fafc5606b
6 changed files with 633 additions and 118 deletions

View File

@@ -15,6 +15,7 @@ class ServerArgs:
chat_template: Optional[str] = None
trust_remote_code: bool = True
context_length: Optional[int] = None
quantization: Optional[str] = None
# Port
host: str = "127.0.0.1"
@@ -135,6 +136,12 @@ class ServerArgs:
default=ServerArgs.context_length,
help="The model's maximum context length. Defaults to None (will use the value from the model's config.json instead).",
)
parser.add_argument(
"--quantization",
type=str,
default=ServerArgs.quantization,
help="The quantization method.",
)
parser.add_argument(
"--mem-fraction-static",
type=float,