Update Readme (#660)
Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
This commit is contained in:
@@ -166,6 +166,15 @@ class ServerArgs:
|
||||
"--quantization",
|
||||
type=str,
|
||||
default=ServerArgs.quantization,
|
||||
choices=[
|
||||
"awq",
|
||||
"fp8",
|
||||
"gptq",
|
||||
"marlin",
|
||||
"gptq_marlin",
|
||||
"squeezellm",
|
||||
"bitsandbytes",
|
||||
],
|
||||
help="The quantization method.",
|
||||
)
|
||||
parser.add_argument(
|
||||
@@ -243,13 +252,13 @@ class ServerArgs:
|
||||
parser.add_argument(
|
||||
"--show-time-cost",
|
||||
action="store_true",
|
||||
help="Show time cost of custom marks",
|
||||
help="Show time cost of custom marks.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--api-key",
|
||||
type=str,
|
||||
default=ServerArgs.api_key,
|
||||
help="Set API key of the server",
|
||||
help="Set API key of the server.",
|
||||
)
|
||||
|
||||
# Data parallelism
|
||||
@@ -285,17 +294,17 @@ class ServerArgs:
|
||||
parser.add_argument(
|
||||
"--disable-flashinfer",
|
||||
action="store_true",
|
||||
help="Disable flashinfer inference kernels",
|
||||
help="Disable flashinfer inference kernels.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--disable-radix-cache",
|
||||
action="store_true",
|
||||
help="Disable RadixAttention",
|
||||
help="Disable RadixAttention for prefix caching.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--disable-regex-jump-forward",
|
||||
action="store_true",
|
||||
help="Disable regex jump-forward",
|
||||
help="Disable regex jump-forward.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--disable-cuda-graph",
|
||||
|
||||
Reference in New Issue
Block a user