From 25f653a739377c0811131719440af45250ef9cf6 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Wed, 3 Jun 2026 19:44:14 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: RedHatAI/Mistral-Small-24B-Instruct-2501-quantized.w8a8 Source: Original Platform --- .gitattributes | 38 +++ README.md | 394 +++++++++++++++++++++++++++++++ config.json | 3 + configuration.json | 1 + generation_config.json | 8 + model-00001-of-00006.safetensors | 3 + model-00002-of-00006.safetensors | 3 + model-00003-of-00006.safetensors | 3 + model-00004-of-00006.safetensors | 3 + model-00005-of-00006.safetensors | 3 + model-00006-of-00006.safetensors | 3 + model.safetensors.index.json | 3 + recipe.yaml | 20 ++ special_tokens_map.json | 3 + tekken.json | 3 + tokenizer.json | 3 + tokenizer_config.json | 3 + 17 files changed, 497 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 config.json create mode 100644 configuration.json create mode 100644 generation_config.json create mode 100644 model-00001-of-00006.safetensors create mode 100644 model-00002-of-00006.safetensors create mode 100644 model-00003-of-00006.safetensors create mode 100644 model-00004-of-00006.safetensors create mode 100644 model-00005-of-00006.safetensors create mode 100644 model-00006-of-00006.safetensors create mode 100644 model.safetensors.index.json create mode 100644 recipe.yaml create mode 100644 special_tokens_map.json create mode 100644 tekken.json create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..d8d7406 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,38 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text + +tekken.json filter=lfs diff=lfs merge=lfs -text \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..7b37148 --- /dev/null +++ b/README.md @@ -0,0 +1,394 @@ +--- +language: +- en +- fr +- de +- es +- it +- pt +- zh +- ja +- ru +- ko +base_model: +- mistralai/Mistral-Small-24B-Instruct-2501 +pipeline_tag: text-generation +tags: +- mistral +- mistral-small +- quantized +- W8A8 +- vllm +- conversational +- text-generation-inference +- compressed-tensors +license: apache-2.0 +license_name: apache-2.0 +name: RedHatAI/Mistral-Small-24B-Instruct-2501-quantized.w8a8 +description: This model was obtained by quantizing the weights and activations of Mistral-Small-24B-Instruct-2501 to INT8 data type. +readme: https://huggingface.co/RedHatAI/Mistral-Small-24B-Instruct-2501-quantized.w8a8/main/README.md +tasks: +- text-to-text +provider: Red Hat +license_link: https://www.apache.org/licenses/LICENSE-2.0 +validated_on: + - RHOAI 2.20 + - RHAIIS 3.0 + - RHELAI 1.5 +--- + +

+ Mistral-Small-24B-Instruct-2501-quantized.w8a8 + Model Icon +

+ + +Validated Badge + + +## Model Overview +- **Model Architecture:** Mistral3ForConditionalGeneration + - **Input:** Text / Image + - **Output:** Text +- **Model Optimizations:** + - **Activation quantization:** INT8 + - **Weight quantization:** INT8 +- **Intended Use Cases:** It is ideal for: + - Fast-response conversational agents. + - Low-latency function calling. + - Subject matter experts via fine-tuning. + - Local inference for hobbyists and organizations handling sensitive data. + - Programming and math reasoning. + - Long document understanding. + - Visual understanding. +- **Out-of-scope:** Use in any manner that violates applicable laws or regulations (including trade compliance laws). Use in languages not officially supported by the model. +- **Release Date:** 03/03/2025 +- **Version:** 1.0 +- **Validated on:** RHOAI 2.20, RHAIIS 3.0, RHELAI 1.5 +- **Model Developers:** Red Hat (Neural Magic) + + +### Model Optimizations + +This model was obtained by quantizing activations and weights of [Mistral-Small-24B-Instruct-2501](https://huggingface.co/mistralai/Mistral-Small-24B-Instruct-2501) to INT8 data type. +This optimization reduces the number of bits used to represent weights and activations from 16 to 8, reducing GPU memory requirements (by approximately 50%) and increasing matrix-multiply compute throughput (by approximately 2x). +Weight quantization also reduces disk size requirements by approximately 50%. + +Only weights and activations of the linear operators within transformers blocks are quantized. +Weights are quantized with a symmetric static per-channel scheme, whereas activations are quantized with a symmetric dynamic per-token scheme. +A combination of the [SmoothQuant](https://arxiv.org/abs/2211.10438) and [GPTQ](https://arxiv.org/abs/2210.17323) algorithms is applied for quantization, as implemented in the [llm-compressor](https://github.com/vllm-project/llm-compressor) library. + + +## Deployment + +1. Initialize vLLM server: +``` +vllm serve RedHatAI/Mistral-Small-24B-Instruct-2501-quantized.w8a8 --tensor_parallel_size 1 --tokenizer_mode mistral +``` + +2. Send requests to the server: + +```python +from openai import OpenAI + +# Modify OpenAI's API key and API base to use vLLM's API server. +openai_api_key = "EMPTY" +openai_api_base = "http://:8000/v1" + +client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, +) + +model = "RedHatAI/Mistral-Small-24B-Instruct-2501-quantized.w8a8" + + +messages = [ + {"role": "user", "content": "Explain quantum mechanics clearly and concisely."}, +] + +outputs = client.chat.completions.create( + model=model, + messages=messages, +) + +generated_text = outputs.choices[0].message.content +print(generated_text) +``` + +
+ Deploy on Red Hat AI Inference Server + +```bash +podman run --rm -it --device nvidia.com/gpu=all -p 8000:8000 \ + --ipc=host \ +--env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \ +--env "HF_HUB_OFFLINE=0" -v ~/.cache/vllm:/home/vllm/.cache \ +--name=vllm \ +registry.access.redhat.com/rhaiis/rh-vllm-cuda \ +vllm serve \ +--tensor-parallel-size 8 \ +--max-model-len 32768 \ +--enforce-eager --model RedHatAI/Mistral-Small-24B-Instruct-2501-quantized.w8a8 +``` +​​See [Red Hat AI Inference Server documentation](https://docs.redhat.com/en/documentation/red_hat_ai_inference_server/) for more details. +
+ +
+ Deploy on Red Hat Enterprise Linux AI + +```bash +# Download model from Red Hat Registry via docker +# Note: This downloads the model to ~/.cache/instructlab/models unless --model-dir is specified. +ilab model download --repository docker://registry.redhat.io/rhelai1/mistral-small-24b-instruct-2501-quantized-w8a8:1.5 +``` + +```bash +# Serve model via ilab +ilab model serve --model-path ~/.cache/instructlab/models/mistral-small-24b-instruct-2501-quantized-w8a8 + +# Chat with model +ilab model chat --model ~/.cache/instructlab/models/mistral-small-24b-instruct-2501-quantized-w8a8 +``` +See [Red Hat Enterprise Linux AI documentation](https://docs.redhat.com/en/documentation/red_hat_enterprise_linux_ai/1.4) for more details. +
+ +
+ Deploy on Red Hat Openshift AI + +```python +# Setting up vllm server with ServingRuntime +# Save as: vllm-servingruntime.yaml +apiVersion: serving.kserve.io/v1alpha1 +kind: ServingRuntime +metadata: + name: vllm-cuda-runtime # OPTIONAL CHANGE: set a unique name + annotations: + openshift.io/display-name: vLLM NVIDIA GPU ServingRuntime for KServe + opendatahub.io/recommended-accelerators: '["nvidia.com/gpu"]' + labels: + opendatahub.io/dashboard: 'true' +spec: + annotations: + prometheus.io/port: '8080' + prometheus.io/path: '/metrics' + multiModel: false + supportedModelFormats: + - autoSelect: true + name: vLLM + containers: + - name: kserve-container + image: quay.io/modh/vllm:rhoai-2.20-cuda # CHANGE if needed. If AMD: quay.io/modh/vllm:rhoai-2.20-rocm + command: + - python + - -m + - vllm.entrypoints.openai.api_server + args: + - "--port=8080" + - "--model=/mnt/models" + - "--served-model-name={{.Name}}" + env: + - name: HF_HOME + value: /tmp/hf_home + ports: + - containerPort: 8080 + protocol: TCP +``` + +```python +# Attach model to vllm server. This is an NVIDIA template +# Save as: inferenceservice.yaml +apiVersion: serving.kserve.io/v1beta1 +kind: InferenceService +metadata: + annotations: + openshift.io/display-name: mistral-small-24b-instruct-2501-quantized-w8a8 # OPTIONAL CHANGE + serving.kserve.io/deploymentMode: RawDeployment + name: mistral-small-24b-instruct-2501-quantized-w8a8 # specify model name. This value will be used to invoke the model in the payload + labels: + opendatahub.io/dashboard: 'true' +spec: + predictor: + maxReplicas: 1 + minReplicas: 1 + model: + modelFormat: + name: vLLM + name: '' + resources: + limits: + cpu: '2' # this is model specific + memory: 8Gi # this is model specific + nvidia.com/gpu: '1' # this is accelerator specific + requests: # same comment for this block + cpu: '1' + memory: 4Gi + nvidia.com/gpu: '1' + runtime: vllm-cuda-runtime # must match the ServingRuntime name above + storageUri: oci://registry.redhat.io/rhelai1/modelcar-mistral-small-24b-instruct-2501-quantized-w8a8:1.5 + tolerations: + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists +``` + +```bash +# make sure first to be in the project where you want to deploy the model +# oc project + +# apply both resources to run model + +# Apply the ServingRuntime +oc apply -f vllm-servingruntime.yaml + +# Apply the InferenceService +oc apply -f qwen-inferenceservice.yaml +``` + +```python +# Replace and below: +# - Run `oc get inferenceservice` to find your URL if unsure. + +# Call the server using curl: +curl https://-predictor-default./v1/chat/completions + -H "Content-Type: application/json" \ + -d '{ + "model": "mistral-small-24b-instruct-2501-quantized-w8a8", + "stream": true, + "stream_options": { + "include_usage": true + }, + "max_tokens": 1, + "messages": [ + { + "role": "user", + "content": "How can a bee fly when its wings are so small?" + } + ] +}' + +``` + +See [Red Hat Openshift AI documentation](https://docs.redhat.com/en/documentation/red_hat_openshift_ai/2025) for more details. +
+ +## Creation + +
+ Creation details + This model was created with [llm-compressor](https://github.com/vllm-project/llm-compressor) by running the code snippet below. + + + ```python + from transformers import AutoTokenizer, AutoModelForCausalLM + from llmcompressor.modifiers.quantization import GPTQModifier + from llmcompressor.modifiers.smoothquant import SmoothQuantModifier + from llmcompressor.transformers import oneshot + from datasets import load_dataset + + # Load model + model_stub = "mistralai/Mistral-Small-24B-Instruct-2501" + model_name = model_stub.split("/")[-1] + + num_samples = 1024 + max_seq_len = 8192 + + tokenizer = AutoTokenizer.from_pretrained(model_stub) + + model = AutoModelForCausalLM.from_pretrained( + model_stub, + device_map="auto", + torch_dtype="auto", + ) + + # Data processing + def preprocess_text(example): + text = tokenizer.apply_chat_template(example["messages"], tokenize=False, add_generation_prompt=False) + return tokenizer(text, padding=False, max_length=max_seq_len, truncation=True) + + ds = load_dataset("neuralmagic/calibration", name="LLM", split="train").select(range(num_samples)) + ds = ds.map(preprocess_text, remove_columns=ds.column_names) + + # Configure the quantization algorithm and scheme + recipe = [ + SmoothQuantModifier( + smoothing_strength=0.9, + mappings=[ + [["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"], "re:.*input_layernorm"], + [["re:.*gate_proj", "re:.*up_proj"], "re:.*post_attention_layernorm"], + [["re:.*down_proj"], "re:.*up_proj"], + ], + ), + GPTQModifier( + ignore=["lm_head"], + sequential_targets=["MistralDecoderLayer"], + dampening_frac=0.1, + targets="Linear", + scheme="W8A8", + ), + ] + + # Apply quantization + oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=max_seq_len, + num_calibration_samples=num_samples + ) + + # Save to disk in compressed-tensors format + save_path = model_name + "-quantized.w8a8" + model.save_pretrained(save_path) + processor.save_pretrained(save_path) + print(f"Model and tokenizer saved to: {save_path}") + ``` +
+ + + +## Evaluation + +The model was evaluated on OpenLLM Leaderboard [V1](https://huggingface.co/spaces/open-llm-leaderboard-old/open_llm_leaderboard) and [V2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/), using the following commands: + +OpenLLM Leaderboard V1: +``` +lm_eval \ + --model vllm \ + --model_args pretrained="neuralmagic/Mistral-Small-24B-Instruct-2501-FP8-Dynamic",dtype=auto,add_bos_token=True,max_model_len=4096,tensor_parallel_size=1,gpu_memory_utilization=0.8,enable_chunked_prefill=True,trust_remote_code=True \ + --tasks openllm \ + --write_out \ + --batch_size auto \ + --output_path output_dir \ + --show_config +``` + +OpenLLM Leaderboard V2: +``` +lm_eval \ + --model vllm \ + --model_args pretrained="neuralmagic/Mistral-Small-24B-Instruct-2501-FP8-Dynamic",dtype=auto,add_bos_token=False,max_model_len=4096,tensor_parallel_size=1,gpu_memory_utilization=0.8,enable_chunked_prefill=True,trust_remote_code=True \ + --apply_chat_template \ + --fewshot_as_multiturn \ + --tasks leaderboard \ + --write_out \ + --batch_size auto \ + --output_path output_dir \ + --show_config + +``` + +### Accuracy + +#### OpenLLM Leaderboard V1 evaluation scores + +| Metric | mistralai/Mistral-Small-24B-Instruct-2501 | nm-testing/Mistral-Small-24B-Instruct-2501-quantized.w8a8 | +|-----------------------------------------|:---------------------------------:|:-------------------------------------------:| +| ARC-Challenge (Acc-Norm, 25-shot) | 72.18 | 68.86 | +| GSM8K (Strict-Match, 5-shot) | 90.14 | 90.00 | +| HellaSwag (Acc-Norm, 10-shot) | 85.05 | 85.06 | +| MMLU (Acc, 5-shot) | 80.69 | 80.25 | +| TruthfulQA (MC2, 0-shot) | 65.55 | 65.69 | +| Winogrande (Acc, 5-shot) | 83.11 | 81.69 | +| **Average Score** | **79.45** | **78.59** | +| **Recovery (%)** | **100.00** | **98.92** | diff --git a/config.json b/config.json new file mode 100644 index 0000000..2997044 --- /dev/null +++ b/config.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1eeb1e8c88cd9f8b8c8333998160f50b1ca0b760b4d0577c688b2b953d43359d +size 1747 diff --git a/configuration.json b/configuration.json new file mode 100644 index 0000000..4aef15d --- /dev/null +++ b/configuration.json @@ -0,0 +1 @@ +{"framework": "pytorch", "task": "image-text-to-text", "allow_remote": true} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..48021ec --- /dev/null +++ b/generation_config.json @@ -0,0 +1,8 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": 2, + "temperature": 0.15, + "transformers_version": "4.49.0" +} diff --git a/model-00001-of-00006.safetensors b/model-00001-of-00006.safetensors new file mode 100644 index 0000000..ede164b --- /dev/null +++ b/model-00001-of-00006.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc32b2bab92804d315eb42d4e03f796596c2b24b2d9b34636e57eaeba2e22c9a +size 4898056368 diff --git a/model-00002-of-00006.safetensors b/model-00002-of-00006.safetensors new file mode 100644 index 0000000..c7cfe68 --- /dev/null +++ b/model-00002-of-00006.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98e29a90c01f9e217b5c101d68f157e84e20464867c10b7d453710dd207f0c77 +size 4835544720 diff --git a/model-00003-of-00006.safetensors b/model-00003-of-00006.safetensors new file mode 100644 index 0000000..ed86c68 --- /dev/null +++ b/model-00003-of-00006.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd955b7feb43181e3c0495bb057b969c0d10264775f6ce944a526f5b7eb0d4bb +size 4982400728 diff --git a/model-00004-of-00006.safetensors b/model-00004-of-00006.safetensors new file mode 100644 index 0000000..1b6143b --- /dev/null +++ b/model-00004-of-00006.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03ccad2ea3c2bfe86e9d27f52a5a087140543a3f263ae49e8a45b0ff774476b4 +size 4998137560 diff --git a/model-00005-of-00006.safetensors b/model-00005-of-00006.safetensors new file mode 100644 index 0000000..cb3ba72 --- /dev/null +++ b/model-00005-of-00006.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42c03c7ac1003499657d8ec94242e04a122828a1033f19721577c385e4222fa0 +size 3865305096 diff --git a/model-00006-of-00006.safetensors b/model-00006-of-00006.safetensors new file mode 100644 index 0000000..40415b3 --- /dev/null +++ b/model-00006-of-00006.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58034435463df5dfb748849bb5002d84b1ad1a1b308dfe26f5420e3e0a340cba +size 1342177408 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..01f534f --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05039e40f7d6767bfcb8ce5e73ac130fb6677e8389245713f5d75dcf9dbc8a1b +size 54304 diff --git a/recipe.yaml b/recipe.yaml new file mode 100644 index 0000000..3d92f93 --- /dev/null +++ b/recipe.yaml @@ -0,0 +1,20 @@ +quant_stage: + quant_modifiers: + SmoothQuantModifier: + smoothing_strength: 0.9 + mappings: + - - ['re:.*q_proj', 're:.*k_proj', 're:.*v_proj'] + - re:.*input_layernorm + - - ['re:.*gate_proj', 're:.*up_proj'] + - re:.*post_attention_layernorm + - - ['re:.*down_proj'] + - re:.*up_proj + GPTQModifier: + ignore: [lm_head] + dampening_frac: 0.09999999999999999 + config_groups: + group_0: + targets: [Linear] + weights: {num_bits: 8, type: int, symmetric: true, strategy: channel, observer: mse} + input_activations: {num_bits: 8, type: int, symmetric: true, strategy: token, dynamic: true, + observer: memoryless} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..e730fbf --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bc5d1c106bf86895473568ef5211f3fde8415e6fdee039223d6db8ca52cbe01 +size 21311 diff --git a/tekken.json b/tekken.json new file mode 100644 index 0000000..f973db9 --- /dev/null +++ b/tekken.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4b90a968dbc67ef3975129d0b78a2e3cbb6bea340ab9205f22e8a0308b1ffc5 +size 14801223 diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..4135d4b --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b76085f9923309d873994d444989f7eb6ec074b06f25b58f1e8d7b7741070949 +size 17078037 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..9afdb72 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68451517cb759df19bda82ea07c4db6606f87a4bde1bb895b41bc4968126a9b7 +size 199699