llama : add gpt-oss (#15091)
* oai moe * compat with new checkpoint * add attn sink impl * add rope scaling yarn * logits match with latest transformers code * wip chat template * rm trailing space * use ggml_scale_bias * rm redundant is_swa_all * convert interleaved gate_up * graph : fix activation function to match reference (#7) * vocab : handle o200k_harmony special tokens * ggml : add attention sinks support (#1) * llama : add attn sinks * ggml : add attn sinks * cuda : add attn sinks * vulkan : add support for sinks in softmax remove unnecessary return * ggml : add fused swiglu_oai op (#11) * ggml : add fused swiglu_oai op * Update ggml/src/ggml-cpu/ops.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * update CUDA impl * cont : metal impl * add vulkan impl * test-backend-ops : more test cases, clean up * llama : remove unfused impl * remove extra lines --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: slaren <slarengh@gmail.com> * repack mxfp4 upon conversion * clean up a bit * enable thinking * add quick hack to render only some special tokens * fix bf16 conversion * remove vocab hack * webui ok * support chat parsing for gpt-oss * fix webui * direct mapping mxfp4, FINALLY * force using mxfp4 * properly use lazy tensor * ggml : add mxfp4 ggml : use e8m0 conversion instead of powf Co-authored-by: Diego Devesa <slarengh@gmail.com> change kvalues_mxfp4 table to match e2m1 (#6) metal : remove quantization for now (not used) cuda : fix disabled CUDA graphs due to ffn moe bias vulkan : add support for mxfp4 cont : add cm2 dequant * ggml : add ggml_add_id (#13) * ggml : add ggml_add_id * add cuda impl * llama : add weight support check for add_id * perf opt * add vulkan impl * rename cuda files * add metal impl * allow in-place ggml_add_id * llama : keep biases on CPU with --cpu-moe * llama : fix compile error ggml-ci * cuda : add fallback for __nv_cvt_e8m0_to_bf16raw ggml-ci * cleanup ggml-ci * sycl : fix supports_op for MXFP4 ggml-ci * fix Unknown reasoning format * ggml-cpu : fix AVX build ggml-ci * fix hip build ggml-ci * cuda : add mxfp4 dequantization support for cuBLAS ggml-ci * ggml-cpu : fix mxfp4 fallback definitions for some architectures ggml-ci * cuda : fix version required for __nv_cvt_e8m0_to_bf16raw --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co> Co-authored-by: slaren <slarengh@gmail.com>
This commit is contained in:
@@ -582,9 +582,6 @@ FILE * ggml_fopen(const char * fname, const char * mode) {
|
||||
#endif
|
||||
|
||||
}
|
||||
static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
|
||||
static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
|
||||
static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
|
||||
|
||||
static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
||||
[GGML_TYPE_I8] = {
|
||||
@@ -690,6 +687,14 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
||||
.is_quantized = true,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q8_1_ref,
|
||||
},
|
||||
[GGML_TYPE_MXFP4] = {
|
||||
.type_name = "mxfp4",
|
||||
.blck_size = QK_MXFP4,
|
||||
.type_size = sizeof(block_mxfp4),
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_mxfp4,
|
||||
.from_float_ref = (ggml_from_float_t)quantize_row_mxfp4_ref,
|
||||
},
|
||||
[GGML_TYPE_Q2_K] = {
|
||||
.type_name = "q2_K",
|
||||
.blck_size = QK_K,
|
||||
@@ -917,6 +922,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
|
||||
"DUP",
|
||||
"ADD",
|
||||
"ADD_ID",
|
||||
"ADD1",
|
||||
"ACC",
|
||||
"SUB",
|
||||
@@ -1010,13 +1016,14 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
"GLU",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 86, "GGML_OP_COUNT != 86");
|
||||
static_assert(GGML_OP_COUNT == 87, "GGML_OP_COUNT != 87");
|
||||
|
||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"none",
|
||||
|
||||
"x",
|
||||
"x+y",
|
||||
"x[i]+y",
|
||||
"x+y",
|
||||
"view(x,nb,offset)+=y->x",
|
||||
"x-y",
|
||||
@@ -1110,7 +1117,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"glu(x)",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 86, "GGML_OP_COUNT != 86");
|
||||
static_assert(GGML_OP_COUNT == 87, "GGML_OP_COUNT != 87");
|
||||
|
||||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||
|
||||
@@ -1140,11 +1147,12 @@ static const char * GGML_GLU_OP_NAME[GGML_GLU_OP_COUNT] = {
|
||||
"REGLU",
|
||||
"GEGLU",
|
||||
"SWIGLU",
|
||||
"SWIGLU_OAI",
|
||||
"GEGLU_ERF",
|
||||
"GEGLU_QUICK",
|
||||
};
|
||||
|
||||
static_assert(GGML_GLU_OP_COUNT == 5, "GGML_GLU_OP_COUNT != 5");
|
||||
static_assert(GGML_GLU_OP_COUNT == 6, "GGML_GLU_OP_COUNT != 6");
|
||||
|
||||
|
||||
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
||||
@@ -1312,6 +1320,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
||||
case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break;
|
||||
case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break;
|
||||
case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break;
|
||||
case GGML_FTYPE_MOSTLY_MXFP4: wtype = GGML_TYPE_MXFP4; break;
|
||||
case GGML_FTYPE_MOSTLY_Q2_K: wtype = GGML_TYPE_Q2_K; break;
|
||||
case GGML_FTYPE_MOSTLY_Q3_K: wtype = GGML_TYPE_Q3_K; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_K: wtype = GGML_TYPE_Q4_K; break;
|
||||
@@ -1962,6 +1971,27 @@ struct ggml_tensor * ggml_add_cast(
|
||||
return ggml_add_cast_impl(ctx, a, b, type);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_add_id(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
struct ggml_tensor * ids) {
|
||||
|
||||
GGML_ASSERT(a->ne[0] == b->ne[0]);
|
||||
GGML_ASSERT(a->ne[1] == ids->ne[0]);
|
||||
GGML_ASSERT(a->ne[2] == ids->ne[1]);
|
||||
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
||||
|
||||
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
||||
|
||||
result->op = GGML_OP_ADD_ID;
|
||||
result->src[0] = a;
|
||||
result->src[1] = b;
|
||||
result->src[2] = ids;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_add1
|
||||
|
||||
static struct ggml_tensor * ggml_add1_impl(
|
||||
@@ -2812,6 +2842,19 @@ struct ggml_tensor * ggml_geglu_quick_split(
|
||||
return ggml_glu_impl(ctx, a, b, GGML_GLU_OP_GEGLU_QUICK, false);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_swiglu_oai(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
float alpha,
|
||||
float limit) {
|
||||
struct ggml_tensor * result = ggml_glu_impl(ctx, a, b, GGML_GLU_OP_SWIGLU_OAI, false);
|
||||
ggml_set_op_params_f32(result, 2, alpha);
|
||||
ggml_set_op_params_f32(result, 3, limit);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_norm
|
||||
|
||||
static struct ggml_tensor * ggml_norm_impl(
|
||||
@@ -3779,6 +3822,22 @@ struct ggml_tensor * ggml_soft_max_ext(
|
||||
return ggml_soft_max_impl(ctx, a, mask, scale, max_bias, false);
|
||||
}
|
||||
|
||||
void ggml_soft_max_add_sinks(
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * sinks) {
|
||||
if (!sinks) {
|
||||
a->src[2] = NULL;
|
||||
return;
|
||||
}
|
||||
|
||||
GGML_ASSERT(a->op == GGML_OP_SOFT_MAX);
|
||||
GGML_ASSERT(a->src[2] == NULL);
|
||||
GGML_ASSERT(a->src[0]->ne[2] == sinks->ne[0]);
|
||||
GGML_ASSERT(sinks->type == GGML_TYPE_F32);
|
||||
|
||||
a->src[2] = sinks;
|
||||
}
|
||||
|
||||
// ggml_soft_max_ext_back
|
||||
|
||||
static struct ggml_tensor * ggml_soft_max_ext_back_impl(
|
||||
@@ -4812,6 +4871,22 @@ enum ggml_prec ggml_flash_attn_ext_get_prec(
|
||||
return (enum ggml_prec) prec_i32;
|
||||
}
|
||||
|
||||
void ggml_flash_attn_ext_add_sinks(
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * sinks) {
|
||||
if (!sinks) {
|
||||
a->src[4] = NULL;
|
||||
return;
|
||||
}
|
||||
|
||||
GGML_ASSERT(a->op == GGML_OP_FLASH_ATTN_EXT);
|
||||
GGML_ASSERT(a->src[4] == NULL);
|
||||
GGML_ASSERT(a->src[0]->ne[2] == sinks->ne[0]);
|
||||
GGML_ASSERT(sinks->type == GGML_TYPE_F32);
|
||||
|
||||
a->src[4] = sinks;
|
||||
}
|
||||
|
||||
// ggml_flash_attn_back
|
||||
|
||||
struct ggml_tensor * ggml_flash_attn_back(
|
||||
@@ -6872,6 +6947,7 @@ size_t ggml_quantize_chunk(
|
||||
case GGML_TYPE_Q5_0: result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q5_1: result = quantize_q5_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q8_0: result = quantize_q8_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_MXFP4: result = quantize_mxfp4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q2_K: result = quantize_q2_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q3_K: result = quantize_q3_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q4_K: result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
|
||||
Reference in New Issue
Block a user