sglang/examples/profiler/nsys_profile_tools/sglang_engine_model.json

{
  "sglang": {
    "llama": {
      "gemm|nvjet": "gemm",
      "fused_moe_kernel|GroupProblemShape|group_gemm_starts|bmm_|GemmUniversal": "moe_gemm",
      "moe|sigmoid": "moe",
      "CatArrayBatched|prepare_inputs": "prepare_next",
      "ncclDevKernel|cross_device_reduce": "nccl_and_custom_ar",
      "_norm_|Norm": "norm",
      "topk": "topk",
      "act_and_mul_": "activation",
      "Rotary": "rope",
      "SoftMax": "softmax",
      "flash|fmha": "attn",
      "elementwise": "elementwise",
      "fp8_quant|cvt_|quantize": "quantize",
      "reduce_kernel": "reduce",
      "triton": "triton_kernel",
      "CUDA mem": "non-gpu-H_D_memops",
      ".*": "misc"
    },
    "ds": {
      "block_fp8_matmul": "block_fp8_gemm",
      "gemm|matmul|nvjet": "gemm",
      "fused_moe_kernel": "moe_gemm",
      "moe|expert|sigmoid": "moe",
      "CatArrayBatched|write_req_to": "prepare_next",
      "ncclDevKernel|cross_device_reduce|all_gather": "nccl_and_custom_ar",
      "Norm": "norm",
      "topk": "topk",
      "activation|act_and_mul": "activation",
      "compute_position_kernel": "rope",
      "elementwise": "elementwise",
      "fp8_quant|quant_fp8|quantize": "quantize",
      "SoftMax": "softmax",
      "reduce": "reduce",
      "_fwd_|create_flash|::mla::|KVCache": "attn",
      "CUDA mem": "non-gpu-H_D_memops",
      ".*": "misc"
    },
    "gpt-oss": {
      "gemm|nvjet": "gemm",
      "fused_moe_kernel|_group_gemm|GroupProblemShape|GemmUniversal|bmm_|matmul_ogs_|_topk_forward|_combined_routing|_sum_bitmatrix_rows|_compute_writeback_idx": "moe_gemm",
      "moe|sigmoid": "moe",
      "CatArrayBatched|prepare_inputs": "prepare_next",
      "_norm_|Norm": "norm",
      "ncclDevKernel|cross_device_reduce|allreduce": "nccl_and_custom_ar",
      "topk|TopK": "topk",
      "act_and_mul_": "activation",
      "Rotary": "rope",
      "SoftMax": "softmax",
      "flash|fmha": "attn",
      "elementwise": "elementwise",
      "fp8_quant|cvt_|quantize": "quantize",
      "reduce_kernel": "reduce",
      "triton": "triton_kernel",
      "CUDA mem": "non-gpu-H_D_memops",
      ".*": "misc"
    }
  }
}