103 lines
1.6 KiB
JSON
103 lines
1.6 KiB
JSON
|
|
{
|
||
|
|
"a_ps": [
|
||
|
|
[
|
||
|
|
"dp",
|
||
|
|
"fsdp"
|
||
|
|
],
|
||
|
|
"sp",
|
||
|
|
"tp",
|
||
|
|
null
|
||
|
|
],
|
||
|
|
"architectures": [
|
||
|
|
"LlamaForCausalLM"
|
||
|
|
],
|
||
|
|
"attention_bias": false,
|
||
|
|
"attention_dropout": 0.0,
|
||
|
|
"axis_dims": [
|
||
|
|
1,
|
||
|
|
-1,
|
||
|
|
1,
|
||
|
|
1
|
||
|
|
],
|
||
|
|
"axis_names": [
|
||
|
|
"dp",
|
||
|
|
"fsdp",
|
||
|
|
"tp",
|
||
|
|
"sp"
|
||
|
|
],
|
||
|
|
"b_ps": [
|
||
|
|
[
|
||
|
|
"dp",
|
||
|
|
"fsdp"
|
||
|
|
],
|
||
|
|
null,
|
||
|
|
null,
|
||
|
|
null
|
||
|
|
],
|
||
|
|
"backend": null,
|
||
|
|
"bits": null,
|
||
|
|
"bos_token_id": 1,
|
||
|
|
"c_max_position_embeddings": 2048,
|
||
|
|
"easy_method": "train",
|
||
|
|
"embd_pdrop": 0.0,
|
||
|
|
"eos_token_id": 2,
|
||
|
|
"fcm_max_ratio": 0.0,
|
||
|
|
"fcm_min_ratio": 0.0,
|
||
|
|
"flash_attn_key_chunk_size": 1024,
|
||
|
|
"flash_attn_query_chunk_size": 1024,
|
||
|
|
"freq_max_position_embeddings": 2048,
|
||
|
|
"hidden_act": "silu",
|
||
|
|
"hidden_size": 2048,
|
||
|
|
"initializer_range": 0.02,
|
||
|
|
"intermediate_size": 5632,
|
||
|
|
"k_ps": [
|
||
|
|
[
|
||
|
|
"dp",
|
||
|
|
"fsdp"
|
||
|
|
],
|
||
|
|
"sp",
|
||
|
|
"tp",
|
||
|
|
null
|
||
|
|
],
|
||
|
|
"max_position_embeddings": 2048,
|
||
|
|
"model_type": "llama",
|
||
|
|
"num_attention_heads": 32,
|
||
|
|
"num_hidden_layers": 22,
|
||
|
|
"num_key_value_heads": 4,
|
||
|
|
"number_rep_kv": 1,
|
||
|
|
"pretraining_tp": 1,
|
||
|
|
"q_ps": [
|
||
|
|
[
|
||
|
|
"dp",
|
||
|
|
"fsdp"
|
||
|
|
],
|
||
|
|
"sp",
|
||
|
|
"tp",
|
||
|
|
null
|
||
|
|
],
|
||
|
|
"resid_pdrop": 0.0,
|
||
|
|
"rms_norm_eps": 1e-05,
|
||
|
|
"rope_scaling": null,
|
||
|
|
"rope_theta": 10000.0,
|
||
|
|
"scan_layers": true,
|
||
|
|
"scan_mlp_chunk_size": 1024,
|
||
|
|
"tie_word_embeddings": false,
|
||
|
|
"torch_dtype": "float16",
|
||
|
|
"transformers_version": "4.36.2",
|
||
|
|
"use_cache": true,
|
||
|
|
"use_flash_attention": false,
|
||
|
|
"use_pjit_attention_force": false,
|
||
|
|
"use_sacn_mlp": false,
|
||
|
|
"use_shard_map": false,
|
||
|
|
"v_ps": [
|
||
|
|
[
|
||
|
|
"dp",
|
||
|
|
"fsdp"
|
||
|
|
],
|
||
|
|
"sp",
|
||
|
|
"tp",
|
||
|
|
null
|
||
|
|
],
|
||
|
|
"vocab_size": 32000
|
||
|
|
}
|