llama.cpp LLM_ARCH_DEEPSEEK and LLM_ARCH_DEEPSEEK2

llama.cpp LLM_ARCH_DEEPSEEK and LLM_ARCH_DEEPSEEK2

  • 1. `LLM_ARCH_DEEPSEEK` and `LLM_ARCH_DEEPSEEK2`
  • 2. `LLM_ARCH_DEEPSEEK` and `LLM_ARCH_DEEPSEEK2`
  • 3. `struct ggml_cgraph * build_deepseek()` and `struct ggml_cgraph * build_deepseek2()`
  • References

不宜吹捧中国大语言模型的同时,又去贬低美国大语言模型。

水是人体的主要化学成分,约占体重的 50% 至 70%,大语言模型的含水量也不会太低。

大语言模型仅仅是一个为人服务的工具,要帮助人独立思考,而不是强化偏见。

llama.cpp
https://github.com/ggerganov/llama.cpp

1. LLM_ARCH_DEEPSEEK and LLM_ARCH_DEEPSEEK2

/home/yongqiang/llm_work/llama_cpp_25_01_05/llama.cpp/src/llama-arch.h
/home/yongqiang/llm_work/llama_cpp_25_01_05/llama.cpp/src/llama-arch.cpp

  • LLM_ARCH_DEEPSEEK and LLM_ARCH_DEEPSEEK2
//
// gguf constants (sync with gguf.py)
//enum llm_arch {LLM_ARCH_LLAMA,LLM_ARCH_DECI,LLM_ARCH_FALCON,LLM_ARCH_BAICHUAN,LLM_ARCH_GROK,LLM_ARCH_GPT2,LLM_ARCH_GPTJ,LLM_ARCH_GPTNEOX,LLM_ARCH_MPT,LLM_ARCH_STARCODER,LLM_ARCH_REFACT,LLM_ARCH_BERT,LLM_ARCH_NOMIC_BERT,LLM_ARCH_JINA_BERT_V2,LLM_ARCH_BLOOM,LLM_ARCH_STABLELM,LLM_ARCH_QWEN,LLM_ARCH_QWEN2,LLM_ARCH_QWEN2MOE,LLM_ARCH_QWEN2VL,LLM_ARCH_PHI2,LLM_ARCH_PHI3,LLM_ARCH_PHIMOE,LLM_ARCH_PLAMO,LLM_ARCH_CODESHELL,LLM_ARCH_ORION,LLM_ARCH_INTERNLM2,LLM_ARCH_MINICPM,LLM_ARCH_MINICPM3,LLM_ARCH_GEMMA,LLM_ARCH_GEMMA2,LLM_ARCH_STARCODER2,LLM_ARCH_MAMBA,LLM_ARCH_XVERSE,LLM_ARCH_COMMAND_R,LLM_ARCH_COHERE2,LLM_ARCH_DBRX,LLM_ARCH_OLMO,LLM_ARCH_OLMO2,LLM_ARCH_OLMOE,LLM_ARCH_OPENELM,LLM_ARCH_ARCTIC,LLM_ARCH_DEEPSEEK,LLM_ARCH_DEEPSEEK2,LLM_ARCH_CHATGLM,LLM_ARCH_BITNET,LLM_ARCH_T5,LLM_ARCH_T5ENCODER,LLM_ARCH_JAIS,LLM_ARCH_NEMOTRON,LLM_ARCH_EXAONE,LLM_ARCH_RWKV6,LLM_ARCH_RWKV6QWEN2,LLM_ARCH_GRANITE,LLM_ARCH_GRANITE_MOE,LLM_ARCH_CHAMELEON,LLM_ARCH_WAVTOKENIZER_DEC,LLM_ARCH_UNKNOWN,
};
  • { LLM_ARCH_DEEPSEEK, "deepseek" } and { LLM_ARCH_DEEPSEEK2, "deepseek2" }
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {{ LLM_ARCH_LLAMA,            "llama"            },{ LLM_ARCH_DECI,             "deci"             },{ LLM_ARCH_FALCON,           "falcon"           },{ LLM_ARCH_GROK,             "grok"             },{ LLM_ARCH_GPT2,             "gpt2"             },{ LLM_ARCH_GPTJ,             "gptj"             },{ LLM_ARCH_GPTNEOX,          "gptneox"          },{ LLM_ARCH_MPT,              "mpt"              },{ LLM_ARCH_BAICHUAN,         "baichuan"         },{ LLM_ARCH_STARCODER,        "starcoder"        },{ LLM_ARCH_REFACT,           "refact"           },{ LLM_ARCH_BERT,             "bert"             },{ LLM_ARCH_NOMIC_BERT,       "nomic-bert"       },{ LLM_ARCH_JINA_BERT_V2,     "jina-bert-v2"     },{ LLM_ARCH_BLOOM,            "bloom"            },{ LLM_ARCH_STABLELM,         "stablelm"         },{ LLM_ARCH_QWEN,             "qwen"             },{ LLM_ARCH_QWEN2,            "qwen2"            },{ LLM_ARCH_QWEN2MOE,         "qwen2moe"         },{ LLM_ARCH_QWEN2VL,          "qwen2vl"          },{ LLM_ARCH_PHI2,             "phi2"             },{ LLM_ARCH_PHI3,             "phi3"             },{ LLM_ARCH_PHIMOE,           "phimoe"           },{ LLM_ARCH_PLAMO,            "plamo"            },{ LLM_ARCH_CODESHELL,        "codeshell"        },{ LLM_ARCH_ORION,            "orion"            },{ LLM_ARCH_INTERNLM2,        "internlm2"        },{ LLM_ARCH_MINICPM,          "minicpm"          },{ LLM_ARCH_MINICPM3,         "minicpm3"         },{ LLM_ARCH_GEMMA,            "gemma"            },{ LLM_ARCH_GEMMA2,           "gemma2"           },{ LLM_ARCH_STARCODER2,       "starcoder2"       },{ LLM_ARCH_MAMBA,            "mamba"            },{ LLM_ARCH_XVERSE,           "xverse"           },{ LLM_ARCH_COMMAND_R,        "command-r"        },{ LLM_ARCH_COHERE2,          "cohere2"          },{ LLM_ARCH_DBRX,             "dbrx"             },{ LLM_ARCH_OLMO,             "olmo"             },{ LLM_ARCH_OLMO2,            "olmo2"            },{ LLM_ARCH_OLMOE,            "olmoe"            },{ LLM_ARCH_OPENELM,          "openelm"          },{ LLM_ARCH_ARCTIC,           "arctic"           },{ LLM_ARCH_DEEPSEEK,         "deepseek"         },{ LLM_ARCH_DEEPSEEK2,        "deepseek2"        },{ LLM_ARCH_CHATGLM,          "chatglm"          },{ LLM_ARCH_BITNET,           "bitnet"           },{ LLM_ARCH_T5,               "t5"               },{ LLM_ARCH_T5ENCODER,        "t5encoder"        },{ LLM_ARCH_JAIS,             "jais"             },{ LLM_ARCH_NEMOTRON,         "nemotron"         },{ LLM_ARCH_EXAONE,           "exaone"           },{ LLM_ARCH_RWKV6,            "rwkv6"            },{ LLM_ARCH_RWKV6QWEN2,       "rwkv6qwen2"       },{ LLM_ARCH_GRANITE,          "granite"          },{ LLM_ARCH_GRANITE_MOE,      "granitemoe"       },{ LLM_ARCH_CHAMELEON,        "chameleon"        },{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },{ LLM_ARCH_UNKNOWN,          "(unknown)"        },
};

2. LLM_ARCH_DEEPSEEK and LLM_ARCH_DEEPSEEK2

/home/yongqiang/llm_work/llama_cpp_25_01_05/llama.cpp/src/llama-arch.cpp

  • LLM_ARCH_DEEPSEEK and LLM_ARCH_DEEPSEEK2
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {{LLM_ARCH_LLAMA,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_OUTPUT,          "output" },{ LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },{ LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },{ LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },{ LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },{ LLM_TENSOR_FFN_GATE_EXP,    "blk.%d.ffn_gate.%d" },{ LLM_TENSOR_FFN_DOWN_EXP,    "blk.%d.ffn_down.%d" },{ LLM_TENSOR_FFN_UP_EXP,      "blk.%d.ffn_up.%d" },{ LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },{ LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },{ LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },},},{LLM_ARCH_DECI,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_OUTPUT,          "output" },{ LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },{ LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },{ LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },{ LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },{ LLM_TENSOR_FFN_GATE_EXP,    "blk.%d.ffn_gate.%d" },{ LLM_TENSOR_FFN_DOWN_EXP,    "blk.%d.ffn_down.%d" },{ LLM_TENSOR_FFN_UP_EXP,      "blk.%d.ffn_up.%d" },{ LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },{ LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },{ LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },},},{LLM_ARCH_BAICHUAN,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_OUTPUT,          "output" },{ LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },{ LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },{ LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },},},{LLM_ARCH_FALCON,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_OUTPUT,          "output" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_NORM_2,     "blk.%d.attn_norm_2" },{ LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },},},{LLM_ARCH_GROK,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_OUTPUT,          "output" },{ LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },{ LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },{ LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },{ LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_GATE_EXP,    "blk.%d.ffn_gate.%d" },{ LLM_TENSOR_FFN_DOWN_EXP,    "blk.%d.ffn_down.%d" },{ LLM_TENSOR_FFN_UP_EXP,      "blk.%d.ffn_up.%d" },{ LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },{ LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },{ LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },{ LLM_TENSOR_LAYER_OUT_NORM,  "blk.%d.layer_output_norm" },{ LLM_TENSOR_ATTN_OUT_NORM,   "blk.%d.attn_output_norm" },},},{LLM_ARCH_GPT2,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_POS_EMBD,        "position_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_OUTPUT,          "output" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },},},{LLM_ARCH_GPTJ,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },},},{LLM_ARCH_GPTNEOX,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_OUTPUT,          "output" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },},},{LLM_ARCH_MPT,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_OUTPUT,          "output"},{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },{ LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },{ LLM_TENSOR_FFN_ACT,         "blk.%d.ffn.act" },{ LLM_TENSOR_POS_EMBD,        "position_embd" },{ LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm"},{ LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm"},},},{LLM_ARCH_STARCODER,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_POS_EMBD,        "position_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_OUTPUT,          "output" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },},},{LLM_ARCH_REFACT,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_OUTPUT,          "output" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },{ LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },},},{LLM_ARCH_BERT,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },{ LLM_TENSOR_TOKEN_TYPES,     "token_types" },{ LLM_TENSOR_POS_EMBD,        "position_embd" },{ LLM_TENSOR_ATTN_OUT_NORM,   "blk.%d.attn_output_norm" },{ LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },{ LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_LAYER_OUT_NORM,  "blk.%d.layer_output_norm" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },{ LLM_TENSOR_CLS,             "cls" },{ LLM_TENSOR_CLS_OUT,         "cls.output" },},},{LLM_ARCH_NOMIC_BERT,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },{ LLM_TENSOR_TOKEN_TYPES,     "token_types" },{ LLM_TENSOR_ATTN_OUT_NORM,   "blk.%d.attn_output_norm" },{ LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_LAYER_OUT_NORM,  "blk.%d.layer_output_norm" },{ LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },},},{LLM_ARCH_JINA_BERT_V2,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },{ LLM_TENSOR_TOKEN_TYPES,     "token_types" },{ LLM_TENSOR_ATTN_NORM_2,     "blk.%d.attn_norm_2" },{ LLM_TENSOR_ATTN_OUT_NORM,   "blk.%d.attn_output_norm" },{ LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm" },{ LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },{ LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },{ LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_LAYER_OUT_NORM,  "blk.%d.layer_output_norm" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },{ LLM_TENSOR_CLS,             "cls" },},},{LLM_ARCH_BLOOM,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_OUTPUT,          "output" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },},},{LLM_ARCH_STABLELM,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_OUTPUT,          "output" },{ LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },{ LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },{ LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm" },{ LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },},},{LLM_ARCH_QWEN,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_OUTPUT,          "output" },{ LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },},},{LLM_ARCH_QWEN2,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_OUTPUT,          "output" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },{ LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },},},{LLM_ARCH_QWEN2VL,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_OUTPUT,          "output" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },{ LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },},},{LLM_ARCH_QWEN2MOE,{{ LLM_TENSOR_TOKEN_EMBD,         "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,        "output_norm" },{ LLM_TENSOR_OUTPUT,             "output" },{ LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_K,             "blk.%d.attn_k" },{ LLM_TENSOR_ATTN_V,             "blk.%d.attn_v" },{ LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },{ LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_GATE_INP,       "blk.%d.ffn_gate_inp" },{ LLM_TENSOR_FFN_GATE_EXPS,      "blk.%d.ffn_gate_exps" },{ LLM_TENSOR_FFN_DOWN_EXPS,      "blk.%d.ffn_down_exps" },{ LLM_TENSOR_FFN_UP_EXPS,        "blk.%d.ffn_up_exps" },{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },{ LLM_TENSOR_FFN_GATE_SHEXP,     "blk.%d.ffn_gate_shexp" },{ LLM_TENSOR_FFN_DOWN_SHEXP,     "blk.%d.ffn_down_shexp" },{ LLM_TENSOR_FFN_UP_SHEXP,       "blk.%d.ffn_up_shexp" },},},{LLM_ARCH_PHI2,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_OUTPUT,          "output" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },{ LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },{ LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },},},{LLM_ARCH_PHI3,{{ LLM_TENSOR_TOKEN_EMBD,         "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,        "output_norm" },{ LLM_TENSOR_OUTPUT,             "output" },{ LLM_TENSOR_ROPE_FACTORS_LONG,  "rope_factors_long" },{ LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },{ LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_QKV,           "blk.%d.attn_qkv" },{ LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_K,             "blk.%d.attn_k" },{ LLM_TENSOR_ATTN_V,             "blk.%d.attn_v" },{ LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },{ LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_DOWN,           "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,             "blk.%d.ffn_up" },},},{LLM_ARCH_PHIMOE,{{ LLM_TENSOR_TOKEN_EMBD,         "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,        "output_norm" },{ LLM_TENSOR_OUTPUT,             "output" },{ LLM_TENSOR_ROPE_FACTORS_LONG,  "rope_factors_long" },{ LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },{ LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_QKV,           "blk.%d.attn_qkv" },{ LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_K,             "blk.%d.attn_k" },{ LLM_TENSOR_ATTN_V,             "blk.%d.attn_v" },{ LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },{ LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_GATE_INP,       "blk.%d.ffn_gate_inp" },{ LLM_TENSOR_FFN_GATE_EXPS,      "blk.%d.ffn_gate_exps" },{ LLM_TENSOR_FFN_DOWN_EXPS,      "blk.%d.ffn_down_exps" },{ LLM_TENSOR_FFN_UP_EXPS,        "blk.%d.ffn_up_exps" },},},{LLM_ARCH_PLAMO,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_OUTPUT,          "output" },{ LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },{ LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },{ LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },},},{LLM_ARCH_CODESHELL,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_OUTPUT,          "output" },{ LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },{ LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },{ LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },{ LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },},},{LLM_ARCH_ORION,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_OUTPUT,          "output" },{ LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },{ LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },{ LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },},},{LLM_ARCH_INTERNLM2,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_OUTPUT,          "output" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },{ LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },},},{LLM_ARCH_MINICPM,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_OUTPUT,          "output" },{ LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },{ LLM_TENSOR_ROPE_FACTORS_LONG,  "rope_factors_long" },{ LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },{ LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },{ LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },{ LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },{ LLM_TENSOR_FFN_GATE_EXP,    "blk.%d.ffn_gate.%d" },{ LLM_TENSOR_FFN_DOWN_EXP,    "blk.%d.ffn_down.%d" },{ LLM_TENSOR_FFN_UP_EXP,      "blk.%d.ffn_up.%d" },},},{LLM_ARCH_MINICPM3,{{ LLM_TENSOR_TOKEN_EMBD,         "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,        "output_norm" },{ LLM_TENSOR_OUTPUT,             "output" },{ LLM_TENSOR_ROPE_FACTORS_LONG,  "rope_factors_long" },{ LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },{ LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_Q_A_NORM,      "blk.%d.attn_q_a_norm" },{ LLM_TENSOR_ATTN_KV_A_NORM,     "blk.%d.attn_kv_a_norm" },{ LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_Q_A,           "blk.%d.attn_q_a" },{ LLM_TENSOR_ATTN_Q_B,           "blk.%d.attn_q_b" },{ LLM_TENSOR_ATTN_KV_A_MQA,      "blk.%d.attn_kv_a_mqa" },{ LLM_TENSOR_ATTN_KV_B,          "blk.%d.attn_kv_b" },{ LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },{ LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_GATE,           "blk.%d.ffn_gate" },{ LLM_TENSOR_FFN_UP,             "blk.%d.ffn_up" },{ LLM_TENSOR_FFN_DOWN,           "blk.%d.ffn_down" },},},{LLM_ARCH_GEMMA,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },{ LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },},},{LLM_ARCH_GEMMA2,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },{ LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_ATTN_POST_NORM,  "blk.%d.post_attention_norm" },{ LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },{ LLM_TENSOR_FFN_POST_NORM,   "blk.%d.post_ffw_norm" },},},{LLM_ARCH_STARCODER2,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_OUTPUT,          "output" },{ LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },{ LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },{ LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },},},{LLM_ARCH_MAMBA,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_OUTPUT,          "output" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_SSM_IN,          "blk.%d.ssm_in" },{ LLM_TENSOR_SSM_CONV1D,      "blk.%d.ssm_conv1d" },{ LLM_TENSOR_SSM_X,           "blk.%d.ssm_x" },{ LLM_TENSOR_SSM_DT,          "blk.%d.ssm_dt" },{ LLM_TENSOR_SSM_A,           "blk.%d.ssm_a" },{ LLM_TENSOR_SSM_D,           "blk.%d.ssm_d" },{ LLM_TENSOR_SSM_OUT,         "blk.%d.ssm_out" },},},{LLM_ARCH_XVERSE,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_OUTPUT,          "output" },{ LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },{ LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },{ LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },},},{LLM_ARCH_COMMAND_R,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },{ LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },{ LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm" },{ LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },},},{LLM_ARCH_COHERE2,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },{ LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },},},{LLM_ARCH_DBRX,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_OUTPUT,          "output" },{ LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_ATTN_OUT_NORM,   "blk.%d.attn_output_norm" },{ LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },{ LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },{ LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },{ LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },},},{LLM_ARCH_OLMO,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_OUTPUT,          "output" },{ LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },{ LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },},},{LLM_ARCH_OLMO2,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_OUTPUT,          "output" },{ LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },{ LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_ATTN_POST_NORM,  "blk.%d.post_attention_norm" },{ LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm" },{ LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },{ LLM_TENSOR_FFN_POST_NORM,   "blk.%d.post_ffw_norm" },{ LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },},},{LLM_ARCH_OLMOE,{{ LLM_TENSOR_TOKEN_EMBD,         "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,        "output_norm" },{ LLM_TENSOR_OUTPUT,             "output" },{ LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_K,             "blk.%d.attn_k" },{ LLM_TENSOR_ATTN_V,             "blk.%d.attn_v" },{ LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },{ LLM_TENSOR_ATTN_Q_NORM,        "blk.%d.attn_q_norm" },{ LLM_TENSOR_ATTN_K_NORM,        "blk.%d.attn_k_norm" },{ LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_GATE_INP,       "blk.%d.ffn_gate_inp" },{ LLM_TENSOR_FFN_GATE_EXPS,      "blk.%d.ffn_gate_exps" },{ LLM_TENSOR_FFN_DOWN_EXPS,      "blk.%d.ffn_down_exps" },{ LLM_TENSOR_FFN_UP_EXPS,        "blk.%d.ffn_up_exps" },},},{LLM_ARCH_OPENELM,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },{ LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm" },{ LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },},},{LLM_ARCH_ARCTIC,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_OUTPUT,          "output" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },{ LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },{ LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },{ LLM_TENSOR_FFN_NORM_EXPS,   "blk.%d.ffn_norm_exps" },{ LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },{ LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },{ LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },},},{LLM_ARCH_DEEPSEEK,{{ LLM_TENSOR_TOKEN_EMBD,         "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,        "output_norm" },{ LLM_TENSOR_OUTPUT,             "output" },{ LLM_TENSOR_ROPE_FREQS,         "rope_freqs" },{ LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_K,             "blk.%d.attn_k" },{ LLM_TENSOR_ATTN_V,             "blk.%d.attn_v" },{ LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },{ LLM_TENSOR_ATTN_ROT_EMBD,      "blk.%d.attn_rot_embd" },{ LLM_TENSOR_FFN_GATE_INP,       "blk.%d.ffn_gate_inp" },{ LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_GATE,           "blk.%d.ffn_gate" },{ LLM_TENSOR_FFN_DOWN,           "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,             "blk.%d.ffn_up" },{ LLM_TENSOR_FFN_GATE_EXPS,      "blk.%d.ffn_gate_exps" },{ LLM_TENSOR_FFN_DOWN_EXPS,      "blk.%d.ffn_down_exps" },{ LLM_TENSOR_FFN_UP_EXPS,        "blk.%d.ffn_up_exps" },{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },{ LLM_TENSOR_FFN_GATE_SHEXP,     "blk.%d.ffn_gate_shexp" },{ LLM_TENSOR_FFN_DOWN_SHEXP,     "blk.%d.ffn_down_shexp" },{ LLM_TENSOR_FFN_UP_SHEXP,       "blk.%d.ffn_up_shexp" },},},{LLM_ARCH_DEEPSEEK2,{{ LLM_TENSOR_TOKEN_EMBD,         "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,        "output_norm" },{ LLM_TENSOR_OUTPUT,             "output" },{ LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_Q_A_NORM,      "blk.%d.attn_q_a_norm" },{ LLM_TENSOR_ATTN_KV_A_NORM,     "blk.%d.attn_kv_a_norm" },{ LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_Q_A,           "blk.%d.attn_q_a" },{ LLM_TENSOR_ATTN_Q_B,           "blk.%d.attn_q_b" },{ LLM_TENSOR_ATTN_KV_A_MQA,      "blk.%d.attn_kv_a_mqa" },{ LLM_TENSOR_ATTN_KV_B,          "blk.%d.attn_kv_b" },{ LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },{ LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_GATE,           "blk.%d.ffn_gate" },{ LLM_TENSOR_FFN_UP,             "blk.%d.ffn_up" },{ LLM_TENSOR_FFN_DOWN,           "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_GATE_INP,       "blk.%d.ffn_gate_inp" },{ LLM_TENSOR_FFN_GATE_EXPS,      "blk.%d.ffn_gate_exps" },{ LLM_TENSOR_FFN_DOWN_EXPS,      "blk.%d.ffn_down_exps" },{ LLM_TENSOR_FFN_UP_EXPS,        "blk.%d.ffn_up_exps" },{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },{ LLM_TENSOR_FFN_GATE_SHEXP,     "blk.%d.ffn_gate_shexp" },{ LLM_TENSOR_FFN_DOWN_SHEXP,     "blk.%d.ffn_down_shexp" },{ LLM_TENSOR_FFN_UP_SHEXP,       "blk.%d.ffn_up_shexp" },{ LLM_TENSOR_FFN_EXP_PROBS_B,    "blk.%d.exp_probs_b" },},},{LLM_ARCH_CHATGLM,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_OUTPUT,          "output" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },},},{LLM_ARCH_BITNET,{{ LLM_TENSOR_TOKEN_EMBD,         "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,        "output_norm" },{ LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_K,             "blk.%d.attn_k" },{ LLM_TENSOR_ATTN_V,             "blk.%d.attn_v" },{ LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },{ LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_SUB_NORM,      "blk.%d.attn_sub_norm" },{ LLM_TENSOR_FFN_GATE,           "blk.%d.ffn_gate" },{ LLM_TENSOR_FFN_DOWN,           "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,             "blk.%d.ffn_up" },{ LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_SUB_NORM,       "blk.%d.ffn_sub_norm" },},},{LLM_ARCH_T5,{{ LLM_TENSOR_TOKEN_EMBD,           "token_embd" },{ LLM_TENSOR_OUTPUT,               "output" },{ LLM_TENSOR_DEC_OUTPUT_NORM,      "dec.output_norm" },{ LLM_TENSOR_DEC_ATTN_NORM,        "dec.blk.%d.attn_norm" },{ LLM_TENSOR_DEC_ATTN_Q,           "dec.blk.%d.attn_q" },{ LLM_TENSOR_DEC_ATTN_K,           "dec.blk.%d.attn_k" },{ LLM_TENSOR_DEC_ATTN_V,           "dec.blk.%d.attn_v" },{ LLM_TENSOR_DEC_ATTN_OUT,         "dec.blk.%d.attn_o" },{ LLM_TENSOR_DEC_ATTN_REL_B,       "dec.blk.%d.attn_rel_b" },{ LLM_TENSOR_DEC_CROSS_ATTN_NORM,  "dec.blk.%d.cross_attn_norm" },{ LLM_TENSOR_DEC_CROSS_ATTN_Q,     "dec.blk.%d.cross_attn_q" },{ LLM_TENSOR_DEC_CROSS_ATTN_K,     "dec.blk.%d.cross_attn_k" },{ LLM_TENSOR_DEC_CROSS_ATTN_V,     "dec.blk.%d.cross_attn_v" },{ LLM_TENSOR_DEC_CROSS_ATTN_OUT,   "dec.blk.%d.cross_attn_o" },{ LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "dec.blk.%d.cross_attn_rel_b" },{ LLM_TENSOR_DEC_FFN_NORM,         "dec.blk.%d.ffn_norm" },{ LLM_TENSOR_DEC_FFN_GATE,         "dec.blk.%d.ffn_gate" },{ LLM_TENSOR_DEC_FFN_DOWN,         "dec.blk.%d.ffn_down" },{ LLM_TENSOR_DEC_FFN_UP,           "dec.blk.%d.ffn_up" },{ LLM_TENSOR_ENC_OUTPUT_NORM,      "enc.output_norm" },{ LLM_TENSOR_ENC_ATTN_NORM,        "enc.blk.%d.attn_norm" },{ LLM_TENSOR_ENC_ATTN_Q,           "enc.blk.%d.attn_q" },{ LLM_TENSOR_ENC_ATTN_K,           "enc.blk.%d.attn_k" },{ LLM_TENSOR_ENC_ATTN_V,           "enc.blk.%d.attn_v" },{ LLM_TENSOR_ENC_ATTN_OUT,         "enc.blk.%d.attn_o" },{ LLM_TENSOR_ENC_ATTN_REL_B,       "enc.blk.%d.attn_rel_b" },{ LLM_TENSOR_ENC_FFN_NORM,         "enc.blk.%d.ffn_norm" },{ LLM_TENSOR_ENC_FFN_GATE,         "enc.blk.%d.ffn_gate" },{ LLM_TENSOR_ENC_FFN_DOWN,         "enc.blk.%d.ffn_down" },{ LLM_TENSOR_ENC_FFN_UP,           "enc.blk.%d.ffn_up" },},},{LLM_ARCH_T5ENCODER,{{ LLM_TENSOR_TOKEN_EMBD,           "token_embd" },{ LLM_TENSOR_OUTPUT,               "output" },{ LLM_TENSOR_ENC_OUTPUT_NORM,      "enc.output_norm" },{ LLM_TENSOR_ENC_ATTN_NORM,        "enc.blk.%d.attn_norm" },{ LLM_TENSOR_ENC_ATTN_Q,           "enc.blk.%d.attn_q" },{ LLM_TENSOR_ENC_ATTN_K,           "enc.blk.%d.attn_k" },{ LLM_TENSOR_ENC_ATTN_V,           "enc.blk.%d.attn_v" },{ LLM_TENSOR_ENC_ATTN_OUT,         "enc.blk.%d.attn_o" },{ LLM_TENSOR_ENC_ATTN_REL_B,       "enc.blk.%d.attn_rel_b" },{ LLM_TENSOR_ENC_FFN_NORM,         "enc.blk.%d.ffn_norm" },{ LLM_TENSOR_ENC_FFN_GATE,         "enc.blk.%d.ffn_gate" },{ LLM_TENSOR_ENC_FFN_DOWN,         "enc.blk.%d.ffn_down" },{ LLM_TENSOR_ENC_FFN_UP,           "enc.blk.%d.ffn_up" },},},{LLM_ARCH_JAIS,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_OUTPUT,          "output" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },{ LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },},},{LLM_ARCH_NEMOTRON,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_OUTPUT,          "output" },{ LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },{ LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },{ LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },},},{LLM_ARCH_EXAONE,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_OUTPUT,          "output" },{ LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },{ LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },{ LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },},},{LLM_ARCH_RWKV6,{{ LLM_TENSOR_TOKEN_EMBD,                "token_embd" },{ LLM_TENSOR_TOKEN_EMBD_NORM,           "token_embd_norm" },{ LLM_TENSOR_OUTPUT_NORM,               "output_norm" },{ LLM_TENSOR_OUTPUT,                    "output" },{ LLM_TENSOR_ATTN_NORM,                 "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_NORM_2,               "blk.%d.attn_norm_2" },{ LLM_TENSOR_TIME_MIX_W1,               "blk.%d.time_mix_w1" },{ LLM_TENSOR_TIME_MIX_W2,               "blk.%d.time_mix_w2" },{ LLM_TENSOR_TIME_MIX_LERP_X,           "blk.%d.time_mix_lerp_x" },{ LLM_TENSOR_TIME_MIX_LERP_W,           "blk.%d.time_mix_lerp_w" },{ LLM_TENSOR_TIME_MIX_LERP_K,           "blk.%d.time_mix_lerp_k" },{ LLM_TENSOR_TIME_MIX_LERP_V,           "blk.%d.time_mix_lerp_v" },{ LLM_TENSOR_TIME_MIX_LERP_R,           "blk.%d.time_mix_lerp_r" },{ LLM_TENSOR_TIME_MIX_LERP_G,           "blk.%d.time_mix_lerp_g" },{ LLM_TENSOR_TIME_MIX_LERP_FUSED,       "blk.%d.time_mix_lerp_fused" },{ LLM_TENSOR_TIME_MIX_FIRST,            "blk.%d.time_mix_first" },{ LLM_TENSOR_TIME_MIX_DECAY,            "blk.%d.time_mix_decay" },{ LLM_TENSOR_TIME_MIX_DECAY_W1,         "blk.%d.time_mix_decay_w1" },{ LLM_TENSOR_TIME_MIX_DECAY_W2,         "blk.%d.time_mix_decay_w2" },{ LLM_TENSOR_TIME_MIX_KEY,              "blk.%d.time_mix_key" },{ LLM_TENSOR_TIME_MIX_VALUE,            "blk.%d.time_mix_value" },{ LLM_TENSOR_TIME_MIX_RECEPTANCE,       "blk.%d.time_mix_receptance" },{ LLM_TENSOR_TIME_MIX_GATE,             "blk.%d.time_mix_gate" },{ LLM_TENSOR_TIME_MIX_LN,               "blk.%d.time_mix_ln" },{ LLM_TENSOR_TIME_MIX_OUTPUT,           "blk.%d.time_mix_output" },{ LLM_TENSOR_CHANNEL_MIX_LERP_K,        "blk.%d.channel_mix_lerp_k" },{ LLM_TENSOR_CHANNEL_MIX_LERP_R,        "blk.%d.channel_mix_lerp_r" },{ LLM_TENSOR_CHANNEL_MIX_KEY,           "blk.%d.channel_mix_key" },{ LLM_TENSOR_CHANNEL_MIX_VALUE,         "blk.%d.channel_mix_value" },{ LLM_TENSOR_CHANNEL_MIX_RECEPTANCE,    "blk.%d.channel_mix_receptance" },},},{LLM_ARCH_RWKV6QWEN2,{{ LLM_TENSOR_TOKEN_EMBD,                "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,               "output_norm" },{ LLM_TENSOR_OUTPUT,                    "output" },{ LLM_TENSOR_ATTN_NORM,                 "blk.%d.attn_norm" },{ LLM_TENSOR_TIME_MIX_W1,               "blk.%d.time_mix_w1" },{ LLM_TENSOR_TIME_MIX_W2,               "blk.%d.time_mix_w2" },{ LLM_TENSOR_TIME_MIX_LERP_X,           "blk.%d.time_mix_lerp_x" },{ LLM_TENSOR_TIME_MIX_LERP_FUSED,       "blk.%d.time_mix_lerp_fused" },{ LLM_TENSOR_TIME_MIX_FIRST,            "blk.%d.time_mix_first" },{ LLM_TENSOR_TIME_MIX_DECAY,            "blk.%d.time_mix_decay" },{ LLM_TENSOR_TIME_MIX_DECAY_W1,         "blk.%d.time_mix_decay_w1" },{ LLM_TENSOR_TIME_MIX_DECAY_W2,         "blk.%d.time_mix_decay_w2" },{ LLM_TENSOR_TIME_MIX_KEY,              "blk.%d.time_mix_key" },{ LLM_TENSOR_TIME_MIX_VALUE,            "blk.%d.time_mix_value" },{ LLM_TENSOR_TIME_MIX_RECEPTANCE,       "blk.%d.time_mix_receptance" },{ LLM_TENSOR_TIME_MIX_GATE,             "blk.%d.time_mix_gate" },{ LLM_TENSOR_TIME_MIX_OUTPUT,           "blk.%d.time_mix_output" },{ LLM_TENSOR_FFN_NORM,                  "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_GATE,                  "blk.%d.ffn_gate" },{ LLM_TENSOR_FFN_DOWN,                  "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,                    "blk.%d.ffn_up" },},},{LLM_ARCH_GRANITE,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_OUTPUT,          "output" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },{ LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },},},{LLM_ARCH_GRANITE_MOE,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_OUTPUT,          "output" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },{ LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },{ LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },{ LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },{ LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },},},{LLM_ARCH_CHAMELEON,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },{ LLM_TENSOR_OUTPUT_NORM,     "output_norm" },{ LLM_TENSOR_OUTPUT,          "output" },{ LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },{ LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },{ LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },{ LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },{ LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },{ LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },{ LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },{ LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },{ LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },{ LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm" },{ LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },},},{LLM_ARCH_WAVTOKENIZER_DEC,{{ LLM_TENSOR_TOKEN_EMBD,        "token_embd" },{ LLM_TENSOR_TOKEN_EMBD_NORM,   "token_embd_norm" },{ LLM_TENSOR_CONV1D,            "conv1d" },{ LLM_TENSOR_CONVNEXT_DW,       "convnext.%d.dw" },{ LLM_TENSOR_CONVNEXT_NORM,     "convnext.%d.norm" },{ LLM_TENSOR_CONVNEXT_PW1,      "convnext.%d.pw1" },{ LLM_TENSOR_CONVNEXT_PW2,      "convnext.%d.pw2" },{ LLM_TENSOR_CONVNEXT_GAMMA,    "convnext.%d.gamma" },{ LLM_TENSOR_OUTPUT_NORM,       "output_norm" },{ LLM_TENSOR_OUTPUT,            "output" },{ LLM_TENSOR_POS_NET_CONV1,     "posnet.%d.conv1" },{ LLM_TENSOR_POS_NET_CONV2,     "posnet.%d.conv2" },{ LLM_TENSOR_POS_NET_NORM,      "posnet.%d.norm" },{ LLM_TENSOR_POS_NET_NORM1,     "posnet.%d.norm1" },{ LLM_TENSOR_POS_NET_NORM2,     "posnet.%d.norm2" },{ LLM_TENSOR_POS_NET_ATTN_NORM, "posnet.%d.attn_norm" },{ LLM_TENSOR_POS_NET_ATTN_Q,    "posnet.%d.attn_q" },{ LLM_TENSOR_POS_NET_ATTN_K,    "posnet.%d.attn_k" },{ LLM_TENSOR_POS_NET_ATTN_V,    "posnet.%d.attn_v" },{ LLM_TENSOR_POS_NET_ATTN_OUT,  "posnet.%d.attn_output" },},},{LLM_ARCH_UNKNOWN,{{ LLM_TENSOR_TOKEN_EMBD,      "token_embd" },},},
};
  • case LLM_ARCH_DEEPSEEK2 and case LLM_ARCH_DEEPSEEK

/home/yongqiang/llm_work/llama_cpp_25_01_05/llama.cpp/src/llama-model.cpp

...const auto tn = LLM_TN(arch);switch (arch) {case LLM_ARCH_LLAMA:case LLM_ARCH_REFACT:case LLM_ARCH_MINICPM:case LLM_ARCH_GRANITE:case LLM_ARCH_GRANITE_MOE:{tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);// outputoutput_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);// if output is NULL, init from the input tok embedif (output == NULL) {output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);}for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);// optional bias tensorslayer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));}else {layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));}if (n_expert == 0) {layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);// optional MLP biaslayer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);} else {layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, TENSOR_NOT_REQUIRED);layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);}}} break;case LLM_ARCH_DECI:{tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);// outputoutput_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);// if output is NULL, init from the input tok embedif (output == NULL) {output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);}for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa(i);const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa(i);const int64_t n_embd_gqa    = hparams.n_embd_v_gqa(i);const int64_t n_ff          = hparams.n_ff(i);const int64_t n_head        = hparams.n_head(i);const int64_t n_head_kv     = hparams.n_head_kv(i);if (n_head_kv == 0 && n_head > 0) {// linear attention for DeciLMCausalModellayer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);}else if (n_head_kv > 0) {layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);}// optional bias tensorslayer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));}else {layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));}layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);// optional MLP biaslayer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);}} break;case LLM_ARCH_MINICPM3:{const int64_t n_embd_head_qk_rope = hparams.n_rot;const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;const int64_t q_lora_rank  = hparams.n_lora_q;const int64_t kv_lora_rank = hparams.n_lora_kv;tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);// outputoutput_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);// if output is NULL, init from the input tok embedif (output == NULL) {output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);}for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);layer.wkv_b     = create_tensor(tn(LLM_TENSOR_ATTN_KV_B,     "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);layer.wo        = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {              n_head * (                      n_embd_head_v), n_embd}, 0);layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));}} break;case LLM_ARCH_GROK:{if (n_expert == 0) {throw std::runtime_error("Grok model cannot have zero experts");}tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);// outputoutput_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);// if output is NULL, init from the input tok embedif (output == NULL) {output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);}for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);layer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);layer.layer_out_norm   = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);}} break;case LLM_ARCH_DBRX:{if (n_expert == 0) {throw std::runtime_error("DBRX model cannot have zero experts");}tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);// outputoutput_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff,   n_expert}, 0);layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert}, 0);layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff,   n_expert}, 0);}} break;case LLM_ARCH_BAICHUAN:{tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);{output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);}for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);}} break;case LLM_ARCH_FALCON:{tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);// output{output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);if (!output) {output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU}}for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);layer.attn_norm_2   = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);}} break;case LLM_ARCH_STARCODER:{tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,   "weight"), {n_embd, n_ctx_train}, 0);// output{output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);if (!output) {// needs to be on GPUoutput = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);}}for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i),   {n_embd, n_ff}, 0);layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i),     {n_ff}, 0);}} break;case LLM_ARCH_BERT:case LLM_ARCH_NOMIC_BERT:{tok_embd     = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0);type_embd    = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);if (arch == LLM_ARCH_BERT) {pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,    "weight"), {n_embd, n_ctx_train}, 0);cls   = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"),   {n_embd},         TENSOR_NOT_REQUIRED);cls_out   = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"),   {1},         TENSOR_NOT_REQUIRED);}tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd}, 0);for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];if (arch == LLM_ARCH_BERT) {layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i),   {n_embd}, 0);layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i),   {n_embd_gqa}, 0);layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i),   {n_embd_gqa}, 0);} else {layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);}layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {n_embd, n_embd}, 0);layer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i),   {n_embd}, 0);layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,        "weight", i), {n_embd, n_ff}, 0);layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN,      "weight", i), {n_ff, n_embd}, 0);if (arch == LLM_ARCH_BERT) {layer.bo         = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, 0);layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);} else {layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);}layer.layer_out_norm   = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i),   {n_embd}, 0);}} break;case LLM_ARCH_JINA_BERT_V2:{tok_embd  = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0); // word_embeddingstype_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); // token_type_embeddingstok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); // LayerNormtok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd}, 0); //LayerNorm biascls   = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"),   {1},         TENSOR_NOT_REQUIRED);for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i]; // JinaBertLayerlayer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i),   {n_embd}, 0);layer.attn_q_norm   = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias",   i), {n_embd_gqa}, 0);layer.attn_k_norm   = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias",   i), {n_embd_gqa}, 0);layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); //output_denslayer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), {n_embd}, 0); //output_denslayer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); //output_normlayer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias",   i), {n_embd}, 0);layer.attn_norm_2   = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias",   i), {n_embd}, 0);layer.layer_out_norm   = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias",   i), {n_embd}, 0);}} break;case LLM_ARCH_BLOOM:{tok_embd   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab}, 0);tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd}, 0);// outputoutput_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias",   i), {n_embd}, 0);layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias",   i), {n_embd + 2*n_embd_gqa}, 0);layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), {n_embd}, 0);layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias",   i), {n_embd}, 0);layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias",   i), {n_embd}, 0);layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias",   i), {n_ff}, 0);}} break;case LLM_ARCH_MPT:{tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,   "weight"), {n_embd, n_ctx_train}, TENSOR_NOT_REQUIRED);// outputoutput_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, TENSOR_NOT_REQUIRED);output        = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);if (!output) {output    = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU}for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, TENSOR_NOT_REQUIRED);layer.attn_q_norm   = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);layer.attn_k_norm   = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);// AWQ ScaleActivation layerlayer.ffn_act = create_tensor(tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, TENSOR_NOT_REQUIRED);}} break;case LLM_ARCH_STABLELM:{tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);// outputoutput_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm =   create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);// optional bias tensors, present in Stable LM 2 1.6Blayer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);// optional q and k layernorms, present in StableLM 2 12Blayer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head},    TENSOR_NOT_REQUIRED);layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);// optional FFN norm, not present in StableLM 2 12B which uses parallel residuallayer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);}} break;case LLM_ARCH_QWEN:{tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);// outputoutput_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}, 0);layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd*3}, 0);layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2}, 0);layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd}, 0);layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff/2}, 0);}} break;case LLM_ARCH_QWEN2:case LLM_ARCH_QWEN2VL:{tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);// outputoutput_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);// if output is NULL, init from the input tok embedif (output == NULL) {output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);}for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);// optional bias tensorslayer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd}, 0);layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, 0);layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, 0);layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);}} break;case LLM_ARCH_QWEN2MOE:{tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);// outputoutput_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);// optional bias tensorslayer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd}, 0);layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, 0);layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, 0);layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);if (n_expert == 0) {throw std::runtime_error("n_expert must be > 0 for QWEN2MOE");}if (n_expert_used == 0) {throw std::runtime_error("n_expert_used must be > 0 for QWEN2MOE");}// MoE branchconst int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);// Shared expert branchconst int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd}, 0);layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {    n_embd, n_ff_shexp}, 0);layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp,     n_embd}, 0);layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {    n_embd, n_ff_shexp}, 0);}} break;case LLM_ARCH_PHI2:{tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);// outputoutput_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);output_b      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "bias"),   {n_vocab}, 0);for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);if (layer.wqkv == nullptr) {layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i),   {n_embd}, 0);layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i),   {n_embd_gqa}, 0);layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i),   {n_embd_gqa}, 0);}layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);}} break;case LLM_ARCH_PHI3:{const int64_t n_embd_head = n_embd / n_head;tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);// outputoutput_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));}} break;case LLM_ARCH_PHIMOE:{const int64_t n_embd_head = n_embd / n_head;tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);// outputoutput_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), { n_embd, n_vocab }, 0);output_b      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "bias"),   { n_vocab }, 0);for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias",   i), { n_embd }, 0);layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);if (layer.wqkv == nullptr) {layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias",   i), {n_embd}, 0);layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias",   i), {n_embd_gqa}, 0);layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias",   i), {n_embd_gqa}, 0);}layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), { n_embd }, 0);layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias",   i), { n_embd }, 0);layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert},         0);layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff,   n_expert}, 0);layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert}, 0);layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff,   n_expert}, 0);layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));}} break;case LLM_ARCH_PLAMO:{tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);// outputoutput_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);}} break;case LLM_ARCH_GPT2:{tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,   "weight"), {n_embd, n_ctx_train}, 0);// outputoutput_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, 0);layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, 0);layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);}} break;case LLM_ARCH_CODESHELL:{tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);// outputoutput_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i),   {n_embd, n_ff}, 0);layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i),     {n_ff}, 0);}} break;case LLM_ARCH_ORION:{tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);}} break;case LLM_ARCH_INTERNLM2:{tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);// outputoutput_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);// layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);}} break;case LLM_ARCH_GEMMA:{tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);// outputoutput_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output      = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloadingfor (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);}} break;case LLM_ARCH_GEMMA2:{tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);// outputoutput_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output      = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloadingfor (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);}} break;case LLM_ARCH_STARCODER2:{tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);// outputoutput_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);// if output is NULL, init from the input tok embedif (output == NULL) {output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);}for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);// optional bias tensorslayer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd}, 0);layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, 0);layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, 0);layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);// optional bias tensorslayer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP ,  "bias", i), {  n_ff}, 0);}} break;case LLM_ARCH_MAMBA:{const int64_t d_conv  = hparams.ssm_d_conv;const int64_t d_inner = hparams.ssm_d_inner;const int64_t d_state = hparams.ssm_d_state;const int64_t dt_rank = hparams.ssm_dt_rank;// only an expansion factor of 2 is supported for nowif (2 * n_embd != d_inner) {throw std::runtime_error("only an expansion factor of 2 is supported for now");}tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);// outputoutput_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);// if output is NULL, init from the input tok embed, duplicated to allow offloadingif (output == NULL) {output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);}for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];// normlayer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);// no "weight" suffix for theselayer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);// out_projlayer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);}} break;case LLM_ARCH_XVERSE:{tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);}} break;case LLM_ARCH_COMMAND_R:{tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);// outputoutput_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);// init output from the input tok embedoutput = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);if (n_layer >= 64){layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);}layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);}} break;case LLM_ARCH_COHERE2:{tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);// outputoutput_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);// init output from the input tok embedoutput      = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab },TENSOR_DUPLICATED);for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd }, 0);layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);}}break;case LLM_ARCH_OLMO:  // adapted from LLM_ARCH_LLAMA with norm params removed{tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);// outputoutput = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);// if output is NULL, init from the input tok embedif (output == NULL) {output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);}for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);}} break;case LLM_ARCH_OLMO2:{tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);// outputoutput_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);}} break;case LLM_ARCH_OLMOE:{tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);// outputoutput_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);if (n_expert == 0) {throw std::runtime_error("n_expert must be > 0");}if (n_expert_used == 0) {throw std::runtime_error("n_expert_used must be > 0");}// MoE branchlayer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff,   n_expert}, 0);layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert}, 0);layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff,   n_expert}, 0);}} break;case LLM_ARCH_OPENELM:{tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);// outputoutput_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);// init output from the input tok embedoutput = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);for (int i = 0; i < n_layer; ++i) {const int64_t n_head      =   hparams.n_head(i);const int64_t n_head_qkv  = 2*hparams.n_head_kv(i) + n_head;const int64_t n_ff        =   hparams.n_ff(i);auto & layer = layers[i];layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k}, 0);layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd}, 0);layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);}} break;case LLM_ARCH_GPTNEOX:{tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);// outputoutput_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);}} break;case LLM_ARCH_ARCTIC:{tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);// outputoutput_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);// if output is NULL, init from the input tok embedif (output == NULL) {output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);}for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd}, 0);layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd}, 0);layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_embd}, 0);layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);layer.ffn_norm_exps = create_tensor(tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd}, 0);layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, false);layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);}} break;case LLM_ARCH_DEEPSEEK:{const int64_t n_ff_exp        = hparams.n_ff_exp;const int64_t n_expert_shared = hparams.n_expert_shared;tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);// outputoutput_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);if (i < (int) hparams.n_layer_dense_lead) {layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);} else {layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);if (n_expert == 0) {throw std::runtime_error("n_expert must be > 0");}if (n_expert_used == 0) {throw std::runtime_error("n_expert_used must be > 0");}// MoE branchlayer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);// Shared expert branchlayer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, 0);layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);}}} break;case LLM_ARCH_DEEPSEEK2:{const bool is_lite = (hparams.n_layer == 27);const int64_t n_embd_head_qk_rope = hparams.n_rot;const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;const int64_t q_lora_rank  = hparams.n_lora_q;const int64_t kv_lora_rank = hparams.n_lora_kv;const int64_t n_ff_exp        = hparams.n_ff_exp;const int64_t n_expert_shared = hparams.n_expert_shared;tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);// outputoutput_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);if (!is_lite) {layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);}layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);if (!is_lite) {layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);} else {layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);}layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);layer.wkv_b     = create_tensor(tn(LLM_TENSOR_ATTN_KV_B,     "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);layer.wo        = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {              n_head * (                      n_embd_head_v), n_embd}, 0);layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);if (i < (int) hparams.n_layer_dense_lead) {layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);} else {layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);if (n_expert == 0) {throw std::runtime_error("n_expert must be > 0");}if (n_expert_used == 0) {throw std::runtime_error("n_expert_used must be > 0");}// MoE branchlayer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);// Shared expert branchlayer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, 0);layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);}}} break;case LLM_ARCH_BITNET:{tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);// outputoutput_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm     = create_tensor(tn(LLM_TENSOR_ATTN_NORM,     "weight", i), {n_embd}, 0);layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);layer.wq       = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);layer.wq_scale = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);layer.wk       = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wk_scale = create_tensor(tn(LLM_TENSOR_ATTN_K,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);layer.wv       = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wv_scale = create_tensor(tn(LLM_TENSOR_ATTN_V,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);layer.wo       = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);layer.wo_scale = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale",  i), {1}, TENSOR_NOT_REQUIRED);layer.ffn_norm     = create_tensor(tn(LLM_TENSOR_FFN_NORM,     "weight", i), {n_embd}, 0);layer.ffn_sub_norm = create_tensor(tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff}, 0);layer.ffn_gate       = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);layer.ffn_gate_scale = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale",  i), {1}, TENSOR_NOT_REQUIRED);layer.ffn_down       = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);layer.ffn_down_scale = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale",  i), {1}, TENSOR_NOT_REQUIRED);layer.ffn_up         = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);layer.ffn_up_scale   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);}} break;case LLM_ARCH_T5:{const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);// outputoutput_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);output_norm     = create_tensor(tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd}, 0);output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);// if output is NULL, init from the input tok embedif (output == NULL) {output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);}for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm_enc  = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM,  "weight", i), {n_embd}, 0);layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);layer.ffn_up_enc   = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);layer.attn_norm  = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM,  "weight", i), {n_embd}, 0);layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);layer.wq = create_tensor(tn(LLM_TENSOR_DEC_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);layer.wk = create_tensor(tn(LLM_TENSOR_DEC_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);layer.wv = create_tensor(tn(LLM_TENSOR_DEC_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);layer.wo = create_tensor(tn(LLM_TENSOR_DEC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);layer.attn_norm_cross  = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM,  "weight", i), {n_embd}, 0);// this tensor seems to be unused in HF transformers implementationlayer.attn_rel_b_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);layer.wv_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);layer.wo_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);layer.ffn_norm = create_tensor(tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd}, 0);layer.ffn_gate = create_tensor(tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);layer.ffn_down = create_tensor(tn(LLM_TENSOR_DEC_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);layer.ffn_up   = create_tensor(tn(LLM_TENSOR_DEC_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);}} break;case LLM_ARCH_T5ENCODER:{const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);// outputoutput_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);// if output is NULL, init from the input tok embedif (output == NULL) {output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);}for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm_enc  = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM,  "weight", i), {n_embd}, 0);layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);layer.ffn_up_enc   = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);}} break;case LLM_ARCH_JAIS:{tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);// outputoutput_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, 0);layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, 0);layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);layer.ffn_gate   = create_tensor(tn(LLM_TENSOR_FFN_GATE,   "weight", i), {n_embd, n_ff}, 0);layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE,   "bias", i),   {n_ff}, 0);layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);}} break;case LLM_ARCH_CHATGLM:{tok_embd   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab}, 0);// outputoutput_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff * 2}, 0);layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);}} break;case LLM_ARCH_NEMOTRON:{tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);// outputoutput_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);output        = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);// optional bias tensorslayer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);// optional MLP biaslayer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);}} break;case LLM_ARCH_EXAONE:{tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);// outputoutput_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM,   "weight", i), {n_embd}, 0);layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));layer.ffn_gate   = create_tensor(tn(LLM_TENSOR_FFN_GATE,   "weight", i), {n_embd,   n_ff}, 0);layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN,   "weight", i), {  n_ff, n_embd}, 0);layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,     "weight", i), {n_embd,   n_ff}, 0);}} break;case LLM_ARCH_RWKV6:{tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);// Block 0, LN0tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);// outputoutput_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);const int time_mix_extra_dim = hparams.time_mix_extra_dim;const int time_decay_extra_dim = hparams.time_decay_extra_dim;const int head_size = hparams.wkv_head_size;const int attn_hidden_size = n_embd;const int ffn_size = hparams.n_ff_arr[0];for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);layer.attn_norm_2   = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, 0);layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, llama_model_loader::TENSOR_NOT_REQUIRED);GGML_ASSERT(!(layer.time_mix_lerp_fused == NULL && layer.time_mix_lerp_w == NULL));layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0);layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);layer.channel_mix_lerp_r = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0);layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);layer.channel_mix_receptance = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd}, 0);}} break;case LLM_ARCH_RWKV6QWEN2:{tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);const int time_mix_extra_dim = hparams.time_mix_extra_dim;const int time_decay_extra_dim = hparams.time_decay_extra_dim;const int head_size = hparams.wkv_head_size;const int attn_hidden_size = n_embd;const int n_head_kv = hparams.n_head_kv();int attn_key_value_size;if (n_head_kv == 0 || attn_hidden_size / head_size == n_head_kv) {attn_key_value_size = attn_hidden_size;} else {attn_key_value_size = n_head_kv * head_size;}for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, llama_model_loader::TENSOR_NOT_REQUIRED);layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {n_embd, attn_key_value_size}, 0);layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {n_embd, attn_key_value_size}, 0);layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);// optional bias tensorslayer.time_mix_key_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "bias", i), {attn_key_value_size}, llama_model_loader::TENSOR_NOT_REQUIRED);layer.time_mix_value_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "bias", i), {attn_key_value_size}, llama_model_loader::TENSOR_NOT_REQUIRED);layer.time_mix_receptance_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "bias", i), {attn_hidden_size}, llama_model_loader::TENSOR_NOT_REQUIRED);layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);}} break;case LLM_ARCH_CHAMELEON:{tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);// outputoutput_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);// if output is NULL, init from the input tok embedif (output == NULL) {output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);}for (int i = 0; i < n_layer; ++i) {auto & layer = layers[i];layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i),  {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i),  {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);}} break;case LLM_ARCH_WAVTOKENIZER_DEC:{tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0);conv1d   = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0);conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"),   {1, hparams.posnet.n_embd}, 0);// posnet{const int64_t n_embd = hparams.posnet.n_embd;for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {auto & layer = layers[i].posnet;// posnet:////  - resnet//  - resnet//  - attn//  - resnet//  - resnet//  - norm//switch (i) {case 0:case 1:case 3:case 4:{layer.norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   i), {1, n_embd}, 0);layer.conv1   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias",   i), {1, n_embd}, 0);layer.norm2   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias",   i), {1, n_embd}, 0);layer.conv2   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   i), {1, n_embd}, 0);} break;case 2:{layer.attn_norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   i), {1, n_embd}, 0);layer.attn_q      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q,    "weight", i), {1, n_embd, n_embd}, 0);layer.attn_q_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q,    "bias",   i), {1, n_embd}, 0);layer.attn_k      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K,    "weight", i), {1, n_embd, n_embd}, 0);layer.attn_k_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K,    "bias",   i), {1, n_embd}, 0);layer.attn_v      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V,    "weight", i), {1, n_embd, n_embd}, 0);layer.attn_v_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V,    "bias",   i), {1, n_embd}, 0);layer.attn_o      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT,  "weight", i), {1, n_embd, n_embd}, 0);layer.attn_o_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT,  "bias",   i), {1, n_embd}, 0);} break;case 5:{layer.norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   i), {1, n_embd}, 0);} break;default: GGML_ABORT("unknown posnet layer");};}}GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0);tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {hparams.posnet.n_embd}, 0);// convnext{const int64_t n_embd = hparams.convnext.n_embd;for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {auto & layer = layers[i].convnext;layer.dw     = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW,    "weight", i), {7, 1, n_embd}, 0);layer.dw_b   = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW,    "bias",   i), {1, n_embd}, 0);layer.norm   = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM,  "weight", i), {n_embd}, 0);layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM,  "bias",   i), {n_embd}, 0);layer.pw1    = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1,   "weight", i), {n_embd, n_ff}, 0);layer.pw1_b  = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1,   "bias",   i), {n_ff}, 0);layer.pw2    = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2,   "weight", i), {n_ff, n_embd}, 0);layer.pw2_b  = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2,   "bias",   i), {n_embd}, 0);layer.gamma  = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);}// outputoutput_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);}output   = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"),   {n_embd}, 0);} break;default:throw std::runtime_error("unknown architecture");}
...

3. struct ggml_cgraph * build_deepseek() and struct ggml_cgraph * build_deepseek2()

/home/yongqiang/llm_work/llama_cpp_25_01_05/llama.cpp/src/llama.cpp

  • struct ggml_cgraph * build_deepseek()
    struct ggml_cgraph * build_deepseek() {struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);// mutable variable, needed during the last layer of the computation to skip unused tokensint32_t n_tokens = this->n_tokens;const int64_t n_embd_head = hparams.n_embd_head_v;GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);GGML_ASSERT(n_embd_head == hparams.n_rot);struct ggml_tensor * cur;struct ggml_tensor * inpL;inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);// inp_pos - contains the positionsstruct ggml_tensor * inp_pos = build_inp_pos();// KQ_mask (mask for 1 head, it will be broadcasted to all heads)struct ggml_tensor * KQ_mask = build_inp_KQ_mask();const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;for (int il = 0; il < n_layer; ++il) {struct ggml_tensor * inpSA = inpL;// normcur = llm_build_norm(ctx0, inpL, hparams,model.layers[il].attn_norm, NULL,LLM_NORM_RMS, cb, il);cb(cur, "attn_norm", il);// self-attention{// rope freq factors for llama3; may return nullptr for llama2 and other modelsstruct ggml_tensor * rope_factors = build_rope_factors(il);// compute Q and K and RoPE themstruct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);cb(Qcur, "Qcur", il);if (model.layers[il].bq) {Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);cb(Qcur, "Qcur", il);}struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);cb(Kcur, "Kcur", il);if (model.layers[il].bk) {Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);cb(Kcur, "Kcur", il);}struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);cb(Vcur, "Vcur", il);if (model.layers[il].bv) {Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);cb(Vcur, "Vcur", il);}Qcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,ext_factor, attn_factor, beta_fast, beta_slow);cb(Qcur, "Qcur", il);Kcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,ext_factor, attn_factor, beta_fast, beta_slow);cb(Kcur, "Kcur", il);cur = llm_build_kv(ctx0, lctx, kv_self, gf,model.layers[il].wo, model.layers[il].bo,Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);}if (il == n_layer - 1) {// skip computing output for unused tokensstruct ggml_tensor * inp_out_ids = build_inp_out_ids();n_tokens = n_outputs;cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);}struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);cb(ffn_inp, "ffn_inp", il);cur = llm_build_norm(ctx0, ffn_inp, hparams,model.layers[il].ffn_norm, NULL,LLM_NORM_RMS, cb, il);cb(cur, "ffn_norm", il);if ((uint32_t) il < hparams.n_layer_dense_lead) {cur = llm_build_ffn(ctx0, lctx, cur,model.layers[il].ffn_up,   NULL, NULL,model.layers[il].ffn_gate, NULL, NULL,model.layers[il].ffn_down, NULL, NULL,NULL,LLM_FFN_SILU, LLM_FFN_PAR, cb, il);cb(cur, "ffn_out", il);} else {// MoE branchggml_tensor * moe_out =llm_build_moe_ffn(ctx0, lctx, cur,model.layers[il].ffn_gate_inp,model.layers[il].ffn_up_exps,model.layers[il].ffn_gate_exps,model.layers[il].ffn_down_exps,nullptr,n_expert, n_expert_used,LLM_FFN_SILU, false,false, hparams.expert_weights_scale,LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,cb, il);cb(moe_out, "ffn_moe_out", il);// FFN shared expert{ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur,model.layers[il].ffn_up_shexp,   NULL, NULL,model.layers[il].ffn_gate_shexp, NULL, NULL,model.layers[il].ffn_down_shexp, NULL, NULL,NULL,LLM_FFN_SILU, LLM_FFN_PAR, cb, il);cb(ffn_shexp, "ffn_shexp", il);cur = ggml_add(ctx0, moe_out, ffn_shexp);cb(cur, "ffn_out", il);}}cur = ggml_add(ctx0, cur, ffn_inp);cur = lctx.cvec.apply_to(ctx0, cur, il);cb(cur, "l_out", il);// input for next layerinpL = cur;}cur = inpL;cur = llm_build_norm(ctx0, cur, hparams,model.output_norm, NULL,LLM_NORM_RMS, cb, -1);cb(cur, "result_norm", -1);// lm_headcur = llm_build_lora_mm(lctx, ctx0, model.output, cur);cb(cur, "result_output", -1);ggml_build_forward_expand(gf, cur);return gf;}
  • struct ggml_cgraph * build_deepseek2()
    struct ggml_cgraph * build_deepseek2() {struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);// mutable variable, needed during the last layer of the computation to skip unused tokensint32_t n_tokens = this->n_tokens;bool is_lite = (hparams.n_layer == 27);// We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k));const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));const uint32_t n_embd_head_qk_rope = hparams.n_rot;const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;const uint32_t kv_lora_rank = hparams.n_lora_kv;struct ggml_tensor * cur;struct ggml_tensor * inpL;// {n_embd, n_tokens}inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);// inp_pos - contains the positionsstruct ggml_tensor * inp_pos = build_inp_pos();// KQ_mask (mask for 1 head, it will be broadcasted to all heads)struct ggml_tensor * KQ_mask = build_inp_KQ_mask();for (int il = 0; il < n_layer; ++il) {struct ggml_tensor * inpSA = inpL;// normcur = llm_build_norm(ctx0, inpL, hparams,model.layers[il].attn_norm, NULL,LLM_NORM_RMS, cb, il);cb(cur, "attn_norm", il);// self_attention{struct ggml_tensor * q = NULL;if (!is_lite) {// {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);cb(q, "q", il);q = llm_build_norm(ctx0, q, hparams,model.layers[il].attn_q_a_norm, NULL,LLM_NORM_RMS, cb, il);cb(q, "q", il);// {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);cb(q, "q", il);} else {q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);cb(q, "q", il);}// split into {n_head * n_embd_head_qk_nope, n_tokens}struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,ggml_row_size(q->type, hparams.n_embd_head_k),ggml_row_size(q->type, hparams.n_embd_head_k * n_head),0);cb(q_nope, "q_nope", il);// and {n_head * n_embd_head_qk_rope, n_tokens}struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,ggml_row_size(q->type, hparams.n_embd_head_k),ggml_row_size(q->type, hparams.n_embd_head_k * n_head),ggml_row_size(q->type, n_embd_head_qk_nope));cb(q_pe, "q_pe", il);// {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);cb(kv_pe_compresseed, "kv_pe_compresseed", il);// split into {kv_lora_rank, n_tokens}struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,kv_pe_compresseed->nb[1],0);cb(kv_compressed, "kv_compressed", il);// and {n_embd_head_qk_rope, n_tokens}struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,kv_pe_compresseed->nb[1],kv_pe_compresseed->nb[1],ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));cb(k_pe, "k_pe", il);kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous normkv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,model.layers[il].attn_kv_a_norm, NULL,LLM_NORM_RMS, cb, il);cb(kv_compressed, "kv_compressed", il);// {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);cb(kv, "kv", il);// split into {n_head * n_embd_head_qk_nope, n_tokens}struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),0);cb(k_nope, "k_nope", il);// and {n_head * n_embd_head_v, n_tokens}struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),ggml_row_size(kv->type, (n_embd_head_qk_nope)));cb(v_states, "v_states", il);v_states = ggml_cont(ctx0, v_states);cb(v_states, "v_states", il);v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),0);cb(v_states, "v_states", il);q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing thisq_pe = ggml_rope_ext(ctx0, q_pe, inp_pos, nullptr,n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,ext_factor, attn_factor_scaled, beta_fast, beta_slow);cb(q_pe, "q_pe", il);// shared RoPE keyk_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing thisk_pe = ggml_rope_ext(ctx0, k_pe, inp_pos, nullptr,n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,ext_factor, attn_factor_scaled, beta_fast, beta_slow);cb(k_pe, "k_pe", il);struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);cb(q_states, "q_states", il);struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);cb(k_states, "k_states", il);cur = llm_build_kv(ctx0, lctx, kv_self, gf,model.layers[il].wo, NULL,k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);}if (il == n_layer - 1) {// skip computing output for unused tokensstruct ggml_tensor * inp_out_ids = build_inp_out_ids();n_tokens = n_outputs;cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);}struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);cb(ffn_inp, "ffn_inp", il);cur = llm_build_norm(ctx0, ffn_inp, hparams,model.layers[il].ffn_norm, NULL,LLM_NORM_RMS, cb, il);cb(cur, "ffn_norm", il);if ((uint32_t) il < hparams.n_layer_dense_lead) {cur = llm_build_ffn(ctx0, lctx, cur,model.layers[il].ffn_up,   NULL, NULL,model.layers[il].ffn_gate, NULL, NULL,model.layers[il].ffn_down, NULL, NULL,NULL,LLM_FFN_SILU, LLM_FFN_PAR, cb, il);cb(cur, "ffn_out", il);} else {// MoE branchggml_tensor * moe_out =llm_build_moe_ffn(ctx0, lctx, cur,model.layers[il].ffn_gate_inp,model.layers[il].ffn_up_exps,model.layers[il].ffn_gate_exps,model.layers[il].ffn_down_exps,model.layers[il].ffn_exp_probs_b,n_expert, n_expert_used,LLM_FFN_SILU, hparams.expert_weights_norm,true, hparams.expert_weights_scale,(enum llama_expert_gating_func_type) hparams.expert_gating_func,cb, il);cb(moe_out, "ffn_moe_out", il);// FFN shared expert{ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur,model.layers[il].ffn_up_shexp,   NULL, NULL,model.layers[il].ffn_gate_shexp, NULL, NULL,model.layers[il].ffn_down_shexp, NULL, NULL,NULL,LLM_FFN_SILU, LLM_FFN_PAR, cb, il);cb(ffn_shexp, "ffn_shexp", il);cur = ggml_add(ctx0, moe_out, ffn_shexp);cb(cur, "ffn_out", il);}}cur = ggml_add(ctx0, cur, ffn_inp);cur = lctx.cvec.apply_to(ctx0, cur, il);cb(cur, "l_out", il);// input for next layerinpL = cur;}cur = inpL;cur = llm_build_norm(ctx0, cur, hparams,model.output_norm, NULL,LLM_NORM_RMS, cb, -1);cb(cur, "result_norm", -1);// lm_headcur = ggml_mul_mat(ctx0, model.output, cur);cb(cur, "result_output", -1);ggml_build_forward_expand(gf, cur);return gf;}
  • case LLM_ARCH_DEEPSEEK: and case LLM_ARCH_DEEPSEEK2:
    switch (model.arch) {case LLM_ARCH_LLAMA:case LLM_ARCH_MINICPM:case LLM_ARCH_GRANITE:case LLM_ARCH_GRANITE_MOE:{result = llm.build_llama();} break;case LLM_ARCH_DECI:{result = llm.build_deci();} break;case LLM_ARCH_BAICHUAN:{result = llm.build_baichuan();} break;case LLM_ARCH_FALCON:{result = llm.build_falcon();} break;case LLM_ARCH_GROK:{result = llm.build_grok();} break;case LLM_ARCH_STARCODER:{result = llm.build_starcoder();} break;case LLM_ARCH_REFACT:{result = llm.build_refact();} break;case LLM_ARCH_BERT:case LLM_ARCH_JINA_BERT_V2:case LLM_ARCH_NOMIC_BERT:{result = llm.build_bert();} break;case LLM_ARCH_BLOOM:{result = llm.build_bloom();} break;case LLM_ARCH_MPT:{result = llm.build_mpt();} break;case LLM_ARCH_STABLELM:{result = llm.build_stablelm();} break;case LLM_ARCH_QWEN:{result = llm.build_qwen();} break;case LLM_ARCH_QWEN2:{result = llm.build_qwen2();} break;case LLM_ARCH_QWEN2VL:{lctx.n_pos_per_token = 4;result = llm.build_qwen2vl();} break;case LLM_ARCH_QWEN2MOE:{result = llm.build_qwen2moe();} break;case LLM_ARCH_PHI2:{result = llm.build_phi2();} break;case LLM_ARCH_PHI3:case LLM_ARCH_PHIMOE:{result = llm.build_phi3();} break;case LLM_ARCH_PLAMO:{result = llm.build_plamo();} break;case LLM_ARCH_GPT2:{result = llm.build_gpt2();} break;case LLM_ARCH_CODESHELL:{result = llm.build_codeshell();} break;case LLM_ARCH_ORION:{result = llm.build_orion();} break;case LLM_ARCH_INTERNLM2:{result = llm.build_internlm2();} break;case LLM_ARCH_MINICPM3:{result = llm.build_minicpm3();} break;case LLM_ARCH_GEMMA:{result = llm.build_gemma();} break;case LLM_ARCH_GEMMA2:{result = llm.build_gemma2();} break;case LLM_ARCH_STARCODER2:{result = llm.build_starcoder2();} break;case LLM_ARCH_MAMBA:{result = llm.build_mamba();} break;case LLM_ARCH_XVERSE:{result = llm.build_xverse();} break;case LLM_ARCH_COMMAND_R:{result = llm.build_command_r();} break;case LLM_ARCH_COHERE2:{result = llm.build_cohere2();} break;case LLM_ARCH_DBRX:{result = llm.build_dbrx();} break;case LLM_ARCH_OLMO:{result = llm.build_olmo();} break;case LLM_ARCH_OLMO2:{result = llm.build_olmo2();} break;case LLM_ARCH_OLMOE:{result = llm.build_olmoe();} break;case LLM_ARCH_OPENELM:{result = llm.build_openelm();} break;case LLM_ARCH_GPTNEOX:{result = llm.build_gptneox();} break;case LLM_ARCH_ARCTIC:{result = llm.build_arctic();} break;case LLM_ARCH_DEEPSEEK:{result = llm.build_deepseek();} break;case LLM_ARCH_DEEPSEEK2:{result = llm.build_deepseek2();} break;case LLM_ARCH_CHATGLM:{result = llm.build_chatglm();} break;case LLM_ARCH_BITNET:{result = llm.build_bitnet();} break;case LLM_ARCH_T5:{if (lctx.is_encoding) {result = llm.build_t5_enc();} else {result = llm.build_t5_dec();}} break;case LLM_ARCH_T5ENCODER:{result = llm.build_t5_enc();} break;case LLM_ARCH_JAIS:{result = llm.build_jais();} break;case LLM_ARCH_NEMOTRON:{result = llm.build_nemotron();} break;case LLM_ARCH_EXAONE:{result = llm.build_exaone();} break;case LLM_ARCH_RWKV6:{result = llm.build_rwkv6();} break;case LLM_ARCH_RWKV6QWEN2:{result = llm.build_rwkv6qwen2();} break;case LLM_ARCH_CHAMELEON:{result = llm.build_chameleon();} break;case LLM_ARCH_WAVTOKENIZER_DEC:{result = llm.build_wavtokenizer_dec();} break;default:GGML_ABORT("fatal error");}

References

[1] Yongqiang Cheng, https://yongqiang.blog.csdn.net/
[2] huggingface/gguf, https://github.com/huggingface/huggingface.js/tree/main/packages/gguf

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.mzph.cn/pingmian/69116.shtml

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!

相关文章

第二十章 存储函数

目录 一、概述 二、语法 三、示例 一、概述 前面章节中&#xff0c;我们详细讲解了MySQL中的存储过程&#xff0c;掌握了存储过程之后&#xff0c;学习存储函数则肥仓简单&#xff0c;存储函数其实是一种特殊的存储过程&#xff0c;也就是有返回值的存储过程。存储函数的参数…

Linux:文件系统(软硬链接)

目录 inode ext2文件系统 Block Group 超级块&#xff08;Super Block&#xff09; GDT&#xff08;Group Descriptor Table&#xff09; 块位图&#xff08;Block Bitmap&#xff09; inode位图&#xff08;Inode Bitmap&#xff09; i节点表&#xff08;inode Tabl…

java求职学习day27

数据库连接池 &DBUtils 1.数据库连接池 1.1 连接池介绍 1) 什么是连接池 实际开发中 “ 获得连接 ” 或 “ 释放资源 ” 是非常消耗系统资源的两个过程&#xff0c;为了解决此类性能问题&#xff0c;通常情况我们 采用连接池技术&#xff0c;来共享连接 Connection 。…

机器学习--2.多元线性回归

多元线性回归 1、基本概念 1.1、连续值 1.2、离散值 1.3、简单线性回归 1.4、最优解 1.5、多元线性回归 2、正规方程 2.1、最小二乘法 2.2、多元一次方程举例 2.3、矩阵转置公式与求导公式 2.4、推导正规方程0的解 2.5、凸函数判定 成年人最大的自律就是&#xff1a…

Docker 部署 ClickHouse 教程

Docker 部署 ClickHouse 教程 背景 ClickHouse 是一个开源的列式数据库管理系统&#xff08;DBMS&#xff09;&#xff0c;主要用于在线分析处理&#xff08;OLAP&#xff09;。它专为大数据的实时分析设计&#xff0c;支持高速的查询性能和高吞吐量。ClickHouse 以其高效的数…

建表注意事项(2):表约束,主键自增,序列[oracle]

没有明确写明数据库时,默认基于oracle 约束的分类 用于确保数据的完整性和一致性。约束可以分为 表级约束 和 列级约束&#xff0c;区别在于定义的位置和作用范围 复合主键约束: 主键约束中有2个或以上的字段 复合主键的列顺序会影响索引的使用&#xff0c;需谨慎设计 添加…

Google C++ Style / 谷歌C++开源风格

文章目录 前言1. 头文件1.1 自给自足的头文件1.2 #define 防护符1.3 导入你的依赖1.4 前向声明1.5 内联函数1.6 #include 的路径及顺序 2. 作用域2.1 命名空间2.2 内部链接2.3 非成员函数、静态成员函数和全局函数2.4 局部变量2.5 静态和全局变量2.6 thread_local 变量 3. 类3.…

【HTML入门】Sublime Text 4与 Phpstorm

文章目录 前言一、环境基础1.Sublime Text 42.Phpstorm(1)安装(2)启动Phpstorm(3)“启动”码 二、HTML1.HTML简介(1)什么是HTML(2)HTML版本及历史(3)HTML基本结构 2.HTML简单语法(1)HTML标签语法(2)HTML常用标签(3)表格(4)特殊字符 总结 前言 在当今的软件开发领域&#xff0c…

【Numpy核心编程攻略:Python数据处理、分析详解与科学计算】2.20 傅里叶变换:从时域到频域的算法实现

2.20 傅里叶变换&#xff1a;从时域到频域的算法实现 目录 #mermaid-svg-zrRqIme9IEqP6JJE {font-family:"trebuchet ms",verdana,arial,sans-serif;font-size:16px;fill:#333;}#mermaid-svg-zrRqIme9IEqP6JJE .error-icon{fill:#552222;}#mermaid-svg-zrRqIme9IEqP…

刷题记录 动态规划-7: 63. 不同路径 II

题目&#xff1a;63. 不同路径 II 难度&#xff1a;中等 给定一个 m x n 的整数数组 grid。一个机器人初始位于 左上角&#xff08;即 grid[0][0]&#xff09;。机器人尝试移动到 右下角&#xff08;即 grid[m - 1][n - 1]&#xff09;。机器人每次只能向下或者向右移动一步。…

HarmonyOS:给您的应用添加通知

一、通知介绍 通知旨在让用户以合适的方式及时获得有用的新消息&#xff0c;帮助用户高效地处理任务。应用可以通过通知接口发送通知消息&#xff0c;用户可以通过通知栏查看通知内容&#xff0c;也可以点击通知来打开应用&#xff0c;通知主要有以下使用场景&#xff1a; 显示…

Unity飞行代码 超仿真 保姆级教程

本文使用Rigidbody控制飞机&#xff0c;基本不会穿模。 效果 飞行效果 这是一条优雅的广告 如果你也在开发飞机大战等类型的飞行游戏&#xff0c;欢迎在主页搜索博文并参考。 搜索词&#xff1a;Unity游戏(Assault空对地打击)开发。 脚本编写 首先是完整代码。 using System.Co…

图论常见算法

图论常见算法 算法prim算法Dijkstra算法 用途最小生成树&#xff08;MST&#xff09;&#xff1a;最短路径&#xff1a;拓扑排序&#xff1a;关键路径&#xff1a; 算法用途适用条件时间复杂度Kruskal最小生成树无向图&#xff08;稀疏图&#xff09;O(E log E)Prim最小生成树无…

车载软件架构 --- 基于AUTOSAR软件架构的ECU开发流程小白篇

我是穿拖鞋的汉子&#xff0c;魔都中坚持长期主义的汽车电子工程师。 老规矩&#xff0c;分享一段喜欢的文字&#xff0c;避免自己成为高知识低文化的工程师&#xff1a; 简单&#xff0c;单纯&#xff0c;喜欢独处&#xff0c;独来独往&#xff0c;不易合同频过着接地气的生活…

Linux 传输层协议 UDP 和 TCP

UDP 协议 UDP 协议端格式 16 位 UDP 长度, 表示整个数据报(UDP 首部UDP 数据)的最大长度如果校验和出错, 就会直接丢弃 UDP 的特点 UDP 传输的过程类似于寄信 . 无连接: 知道对端的 IP 和端口号就直接进行传输, 不需要建立连接不可靠: 没有确认机制, 没有重传机制; 如果因…

Android学习21 -- launcher

1 前言 之前在工作中&#xff0c;第一次听到launcher有点蒙圈&#xff0c;不知道是啥&#xff0c;当时还赶鸭子上架去和客户PK launcher的事。后来才知道其实就是安卓的桌面。本来还以为很复杂&#xff0c;毕竟之前接触过windows的桌面&#xff0c;那叫一个复杂。。。 后面查了…

unity学习26:用Input接口去监测: 鼠标,键盘,虚拟轴,虚拟按键

目录 1 用Input接口去监测&#xff1a;鼠标&#xff0c;键盘&#xff0c;虚拟轴&#xff0c;虚拟按键 2 鼠标 MouseButton 事件 2.1 鼠标的基本操作 2.2 测试代码 2.3 测试情况 3 键盘Key事件 3.1 键盘的枚举方式 3.2 测试代码同上 3.3 测试代码同上 3.4 测试结果 4…

简单介绍一下什么是OpenFeign

OpenFeign是什么&#xff1f; OpenFeign是一个声明式的Http客户端&#xff0c;它可以用来发起Http请求 它主要用于SpringCloud微服务之间的通讯&#xff0c;让调用另一个服务的Java方法和调用本地方法一样快速和便捷 之前我们是用RestTemplate写一大堆东西发起Http请求远程调…

Hugging Face GGUF 模型可视化

Hugging Face GGUF 模型可视化 1. Finding GGUF files (检索 GGUF 模型)2. Viewer for metadata & tensors info (可视化 GGUF 模型)References 无知小儿&#xff0c;仙家雄霸天下&#xff0c;依附强者才是唯一的出路。否则天地虽大&#xff0c;也让你们无路可走&#xff0…