llama : Support llama 4 text-only (#12791)
* llama4 conversion * initial support, no chat template * clean up a bit * fix tokenizer conversion * correct hparams * try this * fix shexp * ffn_inp_normed * chat template * clean up model conversion * add_bos * add scale_before_ffn * fix order * weight_before_ffn * llm_graph_input_attn_temp * add chunk attn mask * build_inp_attn_scale() * add comment about ggml_repeat * clarify comments * fix build
This commit is contained in:
parent
82974011f3
commit
1466621e73
17 changed files with 532 additions and 22 deletions
|
@ -59,6 +59,22 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
|
|||
}
|
||||
}
|
||||
|
||||
void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
|
||||
if (ubatch->pos && attn_scale) {
|
||||
const int64_t n_tokens = ubatch->n_tokens;
|
||||
|
||||
std::vector<float> attn_scale_data(n_tokens, 0.0f);
|
||||
for (int i = 0; i < n_tokens; ++i) {
|
||||
const float pos = ubatch->pos[i];
|
||||
attn_scale_data[i] = std::log(
|
||||
std::floor((pos + 1.0f) / n_attn_temp_floor_scale) + 1.0
|
||||
) * f_attn_temp_scale + 1.0;
|
||||
}
|
||||
|
||||
ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*n_pos_per_token*ggml_element_size(attn_scale));
|
||||
}
|
||||
}
|
||||
|
||||
void llm_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) {
|
||||
if (pos_bucket) {
|
||||
const int64_t n_tokens = ubatch->n_tokens;
|
||||
|
@ -458,9 +474,17 @@ void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
|
|||
}
|
||||
|
||||
// may need to cut off old tokens for sliding window
|
||||
// TODO @ngxson : we are currently re-using the swa logic to store the chunked mask, we should rename SWA to something more generic like "aux mask"
|
||||
if (data_swa) {
|
||||
if (pos - kv_self->cells[i].pos >= (int32_t)hparams.n_swa) {
|
||||
f = -INFINITY;
|
||||
if (hparams.n_attn_chunk) {
|
||||
llama_pos pos_chunk_start = (pos / hparams.n_attn_chunk) * hparams.n_attn_chunk;
|
||||
if (kv_self->cells[i].pos < pos_chunk_start || pos < pos_chunk_start) {
|
||||
f = -INFINITY;
|
||||
}
|
||||
} else {
|
||||
if (pos - kv_self->cells[i].pos >= (int32_t)hparams.n_swa) {
|
||||
f = -INFINITY;
|
||||
}
|
||||
}
|
||||
data_swa[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
|
||||
}
|
||||
|
@ -812,8 +836,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||
float w_scale,
|
||||
llama_expert_gating_func_type gating_op,
|
||||
int il) const {
|
||||
int64_t n_embd = cur->ne[0];
|
||||
int64_t n_tokens = cur->ne[1];
|
||||
const int64_t n_embd = cur->ne[0];
|
||||
const int64_t n_tokens = cur->ne[1];
|
||||
const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN
|
||||
|
||||
ggml_tensor * logits = build_lora_mm(gate_inp, cur); // [n_expert, n_tokens]
|
||||
cb(logits, "ffn_moe_logits", il);
|
||||
|
@ -841,6 +866,12 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||
cb(selection_probs, "ffn_moe_probs_biased", il);
|
||||
}
|
||||
|
||||
// llama4 doesn't have exp_probs_b, and sigmoid is only used after top_k
|
||||
// see: https://github.com/meta-llama/llama-models/blob/699a02993512fb36936b1b0741e13c06790bcf98/models/llama4/moe.py#L183-L198
|
||||
if (arch == LLM_ARCH_LLAMA4) {
|
||||
selection_probs = logits;
|
||||
}
|
||||
|
||||
// select experts
|
||||
ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
|
||||
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
||||
|
@ -867,6 +898,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||
}
|
||||
|
||||
cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
|
||||
|
||||
if (weight_before_ffn) {
|
||||
// TODO: this is a workaround as we don't yet have a repeat op that takes custom dim (ggml_repeat_4d)
|
||||
ggml_tensor * repeated = ggml_new_tensor_3d(ctx0, cur->type, n_embd, n_expert_used, n_tokens);
|
||||
repeated = ggml_repeat(ctx0, cur, repeated); // [n_embd, n_expert_used, n_tokens]
|
||||
cur = ggml_mul(ctx0, repeated, weights);
|
||||
cb(cur, "ffn_moe_weighted", il);
|
||||
}
|
||||
|
||||
ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
||||
cb(up, "ffn_moe_up", il);
|
||||
|
||||
|
@ -894,7 +934,10 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||
ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
|
||||
cb(experts, "ffn_moe_down", il);
|
||||
|
||||
experts = ggml_mul(ctx0, experts, weights);
|
||||
if (!weight_before_ffn) {
|
||||
experts = ggml_mul(ctx0, experts, weights);
|
||||
cb(cur, "ffn_moe_weighted", il);
|
||||
}
|
||||
|
||||
// aggregate experts
|
||||
ggml_tensor * moe_out = nullptr;
|
||||
|
@ -914,6 +957,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||
moe_out = ggml_cont(ctx0, moe_out);
|
||||
}
|
||||
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
||||
return moe_out;
|
||||
}
|
||||
|
||||
|
@ -981,6 +1026,19 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
|
|||
return cur;
|
||||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
|
||||
auto inp = std::make_unique<llm_graph_input_attn_temp>(n_pos_per_token(), hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
|
||||
|
||||
auto & cur = inp->attn_scale;
|
||||
|
||||
cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens*n_pos_per_token());
|
||||
ggml_set_input(cur);
|
||||
|
||||
res->add_input(std::move(inp));
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_inp_out_ids() const {
|
||||
auto inp = std::make_unique<llm_graph_input_out_ids>(hparams, cparams, n_outputs);
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue