llama.c代码2

1、forward

1.1、复习

 encode(tokenizer, prompt, 1, 0, prompt_tokens, &num_prompt_tokens);
在encode函数结尾处(gdb) p *n_tokens
$3 = 2(gdb) p *tokens@3
$6 = {1, 22172, 417}
在encode调用后
(gdb) print num_prompt_tokens
$11 = 2
(gdb) print *prompt_tokens@3
$13 = {1, 22172, 417}

encode函数的主要作用是将输入的自然语言文本（以UTF-8编码）转换成基于特定词汇表（由Tokenizer结构体维护）的token序列。该函数首先确保tokenizer的词汇表已排序和初始化，然后逐字符处理输入文本，识别并合并可能存在的子词单元或符合词典中预定义的合并规则的连续token对。

具体步骤如下：

函数首先检查tokenizer的词汇表是否已排序和初始化，并在需要时完成此操作。
分配临时缓冲区用于存储可能合并的连续token对，同时初始化已生成token的数量为0。
如果参数指定，添加起始符（BOS）到tokens数组。
针对输入文本中的每个字符，函数按照UTF-8编码规范读取并解码字符，将其添加到临时缓冲区中，直到构成一个完整的Unicode码点。
对于每个完整的Unicode码点，查找其在排序后的词汇表中的索引。如果找到，则将对应的token添加到tokens数组；否则，根据字节fallback策略，将每个字节作为单独的token添加（通常是在未找到对应词汇的情况下）。
然后，函数尝试将tokens数组中的连续token进行合并，通过比较合并后的token在词汇表中的得分来决定是否进行合并。若存在得分更高的合并结果，则更新tokens数组。
最后，如果参数指定，添加结束符（EOS）到tokens数组。
整个过程旨在将原始文本适配到模型所需的token化格式，使得Transformer模型能够理解和生成与给定词汇表相匹配的文本序列。
总结来说，这段代码片段是调用encode函数来对给定的prompt文本进行token化处理，并将结果token序列存放在prompt_tokens数组中，同时通过num_prompt_tokens返回最终生成的token数量，且在这个过程中会在token序列开头添加起始符（BOS），但不会添加结束符（EOS）。

1.2 forward

调用

float* logits = forward(transformer, token, pos);
它实现了一个Transformer模型在给定token和位置pos上的前向传播过程，并返回下一个token的logits（预测概率分布）。使用分类器权重将最后的隐藏状态映射到 logits，即对应于词汇表中所有单词的概率分布。
float* forward(Transformer* transformer, int token, int pos) {// 首先，函数获取Transformer结构体中的配置参数、权重以及运行状态等信息。// a few convenience variablesConfig* p = &transformer->config;TransformerWeights* w = &transformer->weights;RunState* s = &transformer->state;float *x = s->x;int dim = p->dim;int kv_dim = (p->dim * p->n_kv_heads) / p->n_heads;int kv_mul = p->n_heads / p->n_kv_heads; // integer multiplier of the kv sharing in multiqueryint hidden_dim =  p->hidden_dim;int head_size = dim / p->n_heads;// copy the token embedding into x//content_row = w->token_embedding_table + token * dim; 这一行计算了指向当前token嵌入向量起始地址的指针。dim 表示每个嵌入向量的维      //度大小，通过将token索引乘以维度大小来偏移到该token对应的嵌入向量在表中的位置。//memcpy(x, content_row, dim*sizeof(*x)); 使用memcpy函数将从content_row指向的token嵌入向量复制到变量x指向的内存空间。x通常是型//运行状态的一部分，用于存储输入序列中当前位置的激活值//将token嵌入进去，并赋给当前位置的激活值x，这儿也是网络输入的初始值float* content_row = w->token_embedding_table + token * dim;memcpy(x, content_row, dim*sizeof(*x));// forward all the layersfor(unsigned long long l = 0; l < p->n_layers; l++) {// attention rmsnormrmsnorm(s->xb, x, w->rms_att_weight + l*dim, dim);// key and value point to the kv cacheint loff = l * p->seq_len * kv_dim; // kv cache layer offset for conveniences->k = s->key_cache + loff + pos * kv_dim;s->v = s->value_cache + loff + pos * kv_dim;// qkv matmuls for this positionmatmul(s->q, s->xb, w->wq + l*dim*dim, dim, dim);matmul(s->k, s->xb, w->wk + l*dim*kv_dim, dim, kv_dim);matmul(s->v, s->xb, w->wv + l*dim*kv_dim, dim, kv_dim);// RoPE relative positional encoding: complex-valued rotate q and k in each headfor (int i = 0; i < dim; i+=2) {int head_dim = i % head_size;float freq = 1.0f / powf(10000.0f, head_dim / (float)head_size);float val = pos * freq;float fcr = cosf(val);float fci = sinf(val);int rotn = i < kv_dim ? 2 : 1; // how many vectors? 2 = q & k, 1 = q onlyfor (int v = 0; v < rotn; v++) {float* vec = v == 0 ? s->q : s->k; // the vector to rotate (query or key)float v0 = vec[i];float v1 = vec[i+1];vec[i]   = v0 * fcr - v1 * fci;vec[i+1] = v0 * fci + v1 * fcr;}}// multihead attention. iterate over all headsint h;#pragma omp parallel for private(h)for (h = 0; h < p->n_heads; h++) {// get the query vector for this headfloat* q = s->q + h * head_size;// attention scores for this headfloat* att = s->att + h * p->seq_len;// iterate over all timesteps, including the current onefor (int t = 0; t <= pos; t++) {// get the key vector for this head and at this timestepfloat* k = s->key_cache + loff + t * kv_dim + (h / kv_mul) * head_size;// calculate the attention score as the dot product of q and kfloat score = 0.0f;for (int i = 0; i < head_size; i++) {score += q[i] * k[i];  //q与k相乘}score /= sqrtf(head_size);// save the score to the attention bufferatt[t] = score;}// softmax the scores to get attention weights, from 0..pos inclusivelysoftmax(att, pos + 1);  //在softmax// weighted sum of the values, store back into xbfloat* xb = s->xb + h * head_size;memset(xb, 0, head_size * sizeof(float));  //xb指向的内存清0for (int t = 0; t <= pos; t++) {// get the value vector for this head and at this timestepfloat* v = s->value_cache + loff + t * kv_dim + (h / kv_mul) * head_size;// get the attention weight for this timestepfloat a = att[t];// accumulate the weighted value into xbfor (int i = 0; i < head_size; i++) {xb[i] += a * v[i];}}}// final matmul to get the output of the attentionmatmul(s->xb2, s->xb, w->wo + l*dim*dim, dim, dim);// residual connection back into xfor (int i = 0; i < dim; i++) {x[i] += s->xb2[i];}// ffn rmsnormrmsnorm(s->xb, x, w->rms_ffn_weight + l*dim, dim);// Now for FFN in PyTorch we have: self.w2(F.silu(self.w1(x)) * self.w3(x))// first calculate self.w1(x) and self.w3(x)matmul(s->hb, s->xb, w->w1 + l*dim*hidden_dim, dim, hidden_dim);matmul(s->hb2, s->xb, w->w3 + l*dim*hidden_dim, dim, hidden_dim);// SwiGLU non-linearityfor (int i = 0; i < hidden_dim; i++) {float val = s->hb[i];// silu(x)=x*σ(x), where σ(x) is the logistic sigmoidval *= (1.0f / (1.0f + expf(-val)));// elementwise multiply with w3(x)val *= s->hb2[i];s->hb[i] = val;}// final matmul to get the output of the ffnmatmul(s->xb, s->hb, w->w2 + l*dim*hidden_dim, hidden_dim, dim);// residual connectionfor (int i = 0; i < dim; i++) {x[i] += s->xb[i];}}// final rmsnormrmsnorm(x, x, w->rms_final_weight, dim);// classifier into logitsmatmul(s->logits, x, w->wcls, p->dim, p->vocab_size);return s->logits;
}

llama2

解释1

(gdb) p *p  配置参数
$16 = {dim = 288, hidden_dim = 768, n_layers = 6, n_heads = 6, n_kv_heads = 6, vocab_size = 32000, seq_len = 256}
(gdb) p *w  权重
$17 = {token_embedding_table = 0x7f7ffcc0d01c, rms_att_weight = 0x7f7ffef3501c, rms_ffn_weight = 0x7f7fff6ceb1c, wq = 0x7f7ffef36b1c, wk = 0x7f7fff11cb1c, wv = 0x7f7fff302b1c, wo = 0x7f7fff4e8b1c, w1 = 0x7f7fff6d061c, w2 = 0x7f7fffbe061c, w3 = 0x7f80000f061c, rms_final_weight = 0x7f800060061c, wcls = 0x7f7ffcc0d01c}
(gdb) p *s  运行状态
$18 = {x = 0x55bde82af480, xb = 0x55bde82af910, xb2 = 0x55bde82afda0, hb = 0x55bde82b0230, hb2 = 0x55bde82b0e40, q = 0x55bde82b1a50, k = 0x0, v = 0x0, att = 0x55bde82b1ee0, logits = 0x55bde82b36f0, key_cache = 0x7f7ffca5c010, value_cache = 0x7f7ffc8ab010}

解释2

rmsnorm(s->xb, x, w->rms_att_weight + l*dim, dim);
void rmsnorm(float* o, float* x, float* weight, int size) {// calculate sum of squaresfloat ss = 0.0f;for (int j = 0; j < size; j++) {ss += x[j] * x[j]; //计算输入向量x的平方和（sum of squares）： 遍历向量x中的所有元素，将每个元素与自身相乘后累加到变量ss上。}ss /= size;ss += 1e-5f;ss = 1.0f / sqrtf(ss);// normalize and scalefor (int j = 0; j < size; j++) {o[j] = weight[j] * (ss * x[j]); //weight[j] 可学习参数}
}
(gdb) print *weight
$24 = 0.798294365
(gdb) print *x
$25 = -0.0170963854
计算输入向量x的RMS归一化（Root Mean Square Normalization）[公式](https://blog.csdn.net/lichunericli/article/details/136109344)版本，并将结果存储在输出向量o中。同时，它还根据权重矩阵weight对归一化后的向量进行了缩放。

解释3

int loff = l * p->seq_len * kv_dim; // kv cache layer offset for convenience
s->k = s->key_cache + loff + pos * kv_dim;
s->v = s->value_cache + loff + pos * kv_dim;
//首先，计算key和value缓存（kv cache）在当前层（l）和当前序列位置（pos）的偏移量 loff。这个偏移量是为了方便地定位到该层特定序列位置的
//key和value向量。// qkv matmuls for this position
matmul(s->q, s->xb, w->wq + l*dim*dim, dim, dim);
void matmul(float* xout, float* x, float* w, int n, int d)  //线性层映射？？
// s->q输出的结果， s->xb输入， w->wq + l*dim*dim权重数据指针，dim, dim表示输入、输出向量的长度
// W (d,n) @ x (n,) -> xout (d,)// classifier into logitsmatmul(s->logits, x, w->wcls, p->dim, p->vocab_size);return s->logits;
通过矩阵乘法，模型将从 x 中提取出的信息映射到整个词汇表上，生成一个维度为 (1, p->vocab_size) 的向量，这个向量的每个元素对应词汇表中一个词的预测得分（logit）。
(gdb) print *s->logits
$28 = -6.79079819

// advance the state machineif (pos < num_prompt_tokens - 1) {// if we are still processing the input prompt, force the next prompt tokennext = prompt_tokens[pos + 1];} else {// otherwise sample the next token from the logitsnext = sample(sampler, logits);}pos++;// data-dependent terminating condition: the BOS (=1) token delimits sequencesif (next == 1) { break; }// print the token as string, decode it with the Tokenizer objectchar* piece = decode(tokenizer, token, next);

解释

 if (pos < num_prompt_tokens - 1) {// if we are still processing the input prompt, force the next prompt tokennext = prompt_tokens[pos + 1];
如果当前位置的pos还在处理处理输入提示符（prompt）中的tokens，则强制使用prompt中的下一个token作为下一个预测值。因为还是prompt嘛。
然后decode// otherwise sample the next token from the logitsnext = sample(sampler, logits);
否者就用前向传播预测出来的logits采样出下一个token。这才是正儿八经生成，在decode。

1.3 decode

char* decode(Tokenizer* t, int prev_token, int token) {char *piece = t->vocab[token];// following BOS (1) token, sentencepiece decoder strips any leading whitespace (see PR #89)if (prev_token == 1 && piece[0] == ' ') { piece++; }// careful, some tokens designate raw bytes, and look like e.g. '<0x01>'// parse this and convert and return the actual byteunsigned char byte_val;if (sscanf(piece, "<0x%02hhX>", &byte_val) == 1) {piece = (char*)t->byte_pieces + byte_val * 2;}return piece;
}

解释

453     char* decode(Tokenizer* t, int prev_token, int token) {
(gdb) print *t
$35 = {vocab = 0x7f7ffc86c010, vocab_scores = 0x55bde82d2b00, sorted_vocab = 0x7f7ffc7af010, vocab_size = 32000, max_token_length = 27, byte_pieces = "\000\000\001\000\002\000\003\000\004\000\005\000\006\000\a\000\b\000\t\000\n\000\v\000\f\000\r\000\016\000\017\000\020\000\021\000\022\000\023\000\024\000\025\000\026\000\027\000\030\000\031\000\032\000\033\000\034\000\035\000\036\000\037\000 \000!\000\"\000#\000$\000%\000&\000'\000(\000)\000*\000+\000,\000-\000.\000/\000\060\000\061\000\062\000\063\000\064\000\065\000\066\000\067\000\070\000\071\000:\000;\000<\000=\000>\000?\000@\000A\000B\000C\000D\000E\000F\000G\000H\000I\000J\000K\000L\000M\000N\000O\000P\000Q\000R\000S\000T\000U\000V\000W\000X\000Y\000Z\000[\000\\\000]\000^\000_\000`\000a\000b\000c\000"...}
(gdb) print prev_token
$36 = 1
(gdb) print token
$37 = 22172(gdb) print piece
$42 = 0x55bde83a02b1 "hello"

1.4 sample

int sample(Sampler* sampler, float* logits) {// sample the token given the logits and some hyperparametersint next;if (sampler->temperature == 0.0f) {// greedy argmax sampling: take the token with the highest probabilitynext = sample_argmax(logits, sampler->vocab_size);} else {// apply the temperature to the logitsfor (int q=0; q<sampler->vocab_size; q++) { logits[q] /= sampler->temperature; }// apply softmax to the logits to get the probabilities for next tokensoftmax(logits, sampler->vocab_size);// flip a (float) coin (this is our source of entropy for sampling)float coin = random_f32(&sampler->rng_state);// we sample from this distribution to get the next tokenif (sampler->topp <= 0 || sampler->topp >= 1) {// simply sample from the predicted probability distributionnext = sample_mult(logits, sampler->vocab_size, coin);} else {// top-p (nucleus) sampling, clamping the least likely tokens to zeronext = sample_topp(logits, sampler->vocab_size, sampler->topp, sampler->probindex, coin);}}return next;
}

解释

 if (sampler->temperature == 0.0f) {// greedy argmax sampling: take the token with the highest probabilitynext = sample_argmax(logits, sampler->vocab_size);
当采样器的温度 sampler->temperature 等于 0.0f 时：使用贪婪选择策略（argmax），直接选取 logit 值最高的 token 作为下一个 token。