encode(tokenizer, prompt, 1, 0, prompt_tokens, &num_prompt_tokens);
在encode函数结尾处(gdb) p *n_tokens
$3 = 2(gdb) p *tokens@3
$6 = {1, 22172, 417}
(gdb) print num_prompt_tokens
$11 = 2
(gdb) print *prompt_tokens@3
$13 = {1, 22172, 417}
1.2 forward
float* logits = forward(transformer, token, pos);
它实现了一个Transformer模型在给定token和位置pos上的前向传播过程,并返回下一个token的logits(预测概率分布)。使用分类器权重将最后的隐藏状态映射到 logits,即对应于词汇表中所有单词的概率分布。
float* forward(Transformer* transformer, int token, int pos) {// 首先,函数获取Transformer结构体中的配置参数、权重以及运行状态等信息。// a few convenience variablesConfig* p = &transformer->config;TransformerWeights* w = &transformer->weights;RunState* s = &transformer->state;float *x = s->x;int dim = p->dim;int kv_dim = (p->dim * p->n_kv_heads) / p->n_heads;int kv_mul = p->n_heads / p->n_kv_heads; // integer multiplier of the kv sharing in multiqueryint hidden_dim = p->hidden_dim;int head_size = dim / p->n_heads;// copy the token embedding into x//content_row = w->token_embedding_table + token * dim; 这一行计算了指向当前token嵌入向量起始地址的指针。dim 表示每个嵌入向量的维 //度大小,通过将token索引乘以维度大小来偏移到该token对应的嵌入向量在表中的位置。//memcpy(x, content_row, dim*sizeof(*x)); 使用memcpy函数将从content_row指向的token嵌入向量复制到变量x指向的内存空间。x通常是型//运行状态的一部分,用于存储输入序列中当前位置的激活值//将token嵌入进去,并赋给当前位置的激活值x,这儿也是网络输入的初始值float* content_row = w->token_embedding_table + token * dim;memcpy(x, content_row, dim*sizeof(*x));// forward all the layersfor(unsigned long long l = 0; l < p->n_layers; l++) {// attention rmsnormrmsnorm(s->xb, x, w->rms_att_weight + l*dim, dim);// key and value point to the kv cacheint loff = l * p->seq_len * kv_dim; // kv cache layer offset for conveniences->k = s->key_cache + loff + pos * kv_dim;s->v = s->value_cache + loff + pos * kv_dim;// qkv matmuls for this positionmatmul(s->q, s->xb, w->wq + l*dim*dim, dim, dim);matmul(s->k, s->xb, w->wk + l*dim*kv_dim, dim, kv_dim);matmul(s->v, s->xb, w->wv + l*dim*kv_dim, dim, kv_dim);// RoPE relative positional encoding: complex-valued rotate q and k in each headfor (int i = 0; i < dim; i+=2) {int head_dim = i % head_size;float freq = 1.0f / powf(10000.0f, head_dim / (float)head_size);float val = pos * freq;float fcr = cosf(val);float fci = sinf(val);int rotn = i < kv_dim ? 2 : 1; // how many vectors? 2 = q & k, 1 = q onlyfor (int v = 0; v < rotn; v++) {float* vec = v == 0 ? s->q : s->k; // the vector to rotate (query or key)float v0 = vec[i];float v1 = vec[i+1];vec[i] = v0 * fcr - v1 * fci;vec[i+1] = v0 * fci + v1 * fcr;}}// multihead attention. iterate over all headsint h;#pragma omp parallel for private(h)for (h = 0; h < p->n_heads; h++) {// get the query vector for this headfloat* q = s->q + h * head_size;// attention scores for this headfloat* att = s->att + h * p->seq_len;// iterate over all timesteps, including the current onefor (int t = 0; t <= pos; t++) {// get the key vector for this head and at this timestepfloat* k = s->key_cache + loff + t * kv_dim + (h / kv_mul) * head_size;// calculate the attention score as the dot product of q and kfloat score = 0.0f;for (int i = 0; i < head_size; i++) {score += q[i] * k[i]; //q与k相乘}score /= sqrtf(head_size);// save the score to the attention bufferatt[t] = score;}// softmax the scores to get attention weights, from 0..pos inclusivelysoftmax(att, pos + 1); //在softmax// weighted sum of the values, store back into xbfloat* xb = s->xb + h * head_size;memset(xb, 0, head_size * sizeof(float)); //xb指向的内存清0for (int t = 0; t <= pos; t++) {// get the value vector for this head and at this timestepfloat* v = s->value_cache + loff + t * kv_dim + (h / kv_mul) * head_size;// get the attention weight for this timestepfloat a = att[t];// accumulate the weighted value into xbfor (int i = 0; i < head_size; i++) {xb[i] += a * v[i];}}}// final matmul to get the output of the attentionmatmul(s->xb2, s->xb, w->wo + l*dim*dim, dim, dim);// residual connection back into xfor (int i = 0; i < dim; i++) {x[i] += s->xb2[i];}// ffn rmsnormrmsnorm(s->xb, x, w->rms_ffn_weight + l*dim, dim);// Now for FFN in PyTorch we have: self.w2(F.silu(self.w1(x)) * self.w3(x))// first calculate self.w1(x) and self.w3(x)matmul(s->hb, s->xb, w->w1 + l*dim*hidden_dim, dim, hidden_dim);matmul(s->hb2, s->xb, w->w3 + l*dim*hidden_dim, dim, hidden_dim);// SwiGLU non-linearityfor (int i = 0; i < hidden_dim; i++) {float val = s->hb[i];// silu(x)=x*σ(x), where σ(x) is the logistic sigmoidval *= (1.0f / (1.0f + expf(-val)));// elementwise multiply with w3(x)val *= s->hb2[i];s->hb[i] = val;}// final matmul to get the output of the ffnmatmul(s->xb, s->hb, w->w2 + l*dim*hidden_dim, hidden_dim, dim);// residual connectionfor (int i = 0; i < dim; i++) {x[i] += s->xb[i];}}// final rmsnormrmsnorm(x, x, w->rms_final_weight, dim);// classifier into logitsmatmul(s->logits, x, w->wcls, p->dim, p->vocab_size);return s->logits;
(gdb) p *p 配置参数
$16 = {dim = 288, hidden_dim = 768, n_layers = 6, n_heads = 6, n_kv_heads = 6, vocab_size = 32000, seq_len = 256}
(gdb) p *w 权重
$17 = {token_embedding_table = 0x7f7ffcc0d01c, rms_att_weight = 0x7f7ffef3501c, rms_ffn_weight = 0x7f7fff6ceb1c, wq = 0x7f7ffef36b1c, wk = 0x7f7fff11cb1c, wv = 0x7f7fff302b1c, wo = 0x7f7fff4e8b1c, w1 = 0x7f7fff6d061c, w2 = 0x7f7fffbe061c, w3 = 0x7f80000f061c, rms_final_weight = 0x7f800060061c, wcls = 0x7f7ffcc0d01c}
(gdb) p *s 运行状态
$18 = {x = 0x55bde82af480, xb = 0x55bde82af910, xb2 = 0x55bde82afda0, hb = 0x55bde82b0230, hb2 = 0x55bde82b0e40, q = 0x55bde82b1a50, k = 0x0, v = 0x0, att = 0x55bde82b1ee0, logits = 0x55bde82b36f0, key_cache = 0x7f7ffca5c010, value_cache = 0x7f7ffc8ab010}
rmsnorm(s->xb, x, w->rms_att_weight + l*dim, dim);
void rmsnorm(float* o, float* x, float* weight, int size) {// calculate sum of squaresfloat ss = 0.0f;for (int j = 0; j < size; j++) {ss += x[j] * x[j]; //计算输入向量x的平方和(sum of squares): 遍历向量x中的所有元素,将每个元素与自身相乘后累加到变量ss上。}ss /= size;ss += 1e-5f;ss = 1.0f / sqrtf(ss);// normalize and scalefor (int j = 0; j < size; j++) {o[j] = weight[j] * (ss * x[j]); //weight[j] 可学习参数}
(gdb) print *weight
$24 = 0.798294365
(gdb) print *x
$25 = -0.0170963854
计算输入向量x的RMS归一化(Root Mean Square Normalization)[公式](https://blog.csdn.net/lichunericli/article/details/136109344)版本,并将结果存储在输出向量o中。同时,它还根据权重矩阵weight对归一化后的向量进行了缩放。
int loff = l * p->seq_len * kv_dim; // kv cache layer offset for convenience
s->k = s->key_cache + loff + pos * kv_dim;
s->v = s->value_cache + loff + pos * kv_dim;
//首先,计算key和value缓存(kv cache)在当前层(l)和当前序列位置(pos)的偏移量 loff。这个偏移量是为了方便地定位到该层特定序列位置的
//key和value向量。// qkv matmuls for this position
matmul(s->q, s->xb, w->wq + l*dim*dim, dim, dim);
void matmul(float* xout, float* x, float* w, int n, int d) //线性层映射??
// s->q输出的结果, s->xb输入, w->wq + l*dim*dim权重数据指针,dim, dim表示输入、输出向量的长度
// W (d,n) @ x (n,) -> xout (d,)// classifier into logitsmatmul(s->logits, x, w->wcls, p->dim, p->vocab_size);return s->logits;
通过矩阵乘法,模型将从 x 中提取出的信息映射到整个词汇表上,生成一个维度为 (1, p->vocab_size) 的向量,这个向量的每个元素对应词汇表中一个词的预测得分(logit)。
(gdb) print *s->logits
$28 = -6.79079819
// advance the state machineif (pos < num_prompt_tokens - 1) {// if we are still processing the input prompt, force the next prompt tokennext = prompt_tokens[pos + 1];} else {// otherwise sample the next token from the logitsnext = sample(sampler, logits);}pos++;// data-dependent terminating condition: the BOS (=1) token delimits sequencesif (next == 1) { break; }// print the token as string, decode it with the Tokenizer objectchar* piece = decode(tokenizer, token, next);
if (pos < num_prompt_tokens - 1) {// if we are still processing the input prompt, force the next prompt tokennext = prompt_tokens[pos + 1];
然后decode// otherwise sample the next token from the logitsnext = sample(sampler, logits);
1.3 decode
char* decode(Tokenizer* t, int prev_token, int token) {char *piece = t->vocab[token];// following BOS (1) token, sentencepiece decoder strips any leading whitespace (see PR #89)if (prev_token == 1 && piece[0] == ' ') { piece++; }// careful, some tokens designate raw bytes, and look like e.g. '<0x01>'// parse this and convert and return the actual byteunsigned char byte_val;if (sscanf(piece, "<0x%02hhX>", &byte_val) == 1) {piece = (char*)t->byte_pieces + byte_val * 2;}return piece;
453 char* decode(Tokenizer* t, int prev_token, int token) {
(gdb) print *t
$35 = {vocab = 0x7f7ffc86c010, vocab_scores = 0x55bde82d2b00, sorted_vocab = 0x7f7ffc7af010, vocab_size = 32000, max_token_length = 27, byte_pieces = "\000\000\001\000\002\000\003\000\004\000\005\000\006\000\a\000\b\000\t\000\n\000\v\000\f\000\r\000\016\000\017\000\020\000\021\000\022\000\023\000\024\000\025\000\026\000\027\000\030\000\031\000\032\000\033\000\034\000\035\000\036\000\037\000 \000!\000\"\000#\000$\000%\000&\000'\000(\000)\000*\000+\000,\000-\000.\000/\000\060\000\061\000\062\000\063\000\064\000\065\000\066\000\067\000\070\000\071\000:\000;\000<\000=\000>\000?\000@\000A\000B\000C\000D\000E\000F\000G\000H\000I\000J\000K\000L\000M\000N\000O\000P\000Q\000R\000S\000T\000U\000V\000W\000X\000Y\000Z\000[\000\\\000]\000^\000_\000`\000a\000b\000c\000"...}
(gdb) print prev_token
$36 = 1
(gdb) print token
$37 = 22172(gdb) print piece
$42 = 0x55bde83a02b1 "hello"
1.4 sample
int sample(Sampler* sampler, float* logits) {// sample the token given the logits and some hyperparametersint next;if (sampler->temperature == 0.0f) {// greedy argmax sampling: take the token with the highest probabilitynext = sample_argmax(logits, sampler->vocab_size);} else {// apply the temperature to the logitsfor (int q=0; q<sampler->vocab_size; q++) { logits[q] /= sampler->temperature; }// apply softmax to the logits to get the probabilities for next tokensoftmax(logits, sampler->vocab_size);// flip a (float) coin (this is our source of entropy for sampling)float coin = random_f32(&sampler->rng_state);// we sample from this distribution to get the next tokenif (sampler->topp <= 0 || sampler->topp >= 1) {// simply sample from the predicted probability distributionnext = sample_mult(logits, sampler->vocab_size, coin);} else {// top-p (nucleus) sampling, clamping the least likely tokens to zeronext = sample_topp(logits, sampler->vocab_size, sampler->topp, sampler->probindex, coin);}}return next;
if (sampler->temperature == 0.0f) {// greedy argmax sampling: take the token with the highest probabilitynext = sample_argmax(logits, sampler->vocab_size);
当采样器的温度 sampler->temperature 等于 0.0f 时:使用贪婪选择策略(argmax),直接选取 logit 值最高的 token 作为下一个 token。