提示缓存:LLM的token便宜 10 倍,但怎么做到的?
伪代码
prompt = "What is the meaning of life?";
tokens = tokenizers(prompt);
while (true) {
embeddings = embed(tokens);
for ([attention, feedforward] of transformers) {
embeddings = attention(embeddings);
embeddings = feedforward(embeddings);
}
output_token = output(embeddings);
if (output_token === END_TOKEN) {
break;
}
tokens.push(output_token);
}
print(decode(tokens));
embedding
// Created during training, never changes during inference.
const EMBEDDINGS = [...];
function embed(tokens) {
return tokens.map(token => {
return EMBEDDINGS[token];
});
}
encodePosition
const EMBEDDINGS = [...];
// Input: array of tokens (integers)
function embed(tokens) {
// Output: array of n-dimensional embedding arrays
return tokens.map((token, i) => {
const embeddings = EMBEDDINGS[token];
return encodePosition(embeddings, i);
});
}
注意力机制
// Similar to EMBEDDINGS from the pseudocode
// earlier, WQ and WK are learned during
// training and do not change during inference.
//
// These are both n*n matrices, where n is the
// number of embedding dimensions. In our example
// above, n = 3.
const WQ = [[...], [...], [...]];
const WK = [[...], [...], [...]];
// The input embeddings look like this:
// [
// [-0.1, 0.1, -0.3], // Mary
// [1.0, -0.5, -0.6], // had
// [0.0, 0.8, 0.6], // a
// [0.5, -0.7, 1.0] // little
// ]
function attentionWeights(embeddings) {
const Q = embeddings * WQ;
const K = embeddings * WK;
// 注意力分数
const scores = Q * transpose(K);
// 掩码
const masked = mask(scores);
return softmax(masked);
}
混合嵌入
在训练时确定的 WV 矩阵
WV 允许模型在混合嵌入之前过滤掉无关的特征
// Learned during training, doesn't change
// during inference. This is also an n*n matrix,
// where n is the number of embedding dimensions.
const WV = [[...], [...], ...];
function attention(embeddings) {
const V = embeddings * WV;
// This is the `attentionWeights` function from
// the section above. We're wrapping it in
// this `attention` function.
const weights = attentionWeights(embeddings);
return weights * V;
}
2025年12月29日20:26:23