跳转至

ref: 提示缓存:LLM 代币便宜 10 倍,但怎么做到的?|Ngrok 博客 --- Prompt caching: 10x cheaper LLM tokens, but how? | ngrok blog

提示缓存:LLM的token便宜 10 倍,但怎么做到的?

伪代码

prompt = "What is the meaning of life?";

tokens = tokenizers(prompt);
while (true) {
    embeddings = embed(tokens);
    for ([attention, feedforward] of transformers) {
        embeddings = attention(embeddings);
        embeddings = feedforward(embeddings);
    }
    output_token = output(embeddings);
    if (output_token === END_TOKEN) {
        break;
    }
    tokens.push(output_token);
}

print(decode(tokens));

embedding

Tiktokenizer

// Created during training, never changes during inference.
const EMBEDDINGS = [...];

function embed(tokens) {
    return tokens.map(token => {
        return EMBEDDINGS[token];
    });
}

encodePosition

const EMBEDDINGS = [...];

// Input: array of tokens (integers)
function embed(tokens) {
    // Output: array of n-dimensional embedding arrays
    return tokens.map((token, i) => {
        const embeddings = EMBEDDINGS[token];
        return encodePosition(embeddings, i);
    });
}

注意力机制

// Similar to EMBEDDINGS from the pseudocode
// earlier, WQ and WK are learned during 
// training and do not change during inference.
// 
// These are both n*n matrices, where n is the
// number of embedding dimensions. In our example
// above, n = 3.
const WQ = [[...], [...], [...]];
const WK = [[...], [...], [...]];

// The input embeddings look like this:
// [
//   [-0.1, 0.1, -0.3], // Mary
//   [1.0, -0.5, -0.6], // had
//   [0.0, 0.8, 0.6],   // a
//   [0.5, -0.7, 1.0]   // little
// ]
function attentionWeights(embeddings) {
    const Q = embeddings * WQ;
    const K = embeddings * WK;
    // 注意力分数
    const scores = Q * transpose(K);
    // 掩码
    const masked = mask(scores);
    return softmax(masked);
}

混合嵌入

在训练时确定的 WV 矩阵

WV 允许模型在混合嵌入之前过滤掉无关的特征

// Learned during training, doesn't change 
// during inference. This is also an n*n matrix,
// where n is the number of embedding dimensions.
const WV = [[...], [...], ...];

function attention(embeddings) {
    const V = embeddings * WV;
    // This is the `attentionWeights` function from
    // the section above. We're wrapping it in
    // this `attention` function.
    const weights = attentionWeights(embeddings);
    return weights * V;
}

2025年12月29日20:26:23

评论