llama.cpp simple source code

#include "llama.h"                  // 包含 llama 的 API 定义
#include <cstdio>
#include <cstring>
#include <string>
#include <vector>

// 打印如何使用该示例程序
static void print_usage(int argc, char ** argv) {
    // 使用 printf 打印示例用法
    printf("\nexample usage:\n");
    printf("\n    %s -m model.gguf [-n n_predict] [-ngl n_gpu_layers] [prompt]\n", argv[0]);
    printf("\n");
}

int main(int argc, char ** argv) {
    // 用于存储模型路径
    std::string model_path;
    // 默认 prompt 内容；用户可以通过命令行传入
    std::string prompt = "Hello my name is";
    // 指定 GPU offload 的层数，默认值为 99
    int ngl = 99;
    // 指定要预测的 token 数量，默认值为 32
    int n_predict = 32;

    // 解析命令行参数
    {
        int i = 1;
        // 遍历所有传入参数
        for (; i < argc; i++) {
            // 如果参数为 -m 表示模型的路径 
            if (strcmp(argv[i], "-m") == 0) {
                if (i + 1 < argc) {
                    model_path = argv[++i];  // 下一个参数即为模型文件路径
                } else {
                    print_usage(argc, argv);
                    return 1;
                }
            } 
            // 如果参数为 -n 表示预测 token 数量
            else if (strcmp(argv[i], "-n") == 0) {
                if (i + 1 < argc) {
                    try {
                        n_predict = std::stoi(argv[++i]); // 将字符串转换为整型
                    } catch (...) {
                        print_usage(argc, argv);
                        return 1;
                    }
                } else {
                    print_usage(argc, argv);
                    return 1;
                }
            } 
            // 如果参数为 -ngl 表示 GPU 层数
            else if (strcmp(argv[i], "-ngl") == 0) {
                if (i + 1 < argc) {
                    try {
                        ngl = std::stoi(argv[++i]);
                    } catch (...) {
                        print_usage(argc, argv);
                        return 1;
                    }
                } else {
                    print_usage(argc, argv);
                    return 1;
                }
            } 
            // 剩下参数当做 prompt 处理
            else {
                break;
            }
        }
        // 如果模型路径为空，则打印用法并退出
        if (model_path.empty()) {
            print_usage(argc, argv);
            return 1;
        }
        // 如果还有其他参数，拼接成完整的 prompt
        if (i < argc) {
            prompt = argv[i++];
            for (; i < argc; i++) {
                prompt += " ";
                prompt += argv[i];
            }
        }
    }

    // 加载所有动态后端（例如 BLAS、CUDA 后端等）
    ggml_backend_load_all();

    // 初始化模型参数，使用默认参数并根据命令行设置 GPU 层数
    llama_model_params model_params = llama_model_default_params();
    model_params.n_gpu_layers = ngl;

    // 通过模型文件路径加载模型
    // llama_model_load_from_file 是 llama.cpp 的 API，用于从 gguf 格式的模型文件构造一个 llama_model 实例
    llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params);

    if (model == NULL) { // 如果加载失败则打印错误信息
        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
        return 1;
    }

    // 获取模型对应的词汇表。调用 llama_model_get_vocab 获取指向词汇表（llama_vocab）的指针
    const llama_vocab * vocab = llama_model_get_vocab(model);

    // 接下来对 prompt 进行分词处理
    // 使用 llama_tokenize 先计算 prompt 经过分词后包含的 token 个数。返回值为负值表示实际 token 数目
    const int n_prompt = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, true, true);

    // 分配存储 token 的内存，使用 vector 来存储分词结果
    std::vector<llama_token> prompt_tokens(n_prompt);
    // 实际对 prompt 进行分词，将 token 存入 prompt_tokens 数组中
    if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) {
        fprintf(stderr, "%s: error: failed to tokenize the prompt\n", __func__);
        return 1;
    }

    // 初始化模型的上下文
    // 首先设置 context 的参数，调用 llama_context_default_params 获得默认值
    llama_context_params ctx_params = llama_context_default_params();
    // 设置 n_ctx 为 prompt token 数量加预测 token 数量减 1
    ctx_params.n_ctx = n_prompt + n_predict - 1;
    // n_batch 表示每次 llama_decode 调用最多处理的 token 数量，设置为 prompt 的 token 数
    ctx_params.n_batch = n_prompt;
    // 启用性能计数器，用于内部统计运行时间等数据
    ctx_params.no_perf = false;

    // 创建一个 context 实例，关联已加载的模型。llama_init_from_model 使用模型和 ctx_params 初始化 transformer
    llama_context * ctx = llama_init_from_model(model, ctx_params);

    if (ctx == NULL) {
        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
        return 1;
    }

    // 初始化采样器，负责根据模型输出概率选择下一个 token
    auto sparams = llama_sampler_chain_default_params();
    sparams.no_perf = false;
    // llama_sampler_chain_init 返回一个采样器链的指针
    llama_sampler * smpl = llama_sampler_chain_init(sparams);

    // 将一个贪婪采样器加入采样器链， llama_sampler_init_greedy 实现了简单的 greedy 策略
    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());

    // 输出 prompt 的 token 对应的文本，逐个 token 转换为文本拼接打印
    for (auto id : prompt_tokens) {
        char buf[128];
        // llama_token_to_piece 将 token id 转换为对应的字符串文字，保存到 buf
        int n = llama_token_to_piece(vocab, id, buf, sizeof(buf), 0, true);
        if (n < 0) {
            fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
            return 1;
        }
        // 利用 std::string 构造字符串后打印
        std::string s(buf, n);
        printf("%s", s.c_str());
    }

    // 将 prompt tokens 打包成一个 batch，供 llama_decode 时使用
    llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());

    // 记录处理开始时间，以便之后输出生成 token 的速度
    const auto t_main_start = ggml_time_us();
    int n_decode = 0;           // 解码 token 的计数器
    llama_token new_token_id;   // 用于存储新采样的 token

    // 主循环：逐步生成新的 token
    // 循环条件：当前生成的 token 数量（n_pos）加上当前 batch 中的 token 数量小于总 token 数量（prompt + 预测不超过 n_predict）
    for (int n_pos = 0; n_pos + batch.n_tokens < n_prompt + n_predict; ) {
        // 调用 llama_decode 运行 transformer 模型，将当前 batch 的 token 送入模型计算
        if (llama_decode(ctx, batch)) {
            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
            return 1;
        }

        // 更新处理位置
        n_pos += batch.n_tokens;

        // 采样下一个 token
        {
            // llama_sampler_sample 根据当前 context 的输出概率分布选择 token
            new_token_id = llama_sampler_sample(smpl, ctx, -1);

            // 检查是否为生成结束标记。 llama_vocab_is_eog 检查 token 是否为结束符（end-of-generation）
            if (llama_vocab_is_eog(vocab, new_token_id)) {
                break;
            }

            char buf[128];
            // 将采样的 token id 转回对应的字符串
            int n = llama_token_to_piece(vocab, new_token_id, buf, sizeof(buf), 0, true);
            if (n < 0) {
                fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
                return 1;
            }
            std::string s(buf, n);
            // 打印该 token 的文本并刷新输出缓冲区，确保实时显示
            printf("%s", s.c_str());
            fflush(stdout);

            // 为下次迭代创建一个只有新 token 的 batch
            batch = llama_batch_get_one(&new_token_id, 1);

            n_decode += 1;
        }
    }

    // 打印换行符
    printf("\n");

    // 记录处理结束时间
    const auto t_main_end = ggml_time_us();

    // 输出解码过程的统计数据，如 token 数量、用时和速度（token/s）
    fprintf(stderr, "%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
            __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));

    fprintf(stderr, "\n");
    // 输出采样器和 context 的性能统计数据
    llama_perf_sampler_print(smpl);
    llama_perf_context_print(ctx);
    fprintf(stderr, "\n");

    // 释放分配的采样器、context 和模型资源，避免内存泄漏
    llama_sampler_free(smpl);
    llama_free(ctx);
    llama_model_free(model);

    return 0;
}
THE END