ane.cpp/generate.h at main · skyfallsin/ane.cpp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#pragma once

#include "models/llm/qwen3_5.h"
#include "core/tokenizer.h"
#include "core/sampling.h"
#include <functional>
#include <string>
#include <vector>
#include <utility>

namespace ane_lm {

struct GenerationResponse {
    std::string text;
    int token = 0;
    int prompt_tokens = 0;
    double prompt_tps = 0.0;
    int generation_tokens = 0;
    double generation_tps = 0.0;
};

struct DraftModelContext {
    LLMModel* model = nullptr;
    Tokenizer* tokenizer = nullptr;
};

// Multi-turn: accepts full message history
void stream_generate(
    LLMModel& model,
    Tokenizer& tokenizer,
    const std::vector<std::pair<std::string, std::string>>& messages,
    int max_tokens = 0,
    bool enable_thinking = false,
    const SamplingParams& sampling = {},
    std::function<void(const GenerationResponse&)> callback = nullptr,
    DraftModelContext* draft = nullptr);

// Single-prompt convenience overload
void stream_generate(
    LLMModel& model,
    Tokenizer& tokenizer,
    const std::string& prompt,
    int max_tokens = 0,
    bool enable_thinking = false,
    const SamplingParams& sampling = {},
    std::function<void(const GenerationResponse&)> callback = nullptr,
    DraftModelContext* draft = nullptr);

} // namespace ane_lm