Skip to content

Commit 897cacc

Browse files
committed
fixes : speculative KV cache + llama worst-case graph
1 parent 466b513 commit 897cacc

File tree

3 files changed

+12
-19
lines changed

3 files changed

+12
-19
lines changed

examples/parallel/parallel.cpp

+5-15
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ int main(int argc, char ** argv) {
8080
return 1;
8181
}
8282

83-
const int n_clients = 16;
83+
const int n_clients = 4;
8484

8585
#ifndef LOG_DISABLE_LOGS
8686
log_set_target(log_filename_generator("parallel", "log"));
@@ -116,10 +116,6 @@ int main(int argc, char ** argv) {
116116
std::vector<llama_token_data> candidates;
117117
candidates.reserve(n_vocab);
118118

119-
auto t_main_start = ggml_time_us();
120-
121-
int64_t n_tokens_total = 0;
122-
123119
llama_seq_id g_seq_id = 0;
124120

125121
std::vector<llama_token> batch_token;
@@ -203,6 +199,9 @@ int main(int argc, char ** argv) {
203199
continue;
204200
}
205201

202+
//printf("client %d, seq %d, token %d, pos %d, batch %d\n",
203+
// client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
204+
206205
const llama_token id = llama_sample_token(ctx, NULL, NULL, params, client.last_tokens, candidates, client.i_batch - i);
207206

208207
if (client.t_start_gen == 0) {
@@ -233,9 +232,7 @@ int main(int argc, char ** argv) {
233232

234233
const auto t_main_end = ggml_time_us();
235234

236-
n_tokens_total += client.n_decoded - client.n_prompt;
237-
238-
printf("\033[1mClient %2d, seq %4d, prompt %4d t, response %4d t, speed: PP %5.2f t/s, TG %5.2f, AVG %5.2f \033[0m: \n\nInput: %s\nResponse: %s\n\n",
235+
printf("\033[1mClient %2d, seq %4d, prompt %4d t, response %4d t, speed: PP %5.2f t/s, TG %5.2f t/s, AVG %5.2f t/s \033[0m: \n\nInput: %s\nResponse: %s\n\n",
239236
client.id, client.seq_id, client.n_prompt, client.n_decoded - client.n_prompt,
240237
(double) (client.n_prompt ) / (client.t_start_gen - client.t_start_prompt) * 1e6,
241238
(double) (client.n_decoded - client.n_prompt) / (t_main_end - client.t_start_gen) * 1e6,
@@ -249,13 +246,6 @@ int main(int argc, char ** argv) {
249246
client.i_batch = -1;
250247
}
251248
}
252-
253-
static bool is_first = true;
254-
if (is_first) {
255-
t_main_start = ggml_time_us();
256-
n_tokens_total = 0;
257-
is_first = false;
258-
}
259249
}
260250

261251
LOG_TEE("\n\n");

examples/speculative/speculative.cpp

+3
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,7 @@ int main(int argc, char ** argv) {
172172
LOG("out of drafted tokens\n");
173173
}
174174

175+
llama_kv_cache_rm_seq(ctx_dft, 0, n_past_dft, n_ctx);
175176
llama_decode(ctx_dft, llama_batch_get_one(&id, 1, n_past_dft, 0), params.n_threads);
176177
++n_past_dft;
177178

@@ -256,6 +257,7 @@ int main(int argc, char ** argv) {
256257
}
257258

258259
// evaluate the drafted token on the draft model
260+
llama_kv_cache_rm_seq(ctx_dft, 0, n_past_cur, n_ctx);
259261
llama_decode(ctx_dft, llama_batch_get_one(&drafted.back(), 1, n_past_cur, 0), params.n_threads);
260262
++n_past_cur;
261263

@@ -265,6 +267,7 @@ int main(int argc, char ** argv) {
265267
}
266268

267269
// evaluate the target model on the drafted tokens
270+
llama_kv_cache_rm_seq(ctx_tgt, 0, n_past_tgt, n_ctx);
268271
llama_decode(ctx_tgt, llama_batch_get_one(drafted.data(), drafted.size(), n_past_tgt, 0), params.n_threads);
269272
++n_past_tgt;
270273

llama.cpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -2604,7 +2604,7 @@ static struct ggml_cgraph * llm_build_llama(
26042604
const int n_gpu_layers = model.n_gpu_layers;
26052605

26062606
const int32_t n_tokens = batch.n_tokens;
2607-
const int32_t n_kv = llama_kv_cache_cell_max(kv_self);
2607+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : llama_kv_cache_cell_max(kv_self);
26082608

26092609
//printf("n_kv = %d\n", n_kv);
26102610

@@ -2775,7 +2775,7 @@ static struct ggml_cgraph * llm_build_llama(
27752775
offload_func_kq(Kcur);
27762776
ggml_set_name(Kcur, "Kcur");
27772777

2778-
struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
2778+
struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
27792779
offload_func_kq(Qcur);
27802780
ggml_set_name(Qcur, "Qcur");
27812781

@@ -6677,9 +6677,9 @@ struct llama_context * llama_new_context_with_model(
66776677
ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
66786678

66796679
// build worst-case graph
6680-
uint32_t n_tokens = std::max((int)hparams.n_ctx, params.n_batch);
6680+
const uint32_t n_tokens = std::min((int) hparams.n_ctx, params.n_batch);
66816681
llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
6682-
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, 0, 0));
6682+
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, hparams.n_ctx - n_tokens, 0));
66836683

66846684
#ifdef GGML_USE_METAL
66856685
if (params.n_gpu_layers > 0) {

0 commit comments

Comments
 (0)