Skip to content

Commit e4640d8

Browse files
lookup: add print for drafting performance (ggml-org#5450)
1 parent 907e08c commit e4640d8

File tree

1 file changed

+11
-1
lines changed

1 file changed

+11
-1
lines changed

examples/lookup/lookup.cpp

+11-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
#include "common.h"
2+
#include "ggml.h"
23
#include "llama.h"
34

45
#include <cmath>
6+
#include <cstdint>
57
#include <cstdio>
68
#include <string>
79
#include <vector>
@@ -73,6 +75,8 @@ int main(int argc, char ** argv){
7375
int n_drafted = 0;
7476
int n_accept = 0;
7577

78+
int64_t t_draft_us = 0;
79+
7680
int n_past = inp.size();
7781

7882
bool has_eos = false;
@@ -160,7 +164,7 @@ int main(int argc, char ** argv){
160164

161165
// generate n_pred tokens through prompt lookup
162166
auto prompt_lookup = [&]() -> void {
163-
int inp_size = inp.size();
167+
const int inp_size = inp.size();
164168
for (int ngram_size = ngram_max ; ngram_size > ngram_min; --ngram_size){
165169
const llama_token * ngram = &inp[inp_size - ngram_size];
166170

@@ -191,8 +195,12 @@ int main(int argc, char ** argv){
191195
return;
192196
};
193197

198+
const int64_t t_start_draft_us = ggml_time_us();
199+
194200
prompt_lookup();
195201

202+
t_draft_us += ggml_time_us() - t_start_draft_us;
203+
196204
llama_decode(ctx, batch_tgt);
197205
++n_past;
198206

@@ -210,6 +218,8 @@ int main(int argc, char ** argv){
210218
LOG_TEE("n_draft = %d\n", n_draft);
211219
LOG_TEE("n_predict = %d\n", n_predict);
212220
LOG_TEE("n_drafted = %d\n", n_drafted);
221+
LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
222+
t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
213223
LOG_TEE("n_accept = %d\n", n_accept);
214224
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
215225

0 commit comments

Comments
 (0)