1
1
#include " common.h"
2
+ #include " ggml.h"
2
3
#include " llama.h"
3
4
4
5
#include < cmath>
6
+ #include < cstdint>
5
7
#include < cstdio>
6
8
#include < string>
7
9
#include < vector>
@@ -73,6 +75,8 @@ int main(int argc, char ** argv){
73
75
int n_drafted = 0 ;
74
76
int n_accept = 0 ;
75
77
78
+ int64_t t_draft_us = 0 ;
79
+
76
80
int n_past = inp.size ();
77
81
78
82
bool has_eos = false ;
@@ -160,7 +164,7 @@ int main(int argc, char ** argv){
160
164
161
165
// generate n_pred tokens through prompt lookup
162
166
auto prompt_lookup = [&]() -> void {
163
- int inp_size = inp.size ();
167
+ const int inp_size = inp.size ();
164
168
for (int ngram_size = ngram_max ; ngram_size > ngram_min; --ngram_size){
165
169
const llama_token * ngram = &inp[inp_size - ngram_size];
166
170
@@ -191,8 +195,12 @@ int main(int argc, char ** argv){
191
195
return ;
192
196
};
193
197
198
+ const int64_t t_start_draft_us = ggml_time_us ();
199
+
194
200
prompt_lookup ();
195
201
202
+ t_draft_us += ggml_time_us () - t_start_draft_us;
203
+
196
204
llama_decode (ctx, batch_tgt);
197
205
++n_past;
198
206
@@ -210,6 +218,8 @@ int main(int argc, char ** argv){
210
218
LOG_TEE (" n_draft = %d\n " , n_draft);
211
219
LOG_TEE (" n_predict = %d\n " , n_predict);
212
220
LOG_TEE (" n_drafted = %d\n " , n_drafted);
221
+ LOG_TEE (" t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n " ,
222
+ t_draft_us*1e-3 , 1 .0f *t_draft_us/n_drafted, n_drafted/(1e-6 *t_draft_us));
213
223
LOG_TEE (" n_accept = %d\n " , n_accept);
214
224
LOG_TEE (" accept = %.3f%%\n " , 100 .0f * n_accept / n_drafted);
215
225
0 commit comments