Skip to content

Commit 687a024

Browse files
ggerganovolexiyb
authored andcommittedNov 23, 2023
sync : ggml (backend v2) (ggml-org#3912)
* sync : ggml (backend v2) (wip) * sync : migrate examples and llama.cpp to dynamic graphs (wip) * sync : update tests + fix max op params to 64 ggml-ci * sync : ggml-cuda ggml-ci * llama : fix save/load state context size ggml-ci * sync : try to fix build on tvOS * sync : pass custom graph sizes in training examples * sync : update graph copies to new ggml API * sync : update sync-ggml.sh with new files * scripts : fix header in sync script * train : fix context size calculations * llama : increase inference graph size up to 4096 nodes * train : allocate grads for backward graphs * train : allocate grads for gb_tmp
1 parent a00c424 commit 687a024

22 files changed

+1986
-856
lines changed
 

Diff for: ‎common/train.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ struct train_state * init_train_state() {
3232
state->opt = new struct ggml_opt_context;
3333
state->opt->ctx = NULL;
3434
state->opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
35+
state->opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
3536
state->opt->loss_after = 0.0f;
3637

3738
return state;

Diff for: ‎common/train.h

+2
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
#include "ggml.h"
1010
#include "llama.h"
1111

12+
#define LLAMA_TRAIN_MAX_NODES 16384
13+
1214
typedef std::string mt19937_state;
1315

1416
struct train_state {

Diff for: ‎examples/benchmark/benchmark-matmult.cpp

+12-9
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,8 @@ int main(int argc, char ** argv) {
171171
struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
172172

173173
// printf("Creating compute graph\n");
174-
struct ggml_cgraph gf = ggml_build_forward(m11xm2);
174+
struct ggml_cgraph * gf = ggml_new_graph(ctx);
175+
ggml_build_forward_expand(gf, m11xm2);
175176

176177
printf("n_threads=%i\n", benchmark_params.n_threads);
177178

@@ -180,9 +181,9 @@ int main(int argc, char ** argv) {
180181

181182
std::vector<uint8_t> work_buffer;
182183

183-
ggml_graph_compute_helper(work_buffer, &gf, benchmark_params.n_threads);
184+
ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
184185

185-
TENSOR_DUMP(gf.nodes[0]);
186+
TENSOR_DUMP(gf->nodes[0]);
186187

187188
printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
188189

@@ -200,7 +201,8 @@ int main(int argc, char ** argv) {
200201
struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2);
201202

202203
// printf("Creating compute graph\n");
203-
struct ggml_cgraph gf31 = ggml_build_forward(q31);
204+
struct ggml_cgraph * gf31 = ggml_new_graph(ctx);
205+
ggml_build_forward_expand(gf31, q31);
204206

205207
// Set up a second graph computation to make sure we override the CPU cache lines
206208
// printf("Creating new tensor q12 & Running quantize\n");
@@ -211,7 +213,8 @@ int main(int argc, char ** argv) {
211213
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
212214

213215
//printf("Creating compute graph\n");
214-
struct ggml_cgraph gf32 = ggml_build_forward(q32);
216+
struct ggml_cgraph * gf32 = ggml_new_graph(ctx);
217+
ggml_build_forward_expand(gf32, q32);
215218
printf("n_threads=%i\n", benchmark_params.n_threads);
216219

217220
const int dimx = sizex;
@@ -223,7 +226,7 @@ int main(int argc, char ** argv) {
223226

224227

225228
// Let's use the F32 result from above as a reference for the quantized multiplication
226-
float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);
229+
float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]);
227230

228231
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
229232
printf("=====================================================================================\n");
@@ -233,7 +236,7 @@ int main(int argc, char ** argv) {
233236

234237
long long int start = ggml_time_us();
235238
//printf("Running ggml_graph_compute\n");
236-
ggml_graph_compute_helper(work_buffer, &gf31, benchmark_params.n_threads);
239+
ggml_graph_compute_helper(work_buffer, gf31, benchmark_params.n_threads);
237240

238241
long long int stop = ggml_time_us();
239242
long long int usec = stop-start;
@@ -251,7 +254,7 @@ int main(int argc, char ** argv) {
251254

252255
// Check that the matrix multiplication result is in the right ballpark
253256
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
254-
float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]);
257+
float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]);
255258
float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
256259
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
257260

@@ -266,7 +269,7 @@ int main(int argc, char ** argv) {
266269
}
267270

268271
// Running a different graph computation to make sure we override the CPU cache lines
269-
ggml_graph_compute_helper(work_buffer, &gf32, benchmark_params.n_threads);
272+
ggml_graph_compute_helper(work_buffer, gf32, benchmark_params.n_threads);
270273
}
271274
printf("\n");
272275
printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));

Diff for: ‎examples/export-lora/export-lora.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ static struct lora_data * load_lora(struct lora_info * info) {
240240
}
241241

242242
struct ggml_init_params params_ggml;
243-
params_ggml.mem_size = ggml_tensor_overhead() * GGML_MAX_NODES;
243+
params_ggml.mem_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE;
244244
params_ggml.mem_buffer = NULL;
245245
params_ggml.no_alloc = true;
246246
result->ctx = ggml_init(params_ggml);
@@ -334,7 +334,7 @@ static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int
334334
float scaling = lora->info.scale * (float)lora->lora_alpha / (float)lora->lora_r;
335335

336336
struct ggml_init_params params;
337-
params.mem_size = GGML_OBJECT_SIZE + GGML_GRAPH_SIZE + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5;
337+
params.mem_size = GGML_OBJECT_SIZE + ggml_graph_overhead() + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5;
338338
params.mem_buffer = NULL;
339339
params.no_alloc = true;
340340
struct ggml_context * ctx = NULL;

Diff for: ‎examples/finetune/finetune.cpp

+11-12
Original file line numberDiff line numberDiff line change
@@ -772,7 +772,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
772772
if (enable_checkpointing) {
773773
ggml_build_backward_gradient_checkpointing(ctx, gf, gb, gb_tmp, checkpoints.data(), (int) checkpoints.size());
774774
} else {
775-
*gb = *gf;
775+
ggml_graph_cpy(gf, gb);
776776
ggml_build_backward_expand(ctx, gf, gb, true);
777777
}
778778

@@ -1615,6 +1615,7 @@ int main(int argc, char ** argv) {
16151615
opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
16161616
opt->params.print_forward_graph = false;
16171617
opt->params.print_backward_graph = false;
1618+
opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
16181619
opt->params.n_threads = params.common.n_threads;
16191620
opt->params.past = params.common.opt_past;
16201621
opt->params.delta = params.common.opt_delta;
@@ -1741,11 +1742,9 @@ int main(int argc, char ** argv) {
17411742
ggml_allocr_free(alloc);
17421743

17431744
// context for compute tensors without their data
1744-
size_t estimated_compute_size_wo_data = (
1745-
ggml_tensor_overhead()*GGML_MAX_NODES*2
1746-
+ (GGML_OBJECT_SIZE+GGML_GRAPH_SIZE)*(
1747-
params.common.use_checkpointing ? 3 : 2
1748-
)
1745+
const size_t estimated_compute_size_wo_data = (
1746+
2*LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead() +
1747+
(params.common.use_checkpointing ? 3 : 2)*(GGML_OBJECT_SIZE+ggml_graph_overhead_custom(LLAMA_TRAIN_MAX_NODES, true))
17491748
);
17501749
struct ggml_init_params ctx_compute_params = {
17511750
estimated_compute_size_wo_data, // mem_size
@@ -1768,11 +1767,11 @@ int main(int argc, char ** argv) {
17681767
for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
17691768
ctx_compute = ggml_init(ctx_compute_params);
17701769
alloc = ggml_allocr_new_measure(tensor_alignment);
1771-
gf = ggml_new_graph(ctx_compute);
1770+
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
17721771
gf->order = (enum ggml_cgraph_eval_order) order;
1773-
gb = ggml_new_graph(ctx_compute);
1772+
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
17741773
gb_tmp = params.common.use_checkpointing
1775-
? ggml_new_graph(ctx_compute)
1774+
? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true)
17761775
: NULL;
17771776
loss = llama_build_lora_finetune_graphs(
17781777
&model, &lora, alloc, ctx_compute,
@@ -1801,11 +1800,11 @@ int main(int argc, char ** argv) {
18011800
mem_compute_data.resize(max_compute_size);
18021801
ctx_compute = ggml_init(ctx_compute_params);
18031802
alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
1804-
gf = ggml_new_graph(ctx_compute);
1803+
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
18051804
gf->order = best_order;
1806-
gb = ggml_new_graph(ctx_compute);
1805+
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
18071806
gb_tmp = params.common.use_checkpointing
1808-
? ggml_new_graph(ctx_compute)
1807+
? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true)
18091808
: NULL;
18101809
loss = llama_build_lora_finetune_graphs(
18111810
&model, &lora, alloc, ctx_compute,

Diff for: ‎examples/llava/clip.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -664,7 +664,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
664664
// measure mem requirement and allocate
665665
{
666666
static const size_t tensor_alignment = 32;
667-
new_clip->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
667+
new_clip->buf_compute.resize(ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead());
668668
new_clip->alloc = ggml_allocr_new_measure(tensor_alignment);
669669
clip_image_f32_batch batch;
670670
batch.size = 1;

Diff for: ‎examples/metal/metal.cpp

+5-5
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ int main(int argc, char ** argv) {
3434
struct ggml_context * ctx_data = NULL;
3535
struct ggml_context * ctx_eval = NULL;
3636

37-
struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
37+
struct ggml_cgraph * gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
3838

3939
// this allocates all Metal resources and memory buffers
4040
auto * ctx_metal = ggml_metal_init(1);
@@ -46,21 +46,21 @@ int main(int argc, char ** argv) {
4646

4747
// main
4848
{
49-
struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "embd");
49+
struct ggml_tensor * input = ggml_graph_get_tensor(gf, "embd");
5050
*(int32_t *) input->data = 1; // BOS
5151

5252
ggml_metal_set_tensor(ctx_metal, input);
5353

5454
// warmup
55-
ggml_metal_graph_compute(ctx_metal, &gf);
55+
ggml_metal_graph_compute(ctx_metal, gf);
5656

5757
const int n_iter = 16;
5858

5959
const int64_t t0 = ggml_time_us();
6060

6161
// the actual inference happens here
6262
for (int i = 0; i < n_iter; ++i) {
63-
ggml_metal_graph_compute(ctx_metal, &gf);
63+
ggml_metal_graph_compute(ctx_metal, gf);
6464
}
6565

6666
const int64_t t1 = ggml_time_us();
@@ -70,7 +70,7 @@ int main(int argc, char ** argv) {
7070

7171
// debug output
7272
{
73-
struct ggml_tensor * logits = gf.nodes[gf.n_nodes - 1];
73+
struct ggml_tensor * logits = gf->nodes[gf->n_nodes - 1];
7474
ggml_metal_get_tensor(ctx_metal, logits);
7575

7676
float * ptr = (float *) ggml_get_data(logits);

Diff for: ‎examples/train-text-from-scratch/train-text-from-scratch.cpp

+11-12
Original file line numberDiff line numberDiff line change
@@ -436,7 +436,7 @@ static struct ggml_tensor * llama_build_train_graphs(
436436
if (enable_checkpointing) {
437437
ggml_build_backward_gradient_checkpointing(ctx, gf, gb, gb_tmp, checkpoints.data(), (int) checkpoints.size());
438438
} else {
439-
*gb = *gf;
439+
ggml_graph_cpy(gf, gb);
440440
ggml_build_backward_expand(ctx, gf, gb, true);
441441
}
442442

@@ -1006,6 +1006,7 @@ int main(int argc, char ** argv) {
10061006
opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
10071007
opt->params.print_forward_graph = false;
10081008
opt->params.print_backward_graph = false;
1009+
opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
10091010
opt->params.n_threads = params.common.n_threads;
10101011
opt->params.past = params.common.opt_past;
10111012
opt->params.delta = params.common.opt_delta;
@@ -1108,11 +1109,9 @@ int main(int argc, char ** argv) {
11081109
ggml_allocr_free(alloc);
11091110

11101111
// context for compute tensors without their data
1111-
size_t estimated_compute_size_wo_data = (
1112-
ggml_tensor_overhead()*GGML_MAX_NODES*2
1113-
+ (GGML_OBJECT_SIZE+GGML_GRAPH_SIZE)*(
1114-
params.common.use_checkpointing ? 3 : 2
1115-
)
1112+
const size_t estimated_compute_size_wo_data = (
1113+
2*LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead() +
1114+
(params.common.use_checkpointing ? 3 : 2)*(GGML_OBJECT_SIZE+ggml_graph_overhead_custom(LLAMA_TRAIN_MAX_NODES, true))
11161115
);
11171116
struct ggml_init_params ctx_compute_params = {
11181117
estimated_compute_size_wo_data, // mem_size
@@ -1135,11 +1134,11 @@ int main(int argc, char ** argv) {
11351134
for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
11361135
ctx_compute = ggml_init(ctx_compute_params);
11371136
alloc = ggml_allocr_new_measure(tensor_alignment);
1138-
gf = ggml_new_graph(ctx_compute);
1137+
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
11391138
gf->order = (enum ggml_cgraph_eval_order) order;
1140-
gb = ggml_new_graph(ctx_compute);
1139+
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
11411140
gb_tmp = params.common.use_checkpointing
1142-
? ggml_new_graph(ctx_compute)
1141+
? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true)
11431142
: NULL;
11441143
loss = llama_build_train_graphs(
11451144
&model, alloc, ctx_compute,
@@ -1168,11 +1167,11 @@ int main(int argc, char ** argv) {
11681167
mem_compute_data.resize(max_compute_size);
11691168
ctx_compute = ggml_init(ctx_compute_params);
11701169
alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
1171-
gf = ggml_new_graph(ctx_compute);
1170+
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
11721171
gf->order = best_order;
1173-
gb = ggml_new_graph(ctx_compute);
1172+
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
11741173
gb_tmp = params.common.use_checkpointing
1175-
? ggml_new_graph(ctx_compute)
1174+
? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true)
11761175
: NULL;
11771176
loss = llama_build_train_graphs(
11781177
&model, alloc, ctx_compute,

0 commit comments

Comments
 (0)
Please sign in to comment.