Skip to content

Commit 65bbb1a

Browse files
committed
add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2) changing this value may improve performance for some systems, but increases memory usage
1 parent 00a415d commit 65bbb1a

File tree

5 files changed

+54
-30
lines changed

5 files changed

+54
-30
lines changed

CMakeLists.txt

+3
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ option(LLAMA_SYCL "llama: use SYCL"
118118
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
119119
set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device")
120120
option(LLAMA_CPU_HBM "llama: use memkind for CPU HBM" OFF)
121+
set(LLAMA_SCHED_MAX_COPIES "4" CACHE STRING "llama: max input copies for pipeline parallelism")
121122

122123
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
123124
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
@@ -147,6 +148,8 @@ set(THREADS_PREFER_PTHREAD_FLAG ON)
147148
find_package(Threads REQUIRED)
148149
include(CheckCXXCompilerFlag)
149150

151+
add_compile_definitions(GGML_SCHED_MAX_COPIES=${LLAMA_SCHED_MAX_COPIES})
152+
150153
# enable libstdc++ assertions for debug builds
151154
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
152155
add_compile_definitions($<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)

Makefile

+4
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,10 @@ ifeq ($(UNAME_S),OpenBSD)
167167
MK_CPPFLAGS += -D_BSD_SOURCE
168168
endif
169169

170+
ifdef LLAMA_SCHED_MAX_COPIES
171+
MK_CPPFLAGS += -DGGML_SCHED_MAX_COPIES=$(LLAMA_SCHED_MAX_COPIES)
172+
endif
173+
170174
ifdef LLAMA_DEBUG
171175
MK_CFLAGS += -O0 -g
172176
MK_CXXFLAGS += -O0 -g

ggml-backend.c

+43-28
Original file line numberDiff line numberDiff line change
@@ -387,7 +387,7 @@ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event)
387387

388388
// backend registry
389389

390-
#define GGML_MAX_BACKENDS_REG 16
390+
#define GGML_SCHED_MAX_BACKENDS_REG 16
391391

392392
struct ggml_backend_reg {
393393
char name[128];
@@ -396,7 +396,7 @@ struct ggml_backend_reg {
396396
void * user_data;
397397
};
398398

399-
static struct ggml_backend_reg ggml_backend_registry[GGML_MAX_BACKENDS_REG];
399+
static struct ggml_backend_reg ggml_backend_registry[GGML_SCHED_MAX_BACKENDS_REG];
400400
static size_t ggml_backend_registry_count = 0;
401401

402402
GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
@@ -441,7 +441,7 @@ GGML_CALL static void ggml_backend_registry_init(void) {
441441
}
442442

443443
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
444-
GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
444+
GGML_ASSERT(ggml_backend_registry_count < GGML_SCHED_MAX_BACKENDS_REG);
445445

446446
size_t id = ggml_backend_registry_count;
447447

@@ -993,16 +993,27 @@ static bool ggml_is_view_op(enum ggml_op op) {
993993

994994
// scheduler
995995

996-
#define GGML_MAX_BACKENDS 16
997-
#define GGML_MAX_SPLITS 256
998-
#define GGML_MAX_SPLIT_INPUTS 16
999-
#define GGML_MAX_COPIES 2
996+
#ifndef GGML_SCHED_MAX_BACKENDS
997+
#define GGML_SCHED_MAX_BACKENDS 16
998+
#endif
999+
1000+
#ifndef GGML_SCHED_MAX_SPLITS
1001+
#define GGML_SCHED_MAX_SPLITS 256
1002+
#endif
1003+
1004+
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
1005+
#define GGML_SCHED_MAX_SPLIT_INPUTS 16
1006+
#endif
1007+
1008+
#ifndef GGML_SCHED_MAX_COPIES
1009+
#define GGML_SCHED_MAX_COPIES 4
1010+
#endif
10001011

10011012
struct ggml_backend_sched_split {
10021013
int backend_id;
10031014
int i_start;
10041015
int i_end;
1005-
struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
1016+
struct ggml_tensor * inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
10061017
int n_inputs;
10071018
// graph view of this split
10081019
struct ggml_cgraph graph;
@@ -1014,15 +1025,15 @@ struct ggml_backend_sched {
10141025

10151026
int n_backends;
10161027

1017-
ggml_backend_t backends[GGML_MAX_BACKENDS];
1018-
ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
1028+
ggml_backend_t backends[GGML_SCHED_MAX_BACKENDS];
1029+
ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
10191030
ggml_gallocr_t galloc;
10201031

10211032
// hash keys of the nodes in the graph
10221033
struct ggml_hash_set hash_set;
10231034
// hash values
10241035
int * tensor_backend_id;
1025-
struct ggml_tensor * (* tensor_copies)[GGML_MAX_BACKENDS][GGML_MAX_COPIES];
1036+
struct ggml_tensor * (* tensor_copies)[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
10261037

10271038
int * node_backend_ids; // [graph_size]
10281039
int * leaf_backend_ids; // [graph_size]
@@ -1031,14 +1042,14 @@ struct ggml_backend_sched {
10311042
struct ggml_cgraph * graph;
10321043

10331044
// graph splits
1034-
struct ggml_backend_sched_split splits[GGML_MAX_SPLITS];
1045+
struct ggml_backend_sched_split splits[GGML_SCHED_MAX_SPLITS];
10351046
int n_splits;
10361047

10371048
// pipeline parallelism support
10381049
int n_copies;
10391050
int cur_copy;
1040-
ggml_backend_event_t events[GGML_MAX_BACKENDS][GGML_MAX_COPIES];
1041-
struct ggml_tensor * graph_inputs[GGML_MAX_SPLIT_INPUTS];
1051+
ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
1052+
struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
10421053
int n_graph_inputs;
10431054

10441055
struct ggml_context * ctx;
@@ -1047,12 +1058,12 @@ struct ggml_backend_sched {
10471058
void * callback_eval_user_data;
10481059

10491060
// align context_buffer to GGML_MEM_ALIGN
1050-
#ifdef _MSC_VER
1061+
#ifdef _MSC_VER
10511062
__declspec(align(GGML_MEM_ALIGN))
1052-
#else
1063+
#else
10531064
__attribute__((aligned(GGML_MEM_ALIGN)))
1054-
#endif
1055-
char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
1065+
#endif
1066+
char context_buffer[GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
10561067
};
10571068

10581069
#define hash_id(tensor) ggml_hash_find_or_insert(sched->hash_set, tensor)
@@ -1089,7 +1100,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co
10891100
}
10901101

10911102
#if 0
1092-
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug only
1103+
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
10931104
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
10941105
#define GET_CAUSE(node) causes[hash_id(node)]
10951106
#else
@@ -1395,7 +1406,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
13951406
if (tensor_backend_id != cur_backend_id) {
13961407
sched->splits[cur_split].i_end = i;
13971408
cur_split++;
1398-
GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
1409+
GGML_ASSERT(cur_split < GGML_SCHED_MAX_SPLITS);
13991410
sched->splits[cur_split].backend_id = tensor_backend_id;
14001411
sched->splits[cur_split].i_start = i;
14011412
sched->splits[cur_split].n_inputs = 0;
@@ -1433,7 +1444,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
14331444
SET_CAUSE(tensor_copy, "4.cpy");
14341445
}
14351446
int n_graph_inputs = sched->n_graph_inputs++;
1436-
GGML_ASSERT(n_graph_inputs < GGML_MAX_SPLIT_INPUTS);
1447+
GGML_ASSERT(n_graph_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
14371448
sched->graph_inputs[n_graph_inputs] = src;
14381449
}
14391450
}
@@ -1455,7 +1466,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
14551466
SET_CAUSE(tensor_copy, "4.cpy");
14561467
}
14571468
int n_inputs = sched->splits[cur_split].n_inputs++;
1458-
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
1469+
GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
14591470
sched->splits[cur_split].inputs[n_inputs] = src;
14601471
}
14611472
node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
@@ -1507,7 +1518,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
15071518

15081519
// create copies of the graph for each split
15091520
// TODO: avoid this copy
1510-
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_MAX_SPLIT_INPUTS, false);
1521+
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS, false);
15111522
for (int i = 0; i < sched->n_splits; i++) {
15121523
struct ggml_backend_sched_split * split = &sched->splits[i];
15131524
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
@@ -1683,23 +1694,23 @@ ggml_backend_sched_t ggml_backend_sched_new(
16831694
size_t graph_size,
16841695
bool parallel) {
16851696
GGML_ASSERT(n_backends > 0);
1686-
GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS);
1697+
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
16871698
GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
16881699

16891700
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
16901701

16911702
// initialize hash table
1692-
sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
1703+
sched->hash_set = ggml_hash_set_new(graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
16931704
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
16941705
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
16951706
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
16961707
sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), graph_size);
16971708

16981709
sched->n_backends = n_backends;
16991710

1700-
sched->n_copies = parallel ? GGML_MAX_COPIES : 1;
1711+
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
17011712

1702-
GGML_ASSERT(sched->n_copies <= GGML_MAX_COPIES);
1713+
GGML_ASSERT(sched->n_copies <= GGML_SCHED_MAX_COPIES);
17031714

17041715
for (int b = 0; b < n_backends; b++) {
17051716
sched->backends[b] = backends[b];
@@ -1764,7 +1775,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
17641775
}
17651776

17661777
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1767-
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
1778+
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
17681779

17691780
ggml_backend_sched_split_graph(sched, graph);
17701781

@@ -1812,6 +1823,10 @@ int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
18121823
return sched->n_splits;
18131824
}
18141825

1826+
int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
1827+
return sched->n_copies;
1828+
}
1829+
18151830
size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
18161831
int backend_index = ggml_backend_sched_backend_id(sched, backend);
18171832
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);

ggml-backend.h

+1
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,7 @@ extern "C" {
184184

185185
// Get the number of splits of the last graph
186186
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
187+
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
187188

188189
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
189190

llama.cpp

+3-2
Original file line numberDiff line numberDiff line change
@@ -13005,10 +13005,11 @@ struct llama_context * llama_new_context_with_model(
1300513005
// currently this is only implemented in the CUDA backend
1300613006
pipeline_parallel = false;
1300713007
#endif
13008+
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES, pipeline_parallel);
13009+
1300813010
if (pipeline_parallel) {
13009-
LLAMA_LOG_INFO("%s: pipeline parallelism enabled\n", __func__);
13011+
LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched));
1301013012
}
13011-
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES, pipeline_parallel);
1301213013

1301313014
// build worst-case graph
1301413015
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_ubatch);

0 commit comments

Comments
 (0)