Skip to content

Commit fe680e3

Browse files
ggerganovslaren
andauthored
sync : ggml (new ops, tests, backend, etc.) (ggml-org#4359)
* sync : ggml (part 1) * sync : ggml (part 2, CUDA) * sync : ggml (part 3, Metal) * ggml : build fixes ggml-ci * cuda : restore lost changes * cuda : restore lost changes (StableLM rope) * cmake : enable separable compilation for CUDA ggml-ci * ggml-cuda : remove device side dequantize * Revert "cmake : enable separable compilation for CUDA" This reverts commit 09e35d0. * cuda : remove assert for rope * tests : add test-backend-ops * ggml : fix bug in ggml_concat * ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()` * ci : try to fix macOS * ggml-backend : remove backend self-registration * ci : disable Metal for macOS cmake build ggml-ci * metal : fix "supports family" call * metal : fix assert * metal : print resource path ggml-ci --------- Co-authored-by: slaren <[email protected]>
1 parent bcc0eb4 commit fe680e3

20 files changed

+4568
-962
lines changed

.github/workflows/build.yml

+11-4
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,9 @@ jobs:
143143
cd build
144144
ctest --verbose
145145
146+
# TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
147+
# how to debug it.
148+
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
146149
macOS-latest-make:
147150
runs-on: macos-latest
148151

@@ -160,14 +163,18 @@ jobs:
160163
- name: Build
161164
id: make_build
162165
run: |
163-
make -j $(sysctl -n hw.logicalcpu)
166+
LLAMA_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
164167
165168
- name: Test
166169
id: make_test
167170
run: |
168-
make tests -j $(sysctl -n hw.logicalcpu)
169-
make test -j $(sysctl -n hw.logicalcpu)
171+
LLAMA_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu)
172+
LLAMA_NO_METAL=1 make test -j $(sysctl -n hw.logicalcpu)
170173
174+
# TODO: build with LLAMA_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
175+
# how to debug it.
176+
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
177+
# would be great if we fix these
171178
macOS-latest-cmake:
172179
runs-on: macos-latest
173180

@@ -188,7 +195,7 @@ jobs:
188195
sysctl -a
189196
mkdir build
190197
cd build
191-
cmake ..
198+
cmake -DLLAMA_METAL=OFF ..
192199
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
193200
194201
- name: Test

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -101,3 +101,4 @@ poetry.toml
101101
/tests/test-tokenizer-1-llama
102102
/tests/test-tokenizer-1-bpe
103103
/tests/test-rope
104+
/tests/test-backend-ops

CMakeLists.txt

+7-7
Original file line numberDiff line numberDiff line change
@@ -97,9 +97,9 @@ option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging"
9797
option(LLAMA_MPI "llama: use MPI" OFF)
9898
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
9999

100-
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
101-
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
102-
option(LLAMA_BUILD_SERVER "llama: build server example" ON)
100+
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
101+
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
102+
option(LLAMA_BUILD_SERVER "llama: build server example" ON)
103103

104104
# Required for relocatable CMake package
105105
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
@@ -662,11 +662,11 @@ add_library(ggml OBJECT
662662
ggml-backend.h
663663
ggml-quants.c
664664
ggml-quants.h
665-
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
665+
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
666666
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
667-
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
668-
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
669-
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
667+
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
668+
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
669+
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
670670
)
671671

672672
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})

Makefile

+5-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@ BUILD_TARGETS = \
88
TEST_TARGETS = \
99
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
1010
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
11-
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope
11+
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
12+
tests/test-backend-ops
1213

1314
# Code coverage output files
1415
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -746,3 +747,6 @@ tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS)
746747

747748
tests/test-c.o: tests/test-c.c llama.h
748749
$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
750+
751+
tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
752+
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

ggml-alloc.c

+42-7
Original file line numberDiff line numberDiff line change
@@ -168,10 +168,6 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor *
168168
size = aligned_offset(NULL, size, alloc->alignment);
169169
AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
170170

171-
if (!alloc->measure) {
172-
ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
173-
}
174-
175171
#ifdef GGML_ALLOCATOR_DEBUG
176172
remove_allocated_tensor(alloc, tensor);
177173
#endif
@@ -237,7 +233,7 @@ void ggml_tallocr_reset(ggml_tallocr_t alloc) {
237233
}
238234

239235
ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment) {
240-
struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, data, size);
236+
struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(data, size);
241237

242238
ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
243239

@@ -449,7 +445,6 @@ static ggml_tallocr_t node_tallocr(ggml_gallocr_t galloc, struct ggml_tensor * n
449445
static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool update_backend) {
450446
ggml_tallocr_t alloc = node_tallocr(galloc, view);
451447

452-
//printf("init_view: %s from src %s\n", view->name, view->view_src->name);
453448
GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
454449
if (update_backend) {
455450
view->backend = view->view_src->backend;
@@ -459,7 +454,7 @@ static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool upd
459454

460455
// FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
461456
// due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
462-
assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend);
457+
assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
463458

464459
if (!alloc->measure) {
465460
ggml_backend_buffer_init_tensor(alloc->buffer, view);
@@ -765,3 +760,43 @@ size_t ggml_allocr_max_size(ggml_allocr_t alloc) {
765760
size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph) {
766761
return ggml_gallocr_alloc_graph(alloc->galloc, alloc->talloc, graph);
767762
}
763+
764+
// utils
765+
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
766+
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
767+
768+
size_t alignment = ggml_backend_buft_get_alignment(buft);
769+
770+
size_t nbytes = 0;
771+
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
772+
if (t->data == NULL && t->view_src == NULL) {
773+
nbytes += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
774+
}
775+
}
776+
777+
if (nbytes == 0) {
778+
fprintf(stderr, "%s: no tensors to allocate\n", __func__);
779+
return NULL;
780+
}
781+
782+
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
783+
ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
784+
785+
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
786+
if (t->data == NULL) {
787+
if (t->view_src == NULL) {
788+
ggml_tallocr_alloc(tallocr, t);
789+
} else {
790+
ggml_backend_view_init(buffer, t);
791+
}
792+
}
793+
}
794+
795+
ggml_tallocr_free(tallocr);
796+
797+
return buffer;
798+
}
799+
800+
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
801+
return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
802+
}

ggml-alloc.h

+7
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ extern "C" {
88

99
struct ggml_backend;
1010
struct ggml_backend_buffer;
11+
struct ggml_backend_buffer_type;
1112

1213
//
1314
// Legacy API
@@ -80,6 +81,12 @@ GGML_API void ggml_gallocr_alloc_graph_n(
8081
struct ggml_hash_set hash_set,
8182
ggml_tallocr_t * hash_node_talloc);
8283

84+
85+
// Utils
86+
// Create a buffer and allocate all the tensors in a ggml_context
87+
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, struct ggml_backend_buffer_type * buft);
88+
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, struct ggml_backend * backend);
89+
8390
#ifdef __cplusplus
8491
}
8592
#endif

ggml-backend-impl.h

+46-21
Original file line numberDiff line numberDiff line change
@@ -12,31 +12,50 @@ extern "C" {
1212
// Backend buffer
1313
//
1414

15+
// buffer type
16+
typedef void * ggml_backend_buffer_type_context_t;
17+
18+
struct ggml_backend_buffer_type_i {
19+
ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
20+
size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
21+
size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
22+
bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
23+
};
24+
25+
struct ggml_backend_buffer_type {
26+
struct ggml_backend_buffer_type_i iface;
27+
ggml_backend_buffer_type_context_t context;
28+
};
29+
30+
// buffer
1531
typedef void * ggml_backend_buffer_context_t;
1632

1733
struct ggml_backend_buffer_i {
18-
void (*free_buffer) (ggml_backend_buffer_t buffer);
19-
void * (*get_base) (ggml_backend_buffer_t buffer); // get base pointer
20-
size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback
21-
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback
22-
void (*free_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-free callback
34+
void (*free_buffer)(ggml_backend_buffer_t buffer);
35+
//void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
36+
void * (*get_base) (ggml_backend_buffer_t buffer);
37+
void (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
38+
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
39+
void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
40+
// (optional) copy tensor between different buffer-type, allow for single-copy tranfers
41+
void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
42+
void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
2343
};
2444

2545
struct ggml_backend_buffer {
26-
struct ggml_backend_buffer_i iface;
27-
28-
ggml_backend_t backend;
46+
struct ggml_backend_buffer_i iface;
47+
ggml_backend_buffer_type_t buft;
2948
ggml_backend_buffer_context_t context;
30-
3149
size_t size;
3250
};
3351

34-
GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
35-
struct ggml_backend * backend,
52+
ggml_backend_buffer_t ggml_backend_buffer_init(
53+
ggml_backend_buffer_type_t buft,
3654
struct ggml_backend_buffer_i iface,
3755
ggml_backend_buffer_context_t context,
3856
size_t size);
3957

58+
4059
//
4160
// Backend
4261
//
@@ -49,20 +68,17 @@ extern "C" {
4968
void (*free)(ggml_backend_t backend);
5069

5170
// buffer allocation
52-
ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size);
71+
ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
5372

54-
// get buffer alignment
55-
size_t (*get_alignment)(ggml_backend_t backend);
56-
57-
// tensor data access
58-
// these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize
73+
// (optional) asynchroneous tensor data access
5974
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
6075
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
61-
void (*synchronize) (ggml_backend_t backend);
6276

63-
// (optional) copy tensor between different backends, allow for single-copy tranfers
64-
void (*cpy_tensor_from)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
65-
void (*cpy_tensor_to) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
77+
// (optional) asynchroneous tensor copy
78+
void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
79+
void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
80+
81+
void (*synchronize) (ggml_backend_t backend);
6682

6783
// compute graph with a plan
6884
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
@@ -82,6 +98,15 @@ extern "C" {
8298
ggml_backend_context_t context;
8399
};
84100

101+
102+
//
103+
// Backend registry
104+
//
105+
106+
typedef ggml_backend_t (*ggml_backend_init_fn)(const char * params, void * user_data);
107+
108+
void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
109+
85110
#ifdef __cplusplus
86111
}
87112
#endif

0 commit comments

Comments
 (0)