Skip to content

Commit 58970a4

Browse files
Leverage mmap for offloading tensors to GPU (#1597)
* Rebase to latest * Show progress * Add assert to make sure we only allocate temp buffer for non-CPU backend tensor Co-authored-by: Johannes Gäßler <[email protected]> --------- Co-authored-by: Johannes Gäßler <[email protected]>
1 parent 8c0a10e commit 58970a4

File tree

5 files changed

+55
-114
lines changed

5 files changed

+55
-114
lines changed

ggml-cuda.cu

+3-20
Original file line numberDiff line numberDiff line change
@@ -1713,8 +1713,7 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
17131713
(void) dst;
17141714
}
17151715

1716-
void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset) {
1717-
FILE * fp = fopen(fname, "rb");
1716+
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
17181717
int nrows = ggml_nrows(tensor);
17191718
const size_t nb1 = tensor->nb[1];
17201719
ggml_backend backend = tensor->backend;
@@ -1748,35 +1747,19 @@ void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const
17481747

17491748
int64_t nrows_split = row_high - row_low;
17501749

1751-
const size_t offset_split = offset + row_low*nb1;
1750+
const size_t offset_split = row_low*nb1;
17521751
const size_t size = ggml_nbytes_split(tensor, nrows_split);
17531752

17541753
void * buf;
17551754
CUDA_CHECK(cudaMalloc(&buf, size));
1756-
void * buf_host = malloc(size);
1757-
1758-
#ifdef _WIN32
1759-
int ret = _fseeki64(fp, (__int64) offset_split, SEEK_SET);
1760-
#else
1761-
int ret = fseek(fp, (long) offset_split, SEEK_SET);
1762-
#endif
1763-
GGML_ASSERT(ret == 0); // same
1764-
1765-
size_t ret2 = fread(buf_host, size, 1, fp);
1766-
if (ret2 != 1) {
1767-
fprintf(stderr, "unexpectedly reached end of file");
1768-
exit(1);
1769-
}
1755+
void * buf_host = (char*)data + offset_split;
17701756

17711757
cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
1772-
cudaDeviceSynchronize();
17731758

1774-
free(buf_host);
17751759
extra->data_device[id] = buf;
17761760
}
17771761

17781762
tensor->extra = extra;
1779-
fclose(fp);
17801763
}
17811764

17821765
void ggml_cuda_free_data(struct ggml_tensor * tensor) {

ggml-cuda.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@ void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
2424
void * ggml_cuda_host_malloc(size_t size);
2525
void ggml_cuda_host_free(void * ptr);
2626

27-
void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensors, size_t offset);
27+
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
28+
2829
void ggml_cuda_free_data(struct ggml_tensor * tensor);
2930
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
3031
void ggml_cuda_set_main_device(int main_device);

ggml-opencl.cpp

+3-32
Original file line numberDiff line numberDiff line change
@@ -1167,7 +1167,7 @@ size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct g
11671167
return 0;
11681168
}
11691169

1170-
void ggml_cl_transform_tensor(ggml_tensor * tensor) {
1170+
void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
11711171
const int64_t ne0 = tensor->ne[0];
11721172
const int64_t ne1 = tensor->ne[1];
11731173
const int64_t ne2 = tensor->ne[2];
@@ -1179,6 +1179,7 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
11791179
size_t q_size;
11801180
cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
11811181

1182+
tensor->data = data;
11821183
// copy tensor to device
11831184
for (int64_t i3 = 0; i3 < ne3; i3++) {
11841185
for (int64_t i2 = 0; i2 < ne2; i2++) {
@@ -1190,35 +1191,5 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
11901191
CL_CHECK(clFinish(queue));
11911192

11921193
tensor->data = dst;
1193-
tensor->backend = GGML_BACKEND_GPU;
1194-
}
1195-
1196-
void ggml_cl_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset) {
1197-
cl_int err;
1198-
FILE * fp = fopen(fname, "rb");
1199-
1200-
const size_t size = ggml_nbytes(tensor);
1201-
1202-
cl_mem dst;
1203-
CL_CHECK((dst = clCreateBuffer(context, CL_MEM_READ_ONLY, size, nullptr, &err), err));
1204-
void * buf_host = malloc(size);
1205-
1206-
#ifdef _WIN32
1207-
int ret = _fseeki64(fp, (__int64) offset, SEEK_SET);
1208-
#else
1209-
int ret = fseek(fp, (long) offset, SEEK_SET);
1210-
#endif
1211-
GGML_ASSERT(ret == 0); // same
1212-
1213-
size_t ret2 = fread(buf_host, size, 1, fp);
1214-
if (ret2 != 1) {
1215-
fprintf(stderr, "unexpectedly reached end of file");
1216-
exit(1);
1217-
}
1218-
1219-
clEnqueueWriteBuffer(queue, dst, CL_TRUE, 0, size, buf_host, 0, nullptr, nullptr);
1220-
1221-
tensor->data = dst;
1222-
free(buf_host);
1223-
fclose(fp);
1194+
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
12241195
}

ggml-opencl.h

+1-2
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,7 @@ void ggml_cl_host_free(void * ptr);
1818

1919
void ggml_cl_free_data(const struct ggml_tensor* tensor);
2020

21-
void ggml_cl_transform_tensor(struct ggml_tensor * tensor);
22-
void ggml_cl_load_data(const char * fname, struct ggml_tensor * tensor, size_t offset);
21+
void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
2322

2423
#ifdef __cplusplus
2524
}

llama.cpp

+46-59
Original file line numberDiff line numberDiff line change
@@ -707,6 +707,9 @@ struct llama_model_loader {
707707

708708
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
709709
struct ggml_tensor * tensor;
710+
if (backend != GGML_BACKEND_CPU) {
711+
ggml_set_no_alloc(ggml_ctx, true);
712+
}
710713
if (lt.ne.size() == 2) {
711714
tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
712715
} else {
@@ -716,6 +719,9 @@ struct llama_model_loader {
716719
ggml_set_name(tensor, lt.name.c_str());
717720
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
718721

722+
if (backend != GGML_BACKEND_CPU) {
723+
ggml_set_no_alloc(ggml_ctx, use_mmap);
724+
}
719725
tensor->backend = backend;
720726
lt.ggml_tensor = tensor;
721727
num_ggml_tensors_created++;
@@ -731,6 +737,7 @@ struct llama_model_loader {
731737
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
732738
size_t data_size = 0;
733739
size_t prefetch_size = 0;
740+
size_t lock_size = 0;
734741
for (const llama_load_tensor & lt : tensors_map.tensors) {
735742
data_size += lt.size;
736743
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
@@ -740,32 +747,56 @@ struct llama_model_loader {
740747

741748
if (use_mmap) {
742749
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
743-
if (!lmlock) {
744-
// Don't call the callback since the actual loading will be lazy
745-
// and we can't measure it.
746-
progress_callback = NULL;
747-
}
748750
if (lmlock) {
749751
lmlock->init(mapping->addr);
750752
}
751753
}
752754

753755
size_t done_size = 0;
754756
for (llama_load_tensor & lt : tensors_map.tensors) {
755-
if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
756-
continue;
757-
}
758757
if (progress_callback) {
759758
progress_callback((float) done_size / data_size, progress_callback_user_data);
760759
}
761760
LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
762761
lt.data = (uint8_t *) lt.ggml_tensor->data;
762+
763+
// allocate temp buffer if not using mmap
764+
if (!use_mmap && lt.data == NULL) {
765+
GGML_ASSERT(lt.ggml_tensor->backend != GGML_BACKEND_CPU);
766+
lt.data = (uint8_t*)malloc(ggml_nbytes(lt.ggml_tensor));
767+
}
768+
763769
load_data_for(lt);
764-
lt.ggml_tensor->data = lt.data;
765-
done_size += lt.size;
766-
if (use_mmap && lmlock) {
767-
lmlock->grow_to(done_size);
770+
771+
switch(lt.ggml_tensor->backend) {
772+
case GGML_BACKEND_CPU:
773+
lt.ggml_tensor->data = lt.data;
774+
if (use_mmap && lmlock) {
775+
lock_size += lt.size;
776+
lmlock->grow_to(lock_size);
777+
}
778+
break;
779+
#if defined(GGML_USE_CUBLAS)
780+
case GGML_BACKEND_GPU:
781+
case GGML_BACKEND_GPU_SPLIT:
782+
ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor);
783+
if (!use_mmap) {
784+
free(lt.data);
785+
}
786+
break;
787+
#elif defined(GGML_USE_CLBLAST)
788+
case GGML_BACKEND_GPU:
789+
ggml_cl_transform_tensor(lt.data, lt.ggml_tensor);
790+
if (!use_mmap) {
791+
free(lt.data);
792+
}
793+
break;
794+
#endif
795+
default:
796+
continue;
768797
}
798+
799+
done_size += lt.size;
769800
}
770801
}
771802

@@ -1141,7 +1172,7 @@ static void llama_model_load_internal(
11411172
if (backend == GGML_BACKEND_GPU) {
11421173
vram_weights +=
11431174
ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
1144-
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
1175+
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
11451176
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
11461177
}
11471178
}
@@ -1196,58 +1227,14 @@ static void llama_model_load_internal(
11961227
model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
11971228
}
11981229

1199-
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1200-
12011230
#if defined(GGML_USE_CUBLAS)
12021231
{
12031232
ggml_cuda_set_tensor_split(tensor_split);
1204-
1205-
size_t done_size = 0;
1206-
size_t data_size = 0;
1207-
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1208-
data_size += lt.size;
1209-
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
1210-
done_size += lt.size;
1211-
}
1212-
}
1213-
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1214-
ggml_backend backend = lt.ggml_tensor->backend;
1215-
if (backend != GGML_BACKEND_GPU && backend != GGML_BACKEND_GPU_SPLIT) {
1216-
continue;
1217-
}
1218-
if (progress_callback) {
1219-
progress_callback((float) done_size / data_size, progress_callback_user_data);
1220-
}
1221-
ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
1222-
done_size += lt.size;
1223-
}
1224-
}
1225-
#elif defined(GGML_USE_CLBLAST)
1226-
{
1227-
size_t done_size = 0;
1228-
size_t data_size = 0;
1229-
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1230-
data_size += lt.size;
1231-
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
1232-
done_size += lt.size;
1233-
}
1234-
}
1235-
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1236-
if (lt.ggml_tensor->backend != GGML_BACKEND_GPU) {
1237-
continue;
1238-
}
1239-
if (progress_callback) {
1240-
progress_callback((float) done_size / data_size, progress_callback_user_data);
1241-
}
1242-
ggml_cl_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
1243-
done_size += lt.size;
1244-
}
12451233
}
1246-
#else
1247-
(void) n_batch;
1248-
(void) tensor_split;
12491234
#endif
12501235

1236+
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1237+
12511238
if (progress_callback) {
12521239
progress_callback(1.0f, progress_callback_user_data);
12531240
}

0 commit comments

Comments
 (0)