@@ -707,6 +707,9 @@ struct llama_model_loader {
707
707
708
708
struct ggml_tensor * get_tensor_for (llama_load_tensor & lt, ggml_backend backend) {
709
709
struct ggml_tensor * tensor;
710
+ if (backend != GGML_BACKEND_CPU) {
711
+ ggml_set_no_alloc (ggml_ctx, true );
712
+ }
710
713
if (lt.ne .size () == 2 ) {
711
714
tensor = ggml_new_tensor_2d (ggml_ctx, lt.type , lt.ne .at (0 ), lt.ne .at (1 ));
712
715
} else {
@@ -716,6 +719,9 @@ struct llama_model_loader {
716
719
ggml_set_name (tensor, lt.name .c_str ());
717
720
LLAMA_ASSERT (lt.ggml_tensor == NULL ); // if this fails, we called get_tensor twice on the same tensor
718
721
722
+ if (backend != GGML_BACKEND_CPU) {
723
+ ggml_set_no_alloc (ggml_ctx, use_mmap);
724
+ }
719
725
tensor->backend = backend;
720
726
lt.ggml_tensor = tensor;
721
727
num_ggml_tensors_created++;
@@ -731,6 +737,7 @@ struct llama_model_loader {
731
737
void load_all_data (llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
732
738
size_t data_size = 0 ;
733
739
size_t prefetch_size = 0 ;
740
+ size_t lock_size = 0 ;
734
741
for (const llama_load_tensor & lt : tensors_map.tensors ) {
735
742
data_size += lt.size ;
736
743
if (lt.ggml_tensor ->backend == GGML_BACKEND_CPU) {
@@ -740,32 +747,56 @@ struct llama_model_loader {
740
747
741
748
if (use_mmap) {
742
749
mapping.reset (new llama_mmap (&file_loaders.at (0 )->file , prefetch_size));
743
- if (!lmlock) {
744
- // Don't call the callback since the actual loading will be lazy
745
- // and we can't measure it.
746
- progress_callback = NULL ;
747
- }
748
750
if (lmlock) {
749
751
lmlock->init (mapping->addr );
750
752
}
751
753
}
752
754
753
755
size_t done_size = 0 ;
754
756
for (llama_load_tensor & lt : tensors_map.tensors ) {
755
- if (lt.ggml_tensor ->backend != GGML_BACKEND_CPU) {
756
- continue ;
757
- }
758
757
if (progress_callback) {
759
758
progress_callback ((float ) done_size / data_size, progress_callback_user_data);
760
759
}
761
760
LLAMA_ASSERT (lt.ggml_tensor ); // unused tensors should have been caught by load_data already
762
761
lt.data = (uint8_t *) lt.ggml_tensor ->data ;
762
+
763
+ // allocate temp buffer if not using mmap
764
+ if (!use_mmap && lt.data == NULL ) {
765
+ GGML_ASSERT (lt.ggml_tensor ->backend != GGML_BACKEND_CPU);
766
+ lt.data = (uint8_t *)malloc (ggml_nbytes (lt.ggml_tensor ));
767
+ }
768
+
763
769
load_data_for (lt);
764
- lt.ggml_tensor ->data = lt.data ;
765
- done_size += lt.size ;
766
- if (use_mmap && lmlock) {
767
- lmlock->grow_to (done_size);
770
+
771
+ switch (lt.ggml_tensor ->backend ) {
772
+ case GGML_BACKEND_CPU:
773
+ lt.ggml_tensor ->data = lt.data ;
774
+ if (use_mmap && lmlock) {
775
+ lock_size += lt.size ;
776
+ lmlock->grow_to (lock_size);
777
+ }
778
+ break ;
779
+ #if defined(GGML_USE_CUBLAS)
780
+ case GGML_BACKEND_GPU:
781
+ case GGML_BACKEND_GPU_SPLIT:
782
+ ggml_cuda_transform_tensor (lt.data , lt.ggml_tensor );
783
+ if (!use_mmap) {
784
+ free (lt.data );
785
+ }
786
+ break ;
787
+ #elif defined(GGML_USE_CLBLAST)
788
+ case GGML_BACKEND_GPU:
789
+ ggml_cl_transform_tensor (lt.data , lt.ggml_tensor );
790
+ if (!use_mmap) {
791
+ free (lt.data );
792
+ }
793
+ break ;
794
+ #endif
795
+ default :
796
+ continue ;
768
797
}
798
+
799
+ done_size += lt.size ;
769
800
}
770
801
}
771
802
@@ -1141,7 +1172,7 @@ static void llama_model_load_internal(
1141
1172
if (backend == GGML_BACKEND_GPU) {
1142
1173
vram_weights +=
1143
1174
ggml_nbytes (layer.attention_norm ) + ggml_nbytes (layer.wq ) + ggml_nbytes (layer.wk ) +
1144
- ggml_nbytes (layer.wv ) + ggml_nbytes (layer.wo ) + ggml_nbytes (layer.attention_norm ) +
1175
+ ggml_nbytes (layer.wv ) + ggml_nbytes (layer.wo ) + ggml_nbytes (layer.ffn_norm ) +
1145
1176
ggml_nbytes (layer.w1 ) + ggml_nbytes (layer.w2 ) + ggml_nbytes (layer.w3 );
1146
1177
}
1147
1178
}
@@ -1196,58 +1227,14 @@ static void llama_model_load_internal(
1196
1227
model.tensors_by_name .emplace_back (lt.name , lt.ggml_tensor );
1197
1228
}
1198
1229
1199
- ml->load_all_data (progress_callback, progress_callback_user_data, use_mlock ? &lctx.model .mlock_mmap : NULL );
1200
-
1201
1230
#if defined(GGML_USE_CUBLAS)
1202
1231
{
1203
1232
ggml_cuda_set_tensor_split (tensor_split);
1204
-
1205
- size_t done_size = 0 ;
1206
- size_t data_size = 0 ;
1207
- for (llama_load_tensor & lt : ml->tensors_map .tensors ) {
1208
- data_size += lt.size ;
1209
- if (lt.ggml_tensor ->backend == GGML_BACKEND_CPU) {
1210
- done_size += lt.size ;
1211
- }
1212
- }
1213
- for (llama_load_tensor & lt : ml->tensors_map .tensors ) {
1214
- ggml_backend backend = lt.ggml_tensor ->backend ;
1215
- if (backend != GGML_BACKEND_GPU && backend != GGML_BACKEND_GPU_SPLIT) {
1216
- continue ;
1217
- }
1218
- if (progress_callback) {
1219
- progress_callback ((float ) done_size / data_size, progress_callback_user_data);
1220
- }
1221
- ggml_cuda_load_data (fname.c_str (), lt.ggml_tensor , lt.shards .at (0 ).file_off );
1222
- done_size += lt.size ;
1223
- }
1224
- }
1225
- #elif defined(GGML_USE_CLBLAST)
1226
- {
1227
- size_t done_size = 0 ;
1228
- size_t data_size = 0 ;
1229
- for (llama_load_tensor & lt : ml->tensors_map .tensors ) {
1230
- data_size += lt.size ;
1231
- if (lt.ggml_tensor ->backend == GGML_BACKEND_CPU) {
1232
- done_size += lt.size ;
1233
- }
1234
- }
1235
- for (llama_load_tensor & lt : ml->tensors_map .tensors ) {
1236
- if (lt.ggml_tensor ->backend != GGML_BACKEND_GPU) {
1237
- continue ;
1238
- }
1239
- if (progress_callback) {
1240
- progress_callback ((float ) done_size / data_size, progress_callback_user_data);
1241
- }
1242
- ggml_cl_load_data (fname.c_str (), lt.ggml_tensor , lt.shards .at (0 ).file_off );
1243
- done_size += lt.size ;
1244
- }
1245
1233
}
1246
- #else
1247
- (void ) n_batch;
1248
- (void ) tensor_split;
1249
1234
#endif
1250
1235
1236
+ ml->load_all_data (progress_callback, progress_callback_user_data, use_mlock ? &lctx.model .mlock_mmap : NULL );
1237
+
1251
1238
if (progress_callback) {
1252
1239
progress_callback (1 .0f , progress_callback_user_data);
1253
1240
}
0 commit comments