@@ -181,11 +181,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
181
181
do { \
182
182
cudaError_t err_ = (err); \
183
183
if (err_ != cudaSuccess) { \
184
- int id ; \
185
- cudaGetDevice (&id ); \
184
+ int dev_id ; \
185
+ cudaGetDevice (&dev_id ); \
186
186
fprintf (stderr, " \n CUDA error %d at %s:%d: %s\n " , err_, __FILE__, __LINE__, \
187
187
cudaGetErrorString (err_)); \
188
- fprintf (stderr, " current device: %d\n " , id ); \
188
+ fprintf (stderr, " current device: %d\n " , dev_id ); \
189
189
exit (1 ); \
190
190
} \
191
191
} while (0 )
@@ -195,11 +195,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
195
195
do { \
196
196
cublasStatus_t err_ = (err); \
197
197
if (err_ != CUBLAS_STATUS_SUCCESS) { \
198
- int id ; \
199
- cudaGetDevice (&id ); \
198
+ int dev_id ; \
199
+ cudaGetDevice (&dev_id ); \
200
200
fprintf (stderr, " \n cuBLAS error %d at %s:%d: %s\n " , \
201
201
err_, __FILE__, __LINE__, cublasGetStatusString (err_)); \
202
- fprintf (stderr, " current device: %d\n " , id ); \
202
+ fprintf (stderr, " current device: %d\n " , dev_id ); \
203
203
exit (1 ); \
204
204
} \
205
205
} while (0 )
@@ -465,6 +465,7 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA
465
465
466
466
#define MAX_STREAMS 8
467
467
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
468
+ static cudaMemPool_t g_cudaMemPools[GGML_CUDA_MAX_DEVICES] = { nullptr };
468
469
469
470
struct ggml_tensor_extra_gpu {
470
471
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
@@ -5772,6 +5773,16 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
5772
5773
return ptr;
5773
5774
}
5774
5775
5776
+ static void * ggml_cuda_pool_malloc_async (size_t size, size_t * actual_size, int id, cudaStream_t stream) {
5777
+ if (g_cudaMemPools[id] == nullptr ) {
5778
+ return ggml_cuda_pool_malloc (size, actual_size);
5779
+ }
5780
+ void *ptr;
5781
+ CUDA_CHECK (cudaMallocFromPoolAsync (&ptr, size, g_cudaMemPools[id], stream));
5782
+ *actual_size = size;
5783
+ return ptr;
5784
+ }
5785
+
5775
5786
static void ggml_cuda_pool_free (void * ptr, size_t size) {
5776
5787
scoped_spin_lock lock (g_cuda_pool_lock);
5777
5788
int id;
@@ -5790,6 +5801,13 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
5790
5801
}
5791
5802
5792
5803
5804
+ static void ggml_cuda_pool_free_async (void * ptr, size_t actual_size, int id, cudaStream_t stream) {
5805
+ if (g_cudaMemPools[id] == nullptr ) {
5806
+ return ggml_cuda_pool_free (ptr, actual_size);
5807
+ }
5808
+ CUDA_CHECK (cudaFreeAsync (ptr, stream));
5809
+ }
5810
+
5793
5811
void ggml_init_cublas () {
5794
5812
static bool initialized = false ;
5795
5813
@@ -5844,6 +5862,13 @@ void ggml_init_cublas() {
5844
5862
// create cublas handle
5845
5863
CUBLAS_CHECK (cublasCreate (&g_cublas_handles[id]));
5846
5864
CUBLAS_CHECK (cublasSetMathMode (g_cublas_handles[id], CUBLAS_TF32_TENSOR_OP_MATH));
5865
+
5866
+ // configure memory pool
5867
+ cudaError_t err = cudaDeviceGetMemPool (&g_cudaMemPools[id], id);
5868
+ if (err == cudaSuccess) {
5869
+ size_t treshold = UINT64_MAX;
5870
+ CUDA_CHECK (cudaMemPoolSetAttribute (g_cudaMemPools[id], cudaMemPoolAttrReleaseThreshold, &treshold));
5871
+ }
5847
5872
}
5848
5873
5849
5874
// configure logging to stdout
@@ -6437,7 +6462,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
6437
6462
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda (src0->type );
6438
6463
GGML_ASSERT (to_fp16_cuda != nullptr );
6439
6464
size_t ne = row_diff*ne00;
6440
- src0_as_f16 = (half *) ggml_cuda_pool_malloc (ne * sizeof (half), &src0_as);
6465
+ src0_as_f16 = (half *) ggml_cuda_pool_malloc_async (ne * sizeof (half), &src0_as, id, stream );
6441
6466
to_fp16_cuda (src0_dd_i, src0_as_f16, ne, stream);
6442
6467
}
6443
6468
const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16;
@@ -6448,13 +6473,12 @@ inline void ggml_cuda_op_mul_mat_cublas(
6448
6473
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda (src1->type );
6449
6474
GGML_ASSERT (to_fp16_cuda != nullptr );
6450
6475
size_t ne = src1_ncols*ne10;
6451
- src1_as_f16 = (half *) ggml_cuda_pool_malloc (ne * sizeof (half), &src1_as);
6476
+ src1_as_f16 = (half *) ggml_cuda_pool_malloc_async (ne * sizeof (half), &src1_as, id, stream );
6452
6477
to_fp16_cuda (src1_ddf_i, src1_as_f16, ne, stream);
6453
6478
}
6454
6479
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddq_i : src1_as_f16;
6455
-
6456
- size_t dst_as = 0 ;
6457
- half * dst_f16 = (half *) ggml_cuda_pool_malloc (row_diff*src1_ncols * sizeof (half), &dst_as);
6480
+ size_t dst_f16_as = 0 ;
6481
+ half * dst_f16 = (half *) ggml_cuda_pool_malloc_async (row_diff*src1_ncols * sizeof (half), &dst_f16_as, id, stream);
6458
6482
6459
6483
const half alpha_f16 = 1 .0f ;
6460
6484
const half beta_f16 = 0 .0f ;
@@ -6472,14 +6496,15 @@ inline void ggml_cuda_op_mul_mat_cublas(
6472
6496
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda (GGML_TYPE_F16);
6473
6497
to_fp32_cuda (dst_f16, dst_dd_i, row_diff*src1_ncols, stream);
6474
6498
6475
- ggml_cuda_pool_free (dst_f16, dst_as);
6499
+ if (dst_f16_as != 0 ) {
6500
+ ggml_cuda_pool_free_async (dst_f16, dst_f16_as, id, stream);
6501
+ }
6476
6502
6477
6503
if (src0_as != 0 ) {
6478
- ggml_cuda_pool_free (src0_as_f16, src0_as);
6504
+ ggml_cuda_pool_free_async (src0_as_f16, src0_as, id, stream );
6479
6505
}
6480
-
6481
6506
if (src1_as != 0 ) {
6482
- ggml_cuda_pool_free (src1_as_f16, src1_as);
6507
+ ggml_cuda_pool_free_async (src1_as_f16, src1_as, id, stream );
6483
6508
}
6484
6509
}
6485
6510
else {
@@ -6489,7 +6514,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
6489
6514
if (src0->type != GGML_TYPE_F32) {
6490
6515
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda (src0->type );
6491
6516
GGML_ASSERT (to_fp32_cuda != nullptr );
6492
- src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc (row_diff*ne00 * sizeof (float ), &src0_as); // NOLINT
6517
+ src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc_async (row_diff*ne00 * sizeof (float ), &src0_as, id, stream ); // NOLINT
6493
6518
to_fp32_cuda (src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
6494
6519
}
6495
6520
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
@@ -6506,7 +6531,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
6506
6531
&beta, dst_dd_i, ldc));
6507
6532
6508
6533
if (src0_as != 0 ) {
6509
- ggml_cuda_pool_free (src0_ddq_as_f32, src0_as);
6534
+ ggml_cuda_pool_free_async (src0_ddq_as_f32, src0_as, id, stream );
6510
6535
}
6511
6536
}
6512
6537
@@ -6929,29 +6954,30 @@ static void ggml_cuda_op_mul_mat(
6929
6954
src0_dd[id] = (char *) src0_extra->data_device [id];
6930
6955
} else {
6931
6956
const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes (src0);
6932
- src0_dd[id] = (char *) ggml_cuda_pool_malloc (ggml_nbytes (src0), &src0_as[id]);
6957
+ src0_dd[id] = (char *) ggml_cuda_pool_malloc_async (ggml_nbytes (src0), &src0_as[id], id, stream );
6933
6958
}
6934
6959
6935
6960
if (src1_on_device && src1_is_contiguous) {
6936
6961
src1_ddf[id] = (float *) src1_extra->data_device [id];
6937
6962
} else {
6938
- src1_ddf[id] = (float *) ggml_cuda_pool_malloc (ggml_nbytes (src1), &src1_asf[id]);
6963
+ src1_ddf[id] = (float *) ggml_cuda_pool_malloc_async (ggml_nbytes (src1), &src1_asf[id], id, stream );
6939
6964
}
6940
6965
6941
6966
if (convert_src1_to_q8_1) {
6942
- src1_ddq[id] = (char *) ggml_cuda_pool_malloc (nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
6967
+ const size_t size_dst_ddq = nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs;
6968
+ src1_ddq[id] = (char *) ggml_cuda_pool_malloc_async (size_dst_ddq, &src1_asq[id], id, stream);
6943
6969
6944
6970
if (src1_on_device && src1_is_contiguous) {
6945
6971
quantize_row_q8_1_cuda (src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
6946
- CUDA_CHECK (cudaGetLastError ());
6972
+ // CUDA_CHECK(cudaGetLastError());
6947
6973
}
6948
6974
}
6949
6975
6950
6976
if (dst_on_device) {
6951
6977
dst_dd[id] = (float *) dst_extra->data_device [id];
6952
6978
} else {
6953
6979
const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof (float ) : ggml_nbytes (dst);
6954
- dst_dd[id] = (float *) ggml_cuda_pool_malloc (size_dst_ddf, &dst_as[id]);
6980
+ dst_dd[id] = (float *) ggml_cuda_pool_malloc_async (size_dst_ddf, &dst_as[id], id, stream );
6955
6981
}
6956
6982
}
6957
6983
@@ -7077,24 +7103,6 @@ static void ggml_cuda_op_mul_mat(
7077
7103
}
7078
7104
}
7079
7105
7080
- for (int64_t id = 0 ; id < g_device_count; ++id) {
7081
- CUDA_CHECK (ggml_cuda_set_device (id));
7082
-
7083
- // free buffers again when done
7084
- if (src0_as[id] > 0 ) {
7085
- ggml_cuda_pool_free (src0_dd[id], src0_as[id]);
7086
- }
7087
- if (src1_asf[id] > 0 ) {
7088
- ggml_cuda_pool_free (src1_ddf[id], src1_asf[id]);
7089
- }
7090
- if (src1_asq[id] > 0 ) {
7091
- ggml_cuda_pool_free (src1_ddq[id], src1_asq[id]);
7092
- }
7093
- if (dst_as[id] > 0 ) {
7094
- ggml_cuda_pool_free (dst_dd[id], dst_as[id]);
7095
- }
7096
- }
7097
-
7098
7106
// main device waits for all other devices to be finished
7099
7107
if (split && g_device_count > 1 ) {
7100
7108
int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1 ) / MUL_MAT_SRC1_COL_STRIDE;
@@ -7112,6 +7120,21 @@ static void ggml_cuda_op_mul_mat(
7112
7120
CUDA_CHECK (ggml_cuda_set_device (g_main_device));
7113
7121
CUDA_CHECK (cudaDeviceSynchronize ());
7114
7122
}
7123
+
7124
+ for (int64_t id = 0 ; id < g_device_count; ++id) {
7125
+ if (src0_as[id] > 0 ) {
7126
+ ggml_cuda_pool_free_async (src0_dd[id], src0_as[id], id, g_cudaStreams[id][0 ]);
7127
+ }
7128
+ if (src1_asf[id] > 0 ) {
7129
+ ggml_cuda_pool_free_async (src1_ddf[id], src1_asf[id], id, g_cudaStreams[id][0 ]);
7130
+ }
7131
+ if (src1_asq[id] > 0 ) {
7132
+ ggml_cuda_pool_free_async (src1_ddq[id], src1_asq[id], id, g_cudaStreams[id][0 ]);
7133
+ }
7134
+ if (dst_as[id] > 0 ) {
7135
+ ggml_cuda_pool_free_async (dst_dd[id], dst_as[id], id, g_cudaStreams[id][0 ]);
7136
+ }
7137
+ }
7115
7138
}
7116
7139
7117
7140
static void ggml_cuda_repeat (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -7298,11 +7321,11 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7298
7321
GGML_ASSERT (to_fp16_cuda != nullptr );
7299
7322
7300
7323
size_t src1_as = 0 ;
7301
- half * src1_as_f16 = (half *) ggml_cuda_pool_malloc (ne1 * sizeof (half), &src1_as);
7324
+ half * src1_as_f16 = (half *) ggml_cuda_pool_malloc_async (ne1 * sizeof (half), &src1_as, id, main_stream );
7302
7325
to_fp16_cuda (src1_ddf, src1_as_f16, ne1, main_stream);
7303
7326
7304
7327
size_t dst_as = 0 ;
7305
- half * dst_f16 = (half *) ggml_cuda_pool_malloc (ne * sizeof (half), &dst_as);
7328
+ half * dst_f16 = (half *) ggml_cuda_pool_malloc_async (ne * sizeof (half), &dst_as, id, main_stream );
7306
7329
7307
7330
GGML_ASSERT (ne12 % ne02 == 0 );
7308
7331
GGML_ASSERT (ne13 % ne03 == 0 );
@@ -7349,10 +7372,9 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7349
7372
} else {
7350
7373
// use cublasGemmBatchedEx
7351
7374
const int ne23 = ne12*ne13;
7352
-
7353
- void ** ptrs_as = nullptr ;
7375
+ // allocate device memory for pointers
7354
7376
size_t ptrs_s = 0 ;
7355
- ptrs_as = (void **) ggml_cuda_pool_malloc (3 *ne23*sizeof (void *), &ptrs_s);
7377
+ void ** ptrs_as = (void **)ggml_cuda_pool_malloc_async (3 *ne23*sizeof (void *), &ptrs_s, id, main_stream );
7356
7378
7357
7379
dim3 block_dims (ne13, ne12);
7358
7380
k_compute_batched_ptrs<<<1 , block_dims, 0 , main_stream>>> (
@@ -7365,7 +7387,6 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7365
7387
dst->nb [2 ], dst->nb [3 ],
7366
7388
r2, r3);
7367
7389
CUDA_CHECK (cudaGetLastError ());
7368
-
7369
7390
CUBLAS_CHECK (
7370
7391
cublasGemmBatchedEx (g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7371
7392
ne01, ne11, ne10,
@@ -7375,16 +7396,21 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7375
7396
ne23,
7376
7397
CUBLAS_COMPUTE_16F,
7377
7398
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7378
-
7379
- ggml_cuda_pool_free (ptrs_as, ptrs_s);
7399
+ // free device memory for pointers
7400
+ if (ptrs_s != 0 ) {
7401
+ ggml_cuda_pool_free_async (ptrs_as, ptrs_s, id, main_stream);
7402
+ }
7380
7403
}
7381
7404
#endif
7382
7405
7383
7406
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda (GGML_TYPE_F16);
7384
7407
to_fp32_cuda (dst_f16, dst_ddf, ne, main_stream);
7385
-
7386
- ggml_cuda_pool_free (src1_as_f16, src1_as);
7387
- ggml_cuda_pool_free (dst_f16, dst_as);
7408
+ if (src1_as != 0 ) {
7409
+ ggml_cuda_pool_free_async (src1_as_f16, src1_as, id, main_stream);
7410
+ }
7411
+ if (dst_as != 0 ) {
7412
+ ggml_cuda_pool_free_async (dst_f16, dst_as, id, main_stream);
7413
+ }
7388
7414
}
7389
7415
7390
7416
static void ggml_cuda_mul_mat (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
0 commit comments