@@ -59,8 +59,8 @@ typedef float2 dfloat2;
59
59
#endif // GGML_CUDA_DMMV_F16
60
60
61
61
typedef void (*dequantize_kernel_t )(const void * vx, const int ib, const int iqs, dfloat2 & v);
62
- typedef void (*to_fp32_cuda_t )(const void * x, float * y, int k, cudaStream_t stream);
63
- typedef void (*dot_kernel_k_t )(const void * vx, const int ib, const int iqs, const float * y, float & v);
62
+ typedef void (*to_fp32_cuda_t )(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
63
+ typedef void (*dot_kernel_k_t )(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
64
64
typedef void (*cpy_kernel_t )(const char * cx, char * cdst);
65
65
typedef void (*ggml_cuda_func_t )(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
66
66
typedef void (*ggml_cuda_op_t )(
@@ -131,7 +131,7 @@ typedef struct {
131
131
} block_q8_1;
132
132
static_assert (sizeof (block_q8_1) == 2*sizeof(ggml_fp16_t ) + QK8_0, "wrong q8_1 block size/padding");
133
133
134
- typedef float (*vec_dot_q_cuda_t )(const void * vbq, const block_q8_1 * bq8_1, const int iqs);
134
+ typedef float (*vec_dot_q_cuda_t )(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs);
135
135
136
136
// ================================= k-quants
137
137
@@ -407,7 +407,7 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
407
407
408
408
// ================================== k-quants
409
409
410
- static __global__ void dequantize_block_q2_K (const void * vx, float * yy) {
410
+ static __global__ void dequantize_block_q2_K (const void * __restrict__ vx, float * __restrict__ yy) {
411
411
412
412
const int i = blockIdx .x ;
413
413
const block_q2_K * x = (const block_q2_K *) vx;
@@ -440,7 +440,7 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
440
440
441
441
}
442
442
443
- static __global__ void dequantize_block_q3_K (const void * vx, float * yy) {
443
+ static __global__ void dequantize_block_q3_K (const void * __restrict__ vx, float * __restrict__ yy) {
444
444
445
445
const int i = blockIdx .x ;
446
446
const block_q3_K * x = (const block_q3_K *) vx;
@@ -504,7 +504,7 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
504
504
}
505
505
#endif
506
506
507
- static __global__ void dequantize_block_q4_K (const void * vx, float * yy) {
507
+ static __global__ void dequantize_block_q4_K (const void * __restrict__ vx, float * __restrict__ yy) {
508
508
const block_q4_K * x = (const block_q4_K *) vx;
509
509
510
510
const int i = blockIdx .x ;
@@ -544,7 +544,7 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
544
544
#endif
545
545
}
546
546
547
- static __global__ void dequantize_block_q5_K (const void * vx, float * yy) {
547
+ static __global__ void dequantize_block_q5_K (const void * __restrict__ vx, float * __restrict__ yy) {
548
548
const block_q5_K * x = (const block_q5_K *) vx;
549
549
550
550
const int i = blockIdx .x ;
@@ -590,7 +590,7 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
590
590
#endif
591
591
}
592
592
593
- static __global__ void dequantize_block_q6_K (const void * vx, float * yy) {
593
+ static __global__ void dequantize_block_q6_K (const void * __restrict__ vx, float * __restrict__ yy) {
594
594
const block_q6_K * x = (const block_q6_K *) vx;
595
595
596
596
const int i = blockIdx .x ;
@@ -634,7 +634,7 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
634
634
#endif
635
635
}
636
636
637
- static __global__ void dequantize_mul_mat_vec_q2_k (const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
637
+ static __global__ void dequantize_mul_mat_vec_q2_k (const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
638
638
639
639
static_assert (16 %K_QUANTS_PER_ITERATION == 0 , " 16 must be divisible by K_QUANTS_PER_ITERATION" );
640
640
@@ -742,7 +742,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
742
742
}
743
743
}
744
744
745
- static __global__ void dequantize_mul_mat_vec_q3_k (const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
745
+ static __global__ void dequantize_mul_mat_vec_q3_k (const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
746
746
747
747
const int row = blockIdx .y *blockDim .y + threadIdx .y ;
748
748
if (row > nrows) return ;
@@ -846,7 +846,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
846
846
}
847
847
}
848
848
849
- static __global__ void dequantize_mul_mat_vec_q4_k (const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
849
+ static __global__ void dequantize_mul_mat_vec_q4_k (const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
850
850
851
851
const int row = blockIdx .y *blockDim .y + threadIdx .y ;
852
852
if (row > nrows) return ;
@@ -949,7 +949,7 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
949
949
}
950
950
}
951
951
952
- static __global__ void dequantize_mul_mat_vec_q5_k (const void * vx, const float * yy, float * dst, const int ncols) {
952
+ static __global__ void dequantize_mul_mat_vec_q5_k (const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) {
953
953
954
954
const int row = blockIdx .x ;
955
955
const int num_blocks_per_row = ncols / QK_K;
@@ -1053,7 +1053,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
1053
1053
}
1054
1054
}
1055
1055
1056
- static __global__ void dequantize_mul_mat_vec_q6_k (const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
1056
+ static __global__ void dequantize_mul_mat_vec_q6_k (const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
1057
1057
1058
1058
static_assert (16 %K_QUANTS_PER_ITERATION == 0 , " 16 must be divisible by K_QUANTS_PER_ITERATION" );
1059
1059
@@ -1171,7 +1171,7 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
1171
1171
v.y = x[ib + iqs + 1 ];
1172
1172
}
1173
1173
1174
- static __global__ void quantize_q8_1 (const float * x, void * vy, const int k) {
1174
+ static __global__ void quantize_q8_1 (const float * __restrict__ x, void * __restrict__ vy, const int k) {
1175
1175
const int i = blockDim .x *blockIdx .x + threadIdx .x ;
1176
1176
1177
1177
if (i >= k) {
@@ -1207,7 +1207,7 @@ static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
1207
1207
}
1208
1208
1209
1209
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
1210
- static __global__ void dequantize_block (const void * vx, float * y, const int k) {
1210
+ static __global__ void dequantize_block (const void * __restrict__ vx, float * __restrict__ y, const int k) {
1211
1211
const int i = blockDim .x *blockIdx .x + 2 *threadIdx .x ;
1212
1212
1213
1213
if (i >= k) {
@@ -1227,7 +1227,7 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
1227
1227
y[iybs + iqs + y_offset] = v.y ;
1228
1228
}
1229
1229
1230
- static __device__ __forceinline__ float vec_dot_q4_0_q8_1 (const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1230
+ static __device__ __forceinline__ float vec_dot_q4_0_q8_1 (const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1231
1231
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1232
1232
const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
1233
1233
@@ -1252,7 +1252,7 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * vbq, cons
1252
1252
#endif // __CUDA_ARCH__ >= 600
1253
1253
}
1254
1254
1255
- static __device__ __forceinline__ float vec_dot_q4_1_q8_1 (const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1255
+ static __device__ __forceinline__ float vec_dot_q4_1_q8_1 (const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1256
1256
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1257
1257
const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
1258
1258
@@ -1277,7 +1277,7 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * vbq, cons
1277
1277
#endif // __CUDA_ARCH__ >= 600
1278
1278
}
1279
1279
1280
- static __device__ __forceinline__ float vec_dot_q5_0_q8_1 (const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1280
+ static __device__ __forceinline__ float vec_dot_q5_0_q8_1 (const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1281
1281
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1282
1282
const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
1283
1283
@@ -1312,7 +1312,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, cons
1312
1312
#endif // __CUDA_ARCH__ >= 600
1313
1313
}
1314
1314
1315
- static __device__ __forceinline__ float vec_dot_q5_1_q8_1 (const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1315
+ static __device__ __forceinline__ float vec_dot_q5_1_q8_1 (const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1316
1316
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1317
1317
const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
1318
1318
@@ -1346,7 +1346,7 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, cons
1346
1346
#endif // __CUDA_ARCH__ >= 600
1347
1347
}
1348
1348
1349
- static __device__ __forceinline__ float vec_dot_q8_0_q8_1 (const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1349
+ static __device__ __forceinline__ float vec_dot_q8_0_q8_1 (const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1350
1350
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1351
1351
const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
1352
1352
@@ -1366,7 +1366,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * vbq, cons
1366
1366
}
1367
1367
1368
1368
template <int qk, int qi, typename block_q_t , vec_dot_q_cuda_t vec_dot_q_cuda>
1369
- static __global__ void mul_mat_vec_q (const void * vx, const void * vy, float * dst, const int ncols, const int nrows) {
1369
+ static __global__ void mul_mat_vec_q (const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
1370
1370
const int row = blockIdx .y *blockDim .y + threadIdx .y ;
1371
1371
1372
1372
if (row >= nrows) {
@@ -1404,7 +1404,7 @@ static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * d
1404
1404
}
1405
1405
1406
1406
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
1407
- static __global__ void dequantize_mul_mat_vec (const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
1407
+ static __global__ void dequantize_mul_mat_vec (const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
1408
1408
// qk = quantized weights per x block
1409
1409
// qr = number of quantized weights per data value in x block
1410
1410
const int row = blockIdx .y *blockDim .y + threadIdx .y ;
@@ -1471,7 +1471,7 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
1471
1471
}
1472
1472
}
1473
1473
1474
- static __global__ void mul_mat_p021_f16_f32 (const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
1474
+ static __global__ void mul_mat_p021_f16_f32 (const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
1475
1475
const half * x = (const half *) vx;
1476
1476
1477
1477
const int row_x = blockDim .y *blockIdx .y + threadIdx .y ;
@@ -1518,7 +1518,7 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
1518
1518
}
1519
1519
1520
1520
static __global__ void mul_mat_vec_nc_f16_f32 ( // nc == non-contiguous
1521
- const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
1521
+ const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
1522
1522
const int row_stride_x, const int channel_stride_x) {
1523
1523
1524
1524
const half * x = (const half *) vx;
@@ -2355,10 +2355,7 @@ inline void ggml_cuda_op_mul_mat_vec(
2355
2355
src0->type == GGML_TYPE_Q5_1 ||
2356
2356
src0->type == GGML_TYPE_Q8_0;
2357
2357
2358
- // The integer intrinsics used in mul_mat_vec_q are available with compute capability 6.
2359
- // However, they have bad performance with Pascal cards.
2360
- // Therefore, in a multi GPU setting decide at runtime which GPUs should use mul_mat_vec_q.
2361
- const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 700 && mul_mat_vec_q_implemented;
2358
+ const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 600 && mul_mat_vec_q_implemented;
2362
2359
#endif
2363
2360
2364
2361
if (use_mul_mat_vec_q) {
0 commit comments