@@ -433,6 +433,8 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
433
433
#define CUDA_MUL_BLOCK_SIZE 256
434
434
#define CUDA_GELU_BLOCK_SIZE 256
435
435
#define CUDA_SILU_BLOCK_SIZE 256
436
+ #define CUDA_RELU_BLOCK_SIZE 256
437
+ #define CUDA_SQR_BLOCK_SIZE 256
436
438
#define CUDA_CPY_BLOCK_SIZE 32
437
439
#define CUDA_SCALE_BLOCK_SIZE 256
438
440
#define CUDA_CLAMP_BLOCK_SIZE 256
@@ -553,6 +555,24 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
553
555
dst[i] = x[i] / (1 .0f + expf (-x[i]));
554
556
}
555
557
558
+ static __global__ void relu_f32 (const float * x, float * dst, const int k) {
559
+ const int i = blockDim .x *blockIdx .x + threadIdx .x ;
560
+
561
+ if (i >= k) {
562
+ return ;
563
+ }
564
+ dst[i] = fmaxf (x[i], 0 );
565
+ }
566
+
567
+ static __global__ void sqr_f32 (const float * x, float * dst, const int k) {
568
+ const int i = blockDim .x *blockIdx .x + threadIdx .x ;
569
+
570
+ if (i >= k) {
571
+ return ;
572
+ }
573
+ dst[i] = x[i] * x[i];
574
+ }
575
+
556
576
static __device__ __forceinline__ float2 warp_reduce_sum (float2 a) {
557
577
#pragma unroll
558
578
for (int mask = 16 ; mask > 0 ; mask >>= 1 ) {
@@ -4759,6 +4779,16 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
4759
4779
silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0 , stream>>> (x, dst, k);
4760
4780
}
4761
4781
4782
+ static void relu_f32_cuda (const float * x, float * dst, const int k, cudaStream_t stream) {
4783
+ const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1 ) / CUDA_RELU_BLOCK_SIZE;
4784
+ relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0 , stream>>> (x, dst, k);
4785
+ }
4786
+
4787
+ static void sqr_f32_cuda (const float * x, float * dst, const int k, cudaStream_t stream) {
4788
+ const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1 ) / CUDA_SQR_BLOCK_SIZE;
4789
+ sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0 , stream>>> (x, dst, k);
4790
+ }
4791
+
4762
4792
static void norm_f32_cuda (const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4763
4793
GGML_ASSERT (ncols % WARP_SIZE == 0 );
4764
4794
if (ncols < 1024 ) {
@@ -6128,6 +6158,34 @@ inline void ggml_cuda_op_silu(
6128
6158
(void ) src1_dd;
6129
6159
}
6130
6160
6161
+ inline void ggml_cuda_op_relu (
6162
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6163
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6164
+
6165
+ GGML_ASSERT (src0->type == GGML_TYPE_F32);
6166
+ GGML_ASSERT ( dst->type == GGML_TYPE_F32);
6167
+
6168
+ relu_f32_cuda (src0_dd, dst_dd, ggml_nelements (src0), main_stream);
6169
+
6170
+ (void ) src1;
6171
+ (void ) dst;
6172
+ (void ) src1_dd;
6173
+ }
6174
+
6175
+ inline void ggml_cuda_op_sqr (
6176
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6177
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6178
+
6179
+ GGML_ASSERT (src0->type == GGML_TYPE_F32);
6180
+ GGML_ASSERT ( dst->type == GGML_TYPE_F32);
6181
+
6182
+ sqr_f32_cuda (src0_dd, dst_dd, ggml_nelements (src0), main_stream);
6183
+
6184
+ (void ) src1;
6185
+ (void ) dst;
6186
+ (void ) src1_dd;
6187
+ }
6188
+
6131
6189
inline void ggml_cuda_op_norm (
6132
6190
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6133
6191
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -7160,6 +7218,14 @@ static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, g
7160
7218
ggml_cuda_op_flatten (src0, src1, dst, ggml_cuda_op_silu);
7161
7219
}
7162
7220
7221
+ static void ggml_cuda_relu (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7222
+ ggml_cuda_op_flatten (src0, src1, dst, ggml_cuda_op_relu);
7223
+ }
7224
+
7225
+ static void ggml_cuda_sqr (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7226
+ ggml_cuda_op_flatten (src0, src1, dst, ggml_cuda_op_sqr);
7227
+ }
7228
+
7163
7229
static void ggml_cuda_norm (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7164
7230
ggml_cuda_op_flatten (src0, src1, dst, ggml_cuda_op_norm);
7165
7231
}
@@ -7891,6 +7957,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
7891
7957
case GGML_UNARY_OP_SILU:
7892
7958
func = ggml_cuda_silu;
7893
7959
break ;
7960
+ case GGML_UNARY_OP_RELU:
7961
+ func = ggml_cuda_relu;
7962
+ break ;
7894
7963
default :
7895
7964
return false ;
7896
7965
} break ;
@@ -7909,6 +7978,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
7909
7978
case GGML_OP_SCALE:
7910
7979
func = ggml_cuda_scale;
7911
7980
break ;
7981
+ case GGML_OP_SQR:
7982
+ func = ggml_cuda_sqr;
7983
+ break ;
7912
7984
case GGML_OP_CLAMP:
7913
7985
if (!any_on_device) {
7914
7986
return false ;
0 commit comments