Skip to content

Commit c5fa4ed

Browse files
committed
sycl : reuse quantum blocks
1 parent 470e865 commit c5fa4ed

File tree

2 files changed

+11
-150
lines changed

2 files changed

+11
-150
lines changed

ggml-common.h

+10
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,16 @@ typedef half2 ggml_half2;
3737

3838
#define GGML_COMMON_AGGR data
3939

40+
#define GGML_COMMON_DECL
41+
#elif defined(GGML_COMMON_DECL_SYCL)
42+
#include <sycl/half_type.hpp>
43+
#include <cstdint>
44+
45+
typedef sycl::half ggml_half;
46+
typedef sycl::half2 ggml_half2;
47+
48+
#define GGML_COMMON_AGGR data
49+
4050
#define GGML_COMMON_DECL
4151
#endif
4252

ggml-sycl.cpp

+1-150
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,6 @@
3636
#include "ggml.h"
3737
#include "ggml-backend-impl.h"
3838

39-
#define GGML_COMMON_IMPL_SYCL
40-
#include "ggml-common.h"
41-
4239
/*
4340
Following definition copied from DPCT head files, which are used by ggml-sycl.cpp
4441
*/
@@ -3147,6 +3144,7 @@ namespace dpct
31473144

31483145
} // COPY from DPCT head files
31493146

3147+
#define GGML_COMMON_DECL_SYCL
31503148
#define GGML_COMMON_IMPL_SYCL
31513149
#include "ggml-common.h"
31523150

@@ -3315,66 +3313,6 @@ typedef void (*ggml_sycl_op_flatten_t)(const ggml_tensor *src0,
33153313
const float *src1_dd, float *dst_dd,
33163314
const dpct::queue_ptr &main_stream);
33173315

3318-
// QK = number of values after dequantization
3319-
// QR = QK / number of values before dequantization
3320-
// QI = number of 32 bit integers before dequantization
3321-
3322-
#define QK4_0 32
3323-
#define QR4_0 2
3324-
#define QI4_0 (QK4_0 / (4 * QR4_0))
3325-
typedef struct dpct_type_block_q4_0 {
3326-
sycl::half d; // delta
3327-
uint8_t qs[QK4_0 / 2]; // nibbles / quants
3328-
} block_q4_0;
3329-
static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
3330-
3331-
#define QK4_1 32
3332-
#define QR4_1 2
3333-
#define QI4_1 (QK4_1 / (4 * QR4_1))
3334-
typedef struct dpct_type_block_q4_1 {
3335-
sycl::half2 dm; // dm.x = delta, dm.y = min
3336-
uint8_t qs[QK4_1 / 2]; // nibbles / quants
3337-
} block_q4_1;
3338-
static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
3339-
3340-
#define QK5_0 32
3341-
#define QR5_0 2
3342-
#define QI5_0 (QK5_0 / (4 * QR5_0))
3343-
typedef struct dpct_type_block_q5_0 {
3344-
sycl::half d; // delta
3345-
uint8_t qh[4]; // 5-th bit of quants
3346-
uint8_t qs[QK5_0 / 2]; // nibbles / quants
3347-
} block_q5_0;
3348-
static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
3349-
3350-
#define QK5_1 32
3351-
#define QR5_1 2
3352-
#define QI5_1 (QK5_1 / (4 * QR5_1))
3353-
typedef struct dpct_type_block_q5_1 {
3354-
sycl::half2 dm; // dm.x = delta, dm.y = min
3355-
uint8_t qh[4]; // 5-th bit of quants
3356-
uint8_t qs[QK5_1 / 2]; // nibbles / quants
3357-
} block_q5_1;
3358-
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
3359-
3360-
#define QK8_0 32
3361-
#define QR8_0 1
3362-
#define QI8_0 (QK8_0 / (4 * QR8_0))
3363-
typedef struct dpct_type_block_q8_0 {
3364-
sycl::half d; // delta
3365-
int8_t qs[QK8_0]; // quants
3366-
} block_q8_0;
3367-
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
3368-
3369-
#define QK8_1 32
3370-
#define QR8_1 1
3371-
#define QI8_1 (QK8_1 / (4 * QR8_1))
3372-
typedef struct dpct_type_block_q8_1 {
3373-
sycl::half2 ds; // ds.x = delta, ds.y = sum
3374-
int8_t qs[QK8_0]; // quants
3375-
} block_q8_1;
3376-
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
3377-
33783316
typedef float (*vec_dot_q_sycl_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
33793317
typedef void (*allocate_tiles_sycl_t)(int **x_ql, sycl::half2 **x_dm,
33803318
int **x_qh, int **x_sc);
@@ -3410,93 +3348,6 @@ typedef struct dpct_type_block_q2_K {
34103348
} block_q2_K;
34113349
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
34123350

3413-
#define QR3_K 4
3414-
#define QI3_K (QK_K / (4*QR3_K))
3415-
typedef struct dpct_type_block_q3_K {
3416-
uint8_t hmask[QK_K/8]; // quants - high bit
3417-
uint8_t qs[QK_K/4]; // quants - low 2 bits
3418-
#ifdef GGML_QKK_64
3419-
uint8_t scales[2]; // scales, quantized with 8 bits
3420-
#else
3421-
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
3422-
#endif
3423-
sycl::half d; // super-block scale
3424-
} block_q3_K;
3425-
//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
3426-
3427-
#define QR4_K 2
3428-
#define QI4_K (QK_K / (4*QR4_K))
3429-
#ifdef GGML_QKK_64
3430-
typedef struct {
3431-
sycl::half dm[2]; // super-block scales/mins
3432-
uint8_t scales[2]; // 4-bit block scales/mins
3433-
uint8_t qs[QK_K/2]; // 4--bit quants
3434-
} block_q4_K;
3435-
static_assert(sizeof(block_q4_K) == sizeof(sycl::half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
3436-
#else
3437-
typedef struct dpct_type_block_q4_K {
3438-
sycl::half2 dm; // super-block scale for quantized scales/mins
3439-
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
3440-
uint8_t qs[QK_K/2]; // 4--bit quants
3441-
} block_q4_K;
3442-
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
3443-
#endif
3444-
3445-
#define QR5_K 2
3446-
#define QI5_K (QK_K / (4*QR5_K))
3447-
#ifdef GGML_QKK_64
3448-
typedef struct {
3449-
sycl::half d; // super-block scale
3450-
int8_t scales[QK_K/16]; // block scales
3451-
uint8_t qh[QK_K/8]; // quants, high bit
3452-
uint8_t qs[QK_K/2]; // quants, low 4 bits
3453-
} block_q5_K;
3454-
static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
3455-
#else
3456-
typedef struct dpct_type_block_q5_K {
3457-
sycl::half2 dm; // super-block scale for quantized scales/mins
3458-
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
3459-
uint8_t qh[QK_K/8]; // quants, high bit
3460-
uint8_t qs[QK_K/2]; // quants, low 4 bits
3461-
} block_q5_K;
3462-
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
3463-
#endif
3464-
3465-
#define QR6_K 2
3466-
#define QI6_K (QK_K / (4*QR6_K))
3467-
typedef struct dpct_type_block_q6_K {
3468-
uint8_t ql[QK_K/2]; // quants, lower 4 bits
3469-
uint8_t qh[QK_K/4]; // quants, upper 2 bits
3470-
int8_t scales[QK_K/16]; // scales
3471-
sycl::half d; // delta
3472-
} block_q6_K;
3473-
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
3474-
3475-
#define QR2_XXS 8
3476-
#define QI2_XXS (QK_K / (4*QR2_XXS))
3477-
typedef struct dpct_type_block_iq2_xxs {
3478-
sycl::half d;
3479-
uint16_t qs[QK_K/8];
3480-
} block_iq2_xxs;
3481-
static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
3482-
3483-
#define QR2_XS 8
3484-
#define QI2_XS (QK_K / (4*QR2_XS))
3485-
typedef struct dpct_type_block_iq2_xs {
3486-
sycl::half d;
3487-
uint16_t qs[QK_K/8];
3488-
uint8_t scales[QK_K/32];
3489-
} block_iq2_xs;
3490-
static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
3491-
3492-
#define QR3_XXS 8
3493-
#define QI3_XXS (QK_K / (4*QR3_XXS))
3494-
typedef struct dpct_type_block_iq3_xxs {
3495-
sycl::half d;
3496-
uint8_t qs[3*(QK_K/8)];
3497-
} block_iq3_xxs;
3498-
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
3499-
35003351
#define WARP_SIZE 32
35013352
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
35023353

0 commit comments

Comments
 (0)