|
36 | 36 | #include "ggml.h"
|
37 | 37 | #include "ggml-backend-impl.h"
|
38 | 38 |
|
39 |
| -#define GGML_COMMON_IMPL_SYCL |
40 |
| -#include "ggml-common.h" |
41 |
| - |
42 | 39 | /*
|
43 | 40 | Following definition copied from DPCT head files, which are used by ggml-sycl.cpp
|
44 | 41 | */
|
@@ -3147,6 +3144,7 @@ namespace dpct
|
3147 | 3144 |
|
3148 | 3145 | } // COPY from DPCT head files
|
3149 | 3146 |
|
| 3147 | +#define GGML_COMMON_DECL_SYCL |
3150 | 3148 | #define GGML_COMMON_IMPL_SYCL
|
3151 | 3149 | #include "ggml-common.h"
|
3152 | 3150 |
|
@@ -3315,66 +3313,6 @@ typedef void (*ggml_sycl_op_flatten_t)(const ggml_tensor *src0,
|
3315 | 3313 | const float *src1_dd, float *dst_dd,
|
3316 | 3314 | const dpct::queue_ptr &main_stream);
|
3317 | 3315 |
|
3318 |
| -// QK = number of values after dequantization |
3319 |
| -// QR = QK / number of values before dequantization |
3320 |
| -// QI = number of 32 bit integers before dequantization |
3321 |
| - |
3322 |
| -#define QK4_0 32 |
3323 |
| -#define QR4_0 2 |
3324 |
| -#define QI4_0 (QK4_0 / (4 * QR4_0)) |
3325 |
| -typedef struct dpct_type_block_q4_0 { |
3326 |
| - sycl::half d; // delta |
3327 |
| - uint8_t qs[QK4_0 / 2]; // nibbles / quants |
3328 |
| -} block_q4_0; |
3329 |
| -static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding"); |
3330 |
| - |
3331 |
| -#define QK4_1 32 |
3332 |
| -#define QR4_1 2 |
3333 |
| -#define QI4_1 (QK4_1 / (4 * QR4_1)) |
3334 |
| -typedef struct dpct_type_block_q4_1 { |
3335 |
| - sycl::half2 dm; // dm.x = delta, dm.y = min |
3336 |
| - uint8_t qs[QK4_1 / 2]; // nibbles / quants |
3337 |
| -} block_q4_1; |
3338 |
| -static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding"); |
3339 |
| - |
3340 |
| -#define QK5_0 32 |
3341 |
| -#define QR5_0 2 |
3342 |
| -#define QI5_0 (QK5_0 / (4 * QR5_0)) |
3343 |
| -typedef struct dpct_type_block_q5_0 { |
3344 |
| - sycl::half d; // delta |
3345 |
| - uint8_t qh[4]; // 5-th bit of quants |
3346 |
| - uint8_t qs[QK5_0 / 2]; // nibbles / quants |
3347 |
| -} block_q5_0; |
3348 |
| -static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding"); |
3349 |
| - |
3350 |
| -#define QK5_1 32 |
3351 |
| -#define QR5_1 2 |
3352 |
| -#define QI5_1 (QK5_1 / (4 * QR5_1)) |
3353 |
| -typedef struct dpct_type_block_q5_1 { |
3354 |
| - sycl::half2 dm; // dm.x = delta, dm.y = min |
3355 |
| - uint8_t qh[4]; // 5-th bit of quants |
3356 |
| - uint8_t qs[QK5_1 / 2]; // nibbles / quants |
3357 |
| -} block_q5_1; |
3358 |
| -static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding"); |
3359 |
| - |
3360 |
| -#define QK8_0 32 |
3361 |
| -#define QR8_0 1 |
3362 |
| -#define QI8_0 (QK8_0 / (4 * QR8_0)) |
3363 |
| -typedef struct dpct_type_block_q8_0 { |
3364 |
| - sycl::half d; // delta |
3365 |
| - int8_t qs[QK8_0]; // quants |
3366 |
| -} block_q8_0; |
3367 |
| -static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding"); |
3368 |
| - |
3369 |
| -#define QK8_1 32 |
3370 |
| -#define QR8_1 1 |
3371 |
| -#define QI8_1 (QK8_1 / (4 * QR8_1)) |
3372 |
| -typedef struct dpct_type_block_q8_1 { |
3373 |
| - sycl::half2 ds; // ds.x = delta, ds.y = sum |
3374 |
| - int8_t qs[QK8_0]; // quants |
3375 |
| -} block_q8_1; |
3376 |
| -static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding"); |
3377 |
| - |
3378 | 3316 | typedef float (*vec_dot_q_sycl_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
|
3379 | 3317 | typedef void (*allocate_tiles_sycl_t)(int **x_ql, sycl::half2 **x_dm,
|
3380 | 3318 | int **x_qh, int **x_sc);
|
@@ -3410,93 +3348,6 @@ typedef struct dpct_type_block_q2_K {
|
3410 | 3348 | } block_q2_K;
|
3411 | 3349 | static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
3412 | 3350 |
|
3413 |
| -#define QR3_K 4 |
3414 |
| -#define QI3_K (QK_K / (4*QR3_K)) |
3415 |
| -typedef struct dpct_type_block_q3_K { |
3416 |
| - uint8_t hmask[QK_K/8]; // quants - high bit |
3417 |
| - uint8_t qs[QK_K/4]; // quants - low 2 bits |
3418 |
| -#ifdef GGML_QKK_64 |
3419 |
| - uint8_t scales[2]; // scales, quantized with 8 bits |
3420 |
| -#else |
3421 |
| - uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits |
3422 |
| -#endif |
3423 |
| - sycl::half d; // super-block scale |
3424 |
| -} block_q3_K; |
3425 |
| -//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding"); |
3426 |
| - |
3427 |
| -#define QR4_K 2 |
3428 |
| -#define QI4_K (QK_K / (4*QR4_K)) |
3429 |
| -#ifdef GGML_QKK_64 |
3430 |
| -typedef struct { |
3431 |
| - sycl::half dm[2]; // super-block scales/mins |
3432 |
| - uint8_t scales[2]; // 4-bit block scales/mins |
3433 |
| - uint8_t qs[QK_K/2]; // 4--bit quants |
3434 |
| -} block_q4_K; |
3435 |
| -static_assert(sizeof(block_q4_K) == sizeof(sycl::half2) + QK_K/2 + 2, "wrong q4_K block size/padding"); |
3436 |
| -#else |
3437 |
| -typedef struct dpct_type_block_q4_K { |
3438 |
| - sycl::half2 dm; // super-block scale for quantized scales/mins |
3439 |
| - uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits |
3440 |
| - uint8_t qs[QK_K/2]; // 4--bit quants |
3441 |
| -} block_q4_K; |
3442 |
| -static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding"); |
3443 |
| -#endif |
3444 |
| - |
3445 |
| -#define QR5_K 2 |
3446 |
| -#define QI5_K (QK_K / (4*QR5_K)) |
3447 |
| -#ifdef GGML_QKK_64 |
3448 |
| -typedef struct { |
3449 |
| - sycl::half d; // super-block scale |
3450 |
| - int8_t scales[QK_K/16]; // block scales |
3451 |
| - uint8_t qh[QK_K/8]; // quants, high bit |
3452 |
| - uint8_t qs[QK_K/2]; // quants, low 4 bits |
3453 |
| -} block_q5_K; |
3454 |
| -static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding"); |
3455 |
| -#else |
3456 |
| -typedef struct dpct_type_block_q5_K { |
3457 |
| - sycl::half2 dm; // super-block scale for quantized scales/mins |
3458 |
| - uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits |
3459 |
| - uint8_t qh[QK_K/8]; // quants, high bit |
3460 |
| - uint8_t qs[QK_K/2]; // quants, low 4 bits |
3461 |
| -} block_q5_K; |
3462 |
| -static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding"); |
3463 |
| -#endif |
3464 |
| - |
3465 |
| -#define QR6_K 2 |
3466 |
| -#define QI6_K (QK_K / (4*QR6_K)) |
3467 |
| -typedef struct dpct_type_block_q6_K { |
3468 |
| - uint8_t ql[QK_K/2]; // quants, lower 4 bits |
3469 |
| - uint8_t qh[QK_K/4]; // quants, upper 2 bits |
3470 |
| - int8_t scales[QK_K/16]; // scales |
3471 |
| - sycl::half d; // delta |
3472 |
| -} block_q6_K; |
3473 |
| -static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding"); |
3474 |
| - |
3475 |
| -#define QR2_XXS 8 |
3476 |
| -#define QI2_XXS (QK_K / (4*QR2_XXS)) |
3477 |
| -typedef struct dpct_type_block_iq2_xxs { |
3478 |
| - sycl::half d; |
3479 |
| - uint16_t qs[QK_K/8]; |
3480 |
| -} block_iq2_xxs; |
3481 |
| -static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding"); |
3482 |
| - |
3483 |
| -#define QR2_XS 8 |
3484 |
| -#define QI2_XS (QK_K / (4*QR2_XS)) |
3485 |
| -typedef struct dpct_type_block_iq2_xs { |
3486 |
| - sycl::half d; |
3487 |
| - uint16_t qs[QK_K/8]; |
3488 |
| - uint8_t scales[QK_K/32]; |
3489 |
| -} block_iq2_xs; |
3490 |
| -static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding"); |
3491 |
| - |
3492 |
| -#define QR3_XXS 8 |
3493 |
| -#define QI3_XXS (QK_K / (4*QR3_XXS)) |
3494 |
| -typedef struct dpct_type_block_iq3_xxs { |
3495 |
| - sycl::half d; |
3496 |
| - uint8_t qs[3*(QK_K/8)]; |
3497 |
| -} block_iq3_xxs; |
3498 |
| -static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding"); |
3499 |
| - |
3500 | 3351 | #define WARP_SIZE 32
|
3501 | 3352 | #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
3502 | 3353 |
|
|
0 commit comments