3
3
#if defined(GGML_COMMON_DECL_C)
4
4
#include < stdint.h>
5
5
6
- typedef uint16_t ggml_fp16_t ;
6
+ typedef uint16_t ggml_half;
7
+ typedef uint32_t ggml_half2;
8
+
9
+ #define GGML_COMMON_AGGR
7
10
8
11
#define GGML_COMMON_DECL
9
12
#elif defined(GGML_COMMON_DECL_METAL)
10
13
#include < metal_stdlib>
11
14
12
- typedef half ggml_fp16_t ;
15
+ typedef half ggml_half;
16
+ typedef half2 ggml_half2;
17
+
18
+ #define GGML_COMMON_AGGR
13
19
14
20
#define GGML_COMMON_DECL
15
21
#elif defined(GGML_COMMON_DECL_CUDA)
22
+ #include < cuda_fp16.h>
16
23
#include < cstdint>
17
24
18
- typedef half ggml_fp16_t ;
25
+ typedef half ggml_half;
26
+ typedef half2 ggml_half2;
27
+
28
+ #define GGML_COMMON_AGGR data
19
29
20
30
#define GGML_COMMON_DECL
21
31
#endif
@@ -40,60 +50,75 @@ typedef half ggml_fp16_t;
40
50
#define QI4_0 (QK4_0 / (4 * QR4_0))
41
51
#define QR4_0 2
42
52
typedef struct {
43
- ggml_fp16_t d; // delta
44
- uint8_t qs[QK4_0 / 2 ]; // nibbles / quants
53
+ ggml_half d; // delta
54
+ uint8_t qs[QK4_0 / 2 ]; // nibbles / quants
45
55
} block_q4_0;
46
- static_assert (sizeof (block_q4_0) == sizeof(ggml_fp16_t ) + QK4_0 / 2, "wrong q4_0 block size/padding");
56
+ static_assert (sizeof (block_q4_0) == sizeof(ggml_half ) + QK4_0 / 2, "wrong q4_0 block size/padding");
47
57
48
58
#define QK4_1 32
49
59
#define QI4_1 (QK4_1 / (4 * QR4_1))
50
60
#define QR4_1 2
51
61
typedef struct {
52
- ggml_fp16_t d; // delta
53
- ggml_fp16_t m; // min
54
- uint8_t qs[QK4_1 / 2 ]; // nibbles / quants
62
+ union {
63
+ struct {
64
+ ggml_half d; // delta
65
+ ggml_half m; // min
66
+ } GGML_COMMON_AGGR;
67
+ ggml_half2 dm;
68
+ };
69
+ uint8_t qs[QK4_1 / 2 ]; // nibbles / quants
55
70
} block_q4_1;
56
- static_assert (sizeof (block_q4_1) == 2 * sizeof(ggml_fp16_t ) + QK4_1 / 2, "wrong q4_1 block size/padding");
71
+ static_assert (sizeof (block_q4_1) == 2 * sizeof(ggml_half ) + QK4_1 / 2, "wrong q4_1 block size/padding");
57
72
58
73
#define QK5_0 32
59
74
#define QI5_0 (QK5_0 / (4 * QR5_0))
60
75
#define QR5_0 2
61
76
typedef struct {
62
- ggml_fp16_t d; // delta
77
+ ggml_half d; // delta
63
78
uint8_t qh[4 ]; // 5-th bit of quants
64
79
uint8_t qs[QK5_0 / 2 ]; // nibbles / quants
65
80
} block_q5_0;
66
- static_assert (sizeof (block_q5_0) == sizeof(ggml_fp16_t ) + sizeof(uint32_t ) + QK5_0 / 2, "wrong q5_0 block size/padding");
81
+ static_assert (sizeof (block_q5_0) == sizeof(ggml_half ) + sizeof(uint32_t ) + QK5_0 / 2, "wrong q5_0 block size/padding");
67
82
68
83
#define QK5_1 32
69
84
#define QI5_1 (QK5_1 / (4 * QR5_1))
70
85
#define QR5_1 2
71
86
typedef struct {
72
- ggml_fp16_t d; // delta
73
- ggml_fp16_t m; // min
87
+ union {
88
+ struct {
89
+ ggml_half d; // delta
90
+ ggml_half m; // min
91
+ } GGML_COMMON_AGGR;
92
+ ggml_half2 dm;
93
+ };
74
94
uint8_t qh[4 ]; // 5-th bit of quants
75
95
uint8_t qs[QK5_1 / 2 ]; // nibbles / quants
76
96
} block_q5_1;
77
- static_assert (sizeof (block_q5_1) == 2 * sizeof(ggml_fp16_t ) + sizeof(uint32_t ) + QK5_1 / 2, "wrong q5_1 block size/padding");
97
+ static_assert (sizeof (block_q5_1) == 2 * sizeof(ggml_half ) + sizeof(uint32_t ) + QK5_1 / 2, "wrong q5_1 block size/padding");
78
98
79
99
#define QK8_0 32
80
100
#define QI8_0 (QK8_0 / (4 * QR8_0))
81
101
#define QR8_0 1
82
102
typedef struct {
83
- ggml_fp16_t d; // delta
84
- int8_t qs[QK8_0]; // quants
103
+ ggml_half d; // delta
104
+ int8_t qs[QK8_0]; // quants
85
105
} block_q8_0;
86
- static_assert (sizeof (block_q8_0) == sizeof(ggml_fp16_t ) + QK8_0, "wrong q8_0 block size/padding");
106
+ static_assert (sizeof (block_q8_0) == sizeof(ggml_half ) + QK8_0, "wrong q8_0 block size/padding");
87
107
88
108
#define QK8_1 32
89
109
#define QI8_1 (QK8_1 / (4 * QR8_1))
90
110
#define QR8_1 1
91
111
typedef struct {
92
- float d; // delta
93
- float s; // d * sum(qs[i])
94
- int8_t qs[QK8_1]; // quants
112
+ union {
113
+ struct {
114
+ ggml_half xxxd; // delta
115
+ ggml_half xxxs; // d * sum(qs[i])
116
+ } GGML_COMMON_AGGR;
117
+ ggml_half2 ds;
118
+ };
119
+ int8_t qs[QK8_1]; // quants
95
120
} block_q8_1;
96
- static_assert (sizeof (block_q8_1) == 2*sizeof(float ) + QK8_1, "wrong q8_1 block size/padding");
121
+ static_assert (sizeof (block_q8_1) == 2*sizeof(ggml_half ) + QK8_1, "wrong q8_1 block size/padding");
97
122
98
123
//
99
124
// Super-block quantization structures
@@ -117,10 +142,15 @@ static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block s
117
142
typedef struct {
118
143
uint8_t scales[QK_K/16 ]; // scales and mins, quantized with 4 bits
119
144
uint8_t qs[QK_K/4 ]; // quants
120
- ggml_fp16_t d; // super-block scale for quantized scales
121
- ggml_fp16_t dmin; // super-block scale for quantized mins
145
+ union {
146
+ struct {
147
+ ggml_half d; // super-block scale for quantized scales
148
+ ggml_half dmin; // super-block scale for quantized mins
149
+ } GGML_COMMON_AGGR;
150
+ ggml_half2 dm;
151
+ };
122
152
} block_q2_K;
123
- static_assert (sizeof (block_q2_K) == 2*sizeof(ggml_fp16_t ) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
153
+ static_assert (sizeof (block_q2_K) == 2*sizeof(ggml_half ) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
124
154
125
155
// 3-bit quantization
126
156
// weight is represented as x = a * q
@@ -130,20 +160,20 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "w
130
160
#define QR3_K 4
131
161
#ifdef GGML_QKK_64
132
162
typedef struct {
133
- uint8_t hmask[QK_K/8 ]; // quants - high bit
134
- uint8_t qs[QK_K/4 ]; // quants - low 2 bits
163
+ uint8_t hmask[QK_K/8 ]; // quants - high bit
164
+ uint8_t qs[QK_K/4 ]; // quants - low 2 bits
135
165
uint8_t scales[2 ];
136
- ggml_fp16_t d; // super-block scale
166
+ ggml_half d; // super-block scale
137
167
} block_q3_K;
138
- static_assert (sizeof (block_q3_K) == sizeof(ggml_fp16_t ) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
168
+ static_assert (sizeof (block_q3_K) == sizeof(ggml_half ) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
139
169
#else
140
170
typedef struct {
141
- uint8_t hmask[QK_K/8 ]; // quants - high bit
142
- uint8_t qs[QK_K/4 ]; // quants - low 2 bits
143
- uint8_t scales[12 ]; // scales, quantized with 6 bits
144
- ggml_fp16_t d; // super-block scale
171
+ uint8_t hmask[QK_K/8 ]; // quants - high bit
172
+ uint8_t qs[QK_K/4 ]; // quants - low 2 bits
173
+ uint8_t scales[12 ]; // scales, quantized with 6 bits
174
+ ggml_half d; // super-block scale
145
175
} block_q3_K;
146
- static_assert (sizeof (block_q3_K) == sizeof(ggml_fp16_t ) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
176
+ static_assert (sizeof (block_q3_K) == sizeof(ggml_half ) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
147
177
#endif
148
178
149
179
// 4-bit quantization
@@ -154,19 +184,24 @@ static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 +
154
184
#define QR4_K 2
155
185
#ifdef GGML_QKK_64
156
186
typedef struct {
157
- ggml_fp16_t d[2 ]; // super-block scales/mins
158
- uint8_t scales[2 ]; // 4-bit block scales/mins
159
- uint8_t qs[QK_K/2 ]; // 4--bit quants
187
+ ggml_half d[2 ]; // super-block scales/mins
188
+ uint8_t scales[2 ]; // 4-bit block scales/mins
189
+ uint8_t qs[QK_K/2 ]; // 4--bit quants
160
190
} block_q4_K;
161
- static_assert (sizeof (block_q4_K) == 2*sizeof(ggml_fp16_t ) + QK_K/2 + 2, "wrong q4_K block size/padding");
191
+ static_assert (sizeof (block_q4_K) == 2*sizeof(ggml_half ) + QK_K/2 + 2, "wrong q4_K block size/padding");
162
192
#else
163
193
typedef struct {
164
- ggml_fp16_t d; // super-block scale for quantized scales
165
- ggml_fp16_t dmin; // super-block scale for quantized mins
194
+ union {
195
+ struct {
196
+ ggml_half d; // super-block scale for quantized scales
197
+ ggml_half dmin; // super-block scale for quantized mins
198
+ } GGML_COMMON_AGGR;
199
+ ggml_half2 dm;
200
+ };
166
201
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
167
- uint8_t qs[QK_K/2 ]; // 4--bit quants
202
+ uint8_t qs[QK_K/2 ]; // 4--bit quants
168
203
} block_q4_K;
169
- static_assert (sizeof (block_q4_K) == 2*sizeof(ggml_fp16_t ) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
204
+ static_assert (sizeof (block_q4_K) == 2*sizeof(ggml_half ) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
170
205
#endif
171
206
172
207
// 5-bit quantization
@@ -177,21 +212,26 @@ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/
177
212
#define QR5_K 2
178
213
#ifdef GGML_QKK_64
179
214
typedef struct {
180
- ggml_fp16_t d; // super-block scale
181
- int8_t scales[QK_K/16 ]; // 8-bit block scales
182
- uint8_t qh[QK_K/8 ]; // quants, high bit
183
- uint8_t qs[QK_K/2 ]; // quants, low 4 bits
215
+ ggml_half d; // super-block scale
216
+ int8_t scales[QK_K/16 ]; // 8-bit block scales
217
+ uint8_t qh[QK_K/8 ]; // quants, high bit
218
+ uint8_t qs[QK_K/2 ]; // quants, low 4 bits
184
219
} block_q5_K;
185
- static_assert (sizeof (block_q5_K) == sizeof(ggml_fp16_t ) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
220
+ static_assert (sizeof (block_q5_K) == sizeof(ggml_half ) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
186
221
#else
187
222
typedef struct {
188
- ggml_fp16_t d; // super-block scale for quantized scales
189
- ggml_fp16_t dmin; // super-block scale for quantized mins
190
- uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
191
- uint8_t qh[QK_K/8 ]; // quants, high bit
192
- uint8_t qs[QK_K/2 ]; // quants, low 4 bits
223
+ union {
224
+ struct {
225
+ ggml_half d; // super-block scale for quantized scales
226
+ ggml_half dmin; // super-block scale for quantized mins
227
+ } GGML_COMMON_AGGR;
228
+ ggml_half2 dm;
229
+ };
230
+ uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
231
+ uint8_t qh[QK_K/8 ]; // quants, high bit
232
+ uint8_t qs[QK_K/2 ]; // quants, low 4 bits
193
233
} block_q5_K;
194
- static_assert (sizeof (block_q5_K) == 2*sizeof(ggml_fp16_t ) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
234
+ static_assert (sizeof (block_q5_K) == 2*sizeof(ggml_half ) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
195
235
#endif
196
236
197
237
// 6-bit quantization
@@ -204,9 +244,9 @@ typedef struct {
204
244
uint8_t ql[QK_K/2 ]; // quants, lower 4 bits
205
245
uint8_t qh[QK_K/4 ]; // quants, upper 2 bits
206
246
int8_t scales[QK_K/16 ]; // scales, quantized with 8 bits
207
- ggml_fp16_t d; // super-block scale
247
+ ggml_half d; // super-block scale
208
248
} block_q6_K;
209
- static_assert (sizeof (block_q6_K) == sizeof(ggml_fp16_t ) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");
249
+ static_assert (sizeof (block_q6_K) == sizeof(ggml_half ) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");
210
250
211
251
// This is only used for intermediate quantization and dot products
212
252
typedef struct {
@@ -222,42 +262,42 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_
222
262
#define QI2_XXS (QK_K / (4 *QR2_XXS))
223
263
#define QR2_XXS 8
224
264
typedef struct {
225
- ggml_fp16_t d;
265
+ ggml_half d;
226
266
uint16_t qs[QK_K/8 ];
227
267
} block_iq2_xxs;
228
- static_assert (sizeof (block_iq2_xxs) == sizeof(ggml_fp16_t ) + QK_K/8*sizeof(uint16_t ), "wrong iq2_xxs block size/padding");
268
+ static_assert (sizeof (block_iq2_xxs) == sizeof(ggml_half ) + QK_K/8*sizeof(uint16_t ), "wrong iq2_xxs block size/padding");
229
269
230
270
// 2.3125 bpw quants
231
271
#define QI2_XS (QK_K / (4 *QR2_XS))
232
272
#define QR2_XS 8
233
273
typedef struct {
234
- ggml_fp16_t d;
274
+ ggml_half d;
235
275
uint16_t qs[QK_K/8 ];
236
276
uint8_t scales[QK_K/32 ];
237
277
} block_iq2_xs;
238
- static_assert (sizeof (block_iq2_xs) == sizeof(ggml_fp16_t ) + QK_K/8*sizeof(uint16_t ) + QK_K/32, "wrong iq2_xs block size/padding");
278
+ static_assert (sizeof (block_iq2_xs) == sizeof(ggml_half ) + QK_K/8*sizeof(uint16_t ) + QK_K/32, "wrong iq2_xs block size/padding");
239
279
240
280
// 2.5625 bpw quants
241
281
#define QI2_S (QK_K / (4 *QR2_S))
242
282
#define QR2_S 8
243
283
typedef struct {
244
- ggml_fp16_t d;
284
+ ggml_half d;
245
285
uint8_t qs[QK_K/4 ];
246
286
uint8_t qh[QK_K/32 ];
247
287
uint8_t scales[QK_K/32 ];
248
288
} block_iq2_s;
249
- static_assert (sizeof (block_iq2_s) == sizeof(ggml_fp16_t ) + QK_K/4 + QK_K/16, "wrong iq2_s block size/padding");
289
+ static_assert (sizeof (block_iq2_s) == sizeof(ggml_half ) + QK_K/4 + QK_K/16, "wrong iq2_s block size/padding");
250
290
251
291
// (Almost) "true" 3-bit quantization.
252
292
// Due to the need to use blocks as per ggml design, it ends up using
253
293
// 3.0625 bpw because of the 16-bit scale for each block of 256.
254
294
#define QI3_XXS (QK_K / (4 *QR3_XXS))
255
295
#define QR3_XXS 8
256
296
typedef struct {
257
- ggml_fp16_t d;
297
+ ggml_half d;
258
298
uint8_t qs[3 *QK_K/8 ];
259
299
} block_iq3_xxs;
260
- static_assert (sizeof (block_iq3_xxs) == sizeof(ggml_fp16_t ) + 3*(QK_K/8 ), "wrong iq3_xxs block size/padding");
300
+ static_assert (sizeof (block_iq3_xxs) == sizeof(ggml_half ) + 3*(QK_K/8 ), "wrong iq3_xxs block size/padding");
261
301
262
302
// 3.4375 bpw
263
303
#if QK_K == 64
@@ -268,32 +308,32 @@ static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong
268
308
#define QI3_XS (QK_K / (4 *QR3_XS))
269
309
#define QR3_XS 8
270
310
typedef struct {
271
- ggml_fp16_t d;
311
+ ggml_half d;
272
312
uint8_t qs[QK_K/4 ];
273
313
uint8_t qh[QK_K/32 ];
274
314
uint8_t signs[QK_K/8 ];
275
315
uint8_t scales[IQ3S_N_SCALE];
276
316
} block_iq3_s;
277
- static_assert (sizeof (block_iq3_s) == sizeof(ggml_fp16_t ) + 13*(QK_K/32 ) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
317
+ static_assert (sizeof (block_iq3_s) == sizeof(ggml_half ) + 13*(QK_K/32 ) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
278
318
279
319
#define QI1_S (QK_K / (4 *QR1_S))
280
320
#define QR1_S 8
281
321
typedef struct {
282
- ggml_fp16_t d;
322
+ ggml_half d;
283
323
uint8_t qs[QK_K/8 ];
284
324
uint8_t scales[QK_K/16 ];
285
325
} block_iq1_s;
286
- static_assert (sizeof (block_iq1_s) == sizeof(ggml_fp16_t ) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
326
+ static_assert (sizeof (block_iq1_s) == sizeof(ggml_half ) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
287
327
288
328
// Non-linear quants
289
329
#define QK4_NL 32
290
330
#define QI4_NL (QK4_NL / (4 *QR4_NL))
291
331
#define QR4_NL 2
292
332
typedef struct {
293
- ggml_fp16_t d;
333
+ ggml_half d;
294
334
uint8_t qs[QK4_NL/2 ];
295
335
} block_iq4_nl;
296
- static_assert (sizeof (block_iq4_nl) == sizeof(ggml_fp16_t ) + QK4_NL/2, "wrong iq4_nl block size/padding");
336
+ static_assert (sizeof (block_iq4_nl) == sizeof(ggml_half ) + QK4_NL/2, "wrong iq4_nl block size/padding");
297
337
298
338
#if QK_K == 64
299
339
#define block_iq4_xs block_iq4_nl
@@ -304,12 +344,12 @@ static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4
304
344
#define QI4_XS (QK_K / (4 *QR4_XS))
305
345
#define QR4_XS 8
306
346
typedef struct {
307
- ggml_fp16_t d;
347
+ ggml_half d;
308
348
uint16_t scales_h;
309
349
uint8_t scales_l[QK_K/64 ];
310
350
uint8_t qs[QK_K/2 ];
311
351
} block_iq4_xs;
312
- static_assert (sizeof (block_iq4_xs) == sizeof(ggml_fp16_t ) + sizeof(uint16_t ) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
352
+ static_assert (sizeof (block_iq4_xs) == sizeof(ggml_half ) + sizeof(uint16_t ) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
313
353
#endif
314
354
315
355
#endif // GGML_COMMON_DECL
0 commit comments