Skip to content

Commit 605560d

Browse files
committed
Merge 'origin/master' into hipblas
2 parents 127f68e + 089b1c9 commit 605560d

12 files changed

+670
-1706
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -44,5 +44,6 @@ zig-cache/
4444

4545
ppl-*.txt
4646
qnt-*.txt
47+
perf-*.txt
4748

4849
examples/jeopardy/results.txt

README.md

+14-21
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,10 @@
77

88
Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
99

10-
## ⚠️ TEMPORARY NOTICE ABOUT UPCOMING BREAKING CHANGE ⚠️
11-
12-
**The quantization formats will soon be updated: https://github.com/ggerganov/llama.cpp/pull/1305**
13-
14-
**All `ggml` model files using the old format will not work with the latest `llama.cpp` code after that change is merged**
15-
16-
---
17-
1810
**Hot topics:**
1911

12+
- Qauntization formats `Q4` and `Q5` have changed - requantize any old models [(info)](https://github.com/ggerganov/llama.cpp/pull/1405)
2013
- [Roadmap May 2023](https://github.com/ggerganov/llama.cpp/discussions/1220)
21-
- [New quantization methods](https://github.com/ggerganov/llama.cpp#quantization)
2214

2315
<details>
2416
<summary>Table of Contents</summary>
@@ -95,6 +87,7 @@ as the main playground for developing new features for the [ggml](https://github
9587
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
9688
- Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
9789
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
90+
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
9891

9992
**UI:**
10093

@@ -338,18 +331,18 @@ As the models are currently fully loaded into memory, you will need adequate dis
338331
339332
Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
340333
341-
| Model | Measure | F16 | Q4_0 | Q4_1 | Q4_2 | Q5_0 | Q5_1 | Q8_0 |
342-
|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|-------:|
343-
| 7B | perplexity | 5.9066 | 6.1620 | 6.0910 | 6.1466 | 5.9862 | 5.9481 | 5.9069 |
344-
| 7B | file size | 13.0G | 4.0G | 4.8G | 4.0G | 4.4G | 4.8G | 7.1G |
345-
| 7B | ms/tok @ 4th | 128 | 56 | 61 | 84 | 91 | 95 | 75 |
346-
| 7B | ms/tok @ 8th | 128 | 47 | 55 | 48 | 53 | 59 | 75 |
347-
| 7B | bits/weight | 16.0 | 5.0 | 6.0 | 5.0 | 5.5 | 6.0 | 9.0 |
348-
| 13B | perplexity | 5.2543 | 5.3863 | 5.3607 | 5.3513 | 5.2856 | 5.2706 | 5.2548 |
349-
| 13B | file size | 25.0G | 7.6G | 9.1G | 7.6G | 8.4G | 9.1G | 14G |
350-
| 13B | ms/tok @ 4th | 239 | 104 | 113 | 160 | 176 | 185 | 141 |
351-
| 13B | ms/tok @ 8th | 240 | 85 | 99 | 97 | 108 | 117 | 147 |
352-
| 13B | bits/weight | 16.0 | 5.0 | 6.0 | 5.0 | 5.5 | 6.0 | 9.0 |
334+
| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
335+
|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
336+
| 7B | perplexity | 5.9066 | 6.1620 | 6.0910 | 5.9862 | 5.9481 | 5.9069 |
337+
| 7B | file size | 13.0G | 4.0G | 4.8G | 4.4G | 4.8G | 7.1G |
338+
| 7B | ms/tok @ 4th | 128 | 50 | 54 | 75 | 83 | 75 |
339+
| 7B | ms/tok @ 8th | 123 | 44 | 52 | 53 | 58 | 72 |
340+
| 7B | bits/weight | 16.0 | 5.0 | 6.0 | 5.5 | 6.0 | 9.0 |
341+
| 13B | perplexity | 5.2543 | 5.3863 | 5.3607 | 5.2856 | 5.2706 | 5.2548 |
342+
| 13B | file size | 25.0G | 7.6G | 9.1G | 8.4G | 9.1G | 14G |
343+
| 13B | ms/tok @ 4th | 239 | 93 | 101 | 150 | 164 | 141 |
344+
| 13B | ms/tok @ 8th | 240 | 81 | 96 | 96 | 104 | 136 |
345+
| 13B | bits/weight | 16.0 | 5.0 | 6.0 | 5.5 | 6.0 | 9.0 |
353346
354347
### Perplexity (measuring model quality)
355348

SHA256SUMS

+16-12
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,27 @@
11
700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d models/7B/consolidated.00.pth
22
666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847 models/7B/ggml-model-f16.bin
3-
99aeb35f26b577fa2732716cca4d8b5ada39a78ea9b2dca2651fc632b5d101b6 models/7B/ggml-model-q4_0.bin
4-
cc061458339a3eb8bcecbf0a825e9924fb7d1a8150f63cd5d091caa99215aafe models/7B/ggml-model-q4_1.bin
5-
25b050337a87344da687a7f2adddc03bd99b7f6c140450e836649f3585fb6496 models/7B/ggml-model-q4_2.bin
3+
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q4_0.bin
4+
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q4_1.bin
5+
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q5_0.bin
6+
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q5_1.bin
67
7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265 models/7B/params.json
78
745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08 models/13B/consolidated.00.pth
89
d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085 models/13B/consolidated.01.pth
910
2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808 models/13B/ggml-model-f16.bin
10-
eecb575d325d935157761172e2bf05984dad216eb2b06777b73463cf9b818bab models/13B/ggml-model-q4_0.bin
11-
d9581b5b88e5622532fe897c9f9b0e67a317d22dd27a6f90fa4ab8c6d23ccdbb models/13B/ggml-model-q4_1.bin
12-
75a218a47df03f5f96354656329864613abcb67779412b9bc2282b28c1c3cbaa models/13B/ggml-model-q4_2.bin
11+
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q4_0.bin
12+
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q4_1.bin
13+
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q5_0.bin
14+
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q5_1.bin
1315
4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f models/13B/params.json
1416
e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067 models/30B/consolidated.00.pth
1517
4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff models/30B/consolidated.01.pth
1618
24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378 models/30B/consolidated.02.pth
1719
1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b models/30B/consolidated.03.pth
1820
7e1b524061a9f4b27c22a12d6d2a5bf13b8ebbea73e99f218809351ed9cf7d37 models/30B/ggml-model-f16.bin
19-
517b9e525742c42b5478a6280a4b41ec66f46298c57aba7f0453d491682fe42d models/30B/ggml-model-q4_0.bin
20-
7b75ac615fa369ee593493a7e6ef87542bf0350255db928b22c5a24f6d598bcd models/30B/ggml-model-q4_1.bin
21-
aadbc9cf806313a55be570f62884eed289d30c313fac3b7838717e01bd553204 models/30B/ggml-model-q4_2.bin
21+
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q4_0.bin
22+
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q4_1.bin
23+
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q5_0.bin
24+
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q5_1.bin
2225
2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb models/30B/params.json
2326
135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe models/65B/consolidated.00.pth
2427
9a600b37b19d38c7e43809485f70d17d1dc12206c07efa83bc72bb498a568bde models/65B/consolidated.01.pth
@@ -29,8 +32,9 @@ a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78 models/65B/con
2932
72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b models/65B/consolidated.06.pth
3033
d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638 models/65B/consolidated.07.pth
3134
60758f2384d74e423dffddfd020ffed9d3bb186ebc54506f9c4a787d0f5367b0 models/65B/ggml-model-f16.bin
32-
01672072136f8be6ca9d7cebe5f86ed316e8b85851b9fe3de951809233cea4f2 models/65B/ggml-model-q4_0.bin
33-
4743a28aac3e5f32a6e838a815f51d3779de44fbbe251d745251e66c23c5950f models/65B/ggml-model-q4_1.bin
34-
1b6f6588d0e2ecfe6c4d849088e48e5e3083466b962daa32e3261363e21fc5e9 models/65B/ggml-model-q4_2.bin
35+
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q4_0.bin
36+
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q4_1.bin
37+
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q5_0.bin
38+
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q5_1.bin
3539
999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b models/65B/params.json
3640
9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 models/tokenizer.model

examples/quantize/quantize.cpp

+5-6
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,11 @@
77
#include <string>
88

99
static const std::map<std::string, llama_ftype> LLAMA_FTYPE_MAP = {
10-
{"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
11-
{"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
12-
{"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2},
13-
{"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
14-
{"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
15-
{"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
10+
{"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
11+
{"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
12+
{"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
13+
{"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
14+
{"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
1615
};
1716

1817
bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::string & ftype_str_out) {

ggml-cuda.cu

+36-95
Original file line numberDiff line numberDiff line change
@@ -94,13 +94,6 @@ typedef struct {
9494
} block_q4_1;
9595
static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
9696

97-
#define QK4_2 16
98-
typedef struct {
99-
half d; // delta
100-
uint8_t qs[QK4_2 / 2]; // nibbles / quants
101-
} block_q4_2;
102-
static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
103-
10497
#define QK5_0 32
10598
typedef struct {
10699
half d; // delta
@@ -126,147 +119,102 @@ typedef struct {
126119
static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
127120

128121
static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
122+
static const int qk = QK4_0;
123+
129124
const block_q4_0 * x = (const block_q4_0 *) vx;
130125

131126
const int i = blockIdx.x;
132127

133128
const float d = x[i].d;
134129

135-
const uint8_t * pp = x[i].qs;
136-
137-
for (int l = 0; l < QK4_0; l += 2) {
138-
const uint8_t vi = pp[l/2];
139-
140-
const int8_t vi0 = vi & 0xf;
141-
const int8_t vi1 = vi >> 4;
130+
for (int j = 0; j < qk/2; ++j) {
131+
const int x0 = (x[i].qs[j] & 0xf) - 8;
132+
const int x1 = (x[i].qs[j] >> 4) - 8;
142133

143-
const float v0 = (vi0 - 8)*d;
144-
const float v1 = (vi1 - 8)*d;
145-
146-
y[i*QK4_0 + l + 0] = v0;
147-
y[i*QK4_0 + l + 1] = v1;
134+
y[i*qk + j + 0 ] = x0*d;
135+
y[i*qk + j + qk/2] = x1*d;
148136
}
149137
}
150138

151139
static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
140+
static const int qk = QK4_1;
141+
152142
const block_q4_1 * x = (const block_q4_1 *) vx;
153143

154144
const int i = blockIdx.x;
155145

156146
const float d = x[i].d;
157147
const float m = x[i].m;
158148

159-
const uint8_t * pp = x[i].qs;
160-
161-
for (int l = 0; l < QK4_1; l += 2) {
162-
const uint8_t vi = pp[l/2];
163-
164-
const int8_t vi0 = vi & 0xf;
165-
const int8_t vi1 = vi >> 4;
149+
for (int j = 0; j < qk/2; ++j) {
150+
const int x0 = (x[i].qs[j] & 0xf);
151+
const int x1 = (x[i].qs[j] >> 4);
166152

167-
const float v0 = vi0*d + m;
168-
const float v1 = vi1*d + m;
169-
170-
y[i*QK4_1 + l + 0] = v0;
171-
y[i*QK4_1 + l + 1] = v1;
172-
}
173-
}
174-
175-
static __global__ void dequantize_block_q4_2(const void * vx, float * y) {
176-
const block_q4_2 * x = (const block_q4_2 *) vx;
177-
178-
const int i = blockIdx.x;
179-
180-
const float d = x[i].d;
181-
182-
const uint8_t * pp = x[i].qs;
183-
184-
for (int l = 0; l < QK4_2; l += 2) {
185-
const uint8_t vi = pp[l/2];
186-
187-
const int8_t vi0 = vi & 0xf;
188-
const int8_t vi1 = vi >> 4;
189-
190-
const float v0 = (vi0 - 8)*d;
191-
const float v1 = (vi1 - 8)*d;
192-
193-
y[i*QK4_2 + l + 0] = v0;
194-
y[i*QK4_2 + l + 1] = v1;
153+
y[i*qk + j + 0 ] = x0*d + m;
154+
y[i*qk + j + qk/2] = x1*d + m;
195155
}
196156
}
197157

198158
static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
159+
static const int qk = QK5_0;
160+
199161
const block_q5_0 * x = (const block_q5_0 *) vx;
200162

201163
const int i = blockIdx.x;
202164

203165
const float d = x[i].d;
204166

205-
const uint8_t * pp = x[i].qs;
206-
207167
uint32_t qh;
208168
memcpy(&qh, x[i].qh, sizeof(qh));
209169

210-
for (int l = 0; l < QK5_0; l += 2) {
211-
const uint8_t vi = pp[l/2];
212-
213-
const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
214-
const int8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
170+
for (int j = 0; j < qk/2; ++j) {
171+
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
172+
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
215173

216-
const int8_t vi0 = ((vi & 0xf) | vh0);
217-
const int8_t vi1 = ((vi >> 4) | vh1);
174+
const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
175+
const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16;
218176

219-
const float v0 = (vi0 - 16)*d;
220-
const float v1 = (vi1 - 16)*d;
221-
222-
y[i*QK5_0 + l + 0] = v0;
223-
y[i*QK5_0 + l + 1] = v1;
177+
y[i*qk + j + 0 ] = x0*d;
178+
y[i*qk + j + qk/2] = x1*d;
224179
}
225180
}
226181

227182
static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
183+
static const int qk = QK5_1;
184+
228185
const block_q5_1 * x = (const block_q5_1 *) vx;
229186

230187
const int i = blockIdx.x;
231188

232189
const float d = x[i].d;
233190
const float m = x[i].m;
234191

235-
const uint8_t * pp = x[i].qs;
236-
237192
uint32_t qh;
238193
memcpy(&qh, x[i].qh, sizeof(qh));
239194

240-
for (int l = 0; l < QK5_1; l += 2) {
241-
const uint8_t vi = pp[l/2];
242-
243-
const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
244-
const int8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
195+
for (int j = 0; j < qk/2; ++j) {
196+
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
197+
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
245198

246-
const int8_t vi0 = (vi & 0xf) | vh0;
247-
const int8_t vi1 = (vi >> 4) | vh1;
199+
const int x0 = (x[i].qs[j] & 0xf) | xh_0;
200+
const int x1 = (x[i].qs[j] >> 4) | xh_1;
248201

249-
const float v0 = vi0*d + m;
250-
const float v1 = vi1*d + m;
251-
252-
y[i*QK5_1 + l + 0] = v0;
253-
y[i*QK5_1 + l + 1] = v1;
202+
y[i*qk + j + 0 ] = x0*d + m;
203+
y[i*qk + j + qk/2] = x1*d + m;
254204
}
255205
}
256206

257207
static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
208+
static const int qk = QK8_0;
209+
258210
const block_q8_0 * x = (const block_q8_0 *) vx;
259211

260212
const int i = blockIdx.x;
261213

262214
const float d = x[i].d;
263215

264-
const int8_t * pp = x[i].qs;
265-
266-
for (int l = 0; l < QK8_0; l++) {
267-
const int8_t vi = pp[l];
268-
269-
y[i*QK8_0 + l] = vi*d;
216+
for (int j = 0; j < qk; ++j) {
217+
y[i*qk + j] = x[i].qs[j]*d;
270218
}
271219
}
272220

@@ -280,11 +228,6 @@ static void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStre
280228
dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y);
281229
}
282230

283-
static void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
284-
const int nb = k / QK4_2;
285-
dequantize_block_q4_2<<<nb, 1, 0, stream>>>(vx, y);
286-
}
287-
288231
static void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
289232
const int nb = k / QK5_0;
290233
dequantize_block_q5_0<<<nb, 1, 0, stream>>>(vx, y);
@@ -319,8 +262,6 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
319262
return dequantize_row_q4_0_cuda;
320263
case GGML_TYPE_Q4_1:
321264
return dequantize_row_q4_1_cuda;
322-
case GGML_TYPE_Q4_2:
323-
return dequantize_row_q4_2_cuda;
324265
case GGML_TYPE_Q5_0:
325266
return dequantize_row_q5_0_cuda;
326267
case GGML_TYPE_Q5_1:

0 commit comments

Comments
 (0)