|
1 | 1 | #include <stdint.h>
|
| 2 | +#include <stdio.h> |
2 | 3 | #include <cuda_fp16.h>
|
| 4 | +#include <atomic> |
3 | 5 | #include "ggml-cuda.h"
|
4 | 6 |
|
5 | 7 | typedef uint16_t ggml_fp16_t;
|
@@ -29,14 +31,12 @@ static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2
|
29 | 31 |
|
30 | 32 | #define QK4_3 16
|
31 | 33 | typedef struct {
|
32 |
| - __half d; // delta |
33 |
| - __half m; // min |
34 |
| - uint8_t qs[QK4_3 / 2]; // nibbles / quants |
| 34 | + __half d; // delta |
| 35 | + __half m; // min |
| 36 | + uint8_t qs[QK4_3 / 2]; // nibbles / quants |
35 | 37 | } block_q4_3;
|
36 | 38 | static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding");
|
37 | 39 |
|
38 |
| - |
39 |
| - |
40 | 40 | static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
|
41 | 41 | const block_q4_0 * x = (const block_q4_0 *) vx;
|
42 | 42 |
|
@@ -131,24 +131,98 @@ static __global__ void dequantize_block_q4_3(const void * vx, float * y) {
|
131 | 131 | }
|
132 | 132 | }
|
133 | 133 |
|
134 |
| -extern "C" { |
135 |
| - __host__ void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) { |
136 |
| - const int nb = k / QK4_0; |
137 |
| - dequantize_block_q4_0<<<nb, 1, 0, stream>>>(vx, y); |
138 |
| - } |
| 134 | +void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) { |
| 135 | + const int nb = k / QK4_0; |
| 136 | + dequantize_block_q4_0<<<nb, 1, 0, stream>>>(vx, y); |
| 137 | +} |
| 138 | + |
| 139 | +void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) { |
| 140 | + const int nb = k / QK4_1; |
| 141 | + dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y); |
| 142 | +} |
| 143 | + |
| 144 | +void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) { |
| 145 | + const int nb = k / QK4_2; |
| 146 | + dequantize_block_q4_2<<<nb, 1, 0, stream>>>(vx, y); |
| 147 | +} |
| 148 | + |
| 149 | +void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream) { |
| 150 | + const int nb = k / QK4_3; |
| 151 | + dequantize_block_q4_3<<<nb, 1, 0, stream>>>(vx, y); |
| 152 | +} |
139 | 153 |
|
140 |
| - __host__ void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) { |
141 |
| - const int nb = k / QK4_1; |
142 |
| - dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y); |
| 154 | +// buffer pool for cuda |
| 155 | +#define MAX_CUDA_BUFFERS 16 |
| 156 | + |
| 157 | +struct scoped_spin_lock { |
| 158 | + std::atomic_flag& lock; |
| 159 | + scoped_spin_lock(std::atomic_flag& lock) : lock(lock) { |
| 160 | + while (lock.test_and_set(std::memory_order_acquire)) { |
| 161 | + ; // spin |
| 162 | + } |
| 163 | + } |
| 164 | + ~scoped_spin_lock() { |
| 165 | + lock.clear(std::memory_order_release); |
| 166 | + } |
| 167 | + scoped_spin_lock(const scoped_spin_lock&) = delete; |
| 168 | + scoped_spin_lock& operator=(const scoped_spin_lock&) = delete; |
| 169 | +}; |
| 170 | + |
| 171 | +struct cuda_buffer { |
| 172 | + void * ptr = nullptr; |
| 173 | + size_t size = 0; |
| 174 | +}; |
| 175 | + |
| 176 | +static cuda_buffer g_cuda_buffer_pool[MAX_CUDA_BUFFERS]; |
| 177 | +static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT; |
| 178 | + |
| 179 | +void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) { |
| 180 | + scoped_spin_lock lock(g_cuda_pool_lock); |
| 181 | + |
| 182 | + for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) { |
| 183 | + cuda_buffer& b = g_cuda_buffer_pool[i]; |
| 184 | + if (b.size >= size && b.ptr != nullptr) { |
| 185 | + void * ptr = b.ptr; |
| 186 | + *actual_size = b.size; |
| 187 | + b.ptr = nullptr; |
| 188 | + b.size = 0; |
| 189 | + return ptr; |
| 190 | + } |
143 | 191 | }
|
| 192 | + void * ptr; |
| 193 | + CUDA_CHECK(cudaMalloc((void **) &ptr, size)); |
| 194 | + *actual_size = size; |
| 195 | + return ptr; |
| 196 | +} |
| 197 | + |
| 198 | +void ggml_cuda_pool_free(void * ptr, size_t size) { |
| 199 | + scoped_spin_lock lock(g_cuda_pool_lock); |
144 | 200 |
|
145 |
| - __host__ void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) { |
146 |
| - const int nb = k / QK4_2; |
147 |
| - dequantize_block_q4_2<<<nb, 1, 0, stream>>>(vx, y); |
| 201 | + for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) { |
| 202 | + cuda_buffer& b = g_cuda_buffer_pool[i]; |
| 203 | + if (b.ptr == nullptr) { |
| 204 | + b.ptr = ptr; |
| 205 | + b.size = size; |
| 206 | + return; |
| 207 | + } |
148 | 208 | }
|
| 209 | + fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n"); |
| 210 | + CUDA_CHECK(cudaFree(ptr)); |
| 211 | +} |
| 212 | + |
| 213 | +cublasHandle_t g_cublasH = NULL; |
| 214 | +cudaStream_t g_cudaStream = NULL; |
| 215 | + |
| 216 | +void ggml_init_cublas(void) { |
| 217 | + if (g_cublasH == NULL) { |
| 218 | + // create cublas handle, bind a stream |
| 219 | + CUBLAS_CHECK(cublasCreate(&g_cublasH)); |
| 220 | + |
| 221 | + CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStream, cudaStreamNonBlocking)); |
| 222 | + |
| 223 | + CUBLAS_CHECK(cublasSetStream(g_cublasH, g_cudaStream)); |
149 | 224 |
|
150 |
| - __host__ void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream) { |
151 |
| - const int nb = k / QK4_3; |
152 |
| - dequantize_block_q4_3<<<nb, 1, 0, stream>>>(vx, y); |
| 225 | + // configure logging to stdout |
| 226 | + // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, NULL)); |
153 | 227 | }
|
154 | 228 | }
|
0 commit comments