Skip to content

Commit ba0c7c7

Browse files
authored
Vulkan k-quant mmq and ggml-backend offload functionality (ggml-org#6155)
* Fix Vulkan no kv offload incoherence * Add k-quant mul mat mat shaders * Rework working buffer allocation, reduces vram use noticeably Clean up cpu assist code, replaced with ggml-backend offload function * Default to all dedicated GPUs * Add fallback for integrated GPUs if no dedicated GPUs are found * Add debug info which device is allocating memory * Fix Intel dequant issue Fix validation issue * Fix Vulkan GGML_OP_GET_ROWS implementation * Clean up merge artifacts * Remove Vulkan warning
1 parent d48ccf3 commit ba0c7c7

7 files changed

+37776
-15352
lines changed

README.md

-9
Original file line numberDiff line numberDiff line change
@@ -636,15 +636,6 @@ Building the program with BLAS support may lead to some performance improvements
636636
637637
- #### Vulkan
638638
639-
> [!WARNING]
640-
>
641-
> Vulkan support has been broken in https://github.com/ggerganov/llama.cpp/pull/6122
642-
> due to relying on `GGML_OP_GET_ROWS` which is not yet properly supported by the Vulkan backend,
643-
> but should be fixed relatively soon (possibly in https://github.com/ggerganov/llama.cpp/pull/6155
644-
> (ref: https://github.com/ggerganov/llama.cpp/pull/6122#issuecomment-2015327635)).
645-
>
646-
> Meanwhile, if you want to use the Vulkan backend, you should use the commit right before the breaking change, https://github.com/ggerganov/llama.cpp/commit/55c1b2a3bbd470e9e2a3a0618b92cf64a885f806
647-
648639
**With docker**:
649640
650641
You don't need to install Vulkan SDK. It will be installed inside the container.

ggml-vulkan-shaders.hpp

+37,199-14,939
Large diffs are not rendered by default.

ggml-vulkan.cpp

+328-307
Large diffs are not rendered by default.

ggml-vulkan.h

-11
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,6 @@ extern "C" {
1111
#define GGML_VK_MAX_DEVICES 16
1212

1313
GGML_API void ggml_vk_instance_init(void);
14-
GGML_API void ggml_vk_init_cpu_assist(void);
15-
16-
GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);
17-
GGML_API void ggml_vk_preallocate_buffers_cpu_assist(void);
18-
GGML_API void ggml_vk_build_graph_cpu_assist(struct ggml_tensor * node, bool last_node);
19-
GGML_API bool ggml_vk_compute_forward_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
20-
#ifdef GGML_VULKAN_CHECK_RESULTS
21-
void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
22-
#endif
23-
GGML_API void ggml_vk_graph_cleanup_cpu_assist(void);
24-
GGML_API void ggml_vk_free_cpu_assist(void);
2514

2615
// backend API
2716
GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);

ggml.c

-35
Original file line numberDiff line numberDiff line change
@@ -278,8 +278,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
278278
#include <Accelerate/Accelerate.h>
279279
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
280280
#include "ggml-opencl.h"
281-
#elif defined(GGML_USE_VULKAN)
282-
#include "ggml-vulkan.h"
283281
#endif
284282
#elif defined(GGML_USE_OPENBLAS)
285283
#if defined(GGML_BLAS_USE_MKL)
@@ -289,8 +287,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
289287
#endif
290288
#elif defined(GGML_USE_CLBLAST)
291289
#include "ggml-opencl.h"
292-
#elif defined(GGML_USE_VULKAN)
293-
#include "ggml-vulkan.h"
294290
#endif
295291

296292
// floating point type used to accumulate sums
@@ -2717,8 +2713,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
27172713

27182714
#if defined(GGML_USE_CLBLAST)
27192715
ggml_cl_init();
2720-
#elif defined(GGML_USE_VULKAN)
2721-
ggml_vk_init_cpu_assist();
27222716
#endif
27232717

27242718
ggml_setup_op_has_task_pass();
@@ -16128,20 +16122,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
1612816122
return;
1612916123
}
1613016124

16131-
#if defined(GGML_USE_VULKAN)
16132-
const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
16133-
#ifdef GGML_VULKAN_CHECK_RESULTS
16134-
if (skip_cpu) {
16135-
ggml_vk_check_results_1_cpu_assist(params, tensor);
16136-
}
16137-
#endif
16138-
if (skip_cpu) {
16139-
return;
16140-
}
16141-
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
16142-
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
16143-
#endif // GGML_USE_VULKAN
16144-
1614516125
switch (tensor->op) {
1614616126
case GGML_OP_DUP:
1614716127
{
@@ -18617,17 +18597,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
1861718597
}
1861818598
}
1861918599

18620-
#ifdef GGML_USE_VULKAN
18621-
for (int i = 0; i < cgraph->n_nodes; i++) {
18622-
ggml_vk_preallocate_buffers_graph_cpu_assist(cgraph->nodes[i]);
18623-
}
18624-
ggml_vk_preallocate_buffers_cpu_assist();
18625-
18626-
for (int i = 0; i < cgraph->n_nodes; i++) {
18627-
ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1);
18628-
}
18629-
#endif
18630-
1863118600
const int n_threads = cplan->n_threads;
1863218601

1863318602
struct ggml_compute_state_shared state_shared = {
@@ -18684,10 +18653,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
1868418653
}
1868518654
}
1868618655

18687-
#ifdef GGML_USE_VULKAN
18688-
ggml_vk_graph_cleanup_cpu_assist();
18689-
#endif
18690-
1869118656
// performance stats (graph)
1869218657
{
1869318658
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;

0 commit comments

Comments
 (0)