File tree 1 file changed +6
-5
lines changed
1 file changed +6
-5
lines changed Original file line number Diff line number Diff line change @@ -5164,11 +5164,12 @@ static int llama_decode_internal(
5164
5164
5165
5165
// If all tensors can be run on the GPU then using more than 1 thread is detrimental.
5166
5166
const bool full_offload_supported =
5167
- model.arch == LLM_ARCH_LLAMA ||
5168
- model.arch == LLM_ARCH_BAICHUAN ||
5169
- model.arch == LLM_ARCH_FALCON ||
5170
- model.arch == LLM_ARCH_REFACT ||
5171
- model.arch == LLM_ARCH_MPT;
5167
+ model.arch == LLM_ARCH_LLAMA ||
5168
+ model.arch == LLM_ARCH_BAICHUAN ||
5169
+ model.arch == LLM_ARCH_FALCON ||
5170
+ model.arch == LLM_ARCH_REFACT ||
5171
+ model.arch == LLM_ARCH_MPT ||
5172
+ model.arch == LLM_ARCH_STARCODER;
5172
5173
5173
5174
const bool fully_offloaded = model.n_gpu_layers >= (int ) hparams.n_layer + 3 ;
5174
5175
if (ggml_cpu_has_cublas () && full_offload_supported && fully_offloaded) {
You can’t perform that action at this time.
0 commit comments