Skip to content

Commit cbdc1f3

Browse files
committed
Merge remote-tracking branch 'upstream/concedo'
2 parents 5b838d4 + 9731682 commit cbdc1f3

12 files changed

+541
-519
lines changed

CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ if (LLAMA_CUBLAS)
100100
if (LLAMA_CUDA_DMMV_F16)
101101
set(CMAKE_CUDA_ARCHITECTURES "60;61") # needed for f16 CUDA intrinsics
102102
else()
103-
set(CMAKE_CUDA_ARCHITECTURES "37;86") # lowest CUDA 12 standard + lowest for integer intrinsics
103+
set(CMAKE_CUDA_ARCHITECTURES "37;52;61") # lowest CUDA 12 standard + lowest for integer intrinsics
104104
endif()
105105
endif()
106106
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")

Makefile

+7-4
Original file line numberDiff line numberDiff line change
@@ -378,13 +378,16 @@ grammar-parser.o: examples/grammar-parser.cpp examples/grammar-parser.h
378378
$(CXX) $(CXXFLAGS) -c $< -o $@
379379
expose.o: expose.cpp expose.h
380380
$(CXX) $(CXXFLAGS) -c $< -o $@
381-
gpttype_adapter_failsafe.o: gpttype_adapter.cpp
381+
382+
# idiotic "for easier compilation"
383+
GPTTYPE_ADAPTER = gpttype_adapter.cpp otherarch/llama_v2.cpp llama.cpp otherarch/utils.cpp otherarch/gptj_v1.cpp otherarch/gptj_v2.cpp otherarch/gptj_v3.cpp otherarch/gpt2_v1.cpp otherarch/gpt2_v2.cpp otherarch/gpt2_v3.cpp otherarch/rwkv_v2.cpp otherarch/rwkv_v3.cpp otherarch/neox_v2.cpp otherarch/neox_v3.cpp otherarch/mpt_v3.cpp ggml.h ggml-cuda.h llama.h llama-util.h
384+
gpttype_adapter_failsafe.o: $(GPTTYPE_ADAPTER)
382385
$(CXX) $(CXXFLAGS) $(FAILSAFE_FLAGS) -c $< -o $@
383-
gpttype_adapter.o: gpttype_adapter.cpp
386+
gpttype_adapter.o: $(GPTTYPE_ADAPTER)
384387
$(CXX) $(CXXFLAGS) -c $< -o $@
385-
gpttype_adapter_clblast.o: gpttype_adapter.cpp
388+
gpttype_adapter_clblast.o: $(GPTTYPE_ADAPTER)
386389
$(CXX) $(CXXFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
387-
gpttype_adapter_cublas.o: gpttype_adapter.cpp
390+
gpttype_adapter_cublas.o: $(GPTTYPE_ADAPTER)
388391
$(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
389392

390393
clean:

expose.h

+2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
const int stop_token_max = 10;
44
const int ban_token_max = 10;
5+
const int tensor_split_max = 16;
56
// match kobold's sampler list and order
67
enum samplers
78
{
@@ -46,6 +47,7 @@ struct load_model_inputs
4647
const float rope_freq_scale = 1.0f;
4748
const float rope_freq_base = 10000.0f;
4849
const char * banned_tokens[ban_token_max];
50+
const float tensor_split[tensor_split_max];
4951
};
5052
struct generation_inputs
5153
{

ggml-cuda.cu

+17-12
Original file line numberDiff line numberDiff line change
@@ -4014,18 +4014,23 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
40144014
}
40154015
func = ggml_cuda_mul;
40164016
break;
4017-
case GGML_OP_GELU:
4018-
if (!any_on_device) {
4019-
return false;
4020-
}
4021-
func = ggml_cuda_gelu;
4022-
break;
4023-
case GGML_OP_SILU:
4024-
if (!any_on_device) {
4025-
return false;
4026-
}
4027-
func = ggml_cuda_silu;
4028-
break;
4017+
case GGML_OP_UNARY:
4018+
switch (ggml_get_unary_op(tensor)) {
4019+
case GGML_UNARY_OP_GELU:
4020+
if (!any_on_device) {
4021+
return false;
4022+
}
4023+
func = ggml_cuda_gelu;
4024+
break;
4025+
case GGML_UNARY_OP_SILU:
4026+
if (!any_on_device) {
4027+
return false;
4028+
}
4029+
func = ggml_cuda_silu;
4030+
break;
4031+
default:
4032+
return false;
4033+
} break;
40294034
case GGML_OP_NORM:
40304035
if (!any_on_device) {
40314036
return false;

ggml-metal.m

+53-43
Original file line numberDiff line numberDiff line change
@@ -519,48 +519,56 @@ void ggml_metal_graph_compute(
519519

520520
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
521521
} break;
522-
case GGML_OP_SILU:
523-
{
524-
if (encoder == nil) {
525-
encoder = [command_buffer computeCommandEncoder];
526-
}
527-
528-
[encoder setComputePipelineState:ctx->pipeline_silu];
529-
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
530-
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
531-
532-
const int64_t n = ggml_nelements(dst);
533-
534-
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
535-
} break;
536-
case GGML_OP_RELU:
537-
{
538-
if (encoder == nil) {
539-
encoder = [command_buffer computeCommandEncoder];
540-
}
541-
542-
[encoder setComputePipelineState:ctx->pipeline_relu];
543-
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
544-
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
545-
546-
const int64_t n = ggml_nelements(dst);
547-
548-
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
522+
case GGML_OP_UNARY:
523+
switch (ggml_get_unary_op(gf->nodes[i])) {
524+
case GGML_UNARY_OP_SILU:
525+
{
526+
if (encoder == nil) {
527+
encoder = [command_buffer computeCommandEncoder];
528+
}
529+
530+
[encoder setComputePipelineState:ctx->pipeline_silu];
531+
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
532+
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
533+
534+
const int64_t n = ggml_nelements(dst);
535+
536+
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
537+
} break;
538+
case GGML_UNARY_OP_RELU:
539+
{
540+
if (encoder == nil) {
541+
encoder = [command_buffer computeCommandEncoder];
542+
}
543+
544+
[encoder setComputePipelineState:ctx->pipeline_relu];
545+
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
546+
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
547+
548+
const int64_t n = ggml_nelements(dst);
549+
550+
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
551+
} break;
552+
case GGML_UNARY_OP_GELU:
553+
{
554+
if (encoder == nil) {
555+
encoder = [command_buffer computeCommandEncoder];
556+
}
557+
558+
[encoder setComputePipelineState:ctx->pipeline_gelu];
559+
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
560+
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
561+
562+
const int64_t n = ggml_nelements(dst);
563+
564+
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
565+
} break;
566+
default:
567+
{
568+
fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
569+
GGML_ASSERT(false);
570+
}
549571
} break;
550-
case GGML_OP_GELU:
551-
{
552-
if (encoder == nil) {
553-
encoder = [command_buffer computeCommandEncoder];
554-
}
555-
556-
[encoder setComputePipelineState:ctx->pipeline_gelu];
557-
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
558-
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
559-
560-
const int64_t n = ggml_nelements(dst);
561-
562-
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
563-
} break;
564572
case GGML_OP_SOFT_MAX:
565573
{
566574
if (encoder == nil) {
@@ -979,8 +987,10 @@ void ggml_metal_graph_compute(
979987
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
980988
} break;
981989
default:
982-
fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
983-
GGML_ASSERT(false);
990+
{
991+
fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
992+
GGML_ASSERT(false);
993+
}
984994
}
985995
}
986996

0 commit comments

Comments
 (0)