YellowRoseCx
diff --git a/Diff for: ‎Makefile
+8-3 b/Diff for: ‎Makefile
+8-3
diff --git a/Diff for: ‎cmake/FindSIMD.cmake
+100 b/Diff for: ‎cmake/FindSIMD.cmake
+100
diff --git a/Diff for: ‎colab.ipynb
+5-2 b/Diff for: ‎colab.ipynb
+5-2
@@ -75,6 +75,10 @@ endif
 ifeq ($(UNAME_S),Darwin)
 	CFLAGS   += -pthread
 	CXXFLAGS += -pthread
+	CLANG_VER = $(shell clang -v 2>&1 | head -n 1 | awk 'BEGIN {FS="[. ]"};{print $$1 $$2 $$4}')
+    ifeq ($(CLANG_VER),Appleclang15)
+		LDFLAGS += -ld_classic
+	endif
 endif
 ifeq ($(UNAME_S),FreeBSD)
 	CFLAGS   += -pthread
@@ -116,7 +120,11 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
 		FULLCFLAGS += -mavx2 -msse3 -mfma -mf16c -mavx
 	else
 # if not on windows, they are clearly building it themselves, so lets just use whatever is supported
+		ifdef LLAMA_COLAB
+		CFLAGS += -mavx2 -msse3 -mfma -mf16c -mavx
+		else
 		CFLAGS += -march=native -mtune=native
+		endif
 	endif
 endif
 ifneq ($(filter ppc64%,$(UNAME_M)),)
@@ -205,9 +213,6 @@ ifdef LLAMA_HIPBLAS
 	LLAMA_CUDA_MMV_Y ?= 1
 	LLAMA_CUDA_KQUANTS_ITER ?= 2
 	HIPFLAGS   += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
-# ifdef LLAMA_CUDA_FORCE_DMMV
-# 	HIPFLAGS 	+= -DGGML_CUDA_FORCE_DMMV
-# endif # LLAMA_CUDA_FORCE_DMMV
 	HIPLDFLAGS    += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 -lrocblas
 	HIP_OBJS       += ggml-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
 ggml-cuda.o: HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) \
 
@@ -0,0 +1,100 @@
+include(CheckCSourceRuns)
+
+set(AVX_CODE "
+    #include <immintrin.h>
+    int main()
+    {
+        __m256 a;
+        a = _mm256_set1_ps(0);
+        return 0;
+    }
+")
+
+set(AVX512_CODE "
+    #include <immintrin.h>
+    int main()
+    {
+        __m512i a = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+                                    0, 0, 0, 0, 0, 0, 0, 0,
+                                    0, 0, 0, 0, 0, 0, 0, 0,
+                                    0, 0, 0, 0, 0, 0, 0, 0,
+                                    0, 0, 0, 0, 0, 0, 0, 0,
+                                    0, 0, 0, 0, 0, 0, 0, 0,
+                                    0, 0, 0, 0, 0, 0, 0, 0,
+                                    0, 0, 0, 0, 0, 0, 0, 0);
+        __m512i b = a;
+        __mmask64 equality_mask = _mm512_cmp_epi8_mask(a, b, _MM_CMPINT_EQ);
+        return 0;
+    }
+")
+
+set(AVX2_CODE "
+    #include <immintrin.h>
+    int main()
+    {
+        __m256i a = {0};
+        a = _mm256_abs_epi16(a);
+        __m256i x;
+        _mm256_extract_epi64(x, 0); // we rely on this in our AVX2 code
+        return 0;
+    }
+")
+
+set(FMA_CODE "
+    #include <immintrin.h>
+    int main()
+    {
+        __m256 acc = _mm256_setzero_ps();
+        const __m256 d = _mm256_setzero_ps();
+        const __m256 p = _mm256_setzero_ps();
+        acc = _mm256_fmadd_ps( d, p, acc );
+        return 0;
+    }
+")
+
+macro(check_sse type flags)
+    set(__FLAG_I 1)
+    set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
+    foreach (__FLAG ${flags})
+        if (NOT ${type}_FOUND)
+            set(CMAKE_REQUIRED_FLAGS ${__FLAG})
+            check_c_source_runs("${${type}_CODE}" HAS_${type}_${__FLAG_I})
+            if (HAS_${type}_${__FLAG_I})
+                set(${type}_FOUND TRUE CACHE BOOL "${type} support")
+                set(${type}_FLAGS "${__FLAG}" CACHE STRING "${type} flags")
+            endif()
+            math(EXPR __FLAG_I "${__FLAG_I}+1")
+        endif()
+    endforeach()
+    set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
+
+    if (NOT ${type}_FOUND)
+        set(${type}_FOUND FALSE CACHE BOOL "${type} support")
+        set(${type}_FLAGS "" CACHE STRING "${type} flags")
+    endif()
+
+    mark_as_advanced(${type}_FOUND ${type}_FLAGS)
+endmacro()
+
+# flags are for MSVC only!
+check_sse("AVX" " ;/arch:AVX")
+if (NOT ${AVX_FOUND})
+    set(LLAMA_AVX OFF)
+else()
+    set(LLAMA_AVX ON)
+endif()
+
+check_sse("AVX2" " ;/arch:AVX2")
+check_sse("FMA" " ;/arch:AVX2")
+if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND}))
+    set(LLAMA_AVX2 OFF)
+else()
+    set(LLAMA_AVX2 ON)
+endif()
+
+check_sse("AVX512" " ;/arch:AVX512")
+if (NOT ${AVX512_FOUND})
+    set(LLAMA_AVX512 OFF)
+else()
+    set(LLAMA_AVX512 ON)
+endif()
@@ -51,6 +51,7 @@
         "Model = \"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_M.gguf\" #@param [\"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-13B-GGUF/resolve/main/mythomax-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/ReMM-SLERP-L2-13B-GGUF/resolve/main/remm-slerp-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/Xwin-LM-13B-v0.2-GGUF/resolve/main/xwin-lm-13b-v0.2.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/Stheno-L2-13B-GGUF/resolve/main/stheno-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q4_K_S.gguf\"]{allow-input: true}\r\n",
         "Layers = 43 #@param [43]{allow-input: true}\r\n",
         "ContextSize = 4096 #@param [4096] {allow-input: true}\r\n",
+        "ForceRebuild = False #@param {type:\"boolean\"}\r\n",
         "\r\n",
         "import os\r\n",
         "if not os.path.isfile(\"/opt/bin/nvidia-smi\"):\r\n",
@@ -61,13 +62,15 @@
         "%cd /content/koboldcpp\r\n",
         "kvers = !(cat koboldcpp.py | grep 'KcppVersion = ' | cut -d '\"' -f2)\r\n",
         "kvers = kvers[0]\r\n",
+        "if ForceRebuild:\r\n",
+        "  kvers = \"force_rebuild\"\r\n",
         "!echo Finding prebuilt binary for {kvers}\r\n",
         "!wget -O dlfile.tmp https://kcppcolab.concedo.workers.dev/?{kvers} && mv dlfile.tmp koboldcpp_cublas.so\r\n",
         "!test -f koboldcpp_cublas.so && echo Prebuilt Binary Exists || echo Prebuilt Binary Does Not Exist\r\n",
-        "!test -f koboldcpp_cublas.so && echo Build Skipped || make koboldcpp_cublas LLAMA_CUBLAS=1\r\n",
+        "!test -f koboldcpp_cublas.so && echo Build Skipped || make koboldcpp_cublas LLAMA_CUBLAS=1 LLAMA_COLAB=1\r\n",
         "!cp koboldcpp_cublas.so koboldcpp_cublas.dat\r\n",
         "!apt install aria2 -y\r\n",
-        "!aria2c -x 10 -o model.ggml --allow-overwrite=true --file-allocation=none $Model\r\n",
+        "!aria2c -x 10 -o model.ggml --summary-interval=5 --download-result=default --allow-overwrite=true --file-allocation=none $Model\r\n",
         "!python koboldcpp.py model.ggml --usecublas 0 mmq --multiuser --gpulayers $Layers --contextsize $ContextSize --hordeconfig concedo 1 1 --remotetunnel\r\n"
       ]
     }