Skip to content

Commit 78200a9

Browse files
committed
Merge branch 'master' into HEAD
2 parents 3374ff7 + f93af02 commit 78200a9

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+4134
-1028
lines changed

.dockerignore

+3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
*.o
22
*.a
33
.cache/
4+
.git/
5+
.github/
6+
.gitignore
47
.vs/
58
.vscode/
69
.DS_Store

.github/workflows/build.yml

+8-8
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ jobs:
188188
sysctl -a
189189
mkdir build
190190
cd build
191-
cmake -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF ..
191+
cmake ..
192192
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
193193
194194
- name: Test
@@ -265,17 +265,17 @@ jobs:
265265
matrix:
266266
include:
267267
- build: 'noavx'
268-
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
268+
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
269269
- build: 'avx2'
270-
defines: '-DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
270+
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
271271
- build: 'avx'
272-
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
272+
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
273273
- build: 'avx512'
274-
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
274+
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
275275
- build: 'clblast'
276-
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
276+
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
277277
- build: 'openblas'
278-
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
278+
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
279279

280280
steps:
281281
- name: Clone
@@ -414,7 +414,7 @@ jobs:
414414
run: |
415415
mkdir build
416416
cd build
417-
cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
417+
cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
418418
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
419419
420420
- name: Determine tag name

.gitignore

+3-1
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ models-mnt
4040
/embedding
4141
/gguf
4242
/gguf-llama-simple
43+
/infill
4344
/libllama.so
4445
/llama-bench
4546
/main
@@ -90,4 +91,5 @@ tests/test-quantize-perf
9091
tests/test-sampling
9192
tests/test-tokenizer-0-llama
9293
tests/test-tokenizer-0-falcon
93-
tests/test-tokenizer-1
94+
tests/test-tokenizer-1-llama
95+
tests/test-tokenizer-1-bpe

CMakeLists.txt

+43-23
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
cmake_minimum_required(VERSION 3.12) # Don't bump this version for no reason
1+
cmake_minimum_required(VERSION 3.13) # for add_link_options
22
project("llama.cpp" C CXX)
33

44
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -44,7 +44,7 @@ endif()
4444

4545
# general
4646
option(LLAMA_STATIC "llama: static link libraries" OFF)
47-
option(LLAMA_NATIVE "llama: enable -march=native flag" OFF)
47+
option(LLAMA_NATIVE "llama: enable -march=native flag" ON)
4848
option(LLAMA_LTO "llama: enable link time optimization" OFF)
4949

5050
# debug
@@ -58,15 +58,21 @@ option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer"
5858
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
5959

6060
# instruction set specific
61-
option(LLAMA_AVX "llama: enable AVX" ON)
62-
option(LLAMA_AVX2 "llama: enable AVX2" ON)
63-
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
64-
option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF)
65-
option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF)
66-
option(LLAMA_FMA "llama: enable FMA" ON)
61+
if (LLAMA_NATIVE)
62+
set(INS_ENB OFF)
63+
else()
64+
set(INS_ENB ON)
65+
endif()
66+
67+
option(LLAMA_AVX "llama: enable AVX" ${INS_ENB})
68+
option(LLAMA_AVX2 "llama: enable AVX2" ${INS_ENB})
69+
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
70+
option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF)
71+
option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF)
72+
option(LLAMA_FMA "llama: enable FMA" ${INS_ENB})
6773
# in MSVC F16C is implied with AVX2/AVX512
6874
if (NOT MSVC)
69-
option(LLAMA_F16C "llama: enable F16C" ON)
75+
option(LLAMA_F16C "llama: enable F16C" ${INS_ENB})
7076
endif()
7177

7278
# 3rd party libs
@@ -343,8 +349,9 @@ if (LLAMA_MPI)
343349
set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
344350
add_compile_definitions(GGML_USE_MPI)
345351
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
346-
set(cxx_flags ${cxx_flags} -Wno-cast-qual)
347-
set(c_flags ${c_flags} -Wno-cast-qual)
352+
if (NOT MSVC)
353+
add_compile_options(-Wno-cast-qual)
354+
endif()
348355
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_C_LIBRARIES})
349356
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
350357
# Even if you're only using the C header, C++ programs may bring in MPI
@@ -418,10 +425,11 @@ if (LLAMA_ALL_WARNINGS)
418425
set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int
419426
-Werror=implicit-function-declaration)
420427
set(cxx_flags -Wmissing-declarations -Wmissing-noreturn)
428+
set(host_cxx_flags "")
421429

422430
if (CMAKE_C_COMPILER_ID MATCHES "Clang")
423431
set(warning_flags ${warning_flags} -Wunreachable-code-break -Wunreachable-code-return)
424-
set(cxx_flags ${cxx_flags} -Wmissing-prototypes -Wextra-semi)
432+
set(host_cxx_flags ${host_cxx_flags} -Wmissing-prototypes -Wextra-semi)
425433

426434
if (
427435
(CMAKE_C_COMPILER_ID STREQUAL "Clang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 3.8.0) OR
@@ -431,27 +439,38 @@ if (LLAMA_ALL_WARNINGS)
431439
endif()
432440
elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU")
433441
set(c_flags ${c_flags} -Wdouble-promotion)
434-
set(cxx_flags ${cxx_flags} -Wno-array-bounds)
442+
set(host_cxx_flags ${host_cxx_flags} -Wno-array-bounds)
435443

436444
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.1.0)
437-
set(cxx_flags ${cxx_flags} -Wno-format-truncation)
445+
set(host_cxx_flags ${host_cxx_flags} -Wno-format-truncation)
438446
endif()
439447
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1.0)
440-
set(cxx_flags ${cxx_flags} -Wextra-semi)
448+
set(host_cxx_flags ${host_cxx_flags} -Wextra-semi)
441449
endif()
442450
endif()
443451
else()
444452
# todo : msvc
445453
endif()
446454

447-
add_compile_options(
448-
${warning_flags}
449-
"$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
450-
"$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
451-
)
455+
set(c_flags ${c_flags} ${warning_flags})
456+
set(cxx_flags ${cxx_flags} ${warning_flags})
457+
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
458+
"$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags} ${host_cxx_flags}>")
452459

453460
endif()
454461

462+
if (NOT MSVC)
463+
set(cuda_flags -Wno-pedantic)
464+
endif()
465+
set(cuda_flags ${cxx_flags} -use_fast_math ${cuda_flags})
466+
467+
list(JOIN host_cxx_flags " " cuda_host_flags) # pass host compiler flags as a single argument
468+
if (NOT cuda_host_flags STREQUAL "")
469+
set(cuda_flags ${cuda_flags} -Xcompiler ${cuda_host_flags})
470+
endif()
471+
472+
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>")
473+
455474
if (WIN32)
456475
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
457476

@@ -491,9 +510,6 @@ if (NOT MSVC)
491510
if (LLAMA_GPROF)
492511
add_compile_options(-pg)
493512
endif()
494-
if (LLAMA_NATIVE)
495-
add_compile_options(-march=native)
496-
endif()
497513
endif()
498514

499515
if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
@@ -548,6 +564,9 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
548564
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX>)
549565
endif()
550566
else()
567+
if (LLAMA_NATIVE)
568+
add_compile_options(-march=native)
569+
endif()
551570
if (LLAMA_F16C)
552571
add_compile_options(-mf16c)
553572
endif()
@@ -705,6 +724,7 @@ set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR}
705724
set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
706725
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
707726
set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
727+
get_directory_property(LLAMA_TRANSIENT_DEFINES COMPILE_DEFINITIONS)
708728

709729
configure_package_config_file(
710730
${CMAKE_CURRENT_SOURCE_DIR}/scripts/LlamaConfig.cmake.in

Makefile

+11-3
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
# Define the default target now so that it is always the first target
2-
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative benchmark-matmult parallel finetune export-lora tests/test-c.o
2+
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o
33

44
# Binaries only useful for tests
5-
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama
5+
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe
66

77
# Code coverage output files
88
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -62,9 +62,11 @@ test: $(TEST_TARGETS)
6262
if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
6363
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
6464
elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
65-
continue; \
65+
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
6666
elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
6767
continue; \
68+
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
69+
continue; \
6870
else \
6971
echo "Running test $$test_target..."; \
7072
./$$test_target; \
@@ -543,6 +545,9 @@ main: examples/main/main.cpp build-info.h ggml.
543545
@echo '==== Run ./main -h for help. ===='
544546
@echo
545547

548+
infill: examples/infill/infill.cpp build-info.h ggml.o llama.o common.o console.o grammar-parser.o $(OBJS)
549+
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
550+
546551
simple: examples/simple/simple.cpp build-info.h ggml.o llama.o common.o $(OBJS)
547552
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
548553

@@ -667,6 +672,9 @@ tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h gg
667672
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
668673
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
669674

675+
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp build-info.h ggml.o llama.o common.o $(OBJS)
676+
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
677+
670678
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
671679
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
672680

README.md

+6-2
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
1111

1212
### Hot topics
1313

14-
- Parallel decoding + continuous batching support incoming: [#3228](https://github.com/ggerganov/llama.cpp/pull/3228) \
14+
- ‼️ Breaking change: `rope_freq_base` and `rope_freq_scale` must be set to zero to use the model default values: [#3401](https://github.com/ggerganov/llama.cpp/pull/3401)
15+
- Parallel decoding + continuous batching support added: [#3228](https://github.com/ggerganov/llama.cpp/pull/3228) \
1516
**Devs should become familiar with the new API**
1617
- Local Falcon 180B inference on Mac Studio
1718

@@ -92,7 +93,8 @@ as the main playground for developing new features for the [ggml](https://github
9293
- [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
9394
- [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) and its derivations (such as [baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft))
9495
- [X] [Aquila-7B](https://huggingface.co/BAAI/Aquila-7B) / [AquilaChat-7B](https://huggingface.co/BAAI/AquilaChat-7B)
95-
- [X] Mistral AI v0.1
96+
- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
97+
- [X] [Mistral AI v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
9698

9799
**Bindings:**
98100

@@ -662,6 +664,8 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
662664
663665
The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
664666
667+
For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
668+
665669
### Instruction mode with Alpaca
666670
667671
1. First, download the `ggml` Alpaca model into the `./models` folder

common/common.cpp

+3
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
389389
params.interactive_first = true;
390390
} else if (arg == "-ins" || arg == "--instruct") {
391391
params.instruct = true;
392+
} else if (arg == "--infill") {
393+
params.infill = true;
392394
} else if (arg == "--multiline-input") {
393395
params.multiline_input = true;
394396
} else if (arg == "--simple-io") {
@@ -921,6 +923,7 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_to
921923
result += piece;
922924
}
923925

926+
// NOTE: the original tokenizer decodes bytes after collecting the pieces.
924927
return result;
925928
}
926929

common/common.h

+1
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ struct gpt_params {
120120
bool use_mlock = false; // use mlock to keep model in memory
121121
bool numa = false; // attempt optimizations that help on some NUMA systems
122122
bool verbose_prompt = false; // print prompt tokens before generation
123+
bool infill = false; // use infill mode
123124
};
124125

125126
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);

0 commit comments

Comments
 (0)