Skip to content

Commit 3db70b5

Browse files
committed
Merge 'origin/master' into hipblas
2 parents 2ec4466 + 7568d1a commit 3db70b5

24 files changed

+924
-321
lines changed

.devops/tools.sh

+9-9
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,13 @@ shift
1010
# Join the remaining arguments into a single string
1111
arg2="$@"
1212

13-
if [[ $arg1 == '--convert' || $arg1 == '-c' ]]; then
14-
python3 ./convert.py $arg2
15-
elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then
16-
./quantize $arg2
17-
elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then
18-
./main $arg2
19-
elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
13+
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
14+
python3 ./convert.py "$arg2"
15+
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
16+
./quantize "$arg2"
17+
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
18+
./main "$arg2"
19+
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
2020
echo "Converting PTH to GGML..."
2121
for i in `ls $1/$2/ggml-model-f16.bin*`; do
2222
if [ -f "${i/f16/q4_0}" ]; then
@@ -26,8 +26,8 @@ elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
2626
./quantize "$i" "${i/f16/q4_0}" q4_0
2727
fi
2828
done
29-
elif [[ $arg1 == '--server' || $arg1 == '-s' ]]; then
30-
./server $arg2
29+
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
30+
./server "$arg2"
3131
else
3232
echo "Unknown command: $arg1"
3333
echo "Available commands: "

Makefile

+12-8
Original file line numberDiff line numberDiff line change
@@ -151,14 +151,11 @@ ifdef LLAMA_MPI
151151
CFLAGS += -DGGML_USE_MPI -Wno-cast-qual
152152
CXXFLAGS += -DGGML_USE_MPI -Wno-cast-qual
153153
OBJS += ggml-mpi.o
154-
155-
ggml-mpi.o: ggml-mpi.c ggml-mpi.h
156-
$(CC) $(CFLAGS) -c $< -o $@
157154
endif # LLAMA_MPI
158155

159156
ifdef LLAMA_OPENBLAS
160-
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
161-
LDFLAGS += -lopenblas
157+
CFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags openblas)
158+
LDFLAGS += $(shell pkg-config --libs openblas)
162159
endif # LLAMA_OPENBLAS
163160

164161
ifdef LLAMA_BLIS
@@ -247,9 +244,6 @@ ifdef LLAMA_METAL
247244
CXXFLAGS += -DGGML_USE_METAL
248245
LDFLAGS += -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
249246
OBJS += ggml-metal.o
250-
251-
ggml-metal.o: ggml-metal.m ggml-metal.h
252-
$(CC) $(CFLAGS) -c $< -o $@
253247
endif # LLAMA_METAL
254248

255249
ifneq ($(filter aarch64%,$(UNAME_M)),)
@@ -274,6 +268,16 @@ ifneq ($(filter armv8%,$(UNAME_M)),)
274268
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
275269
endif
276270

271+
ifdef LLAMA_METAL
272+
ggml-metal.o: ggml-metal.m ggml-metal.h
273+
$(CC) $(CFLAGS) -c $< -o $@
274+
endif # LLAMA_METAL
275+
276+
ifdef LLAMA_MPI
277+
ggml-mpi.o: ggml-mpi.c ggml-mpi.h
278+
$(CC) $(CFLAGS) -c $< -o $@
279+
endif # LLAMA_MPI
280+
277281
ifdef LLAMA_NO_K_QUANTS
278282
k_quants.o: k_quants.c k_quants.h
279283
$(CC) $(CFLAGS) -c $< -o $@

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -640,7 +640,7 @@ Please verify the [sha256 checksums](SHA256SUMS) of all downloaded model files t
640640

641641
```bash
642642
# run the verification script
643-
python3 .\scripts\verify-checksum-models.py
643+
./scripts/verify-checksum-models.py
644644
```
645645

646646
- On linux or macOS it is also possible to run the following commands to verify if you have all possible latest files in your self-installed `./models` subdirectory:

build.zig

+21-11
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,19 @@
11
const std = @import("std");
2+
const commit_hash = @embedFile(".git/refs/heads/master");
23

3-
// Zig Version: 0.11.0-dev.3379+629f0d23b
4+
// Zig Version: 0.11.0-dev.3986+e05c242cd
45
pub fn build(b: *std.build.Builder) void {
56
const target = b.standardTargetOptions(.{});
67
const optimize = b.standardOptimizeOption(.{});
8+
9+
const config_header = b.addConfigHeader(
10+
.{ .style = .blank, .include_path = "build-info.h" },
11+
.{
12+
.BUILD_NUMBER = 0,
13+
.BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline
14+
},
15+
);
16+
717
const lib = b.addStaticLibrary(.{
818
.name = "llama",
919
.target = target,
@@ -13,24 +23,21 @@ pub fn build(b: *std.build.Builder) void {
1323
lib.linkLibCpp();
1424
lib.addIncludePath(".");
1525
lib.addIncludePath("./examples");
16-
lib.addCSourceFiles(&.{
17-
"ggml.c",
18-
}, &.{"-std=c11"});
19-
lib.addCSourceFiles(&.{
20-
"llama.cpp",
21-
}, &.{"-std=c++11"});
26+
lib.addConfigHeader(config_header);
27+
lib.addCSourceFiles(&.{"ggml.c"}, &.{"-std=c11"});
28+
lib.addCSourceFiles(&.{"llama.cpp"}, &.{"-std=c++11"});
2229
b.installArtifact(lib);
2330

2431
const examples = .{
2532
"main",
2633
"baby-llama",
2734
"embedding",
28-
// "metal",
35+
"metal",
2936
"perplexity",
3037
"quantize",
3138
"quantize-stats",
3239
"save-load-state",
33-
// "server",
40+
"server",
3441
"simple",
3542
"train-text-from-scratch",
3643
};
@@ -43,16 +50,19 @@ pub fn build(b: *std.build.Builder) void {
4350
});
4451
exe.addIncludePath(".");
4552
exe.addIncludePath("./examples");
53+
exe.addConfigHeader(config_header);
4654
exe.addCSourceFiles(&.{
47-
std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{example_name, example_name}),
55+
std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{ example_name, example_name }),
4856
"examples/common.cpp",
4957
}, &.{"-std=c++11"});
5058
exe.linkLibrary(lib);
5159
b.installArtifact(exe);
60+
5261
const run_cmd = b.addRunArtifact(exe);
5362
run_cmd.step.dependOn(b.getInstallStep());
5463
if (b.args) |args| run_cmd.addArgs(args);
55-
const run_step = b.step("run_" ++ example_name, "Run the app");
64+
65+
const run_step = b.step("run-" ++ example_name, "Run the app");
5666
run_step.dependOn(&run_cmd.step);
5767
}
5868
}

examples/common.cpp

+18-1
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
168168
break;
169169
}
170170
params.n_ctx = std::stoi(argv[i]);
171+
} else if (arg == "--rope-freq-base") {
172+
if (++i >= argc) {
173+
invalid_param = true;
174+
break;
175+
}
176+
params.rope_freq_base = std::stof(argv[i]);
177+
} else if (arg == "--rope-freq-scale") {
178+
if (++i >= argc) {
179+
invalid_param = true;
180+
break;
181+
}
182+
params.rope_freq_scale = std::stof(argv[i]);
171183
} else if (arg == "--memory-f32") {
172184
params.memory_f16 = false;
173185
} else if (arg == "--top-p") {
@@ -285,6 +297,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
285297
break;
286298
}
287299
params.lora_adapter = argv[i];
300+
params.use_mmap = false;
288301
} else if (arg == "--lora-base") {
289302
if (++i >= argc) {
290303
invalid_param = true;
@@ -492,6 +505,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
492505
fprintf(stderr, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
493506
fprintf(stderr, " --cfg-smooth-factor N smooth factor between old and new logits (default: %f, 1.0 = no smoothing)\n", params.cfg_smooth_factor);
494507
fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
508+
fprintf(stderr, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
509+
fprintf(stderr, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
495510
fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
496511
fprintf(stderr, " --no-penalize-nl do not penalize newline token\n");
497512
fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
@@ -520,7 +535,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
520535
fprintf(stderr, " --mtest compute maximum memory usage\n");
521536
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
522537
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
523-
fprintf(stderr, " --lora FNAME apply LoRA adapter\n");
538+
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
524539
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
525540
fprintf(stderr, " -m FNAME, --model FNAME\n");
526541
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
@@ -572,6 +587,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
572587
lparams.use_mlock = params.use_mlock;
573588
lparams.logits_all = params.perplexity;
574589
lparams.embedding = params.embedding;
590+
lparams.rope_freq_base = params.rope_freq_base;
591+
lparams.rope_freq_scale = params.rope_freq_scale;
575592

576593
return lparams;
577594
}

examples/common.h

+2
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ struct gpt_params {
3232
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
3333
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
3434
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
35+
float rope_freq_base = 10000.0f; // RoPE base frequency
36+
float rope_freq_scale = 1.0f; // RoPE frequency scaling factor
3537

3638
// sampling parameters
3739
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens

examples/embd-input/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ make
1717
import torch
1818
1919
bin_path = "../LLaVA-13b-delta-v1-1/pytorch_model-00003-of-00003.bin"
20-
pth_path = "./examples/embd_input/llava_projection.pth"
20+
pth_path = "./examples/embd-input/llava_projection.pth"
2121
2222
dic = torch.load(bin_path)
2323
used_key = ["model.mm_projector.weight","model.mm_projector.bias"]

examples/embd-input/llava.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ def chat_with_image(self, image, question):
5959
# Also here can use pytorch_model-00003-of-00003.bin directly.
6060
a.load_projection(os.path.join(
6161
os.path.dirname(__file__) ,
62-
"llava_projetion.pth"))
62+
"llava_projection.pth"))
6363
respose = a.chat_with_image(
6464
Image.open("./media/llama1-logo.png").convert('RGB'),
6565
"what is the text in the picture?")

examples/main/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -293,5 +293,5 @@ These options provide extra functionality and customization when running the LLa
293293
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
294294
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
295295
- `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
296-
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model. This allows you to adapt the pretrained model to specific tasks or domains.
296+
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
297297
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.

examples/main/main.cpp

+10-2
Original file line numberDiff line numberDiff line change
@@ -84,9 +84,17 @@ int main(int argc, char ** argv) {
8484
return 0;
8585
}
8686

87+
if (params.rope_freq_base != 10000.0) {
88+
fprintf(stderr, "%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base);
89+
}
90+
91+
if (params.rope_freq_scale != 1.0) {
92+
fprintf(stderr, "%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
93+
}
94+
8795
if (params.n_ctx > 2048) {
88-
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
89-
"expect poor results\n", __func__, params.n_ctx);
96+
fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified);"
97+
" you are on your own\n", __func__, params.n_ctx);
9098
} else if (params.n_ctx < 8) {
9199
fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
92100
params.n_ctx = 8;

examples/server/README.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ Command line options:
1616
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
1717
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
1818
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
19-
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model. This allows you to adapt the pretrained model to specific tasks or domains.
19+
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
2020
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
2121
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
2222
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
@@ -66,6 +66,7 @@ Using [curl](https://curl.se/). On Windows `curl.exe` should be available in the
6666
```sh
6767
curl --request POST \
6868
--url http://localhost:8080/completion \
69+
--header "Content-Type: application/json" \
6970
--data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}'
7071
```
7172

examples/server/chat.sh

+2
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ tokenize() {
3232
--silent \
3333
--request POST \
3434
--url "${API_URL}/tokenize" \
35+
--header "Content-Type: application/json" \
3536
--data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \
3637
| jq '.tokens[]'
3738
}
@@ -64,6 +65,7 @@ chat_completion() {
6465
--no-buffer \
6566
--request POST \
6667
--url "${API_URL}/completion" \
68+
--header "Content-Type: application/json" \
6769
--data-raw "${DATA}")
6870

6971
printf "\n"

examples/server/server.cpp

+20-1
Original file line numberDiff line numberDiff line change
@@ -608,6 +608,8 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
608608
fprintf(stderr, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
609609
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
610610
fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
611+
fprintf(stderr, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
612+
fprintf(stderr, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
611613
fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
612614
fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
613615
fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n");
@@ -632,7 +634,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
632634
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
633635
fprintf(stderr, " -a ALIAS, --alias ALIAS\n");
634636
fprintf(stderr, " set an alias for the model, will be added as `model` field in completion response\n");
635-
fprintf(stderr, " --lora FNAME apply LoRA adapter\n");
637+
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
636638
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
637639
fprintf(stderr, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
638640
fprintf(stderr, " --port PORT port to listen (default (default: %d)\n", sparams.port);
@@ -722,6 +724,22 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
722724
}
723725
params.n_ctx = std::stoi(argv[i]);
724726
}
727+
else if (arg == "--rope-freq-base")
728+
{
729+
if (++i >= argc) {
730+
invalid_param = true;
731+
break;
732+
}
733+
params.rope_freq_base = std::stof(argv[i]);
734+
}
735+
else if (arg == "--rope-freq-scale")
736+
{
737+
if (++i >= argc) {
738+
invalid_param = true;
739+
break;
740+
}
741+
params.rope_freq_scale = std::stof(argv[i]);
742+
}
725743
else if (arg == "--memory-f32" || arg == "--memory_f32")
726744
{
727745
params.memory_f16 = false;
@@ -820,6 +838,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
820838
break;
821839
}
822840
params.lora_adapter = argv[i];
841+
params.use_mmap = false;
823842
}
824843
else if (arg == "--lora-base")
825844
{

flake.nix

+4
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@
4343
"-DLLAMA_METAL=ON"
4444
]);
4545
installPhase = ''
46+
runHook preInstall
47+
4648
mkdir -p $out/bin
4749
mv bin/* $out/bin/
4850
mv $out/bin/main $out/bin/llama
@@ -51,6 +53,8 @@
5153
echo "#!${llama-python}/bin/python" > $out/bin/convert.py
5254
cat ${./convert.py} >> $out/bin/convert.py
5355
chmod +x $out/bin/convert.py
56+
57+
runHook postInstall
5458
'';
5559
meta.mainProgram = "llama";
5660
};

0 commit comments

Comments
 (0)