Skip to content

Commit af99c6f

Browse files
committed
llama : remove memory_f16 and kv_f16 flags
1 parent 4adb1d6 commit af99c6f

File tree

6 files changed

+0
-16
lines changed

6 files changed

+0
-16
lines changed

common/common.cpp

-6
Original file line numberDiff line numberDiff line change
@@ -278,8 +278,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
278278
break;
279279
}
280280
params.yarn_beta_slow = std::stof(argv[i]);
281-
} else if (arg == "--memory-f32") {
282-
params.memory_f16 = false;
283281
} else if (arg == "--top-p") {
284282
if (++i >= argc) {
285283
invalid_param = true;
@@ -804,8 +802,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
804802
printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
805803
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
806804
printf(" --no-penalize-nl do not penalize newline token\n");
807-
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
808-
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
809805
printf(" --temp N temperature (default: %.1f)\n", (double)sparams.temp);
810806
printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n");
811807
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
@@ -948,7 +944,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
948944
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
949945
cparams.mul_mat_q = params.mul_mat_q;
950946
cparams.seed = params.seed;
951-
cparams.f16_kv = params.memory_f16;
952947
cparams.logits_all = params.logits_all;
953948
cparams.embedding = params.embedding;
954949
cparams.rope_scaling_type = params.rope_scaling_type;
@@ -1375,7 +1370,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
13751370
}
13761371
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
13771372
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
1378-
fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
13791373
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
13801374
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
13811375
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);

common/common.h

-1
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,6 @@ struct gpt_params {
9898
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
9999

100100
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
101-
bool memory_f16 = true; // use f16 instead of f32 for memory kv
102101
bool random_prompt = false; // do not randomize prompt if none provided
103102
bool use_color = false; // use color to distinguish generations and inputs
104103
bool interactive = false; // interactive mode

examples/quantize-stats/quantize-stats.cpp

-1
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,6 @@ int main(int argc, char ** argv) {
321321
auto cparams = llama_context_default_params();
322322
cparams.n_ctx = 256;
323323
cparams.seed = 1;
324-
cparams.f16_kv = false;
325324

326325
ctx = llama_new_context_with_model(model, cparams);
327326

examples/server/server.cpp

-4
Original file line numberDiff line numberDiff line change
@@ -2108,10 +2108,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
21082108
}
21092109
params.yarn_beta_slow = std::stof(argv[i]);
21102110
}
2111-
else if (arg == "--memory-f32" || arg == "--memory_f32")
2112-
{
2113-
params.memory_f16 = false;
2114-
}
21152111
else if (arg == "--threads" || arg == "-t")
21162112
{
21172113
if (++i >= argc)

llama.cpp

-3
Original file line numberDiff line numberDiff line change
@@ -8583,7 +8583,6 @@ struct llama_context_params llama_context_default_params() {
85838583
/*.type_k =*/ GGML_TYPE_F16,
85848584
/*.type_v =*/ GGML_TYPE_F16,
85858585
/*.mul_mat_q =*/ true,
8586-
/*.f16_kv =*/ true,
85878586
/*.logits_all =*/ false,
85888587
/*.embedding =*/ false,
85898588
/*.offload_kqv =*/ true,
@@ -8737,8 +8736,6 @@ struct llama_context * llama_new_context_with_model(
87378736
ctx->rng = std::mt19937(params.seed);
87388737
ctx->logits_all = params.logits_all;
87398738

8740-
//const ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
8741-
87428739
const ggml_type type_k = params.type_k;
87438740
const ggml_type type_v = params.type_v;
87448741

llama.h

-1
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,6 @@ extern "C" {
196196

197197
// Keep the booleans together to avoid misalignment during copy-by-value.
198198
bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
199-
bool f16_kv; // use fp16 for KV cache, fp32 otherwise
200199
bool logits_all; // the llama_eval() call computes all logits, not just the last one
201200
bool embedding; // embedding mode only
202201
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU

0 commit comments

Comments
 (0)