Skip to content

Commit 62944df

Browse files
Bugfix: Remove f16_kv, add offload_kqv field (#1019)
F16_KV appears to have been removed here: ggml-org/llama.cpp@af99c6f This addresses two issues: - #995 which just requests to add the KV cache offloading param - #1006 a NULL ptr exception when using the embeddings (introduced by leaving f16_kv in the fields struct)
1 parent 37da8e8 commit 62944df

File tree

3 files changed

+3
-10
lines changed

3 files changed

+3
-10
lines changed

llama_cpp/llama.py

-5
Original file line numberDiff line numberDiff line change
@@ -751,7 +751,6 @@ def __init__(
751751
yarn_beta_slow: float = 1.0,
752752
yarn_orig_ctx: int = 0,
753753
mul_mat_q: bool = True,
754-
f16_kv: bool = True,
755754
logits_all: bool = False,
756755
embedding: bool = False,
757756
# Sampling Params
@@ -817,7 +816,6 @@ def __init__(
817816
yarn_beta_fast: YaRN low correction dim
818817
yarn_beta_slow: YaRN high correction dim
819818
yarn_orig_ctx: YaRN original context size
820-
f16_kv: Use fp16 for KV cache, fp32 otherwise
821819
logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs.
822820
embedding: Embedding mode only.
823821
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
@@ -904,7 +902,6 @@ def __init__(
904902
)
905903
self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0
906904
self.context_params.mul_mat_q = mul_mat_q
907-
# self.context_params.f16_kv = f16_kv
908905
self.context_params.logits_all = logits_all
909906
self.context_params.embedding = embedding
910907

@@ -2155,7 +2152,6 @@ def __getstate__(self):
21552152
yarn_beta_slow=self.context_params.yarn_beta_slow,
21562153
yarn_orig_ctx=self.context_params.yarn_orig_ctx,
21572154
mul_mat_q=self.context_params.mul_mat_q,
2158-
f16_kv=self.context_params.f16_kv,
21592155
logits_all=self.context_params.logits_all,
21602156
embedding=self.context_params.embedding,
21612157
# Sampling Params
@@ -2198,7 +2194,6 @@ def __setstate__(self, state):
21982194
yarn_beta_slow=state["yarn_beta_slow"],
21992195
yarn_orig_ctx=state["yarn_orig_ctx"],
22002196
mul_mat_q=state["mul_mat_q"],
2201-
f16_kv=state["f16_kv"],
22022197
logits_all=state["logits_all"],
22032198
embedding=state["embedding"],
22042199
# Sampling Params

llama_cpp/llama_cpp.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -432,9 +432,9 @@ class llama_context_params(Structure):
432432
type_k (int): data type for K cache
433433
type_v (int): data type for V cache
434434
mul_mat_q (bool): if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
435-
f16_kv (bool): use fp16 for KV cache, fp32 otherwise
436435
logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
437-
embedding (bool): embedding mode only"""
436+
embedding (bool): embedding mode only
437+
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU"""
438438
_fields_ = [
439439
("seed", c_uint32),
440440
("n_ctx", c_uint32),
@@ -452,9 +452,9 @@ class llama_context_params(Structure):
452452
("type_k", c_int),
453453
("type_v", c_int),
454454
("mul_mat_q", c_bool),
455-
("f16_kv", c_bool),
456455
("logits_all", c_bool),
457456
("embedding", c_bool),
457+
("offload_kqv", c_bool),
458458
]
459459

460460

llama_cpp/server/app.py

-2
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,6 @@ class Settings(BaseSettings):
9898
mul_mat_q: bool = Field(
9999
default=True, description="if true, use experimental mul_mat_q kernels"
100100
)
101-
f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.")
102101
logits_all: bool = Field(default=True, description="Whether to return logits.")
103102
embedding: bool = Field(default=True, description="Whether to use embeddings.")
104103
# Sampling Params
@@ -408,7 +407,6 @@ def create_app(settings: Optional[Settings] = None):
408407
yarn_beta_slow=settings.yarn_beta_slow,
409408
yarn_orig_ctx=settings.yarn_orig_ctx,
410409
mul_mat_q=settings.mul_mat_q,
411-
f16_kv=settings.f16_kv,
412410
logits_all=settings.logits_all,
413411
embedding=settings.embedding,
414412
# Sampling Params

0 commit comments

Comments
 (0)