@@ -10096,9 +10096,6 @@ struct llm_build_context {
10096
10096
cur = ggml_add (ctx0, ggml_mul_mat(ctx0, model.cls, inp), model.cls_b);
10097
10097
cur = ggml_tanh(ctx0, cur);
10098
10098
cur = ggml_add (ctx0, ggml_mul_mat(ctx0, model.cls_out, cur), model.cls_out_b);
10099
-
10100
- // broadcast across the embedding size to make it compatible with the llama_get_embeddings API
10101
- cur = ggml_repeat(ctx0, cur, inp);
10102
10099
} break;
10103
10100
default:
10104
10101
{
@@ -16831,7 +16828,6 @@ static int llama_decode_internal(
16831
16828
case LLAMA_POOLING_TYPE_MEAN:
16832
16829
case LLAMA_POOLING_TYPE_CLS:
16833
16830
case LLAMA_POOLING_TYPE_LAST:
16834
- case LLAMA_POOLING_TYPE_RANK:
16835
16831
{
16836
16832
// extract sequence embeddings (cleared before processing each batch)
16837
16833
auto & embd_seq_out = lctx.embd_seq;
@@ -16845,6 +16841,20 @@ static int llama_decode_internal(
16845
16841
ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
16846
16842
}
16847
16843
} break;
16844
+ case LLAMA_POOLING_TYPE_RANK:
16845
+ {
16846
+ // extract the rank score - a single float per sequence
16847
+ auto & embd_seq_out = lctx.embd_seq;
16848
+
16849
+ for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
16850
+ const llama_seq_id seq_id = ubatch.seq_id[s][0];
16851
+ if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
16852
+ continue;
16853
+ }
16854
+ embd_seq_out[seq_id].resize(1);
16855
+ ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float));
16856
+ }
16857
+ } break;
16848
16858
case LLAMA_POOLING_TYPE_UNSPECIFIED:
16849
16859
{
16850
16860
GGML_ABORT("unknown pooling type");
0 commit comments