From 0f3f883e246eee6613a3a540bf7f5982e5cfe7e6 Mon Sep 17 00:00:00 2001 From: Tai An Date: Sat, 9 May 2026 06:18:02 -0700 Subject: [PATCH] fix(embed): mark all tokens for output to suppress llama.cpp 'overriding' warning (#2208) --- llama_cpp/llama.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 752c25dd3..7712446e7 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1040,7 +1040,8 @@ def embed( # get pooling information pooling_type = self.pooling_type() - logits_all = pooling_type == llama_cpp.LLAMA_POOLING_TYPE_NONE + # All tokens need outputs for embeddings; llama.cpp otherwise logs an "overriding" warning per input. + logits_all = True if self.context_params.embeddings is False: raise RuntimeError(