Skip to content

llama : remove llama_kv_cache_view API + remove deprecated #13653

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 20, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 1 addition & 8 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1452,7 +1452,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.swa_full = true;
}
));
).set_env("LLAMA_ARG_SWA_FULL"));
add_opt(common_arg(
{"--no-context-shift"},
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
Expand Down Expand Up @@ -2065,13 +2065,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.grp_attn_w = value;
}
).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
add_opt(common_arg(
{"-dkvc", "--dump-kv-cache"},
"verbose print of the KV cache",
[](common_params & params) {
params.dump_kv_cache = true;
}
));
add_opt(common_arg(
{"-nkvo", "--no-kv-offload"},
"disable KV offload",
Expand Down
75 changes: 0 additions & 75 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1329,81 +1329,6 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
return text;
}

//
// KV cache utils
//

void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";

printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);

llama_kv_cache_view_cell * c_curr = view.cells;
llama_seq_id * cs_curr = view.cells_sequences;

for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
if (i % row_size == 0) {
printf("\n%5d: ", i);
}
int seq_count = 0;
for (int j = 0; j < view.n_seq_max; j++) {
if (cs_curr[j] >= 0) { seq_count++; }
}
putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
}

printf("\n=== Done dumping\n");
}

void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";

printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);

std::unordered_map<llama_seq_id, size_t> seqs;
llama_kv_cache_view_cell * c_curr = view.cells;
llama_seq_id * cs_curr = view.cells_sequences;

for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
for (int j = 0; j < view.n_seq_max; j++) {
if (cs_curr[j] < 0) { continue; }
if (seqs.find(cs_curr[j]) == seqs.end()) {
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
const size_t sz = seqs.size();
seqs[cs_curr[j]] = sz;
}
}
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
}

printf("=== Sequence legend: ");
for (const auto & it : seqs) {
printf("%zu=%d, ", it.second, it.first);
}
printf("'+'=other sequence ids");

c_curr = view.cells;
cs_curr = view.cells_sequences;
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
if (i % row_size == 0) {
printf("\n%5d: ", i);
}
for (int j = 0; j < view.n_seq_max; j++) {
if (cs_curr[j] >= 0) {
const auto & it = seqs.find(cs_curr[j]);
putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
} else {
putchar('.');
}
}
putchar(' ');
}

printf("\n=== Done dumping\n");
}

//
// Embedding utils
//
Expand Down
11 changes: 0 additions & 11 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,6 @@ struct common_params {
bool use_mlock = false; // use mlock to keep model in memory
bool verbose_prompt = false; // print prompt tokens before generation
bool display_prompt = true; // print prompt before generation
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
bool no_kv_offload = false; // disable KV offloading
bool warmup = true; // warmup run
bool check_tensors = false; // validate tensor data
Expand Down Expand Up @@ -622,16 +621,6 @@ std::string common_detokenize(
const std::vector<llama_token> & tokens,
bool special = true);

//
// KV cache utils
//

// Dump the KV cache view with the number of sequences per cell.
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);

// Dump the KV cache view showing individual sequences in each cell (long output).
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);

//
// Embedding utils
//
Expand Down
13 changes: 0 additions & 13 deletions examples/lookahead/lookahead.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,6 @@ int main(int argc, char ** argv) {
const int N = 5; // n-gram size
const int G = 15; // max verification n-grams

const bool dump_kv_cache = params.dump_kv_cache;

// init llama.cpp
llama_backend_init();
llama_numa_init(params.numa);
Expand Down Expand Up @@ -152,9 +150,6 @@ int main(int argc, char ** argv) {
// here we keep adding new n-grams as we go
ngram_container ngrams_observed(llama_vocab_n_tokens(vocab), N, G);

// debug
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, W + G + 1);

const auto t_dec_start = ggml_time_us();

// sample first token
Expand All @@ -172,12 +167,6 @@ int main(int argc, char ** argv) {
}

while (true) {
// debug
if (dump_kv_cache) {
llama_kv_cache_view_update(ctx, &kvc_view);
common_kv_cache_dump_view_seqs(kvc_view, 40);
}

// build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
//
// Example for W = 5, N = 4, G = 2:
Expand Down Expand Up @@ -473,8 +462,6 @@ int main(int argc, char ** argv) {

common_sampler_free(smpl);

llama_kv_cache_view_free(&kvc_view);

llama_batch_free(batch);

llama_backend_free();
Expand Down
11 changes: 0 additions & 11 deletions examples/lookup/lookup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ int main(int argc, char ** argv){
// max. number of additional tokens to draft if match is found
const int n_draft = params.speculative.n_max;

const bool dump_kv_cache = params.dump_kv_cache;

// init llama.cpp
llama_backend_init();
llama_numa_init(params.numa);
Expand Down Expand Up @@ -110,18 +108,9 @@ int main(int argc, char ** argv){

llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, 1);

// debug
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, 1);

const auto t_dec_start = ggml_time_us();

while (true) {
// debug
if (dump_kv_cache) {
llama_kv_cache_view_update(ctx, &kvc_view);
common_kv_cache_dump_view_seqs(kvc_view, 40);
}

// print current draft sequence
LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str());

Expand Down
9 changes: 0 additions & 9 deletions examples/parallel/parallel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,8 +178,6 @@ int main(int argc, char ** argv) {
// insert new requests as soon as the previous one is done
const bool cont_batching = params.cont_batching;

const bool dump_kv_cache = params.dump_kv_cache;

// is the system prompt shared in the cache
const bool is_sp_shared = params.is_pp_shared;

Expand Down Expand Up @@ -241,8 +239,6 @@ int main(int argc, char ** argv) {
int32_t n_total_gen = 0;
int32_t n_cache_miss = 0;

struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, n_clients);

const auto t_main_start = ggml_time_us();

LOG_INF("%s: Simulating parallel requests from clients:\n", __func__);
Expand Down Expand Up @@ -272,11 +268,6 @@ int main(int argc, char ** argv) {
LOG_INF("Processing requests ...\n\n");

while (true) {
if (dump_kv_cache) {
llama_kv_cache_view_update(ctx, &kvc_view);
common_kv_cache_dump_view_seqs(kvc_view, 40);
}

common_batch_clear(batch);

// decode any currently ongoing sequences
Expand Down
114 changes: 0 additions & 114 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -608,72 +608,13 @@ extern "C" {
// KV cache
//

// TODO: start using struct llama_kv_cache

// Information associated with an individual cell in the KV cache view.
struct llama_kv_cache_view_cell {
// The position for this cell. Takes KV cache shifts into account.
// May be negative if the cell is not populated.
llama_pos pos;
};

// An updateable view of the KV cache.
struct llama_kv_cache_view {
// Number of KV cache cells. This will be the same as the context size.
int32_t n_cells;

// Maximum number of sequences that can exist in a cell. It's not an error
// if there are more sequences in a cell than this value, however they will
// not be visible in the view cells_sequences.
int32_t n_seq_max;

// Number of tokens in the cache. For example, if there are two populated
// cells, the first with 1 sequence id in it and the second with 2 sequence
// ids then you'll have 3 tokens.
int32_t token_count;

// Number of populated cache cells.
int32_t used_cells;

// Maximum contiguous empty slots in the cache.
int32_t max_contiguous;

// Index to the start of the max_contiguous slot range. Can be negative
// when cache is full.
int32_t max_contiguous_idx;

// Information for an individual cell.
struct llama_kv_cache_view_cell * cells;

// The sequences for each cell. There will be n_seq_max items per cell.
llama_seq_id * cells_sequences;
};

// Create an empty KV cache view. (use only for debugging purposes)
LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max);

// Free a KV cache view. (use only for debugging purposes)
LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);

// Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
// TODO: change signature to llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_context * ctx)
LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);

///

// Returns the number of tokens in the KV cache (slow, use only for debug)
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx);

DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx),
"use llama_kv_self_n_tokens instead");

// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx);

DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx),
"use llama_kv_self_used_cells instead");

// Clear the KV cache - both cell info is erased and KV data is zeroed
LLAMA_API void llama_kv_self_clear(
struct llama_context * ctx);
Expand Down Expand Up @@ -756,61 +697,6 @@ extern "C" {
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
LLAMA_API void llama_kv_self_update(struct llama_context * ctx);

DEPRECATED(LLAMA_API void llama_kv_cache_clear(
struct llama_context * ctx),
"use llama_kv_self_clear instead");

DEPRECATED(LLAMA_API bool llama_kv_cache_seq_rm(
struct llama_context * ctx,
llama_seq_id seq_id,
llama_pos p0,
llama_pos p1),
"use llama_kv_self_seq_rm instead");

DEPRECATED(LLAMA_API void llama_kv_cache_seq_cp(
struct llama_context * ctx,
llama_seq_id seq_id_src,
llama_seq_id seq_id_dst,
llama_pos p0,
llama_pos p1),
"use llama_kv_self_seq_cp instead");

DEPRECATED(LLAMA_API void llama_kv_cache_seq_keep(
struct llama_context * ctx,
llama_seq_id seq_id),
"use llama_kv_self_seq_keep instead");

DEPRECATED(LLAMA_API void llama_kv_cache_seq_add(
struct llama_context * ctx,
llama_seq_id seq_id,
llama_pos p0,
llama_pos p1,
llama_pos delta),
"use llama_kv_self_seq_add instead");

DEPRECATED(LLAMA_API void llama_kv_cache_seq_div(
struct llama_context * ctx,
llama_seq_id seq_id,
llama_pos p0,
llama_pos p1,
int d),
"use llama_kv_self_seq_div instead");

DEPRECATED(LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
struct llama_context * ctx,
llama_seq_id seq_id),
"use llama_kv_self_seq_pos_max instead");

DEPRECATED(LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx),
"use llama_kv_self_defrag instead");

DEPRECATED(LLAMA_API bool llama_kv_cache_can_shift(const struct llama_context * ctx),
"use llama_kv_self_can_shift instead");

DEPRECATED(LLAMA_API void llama_kv_cache_update(struct llama_context * ctx),
"use llama_kv_self_update instead");


//
// State / sessions
//
Expand Down
Loading
Loading