@@ -3142,6 +3142,7 @@ static struct ggml_cgraph * llm_build_llama(
3142
3142
if (batch.token ) {
3143
3143
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_tokens);
3144
3144
ggml_set_name (inp_tokens, " inp_tokens" );
3145
+ ggml_allocr_alloc (lctx.alloc , inp_tokens);
3145
3146
3146
3147
inpL = ggml_get_rows (ctx0, model.tok_embeddings , inp_tokens);
3147
3148
} else {
@@ -3156,19 +3157,23 @@ static struct ggml_cgraph * llm_build_llama(
3156
3157
// KQ_scale
3157
3158
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d (ctx0, GGML_TYPE_F32, 1 );
3158
3159
ggml_set_name (KQ_scale, " KQ_scale" );
3160
+ ggml_allocr_alloc (lctx.alloc , KQ_scale);
3159
3161
3160
3162
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
3161
3163
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1 );
3162
3164
ggml_set_name (KQ_mask, " KQ_mask" );
3165
+ ggml_allocr_alloc (lctx.alloc , KQ_mask);
3163
3166
3164
3167
// KQ_pos - contains the positions
3165
3168
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_tokens);
3166
3169
ggml_set_name (KQ_pos, " KQ_pos" );
3170
+ ggml_allocr_alloc (lctx.alloc , KQ_pos);
3167
3171
3168
3172
// shift the entire K-cache if needed
3169
3173
if (do_rope_shift) {
3170
3174
struct ggml_tensor * K_shift = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_ctx);
3171
3175
ggml_set_name (K_shift, " K_shift" );
3176
+ ggml_allocr_alloc (lctx.alloc , K_shift);
3172
3177
3173
3178
for (int il = 0 ; il < n_layer; ++il) {
3174
3179
struct ggml_tensor * tmp =
@@ -5523,7 +5528,7 @@ static struct ggml_cgraph * llama_build_graph(
5523
5528
}
5524
5529
5525
5530
// allocate memory and set the values for the input tensors of the graph
5526
- llama_build_graph_input (lctx, batch, result);
5531
+ // llama_build_graph_input(lctx, batch, result);
5527
5532
5528
5533
// auto t_start = std::chrono::high_resolution_clock::now();
5529
5534
0 commit comments