tmp

ggerganov · ggerganov · commit 66a54bfe2e91 · 2023-10-29T10:59:09.000+02:00
diff --git a/llama.cpp b/llama.cpp
@@ -3142,6 +3142,7 @@ static struct ggml_cgraph * llm_build_llama(
     if (batch.token) {
         struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
         ggml_set_name(inp_tokens, "inp_tokens");
+        ggml_allocr_alloc(lctx.alloc, inp_tokens);
 
         inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
     } else {
@@ -3156,19 +3157,23 @@ static struct ggml_cgraph * llm_build_llama(
     // KQ_scale
     struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
     ggml_set_name(KQ_scale, "KQ_scale");
+    ggml_allocr_alloc(lctx.alloc, KQ_scale);
 
     // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
     struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
     ggml_set_name(KQ_mask, "KQ_mask");
+    ggml_allocr_alloc(lctx.alloc, KQ_mask);
 
     // KQ_pos - contains the positions
     struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
     ggml_set_name(KQ_pos, "KQ_pos");
+    ggml_allocr_alloc(lctx.alloc, KQ_pos);
 
     // shift the entire K-cache if needed
     if (do_rope_shift) {
         struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
         ggml_set_name(K_shift, "K_shift");
+        ggml_allocr_alloc(lctx.alloc, K_shift);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * tmp =
@@ -5523,7 +5528,7 @@ static struct ggml_cgraph * llama_build_graph(
     }
 
     // allocate memory and set the values for the input tensors of the graph
-    llama_build_graph_input(lctx, batch, result);
+    //llama_build_graph_input(lctx, batch, result);
 
     //auto t_start = std::chrono::high_resolution_clock::now();