@@ -4076,8 +4076,6 @@ static struct ggml_cgraph * llm_build_mpt(
4076
4076
const int32_t n_kv = ggml_allocr_is_measure (lctx.alloc ) ? n_ctx : kv_self.n ;
4077
4077
const int32_t kv_head = ggml_allocr_is_measure (lctx.alloc ) ? n_ctx - n_tokens : kv_self.head ;
4078
4078
4079
- const bool do_rope_shift = ggml_allocr_is_measure (lctx.alloc ) || kv_self.has_shift ;
4080
-
4081
4079
// printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n",
4082
4080
// kv_head, n_kv, n_tokens, n_ctx, ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift);
4083
4081
@@ -4176,34 +4174,6 @@ static struct ggml_cgraph * llm_build_mpt(
4176
4174
}
4177
4175
}
4178
4176
4179
- // shift the entire K-cache if needed
4180
- // TODO: Do we need to handle it? (MPT uses alibi instead of rope)
4181
- /* if (do_rope_shift) {
4182
- struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
4183
- offload_func_kq(K_shift);
4184
- ggml_set_name(K_shift, "K_shift");
4185
- ggml_allocr_alloc(lctx.alloc, K_shift);
4186
- if (!ggml_allocr_is_measure(lctx.alloc)) {
4187
- int * data = (int *) K_shift->data;
4188
- for (int i = 0; i < n_ctx; ++i) {
4189
- data[i] = kv_self.cells[i].delta;
4190
- }
4191
- }
4192
-
4193
- for (int il = 0; il < n_layer; ++il) {
4194
- struct ggml_tensor * tmp =
4195
- ggml_rope_custom_inplace(ctx0,
4196
- ggml_view_3d(ctx0, kv_self.k,
4197
- n_embd_head, n_head_kv, n_ctx,
4198
- ggml_element_size(kv_self.k)*n_embd_head,
4199
- ggml_element_size(kv_self.k)*n_embd_gqa,
4200
- ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
4201
- K_shift, n_embd_head, 2, 0, freq_base, freq_scale);
4202
- offload_func_kq(tmp);
4203
- ggml_build_forward_expand(gf, tmp);
4204
- }
4205
- }*/
4206
-
4207
4177
for (int il = 0 ; il < n_layer; ++il) {
4208
4178
struct ggml_tensor * attn_norm;
4209
4179
@@ -4306,7 +4276,7 @@ static struct ggml_cgraph * llm_build_mpt(
4306
4276
4307
4277
// TODO: replace with ggml_add()
4308
4278
struct ggml_tensor * KQ_scaled_alibi =
4309
- ggml_alibi (ctx0, KQ_scaled, std::max (kv_head, n_kv - n_tokens) , n_head, max_alibi_bias);
4279
+ ggml_alibi (ctx0, KQ_scaled, 0 , n_head, max_alibi_bias);
4310
4280
offload_func_kq (KQ_scaled_alibi);
4311
4281
ggml_set_name (KQ_scaled_alibi, " KQ_scaled_alibi" );
4312
4282
0 commit comments