Simplify common_prefix_length logic and drop use of common_prefix_evaluated bool.

mattpulver · mattpulver · commit 40c940352fbf · 2023-08-16T10:56:15.000-04:00
diff --git a/llama.cpp b/llama.cpp
@@ -2901,8 +2901,10 @@ struct llama_beam {
     }
     // Shift off first n tokens and discard them.
     void shift_tokens(size_t const n) {
-        std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
-        tokens.resize(tokens.size() - n);
+        if (n) {
+            std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
+            tokens.resize(tokens.size() - n);
+        }
     }
     llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eos}; }
 };
@@ -2963,8 +2965,6 @@ struct beam_search {
 
     // Re-calculated on each loop iteration
     size_t common_prefix_length;
-    // true iff llama_eval() has been called with non-empty common prefix in current loop iteration.
-    bool common_prefix_evaluated;
 
     // Used to communicate to/from callback on beams state.
     std::vector<llama_beam_view> beam_views;
@@ -2996,11 +2996,6 @@ struct beam_search {
     void fill_next_beams_by_top_probabilities(llama_beam& beam) {
         // Min-heaps use a greater-than comparator.
         auto const comp = [](llama_beam const& a, llama_beam const& b) { return a.p > b.p; };
-        if (common_prefix_evaluated) {
-            // llama_eval was already called during this iteration
-            // with the common token prefix, so shift it off this beam.
-            beam.shift_tokens(common_prefix_length);
-        }
         if (beam.eos) {
             // beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
             if (next_beams.size() < n_beams) {
@@ -3017,11 +3012,6 @@ struct beam_search {
             // beam is not at end-of-sentence, so branch with next top_k tokens.
             if (!beam.tokens.empty()) {
                 llama_eval(ctx, beam.tokens.data(), beam.tokens.size(), n_past, n_threads);
-                if (!common_prefix_evaluated && common_prefix_length) {
-                    beam.shift_tokens(common_prefix_length);
-                    n_past += common_prefix_length;
-                    common_prefix_evaluated = true;
-                }
             }
             logit_info logit_info(ctx);
             std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
@@ -3076,9 +3066,7 @@ struct beam_search {
     // Side effect: set common_prefix_length = find_common_prefix_length();
     llama_beams_state get_beams_state(bool const last_call) {
         for (size_t i=0 ; i<beams.size() ; ++i) {
-            //beam_views[i] = beams[i].view();
-            auto view = beams.at(i).view();
-            beam_views.at(i) = view;  // capacity 0
+            beam_views[i] = beams[i].view();
         }
         common_prefix_length = find_common_prefix_length();
         return {beam_views.data(), beams.size(), common_prefix_length, last_call};
@@ -3094,12 +3082,16 @@ struct beam_search {
         auto const not_eos = [](llama_beam const& beam) { return !beam.eos; };
         for (int i=0 ; i<n_predict && std::any_of(beams.begin(),beams.end(),not_eos) &&
                        !beams[top_beam_index()].eos ; ++i) {
-            callback(callback_state, get_beams_state(false));
+            callback(callback_state, get_beams_state(false));  // Sets common_prefix_length
             update_beams_from_beam_views();   // Update values (p,eos) that callback may have changed.
-            common_prefix_evaluated = false;  // Any common prefix has not yet been llama_eval()ed.
+            if (common_prefix_length) {
+                llama_eval(ctx, beams[0].tokens.data(), common_prefix_length, n_past, n_threads);
+                n_past += common_prefix_length;
+            }
             // Zero-out next_beam probabilities to place them last in following min-heap.
             std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam& beam) { beam.p = 0.0f; });
             for (llama_beam& beam : beams) {
+                beam.shift_tokens(common_prefix_length);
                 fill_next_beams_by_top_probabilities(beam);
             }
             // next_beams become the beams of next/final iteration. Swap them to re-use memory.