Cleanup: Encapsulate beam search functions into struct beam_search.

mattpulver · mattpulver · commit 2d9fdbd7a19f · 2023-08-02T10:59:34.000-04:00
diff --git a/llama.cpp b/llama.cpp
@@ -36,6 +36,7 @@
 #include <ctime>
 #include <cinttypes>
 #include <fstream>
+#include <functional>
 #include <random>
 #include <map>
 #include <unordered_map>
@@ -2882,10 +2883,10 @@ struct beam {
     float p;  // Cumulative beam probability (renormalized with each token)
     // end-of-sentence
     bool eos() const { return !tokens.empty() && tokens.back() == llama_token_eos(); }
-    // Shift off first n tokens to the end of trunk.
-    void shift_tokens(std::vector<llama_token>& trunk, int const n) {
-        trunk.resize(trunk.size() + n);
-        std::copy(tokens.begin(), tokens.begin() + n, trunk.end() - n);
+    // Shift off first n tokens to the end of dest.
+    void shift_tokens(std::vector<llama_token>& dest, int const n) {
+        dest.resize(dest.size() + n);
+        std::copy(tokens.begin(), tokens.begin() + n, dest.end() - n);
         shift_tokens(n);
     }
     // Shift off first n tokens and discard them.
@@ -2895,9 +2896,9 @@ struct beam {
     }
 };
 
-void out_beam(std::ostream& os, llama_context* ctx, beam const& b) {
-    os << "p(" << b.p << ") eos(" << std::boolalpha << b.eos() << ") tokens(";
-    for (llama_token const token_id : b.tokens) {
+void out_beam(std::ostream& os, llama_context* ctx, beam const& beam) {
+    os << "p(" << beam.p << ") eos(" << std::boolalpha << beam.eos() << ") tokens(";
+    for (llama_token const token_id : beam.tokens) {
         os << llama_token_to_str(ctx, token_id);
     }
     os << ')';
@@ -2948,139 +2949,163 @@ struct logit_info {
     }
 };
 
-// Track beams common prefix and when llama_eval has been applied with it.
-struct beams_state {
+struct beam_search {
+    llama_context * ctx;
     int beam_width;
+    int n_past;
+    int n_predict;
+    int n_threads;
+    std::vector<beam> beams;
+    std::vector<beam> next_beams;
+
+    // Re-calculated on each loop iteration
     int common_prefix_length;
-    int& n_past;
-    bool shifted;
-    std::vector<llama_token> trunk;  // Save token prefix common to all beams here
-    beams_state(int beam_width, int& n_past, int n_predict)
-      : beam_width(beam_width)
-      , n_past(n_past) {
-        trunk.reserve(n_predict);
-    }
-
-    // Set common_prefix_length based on beams.
-    void find_common_prefix(std::vector<beam>& beams) {
-        shifted = false;
-        common_prefix_length = int(beams[0].tokens.size());
+    // true iff llama_eval() has been called with common prefix in current loop iteration.
+    bool common_prefix_evaluated;
+    // Save token prefix common to all beams here
+    std::vector<llama_token> response;
+
+    beam_search(llama_context * ctx, int beam_width, int n_past, int n_predict, int n_threads)
+      : ctx(ctx)
+      , beam_width(beam_width)
+      , n_past(n_past)
+      , n_predict(n_predict)
+      , n_threads(n_threads) {
+        beams.reserve(beam_width);
+        next_beams.reserve(beam_width);
+    }
+
+    // Find common_prefix_length based on beams.
+    // Requires beams is not empty.
+    int find_common_prefix_length() {
+        int common_prefix_length = int(beams[0].tokens.size());
         for (int i=1 ; i<int(beams.size()) ; ++i) {
             int const j_max = std::min(common_prefix_length, int(beams[i].tokens.size()));
             for (int j=0 ; j<j_max ; ++j) {
                 if (beams[0].tokens[j] != beams[i].tokens[j]) {
-                    common_prefix_length = j;
-                    break;
+                    return j;
                 }
             }
         }
-    }
-};
-
-void fill_next_beams_by_top_probabilities(llama_context* ctx, std::vector<beam>& next_beams,
-    beam& b, beams_state& beams_state, int const n_threads) {
-    auto const comp = [](beam const& a, beam const& b) { return a.p > b.p; };
-    if (beams_state.shifted) {
-        // llama_eval was already called during this iteration
-        // with the common token prefix, so shift it off this beam.
-        b.shift_tokens(beams_state.common_prefix_length);
-    }
-    if (b.eos()) {
-        // b is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
-        if (next_beams.size() < static_cast<size_t>(beams_state.beam_width)) {
-            next_beams.push_back(b);
-            if (next_beams.size() == static_cast<size_t>(beams_state.beam_width)) {
-                std::make_heap(next_beams.begin(), next_beams.end(), comp);
-            }
-        } else if (next_beams.front().p < b.p) {
-            std::pop_heap(next_beams.begin(), next_beams.end(), comp);
-            next_beams.back() = std::move(b);
-            std::push_heap(next_beams.begin(), next_beams.end(), comp);
+        return common_prefix_length;
+    }
+
+    // Min-heaps are used to efficiently gather the top-k elements (k=beam_width).
+    // The repetative patterns below reflect the 2 stages of heaps:
+    //  * Gather elements until the vector is full, then call std::make_heap() on it.
+    //  * If the heap is full and a new element is found that should be included,
+    //    pop off the least element, replace it with the new, then push it into the heap.
+    void fill_next_beams_by_top_probabilities(beam& b) {
+        // Min-heaps use a greater-than comparator.
+        auto const comp = [](beam const& a, beam const& b) { return a.p > b.p; };
+        if (common_prefix_evaluated) {
+            // llama_eval was already called during this iteration
+            // with the common token prefix, so shift it off this beam.
+            b.shift_tokens(common_prefix_length);
         }
-    } else {
-        // b is not at end-of-sentence, so branch with next top_k tokens.
-        if (!b.tokens.empty()) {
-            llama_eval(ctx, b.tokens.data(), b.tokens.size(), beams_state.n_past, n_threads);
-            if (!beams_state.shifted && beams_state.common_prefix_length) {
-                b.shift_tokens(beams_state.trunk, beams_state.common_prefix_length);
-                beams_state.n_past += beams_state.common_prefix_length;
-                beams_state.shifted = true;
-            }
-        }
-        logit_info li(ctx);
-        std::vector<llama_token_data> next_tokens = li.top_k(beams_state.beam_width);
-        int i=0;
-        if (next_beams.size() < static_cast<size_t>(beams_state.beam_width)) {
-            for (; next_beams.size() < static_cast<size_t>(beams_state.beam_width) ; ++i) {
-                beam next_beam = b;
-                next_beam.tokens.push_back(next_tokens[i].id);
-                next_beam.p *= li.probability_from_logit(next_tokens[i].logit);
-                next_beams.push_back(std::move(next_beam));
-            }
-            std::make_heap(next_beams.begin(), next_beams.end(), comp);
-        } else {
-            for (; next_beams.front().p == 0.0f ; ++i) {
+        if (b.eos()) {
+            // beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
+            if (next_beams.size() < static_cast<size_t>(beam_width)) {
+                next_beams.push_back(b);
+                if (next_beams.size() == static_cast<size_t>(beam_width)) {
+                    std::make_heap(next_beams.begin(), next_beams.end(), comp);
+                }
+            } else if (next_beams.front().p < b.p) {
                 std::pop_heap(next_beams.begin(), next_beams.end(), comp);
-                next_beams.back() = b;
-                next_beams.back().tokens.push_back(next_tokens[i].id);
-                next_beams.back().p *= li.probability_from_logit(next_tokens[i].logit);
+                next_beams.back() = std::move(b);
                 std::push_heap(next_beams.begin(), next_beams.end(), comp);
             }
+        } else {
+            // beam is not at end-of-sentence, so branch with next top_k tokens.
+            if (!b.tokens.empty()) {
+                llama_eval(ctx, b.tokens.data(), b.tokens.size(), n_past, n_threads);
+                if (!common_prefix_evaluated && common_prefix_length) {
+                    b.shift_tokens(response, common_prefix_length);
+                    n_past += common_prefix_length;
+                    common_prefix_evaluated = true;
+                }
+            }
+            logit_info logit_info(ctx);
+            std::vector<llama_token_data> next_tokens = logit_info.top_k(beam_width);
+            int i=0;
+            if (next_beams.size() < static_cast<size_t>(beam_width)) {
+                for (; next_beams.size() < static_cast<size_t>(beam_width) ; ++i) {
+                    beam next_beam = b;
+                    next_beam.tokens.push_back(next_tokens[i].id);
+                    next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
+                    next_beams.push_back(std::move(next_beam));
+                }
+                std::make_heap(next_beams.begin(), next_beams.end(), comp);
+            } else {
+                for (; next_beams.front().p == 0.0f ; ++i) {
+                    std::pop_heap(next_beams.begin(), next_beams.end(), comp);
+                    next_beams.back() = b;
+                    next_beams.back().tokens.push_back(next_tokens[i].id);
+                    next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
+                    std::push_heap(next_beams.begin(), next_beams.end(), comp);
+                }
+            }
+            for (; i < beam_width ; ++i) {
+                float const next_p = b.p * logit_info.probability_from_logit(next_tokens[i].logit);
+                if (next_beams.front().p < next_p) {
+                    std::pop_heap(next_beams.begin(), next_beams.end(), comp);
+                    next_beams.back() = b;
+                    next_beams.back().tokens.push_back(next_tokens[i].id);
+                    next_beams.back().p = next_p;
+                    std::push_heap(next_beams.begin(), next_beams.end(), comp);
+                }
+            }
         }
-        for (; i < beams_state.beam_width ; ++i) {
-            float const next_p = b.p * li.probability_from_logit(next_tokens[i].logit);
-            if (next_beams.front().p < next_p) {
-                std::pop_heap(next_beams.begin(), next_beams.end(), comp);
-                next_beams.back() = b;
-                next_beams.back().tokens.push_back(next_tokens[i].id);
-                next_beams.back().p = next_p;
-                std::push_heap(next_beams.begin(), next_beams.end(), comp);
+    }
+
+    // Loop:
+    //  * while i < n_predict
+    //  * until all of the beams have nreached end-of-sentence
+    //  * until the highest probability beam is at end-of-sentence
+    //    (since all other beam probabilities can only decrease)
+    void loop(std::function<void(std::vector<beam>&)> const callback) {
+        beams.push_back({{}, 1.0f});
+        auto const eos = [](beam const& beam) { return beam.eos(); };
+        for (int i=0 ; i<n_predict && !std::all_of(beams.begin(),beams.end(),eos) && !eos(top_beam()) ; ++i) {
+            common_prefix_evaluated = false;
+            common_prefix_length = find_common_prefix_length();
+            for (beam& beam : beams) {
+                fill_next_beams_by_top_probabilities(beam);
             }
+            beams.swap(next_beams);
+            renormalize_beam_probabilities(beams);
+            std::for_each(next_beams.begin(), next_beams.end(), [](beam& beam) { beam.p = 0.0f; });
+            callback(beams);
         }
+        beam& top_b = top_beam();
+        top_b.shift_tokens(response, top_b.tokens.size());
     }
-}
 
-// As beams grow, the cumulative probabilities decrease.
-// Renormalize them to avoid floating point underflow.
-void renormalize_beam_probabilities(std::vector<beam>& beams) {
-    auto const sum_p = [](float sum, beam& b) { return sum + b.p; };
-    float const inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
-    std::for_each(beams.begin(), beams.end(), [inv_sum](beam& b) { b.p *= inv_sum; });
-}
+    // As beams grow, the cumulative probabilities decrease.
+    // Renormalize them to avoid floating point underflow.
+    static void renormalize_beam_probabilities(std::vector<beam>& beams) {
+        auto const sum_p = [](float sum, beam& beam) { return sum + beam.p; };
+        float const inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
+        std::for_each(beams.begin(), beams.end(), [inv_sum](beam& beam) { beam.p *= inv_sum; });
+    }
 
-// Return beam with highest probability.
-beam& top_beam(std::vector<beam>& beams) {
-    auto const by_p = [](beam const& a, beam const& b) { return a.p < b.p; };
-    return *std::max_element(beams.begin(), beams.end(), by_p);
-}
+    // Return beam with highest probability.
+    beam& top_beam() {
+        auto const by_p = [](beam const& a, beam const& b) { return a.p < b.p; };
+        return *std::max_element(beams.begin(), beams.end(), by_p);
+    }
+};
 
-// This is deterministic, but can be made probabilistic in
-// fill_next_beams_by_top_probabilities() by randomly selecting from all next_beams.
 // Not thread-safe.
-const char* llama_beam_search(llama_context * ctx, int const beam_width,
+const char* llama_beam_search(llama_context * ctx, int beam_width,
                               int n_past, int const n_predict, int const n_threads) {
     static std::string beam_search_response;
     assert(ctx);
     const int64_t t_start_sample_us = ggml_time_us();
 
-    std::vector<beam> beams;
-    beams.reserve(beam_width);
-    beams.push_back({{}, 1.0});
-    std::vector<beam> next_beams;
-    next_beams.reserve(beam_width);
-    beams_state beams_state(beam_width, n_past, n_predict);
-    // Loop while there are any beams that have not yet reached end-of-sentence.
-    // If the highest probability beam is at end-of-sentence, then finish since all other
-    // beam probabilities can only decrease.
-    auto const eos = [](beam const& b) { return b.eos(); };
-    for (int i=0 ; i<n_predict && !eos(top_beam(beams)) && !std::all_of(beams.begin(),beams.end(),eos) ; ++i) {
-        beams_state.find_common_prefix(beams);
-        for (beam& b : beams) {
-            fill_next_beams_by_top_probabilities(ctx, next_beams, b, beams_state, n_threads);
-        }
-        beams.swap(next_beams);
-        std::for_each(next_beams.begin(), next_beams.end(), [](beam& b) { b.p = 0.0f; });
-        renormalize_beam_probabilities(beams);
+    beam_search beam_search(ctx, beam_width, n_past, n_predict, n_threads);
+
+    beam_search.loop([&](std::vector<beam>& beams) {
 #if 1 // DEBUG: print current beams for this iteration
         std::cout << "\n\nCurrent beams:\n";
         for (size_t j=0 ; j < beams.size() ; ++j) {
@@ -3091,13 +3116,11 @@ const char* llama_beam_search(llama_context * ctx, int const beam_width,
 #else
         std::cout << '.' << std::flush;  // Show progress
 #endif
-    }
+    });
 
-    beam& top_b = top_beam(beams);
-    top_b.shift_tokens(beams_state.trunk, top_b.tokens.size());
     // Save beam sentence to beam_search_response. Is there a better way?
     std::ostringstream oss;
-    for (llama_token const token : beams_state.trunk) {
+    for (llama_token const token : beam_search.response) {
         oss << llama_token_to_str(ctx, token);
     }
     beam_search_response = oss.str();