Use llama_ prefix for struct names.

mattpulver · mattpulver · commit fbbf0ebdc78a · 2023-08-08T08:51:22.000-04:00
diff --git a/examples/beam_search/beam_search.cpp b/examples/beam_search/beam_search.cpp
@@ -30,12 +30,12 @@
 // Used for debugging to print out beam tokens.
 struct ostream_beam_view {
     llama_context* ctx;
-    beam_view bv;
+    llama_beam_view beam_view;
 };
 std::ostream& operator<<(std::ostream& os, ostream_beam_view const& obv) {
-    os << "p(" << obv.bv.p << ") eos(" << std::boolalpha << obv.bv.eos() << ") tokens(";
-    for (size_t i=0 ; i<obv.bv.n_tokens ; ++i) {
-        os << llama_token_to_str(obv.ctx, obv.bv.tokens[i]);
+    os << "p(" << obv.beam_view.p << ") eos(" << std::boolalpha << obv.beam_view.eos() << ") tokens(";
+    for (size_t i=0 ; i<obv.beam_view.n_tokens ; ++i) {
+        os << llama_token_to_str(obv.ctx, obv.beam_view.tokens[i]);
     }
     return os << ')';
 }
@@ -52,7 +52,7 @@ struct beam_search_callback_state {
 //  * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
 //    This is also called when the stop condition is met.
 //    Collect tokens into std::vector<llama_token> response which is pointed to by callback_state.
-beam_search_control beam_search_callback(void* callback_state, beams_state const beams_state) {
+llama_beam_search_control beam_search_callback(void* callback_state, llama_beams_state const beams_state) {
     auto const state = *static_cast<beam_search_callback_state*>(callback_state);
     printf(",");  // Show progress
     if (size_t const n = beams_state.common_prefix_length) {
@@ -69,11 +69,10 @@ beam_search_control beam_search_callback(void* callback_state, beams_state const
         std::cout << "beams["<<i<<"]: " << ostream_beam_view{state.ctx,beams_state.beam_views[i]} << std::endl;
     }
 #endif
-    beam_search_control control {
+    return llama_beam_search_control{
         beams_state.n_beams,  // = collapse_to. Any index out of range means do not collapse beams.
         false                 // = stop. Don't stop beam search.
     };
-    return control;
 }
 
 int main(int argc, char ** argv)
diff --git a/llama.cpp b/llama.cpp
@@ -36,7 +36,6 @@
 #include <ctime>
 #include <cinttypes>
 #include <fstream>
-#include <functional>
 #include <random>
 #include <map>
 #include <unordered_map>
@@ -2876,7 +2875,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
     ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
 }
 
-struct beam {
+struct llama_beam {
     std::vector<llama_token> tokens;
     float p;  // Cumulative beam probability (renormalized relative to all beams)
     // end-of-sentence
@@ -2939,16 +2938,16 @@ struct beam_search {
     int n_past;
     int n_predict;
     int n_threads;
-    std::vector<beam> beams;
-    std::vector<beam> next_beams;
+    std::vector<llama_beam> beams;
+    std::vector<llama_beam> next_beams;
 
     // Re-calculated on each loop iteration
     size_t common_prefix_length;
     // true iff llama_eval() has been called with non-empty common prefix in current loop iteration.
     bool common_prefix_evaluated;
 
-    // Temporary memory used by beams_state to pass back via callback.
-    std::vector<beam_view> beam_views;
+    // Temporary memory used by llama_beams_state to pass back via callback.
+    std::vector<llama_beam_view> beam_views;
 
     beam_search(llama_context * ctx, size_t beam_width, int n_past, int n_predict, int n_threads)
       : ctx(ctx)
@@ -2974,32 +2973,32 @@ struct beam_search {
     //  * Gather elements until the vector is full, then call std::make_heap() on it.
     //  * If the heap is full and a new element is found that should be included, pop the
     //    least element to the back(), replace it with the new, then push it into the heap.
-    void fill_next_beams_by_top_probabilities(beam& b) {
+    void fill_next_beams_by_top_probabilities(llama_beam& beam) {
         // Min-heaps use a greater-than comparator.
-        auto const comp = [](beam const& a, beam const& b) { return a.p > b.p; };
+        auto const comp = [](llama_beam const& a, llama_beam const& b) { return a.p > b.p; };
         if (common_prefix_evaluated) {
             // llama_eval was already called during this iteration
             // with the common token prefix, so shift it off this beam.
-            b.shift_tokens(common_prefix_length);
+            beam.shift_tokens(common_prefix_length);
         }
-        if (b.eos()) {
+        if (beam.eos()) {
             // beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
             if (next_beams.size() < beam_width) {
-                next_beams.push_back(std::move(b));
+                next_beams.push_back(std::move(beam));
                 if (next_beams.size() == beam_width) {
                     std::make_heap(next_beams.begin(), next_beams.end(), comp);
                 }
-            } else if (next_beams.front().p < b.p) {
+            } else if (next_beams.front().p < beam.p) {
                 std::pop_heap(next_beams.begin(), next_beams.end(), comp);
-                next_beams.back() = std::move(b);
+                next_beams.back() = std::move(beam);
                 std::push_heap(next_beams.begin(), next_beams.end(), comp);
             }
         } else {
             // beam is not at end-of-sentence, so branch with next top_k tokens.
-            if (!b.tokens.empty()) {
-                llama_eval(ctx, b.tokens.data(), b.tokens.size(), n_past, n_threads);
+            if (!beam.tokens.empty()) {
+                llama_eval(ctx, beam.tokens.data(), beam.tokens.size(), n_past, n_threads);
                 if (!common_prefix_evaluated && common_prefix_length) {
-                    b.shift_tokens(common_prefix_length);
+                    beam.shift_tokens(common_prefix_length);
                     n_past += common_prefix_length;
                     common_prefix_evaluated = true;
                 }
@@ -3009,7 +3008,7 @@ struct beam_search {
             size_t i=0;
             if (next_beams.size() < beam_width) {
                 for (; next_beams.size() < beam_width ; ++i) {
-                    beam next_beam = b;
+                    llama_beam next_beam = beam;
                     next_beam.tokens.push_back(next_tokens[i].id);
                     next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
                     next_beams.push_back(std::move(next_beam));
@@ -3018,17 +3017,17 @@ struct beam_search {
             } else {
                 for (; next_beams.front().p == 0.0f ; ++i) {
                     std::pop_heap(next_beams.begin(), next_beams.end(), comp);
-                    next_beams.back() = b;
+                    next_beams.back() = beam;
                     next_beams.back().tokens.push_back(next_tokens[i].id);
                     next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
                     std::push_heap(next_beams.begin(), next_beams.end(), comp);
                 }
             }
             for (; i < beam_width ; ++i) {
-                float const next_p = b.p * logit_info.probability_from_logit(next_tokens[i].logit);
+                float const next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
                 if (next_beams.front().p < next_p) {
                     std::pop_heap(next_beams.begin(), next_beams.end(), comp);
-                    next_beams.back() = b;
+                    next_beams.back() = beam;
                     next_beams.back().tokens.push_back(next_tokens[i].id);
                     next_beams.back().p = next_p;
                     std::push_heap(next_beams.begin(), next_beams.end(), comp);
@@ -3055,9 +3054,9 @@ struct beam_search {
 
     // Construct beams_state to send back to caller via the callback function.
     // Side effect: set common_prefix_length = find_common_prefix_length();
-    beams_state get_beams_state(bool const last_call) {
+    llama_beams_state get_beams_state(bool const last_call) {
         for (size_t i=0 ; i<beams.size() ; ++i) {
-            beam_views[i] = beam_view{beams[i].tokens.data(), beams[i].tokens.size(), beams[i].p};
+            beam_views[i] = llama_beam_view{beams[i].tokens.data(), beams[i].tokens.size(), beams[i].p};
         }
         common_prefix_length = find_common_prefix_length();
         return {beam_views.data(), beams.size(), common_prefix_length, last_call};
@@ -3070,10 +3069,10 @@ struct beam_search {
     //    (since all other beam probabilities can only decrease)
     void loop(llama_beam_search_callback_fn_t const callback, void* const callback_state) {
         beams.push_back({{}, 1.0f});  // Start with one empty beam w/ probability = 1.0.
-        auto const not_eos = [](beam const& beam) { return !beam.eos(); };
+        auto const not_eos = [](llama_beam const& beam) { return !beam.eos(); };
         for (int i=0 ; i<n_predict && std::any_of(beams.begin(),beams.end(),not_eos) &&
                        !beams[top_beam_index()].eos() ; ++i) {
-            beam_search_control const control = callback(callback_state, get_beams_state(false));
+            llama_beam_search_control const control = callback(callback_state, get_beams_state(false));
             if (control.collapse_to < beams.size()) {
                 // Caller has manually selected a specific beam. Collapse beams into it.
                 collapse_beams(control.collapse_to);
@@ -3082,30 +3081,30 @@ struct beam_search {
                 break;
             }
             common_prefix_evaluated = false;
-            for (beam& beam : beams) {
+            for (llama_beam& beam : beams) {
                 fill_next_beams_by_top_probabilities(beam);
             }
             beams.swap(next_beams);
             renormalize_beam_probabilities(beams);
-            std::for_each(next_beams.begin(), next_beams.end(), [](beam& beam) { beam.p = 0.0f; });
+            std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam& beam) { beam.p = 0.0f; });
         }
         collapse_beams(top_beam_index());
         callback(callback_state, get_beams_state(true));
     }
 
     // As beams grow, the cumulative probabilities decrease.
     // Renormalize them to avoid floating point underflow.
-    static void renormalize_beam_probabilities(std::vector<beam>& beams) {
-        auto const sum_p = [](float sum, beam& beam) { return sum + beam.p; };
+    static void renormalize_beam_probabilities(std::vector<llama_beam>& beams) {
+        auto const sum_p = [](float sum, llama_beam& beam) { return sum + beam.p; };
         float const inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
-        std::for_each(beams.begin(), beams.end(), [=](beam& beam) { beam.p *= inv_sum; });
+        std::for_each(beams.begin(), beams.end(), [=](llama_beam& beam) { beam.p *= inv_sum; });
     }
 
     // Return index of highest ranking beam by (probability,eos()).
     // In other words choose most probable beam. In case of ties, choose beam at end-of-sentence.
     // Assumes beams is non-empty.
     size_t top_beam_index() {
-        auto const by_p_and_eos = [](beam const& a, beam const& b) {
+        auto const by_p_and_eos = [](llama_beam const& a, llama_beam const& b) {
             return a.p < b.p || (a.p == b.p && a.eos() < b.eos()); };
         return std::max_element(beams.begin(), beams.end(), by_p_and_eos) - beams.begin();
     }
diff --git a/llama.h b/llama.h
@@ -444,7 +444,7 @@ extern "C" {
     LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
 
     // Lightweight view of a beam
-    struct beam_view {
+    struct llama_beam_view {
         llama_token const* tokens;
         size_t n_tokens;
         float p;  // Cumulative beam probability (renormalized relative to all beams)
@@ -456,27 +456,27 @@ extern "C" {
     // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
     // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
     // These pointers are valid only during the synchronous callback, so should not be saved.
-    struct beams_state {
-        beam_view* beam_views;        // View of each beam.
+    struct llama_beams_state {
+        llama_beam_view* beam_views;  // View of each beam.
         size_t n_beams;               // Number of elements in beam_views[].
         size_t common_prefix_length;  // Current max length of prefix tokens shared by all beams.
         bool last_call;               // True iff this is the last callback invocation.
     };
     // Must be returned by beam_search_callback function.
-    struct beam_search_control {
+    struct llama_beam_search_control {
         size_t collapse_to;  // Collapse to a beam index.  Ignored if n_beams <= collapse_to.
         bool stop;           // Stop beam search.  Set to false to continue.
     };
     // Type of pointer to the beam_search_callback function.
     // void* callback_state is any custom data passed to llama_beam_search, that is subsequently
     // passed back to beam_search_callback. This avoids having to use global variables in the callback.
-    typedef beam_search_control (*llama_beam_search_callback_fn_t)(void* callback_state, beams_state);
+    typedef llama_beam_search_control (*llama_beam_search_callback_fn_t)(void* callback_state, llama_beams_state);
 
     /// @details Deterministically returns entire sentence constructed by a beam search.
     /// @param ctx Pointer to the llama_context.
     /// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
     ///                 The return beam_search_control can be used to control the beam_search execution.
-    /// @param callback_state A pointer that is passed back to callback and nothing more.
+    /// @param callback_state A pointer that is simply passed back to callback.
     /// @param beam_width The number of parallel beams to use.
     /// @param n_past The number of tokens already evaluated.
     /// @param n_predict The maximum number of tokens to predict.