36
36
#include < ctime>
37
37
#include < cinttypes>
38
38
#include < fstream>
39
- #include < functional>
40
39
#include < random>
41
40
#include < map>
42
41
#include < unordered_map>
@@ -2876,7 +2875,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
2876
2875
ctx->t_sample_us += ggml_time_us () - t_start_sample_us;
2877
2876
}
2878
2877
2879
- struct beam {
2878
+ struct llama_beam {
2880
2879
std::vector<llama_token> tokens;
2881
2880
float p; // Cumulative beam probability (renormalized relative to all beams)
2882
2881
// end-of-sentence
@@ -2939,16 +2938,16 @@ struct beam_search {
2939
2938
int n_past;
2940
2939
int n_predict;
2941
2940
int n_threads;
2942
- std::vector<beam > beams;
2943
- std::vector<beam > next_beams;
2941
+ std::vector<llama_beam > beams;
2942
+ std::vector<llama_beam > next_beams;
2944
2943
2945
2944
// Re-calculated on each loop iteration
2946
2945
size_t common_prefix_length;
2947
2946
// true iff llama_eval() has been called with non-empty common prefix in current loop iteration.
2948
2947
bool common_prefix_evaluated;
2949
2948
2950
- // Temporary memory used by beams_state to pass back via callback.
2951
- std::vector<beam_view > beam_views;
2949
+ // Temporary memory used by llama_beams_state to pass back via callback.
2950
+ std::vector<llama_beam_view > beam_views;
2952
2951
2953
2952
beam_search (llama_context * ctx, size_t beam_width, int n_past, int n_predict, int n_threads)
2954
2953
: ctx(ctx)
@@ -2974,32 +2973,32 @@ struct beam_search {
2974
2973
// * Gather elements until the vector is full, then call std::make_heap() on it.
2975
2974
// * If the heap is full and a new element is found that should be included, pop the
2976
2975
// least element to the back(), replace it with the new, then push it into the heap.
2977
- void fill_next_beams_by_top_probabilities (beam& b ) {
2976
+ void fill_next_beams_by_top_probabilities (llama_beam& beam ) {
2978
2977
// Min-heaps use a greater-than comparator.
2979
- auto const comp = [](beam const & a, beam const & b) { return a.p > b.p ; };
2978
+ auto const comp = [](llama_beam const & a, llama_beam const & b) { return a.p > b.p ; };
2980
2979
if (common_prefix_evaluated) {
2981
2980
// llama_eval was already called during this iteration
2982
2981
// with the common token prefix, so shift it off this beam.
2983
- b .shift_tokens (common_prefix_length);
2982
+ beam .shift_tokens (common_prefix_length);
2984
2983
}
2985
- if (b .eos ()) {
2984
+ if (beam .eos ()) {
2986
2985
// beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
2987
2986
if (next_beams.size () < beam_width) {
2988
- next_beams.push_back (std::move (b ));
2987
+ next_beams.push_back (std::move (beam ));
2989
2988
if (next_beams.size () == beam_width) {
2990
2989
std::make_heap (next_beams.begin (), next_beams.end (), comp);
2991
2990
}
2992
- } else if (next_beams.front ().p < b .p ) {
2991
+ } else if (next_beams.front ().p < beam .p ) {
2993
2992
std::pop_heap (next_beams.begin (), next_beams.end (), comp);
2994
- next_beams.back () = std::move (b );
2993
+ next_beams.back () = std::move (beam );
2995
2994
std::push_heap (next_beams.begin (), next_beams.end (), comp);
2996
2995
}
2997
2996
} else {
2998
2997
// beam is not at end-of-sentence, so branch with next top_k tokens.
2999
- if (!b .tokens .empty ()) {
3000
- llama_eval (ctx, b .tokens .data (), b .tokens .size (), n_past, n_threads);
2998
+ if (!beam .tokens .empty ()) {
2999
+ llama_eval (ctx, beam .tokens .data (), beam .tokens .size (), n_past, n_threads);
3001
3000
if (!common_prefix_evaluated && common_prefix_length) {
3002
- b .shift_tokens (common_prefix_length);
3001
+ beam .shift_tokens (common_prefix_length);
3003
3002
n_past += common_prefix_length;
3004
3003
common_prefix_evaluated = true ;
3005
3004
}
@@ -3009,7 +3008,7 @@ struct beam_search {
3009
3008
size_t i=0 ;
3010
3009
if (next_beams.size () < beam_width) {
3011
3010
for (; next_beams.size () < beam_width ; ++i) {
3012
- beam next_beam = b ;
3011
+ llama_beam next_beam = beam ;
3013
3012
next_beam.tokens .push_back (next_tokens[i].id );
3014
3013
next_beam.p *= logit_info.probability_from_logit (next_tokens[i].logit );
3015
3014
next_beams.push_back (std::move (next_beam));
@@ -3018,17 +3017,17 @@ struct beam_search {
3018
3017
} else {
3019
3018
for (; next_beams.front ().p == 0 .0f ; ++i) {
3020
3019
std::pop_heap (next_beams.begin (), next_beams.end (), comp);
3021
- next_beams.back () = b ;
3020
+ next_beams.back () = beam ;
3022
3021
next_beams.back ().tokens .push_back (next_tokens[i].id );
3023
3022
next_beams.back ().p *= logit_info.probability_from_logit (next_tokens[i].logit );
3024
3023
std::push_heap (next_beams.begin (), next_beams.end (), comp);
3025
3024
}
3026
3025
}
3027
3026
for (; i < beam_width ; ++i) {
3028
- float const next_p = b .p * logit_info.probability_from_logit (next_tokens[i].logit );
3027
+ float const next_p = beam .p * logit_info.probability_from_logit (next_tokens[i].logit );
3029
3028
if (next_beams.front ().p < next_p) {
3030
3029
std::pop_heap (next_beams.begin (), next_beams.end (), comp);
3031
- next_beams.back () = b ;
3030
+ next_beams.back () = beam ;
3032
3031
next_beams.back ().tokens .push_back (next_tokens[i].id );
3033
3032
next_beams.back ().p = next_p;
3034
3033
std::push_heap (next_beams.begin (), next_beams.end (), comp);
@@ -3055,9 +3054,9 @@ struct beam_search {
3055
3054
3056
3055
// Construct beams_state to send back to caller via the callback function.
3057
3056
// Side effect: set common_prefix_length = find_common_prefix_length();
3058
- beams_state get_beams_state (bool const last_call) {
3057
+ llama_beams_state get_beams_state (bool const last_call) {
3059
3058
for (size_t i=0 ; i<beams.size () ; ++i) {
3060
- beam_views[i] = beam_view {beams[i].tokens .data (), beams[i].tokens .size (), beams[i].p };
3059
+ beam_views[i] = llama_beam_view {beams[i].tokens .data (), beams[i].tokens .size (), beams[i].p };
3061
3060
}
3062
3061
common_prefix_length = find_common_prefix_length ();
3063
3062
return {beam_views.data (), beams.size (), common_prefix_length, last_call};
@@ -3070,10 +3069,10 @@ struct beam_search {
3070
3069
// (since all other beam probabilities can only decrease)
3071
3070
void loop (llama_beam_search_callback_fn_t const callback, void * const callback_state) {
3072
3071
beams.push_back ({{}, 1 .0f }); // Start with one empty beam w/ probability = 1.0.
3073
- auto const not_eos = [](beam const & beam) { return !beam.eos (); };
3072
+ auto const not_eos = [](llama_beam const & beam) { return !beam.eos (); };
3074
3073
for (int i=0 ; i<n_predict && std::any_of (beams.begin (),beams.end (),not_eos) &&
3075
3074
!beams[top_beam_index ()].eos () ; ++i) {
3076
- beam_search_control const control = callback (callback_state, get_beams_state (false ));
3075
+ llama_beam_search_control const control = callback (callback_state, get_beams_state (false ));
3077
3076
if (control.collapse_to < beams.size ()) {
3078
3077
// Caller has manually selected a specific beam. Collapse beams into it.
3079
3078
collapse_beams (control.collapse_to );
@@ -3082,30 +3081,30 @@ struct beam_search {
3082
3081
break ;
3083
3082
}
3084
3083
common_prefix_evaluated = false ;
3085
- for (beam & beam : beams) {
3084
+ for (llama_beam & beam : beams) {
3086
3085
fill_next_beams_by_top_probabilities (beam);
3087
3086
}
3088
3087
beams.swap (next_beams);
3089
3088
renormalize_beam_probabilities (beams);
3090
- std::for_each (next_beams.begin (), next_beams.end (), [](beam & beam) { beam.p = 0 .0f ; });
3089
+ std::for_each (next_beams.begin (), next_beams.end (), [](llama_beam & beam) { beam.p = 0 .0f ; });
3091
3090
}
3092
3091
collapse_beams (top_beam_index ());
3093
3092
callback (callback_state, get_beams_state (true ));
3094
3093
}
3095
3094
3096
3095
// As beams grow, the cumulative probabilities decrease.
3097
3096
// Renormalize them to avoid floating point underflow.
3098
- static void renormalize_beam_probabilities (std::vector<beam >& beams) {
3099
- auto const sum_p = [](float sum, beam & beam) { return sum + beam.p ; };
3097
+ static void renormalize_beam_probabilities (std::vector<llama_beam >& beams) {
3098
+ auto const sum_p = [](float sum, llama_beam & beam) { return sum + beam.p ; };
3100
3099
float const inv_sum = 1 .0f / std::accumulate (beams.begin (), beams.end (), 0 .0f , sum_p);
3101
- std::for_each (beams.begin (), beams.end (), [=](beam & beam) { beam.p *= inv_sum; });
3100
+ std::for_each (beams.begin (), beams.end (), [=](llama_beam & beam) { beam.p *= inv_sum; });
3102
3101
}
3103
3102
3104
3103
// Return index of highest ranking beam by (probability,eos()).
3105
3104
// In other words choose most probable beam. In case of ties, choose beam at end-of-sentence.
3106
3105
// Assumes beams is non-empty.
3107
3106
size_t top_beam_index () {
3108
- auto const by_p_and_eos = [](beam const & a, beam const & b) {
3107
+ auto const by_p_and_eos = [](llama_beam const & a, llama_beam const & b) {
3109
3108
return a.p < b.p || (a.p == b.p && a.eos () < b.eos ()); };
3110
3109
return std::max_element (beams.begin (), beams.end (), by_p_and_eos) - beams.begin ();
3111
3110
}
0 commit comments