@@ -4335,7 +4335,7 @@ struct llama_beam {
4335
4335
float p; // Cumulative beam probability (renormalized relative to all beams)
4336
4336
bool eos; // Initialize end-of-sentence to false. Callback sets this to true.
4337
4337
// Sort beams by probability. In case of ties, prefer beams at eos.
4338
- bool operator <(llama_beam const & rhs) const {
4338
+ bool operator <(llama_beam const & rhs) const {
4339
4339
return std::make_tuple (p, eos) < std::make_tuple (rhs.p , rhs.eos );
4340
4340
}
4341
4341
// Shift off first n tokens and discard them.
@@ -4350,15 +4350,15 @@ struct llama_beam {
4350
4350
4351
4351
// A struct for calculating logit-related info.
4352
4352
struct logit_info {
4353
- float const * const logits;
4353
+ float const * const logits;
4354
4354
int const n_vocab;
4355
4355
float const max_l;
4356
4356
float const normalizer;
4357
4357
struct sum_exp {
4358
4358
float max_l;
4359
4359
float operator ()(float sum, float l) const { return sum + std::exp (l - max_l); }
4360
4360
};
4361
- logit_info (llama_context* ctx)
4361
+ logit_info (llama_context * ctx)
4362
4362
: logits(llama_get_logits(ctx))
4363
4363
, n_vocab(llama_n_vocab(ctx))
4364
4364
, max_l(*std::max_element (logits, logits + n_vocab))
@@ -4376,7 +4376,7 @@ struct logit_info {
4376
4376
for (llama_token token_id=0 ; token_id<k_min ; ++token_id) {
4377
4377
min_heap.push_back (get_token_data (token_id));
4378
4378
}
4379
- auto comp = [](llama_token_data const & a, llama_token_data const & b) { return a.logit > b.logit ; };
4379
+ auto comp = [](llama_token_data const & a, llama_token_data const & b) { return a.logit > b.logit ; };
4380
4380
std::make_heap (min_heap.begin (), min_heap.end (), comp);
4381
4381
for (llama_token token_id=k_min ; token_id<n_vocab ; ++token_id) {
4382
4382
if (min_heap.front ().logit < logits[token_id]) {
@@ -4432,9 +4432,9 @@ struct beam_search {
4432
4432
// * Gather elements until the vector is full, then call std::make_heap() on it.
4433
4433
// * If the heap is full and a new element is found that should be included, pop the
4434
4434
// least element to the back(), replace it with the new, then push it into the heap.
4435
- void fill_next_beams_by_top_probabilities (llama_beam& beam) {
4435
+ void fill_next_beams_by_top_probabilities (llama_beam & beam) {
4436
4436
// Min-heaps use a greater-than comparator.
4437
- auto const comp = [](llama_beam const & a, llama_beam const & b) { return a.p > b.p ; };
4437
+ auto const comp = [](llama_beam const & a, llama_beam const & b) { return a.p > b.p ; };
4438
4438
if (beam.eos ) {
4439
4439
// beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
4440
4440
if (next_beams.size () < n_beams) {
@@ -4516,9 +4516,9 @@ struct beam_search {
4516
4516
// * any of the beams have not yet reached end-of-sentence, AND
4517
4517
// * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
4518
4518
// (since all other beam probabilities can only decrease)
4519
- void loop (llama_beam_search_callback_fn_t const callback, void * const callback_data) {
4519
+ void loop (llama_beam_search_callback_fn_t const callback, void * const callback_data) {
4520
4520
beams.push_back ({{}, 1 .0f , false }); // Start with one empty beam w/ probability = 1.0 and !eos.
4521
- auto const not_eos = [](llama_beam const & beam) { return !beam.eos ; };
4521
+ auto const not_eos = [](llama_beam const & beam) { return !beam.eos ; };
4522
4522
for (int i=0 ; i<n_predict && std::any_of (beams.begin (),beams.end (),not_eos) &&
4523
4523
!beams[top_beam_index ()].eos ; ++i) {
4524
4524
callback (callback_data, get_beams_state (false )); // Sets common_prefix_length
@@ -4528,8 +4528,8 @@ struct beam_search {
4528
4528
n_past += common_prefix_length;
4529
4529
}
4530
4530
// Zero-out next_beam probabilities to place them last in following min-heap.
4531
- std::for_each (next_beams.begin (), next_beams.end (), [](llama_beam& beam) { beam.p = 0 .0f ; });
4532
- for (llama_beam& beam : beams) {
4531
+ std::for_each (next_beams.begin (), next_beams.end (), [](llama_beam & beam) { beam.p = 0 .0f ; });
4532
+ for (llama_beam & beam : beams) {
4533
4533
beam.shift_tokens (common_prefix_length);
4534
4534
fill_next_beams_by_top_probabilities (beam);
4535
4535
}
@@ -4543,10 +4543,10 @@ struct beam_search {
4543
4543
4544
4544
// As beams grow, the cumulative probabilities decrease.
4545
4545
// Renormalize them to avoid floating point underflow.
4546
- static void renormalize_beam_probabilities (std::vector<llama_beam>& beams) {
4547
- auto const sum_p = [](float sum, llama_beam& beam) { return sum + beam.p ; };
4546
+ static void renormalize_beam_probabilities (std::vector<llama_beam> & beams) {
4547
+ auto const sum_p = [](float sum, llama_beam & beam) { return sum + beam.p ; };
4548
4548
float const inv_sum = 1 .0f / std::accumulate (beams.begin (), beams.end (), 0 .0f , sum_p);
4549
- std::for_each (beams.begin (), beams.end (), [=](llama_beam& beam) { beam.p *= inv_sum; });
4549
+ std::for_each (beams.begin (), beams.end (), [=](llama_beam & beam) { beam.p *= inv_sum; });
4550
4550
}
4551
4551
4552
4552
// Assumes beams is non-empty. Uses llama_beam::operator<() for ordering.
@@ -4564,7 +4564,7 @@ struct beam_search {
4564
4564
};
4565
4565
4566
4566
void llama_beam_search (llama_context * ctx,
4567
- llama_beam_search_callback_fn_t callback, void * callback_data,
4567
+ llama_beam_search_callback_fn_t callback, void * callback_data,
4568
4568
size_t n_beams, int n_past, int n_predict, int n_threads) {
4569
4569
assert (ctx);
4570
4570
const int64_t t_start_sample_us = ggml_time_us ();
0 commit comments