@@ -351,6 +351,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
351
351
fprintf (stderr, " %s : loaded %zu tasks from prompt.\n " , __func__, hs_task_count);
352
352
353
353
const bool is_spm = llama_vocab_type (ctx) == LLAMA_VOCAB_TYPE_SPM;
354
+ fprintf (stderr, " ================================= is_spm = %d\n " , is_spm);
354
355
355
356
// This is needed as usual for LLaMA models
356
357
const bool add_bos = is_spm;
@@ -406,18 +407,30 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
406
407
double acc = 0 .0f ;
407
408
const int n_vocab = llama_n_vocab (ctx);
408
409
410
+ std::vector<std::vector<int >> ending_tokens (4 );
411
+
409
412
std::vector<float > tok_logits (n_vocab);
410
413
411
414
for (size_t task_idx = 0 ; task_idx < hs_task_count; task_idx++) {
412
415
// Tokenize the context to count tokens
413
416
std::vector<int > context_embd = ::llama_tokenize (ctx, hs_data[task_idx].context , add_bos);
414
417
size_t context_size = context_embd.size ();
415
418
419
+ for (int i = 0 ; i < 4 ; ++i) {
420
+ ending_tokens[i] = ::llama_tokenize (ctx, hs_data[task_idx].context + hs_data[task_idx].ending [i], add_bos);
421
+ for (int k = 0 ; k < int (context_size); ++k) {
422
+ if (ending_tokens[i][k] != context_embd[k]) {
423
+ fprintf (stderr, " Oops: ending %d of task %d differs from context at position %d\n " ,i,int (task_idx),k);
424
+ break ;
425
+ }
426
+ }
427
+ }
428
+
416
429
// Do the 1st ending
417
430
// In this case we include the context when evaluating
418
- auto query_embd = ::llama_tokenize (ctx, hs_data[task_idx].context + hs_data[task_idx].ending [0 ], add_bos);
431
+ // auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], add_bos);
432
+ auto query_embd = ending_tokens[0 ];
419
433
auto query_size = query_embd.size ();
420
- // printf("First query: %d\n",(int)query_size);
421
434
422
435
// Stop if query wont fit the ctx window
423
436
if (query_size > (size_t )params.n_ctx ) {
@@ -462,7 +475,8 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
462
475
for (size_t ending_idx = 1 ; ending_idx < 4 ; ending_idx++) {
463
476
464
477
// Tokenize the query
465
- query_embd = ::llama_tokenize (ctx, hs_data[task_idx].ending [ending_idx], false );
478
+ query_embd.resize (ending_tokens[ending_idx].size () - context_size);
479
+ std::memcpy (query_embd.data (), ending_tokens[ending_idx].data () + context_size, query_embd.size ()*sizeof (int ));
466
480
query_size = query_embd.size ();
467
481
468
482
// Stop if query wont fit the ctx window
0 commit comments