@@ -547,7 +547,7 @@ bool llama_eval(
547
547
static void * buf = malloc (buf_size);
548
548
549
549
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
550
- const size_t buf_size_new = 1.1 *(mem_per_token*N); // add 10 % to account for ggml object overhead
550
+ const size_t buf_size_new = 1.3 *(mem_per_token*N); // add 30 % to account for ggml object overhead
551
551
// fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
552
552
553
553
// reallocate
@@ -747,6 +747,49 @@ bool llama_eval(
747
747
return true ;
748
748
}
749
749
750
+ std::vector<double > softmax (const std::vector<float >& logits) {
751
+ std::vector<double > probs (logits.size ());
752
+ float max_logit = logits[0 ];
753
+ for (float v : logits) max_logit = std::max (max_logit, v);
754
+ double sum_exp = 0.0 ;
755
+ for (size_t i = 0 ; i < logits.size (); i++) {
756
+ // Subtract the maximum logit value from the current logit value for numerical stability
757
+ float logit = logits[i] - max_logit;
758
+ double exp_logit = std::exp (logit);
759
+ sum_exp += exp_logit;
760
+ probs[i] = exp_logit;
761
+ }
762
+ for (size_t i = 0 ; i < probs.size (); i++) probs[i] /= sum_exp;
763
+ return probs;
764
+ }
765
+
766
+ void perplexity (const gpt_vocab &vocab, const llama_model &model, const gpt_params ¶ms, size_t mem_per_token) {
767
+ // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
768
+ // Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
769
+ // Output: `perplexity: 13.5106 [114/114]`
770
+ std::vector<gpt_vocab::id> tokens = ::llama_tokenize (vocab, params.prompt , true );
771
+
772
+ double nll = 0.0 ;
773
+ int seq_count = tokens.size () / params.n_ctx ;
774
+ for (int i = 0 ; i < seq_count; ++i) {
775
+ int start = i * params.n_ctx ;
776
+ int end = start + params.n_ctx - 1 ;
777
+ std::vector<gpt_vocab::id> embd (tokens.begin () + start, tokens.begin () + end);
778
+ std::vector<float > logits;
779
+ if (!llama_eval (model, params.n_threads , 0 , embd, logits, mem_per_token)) {
780
+ fprintf (stderr, " Failed to predict\n " );
781
+ return ;
782
+ }
783
+ // Calculate probability of next token, given the previous ones.
784
+ double prob = softmax (logits)[tokens[end]];
785
+ nll += -std::log (prob);
786
+ // perplexity is e^(average negative log-likelihood)
787
+ printf (" perplexity: %.4lf [%d/%d] \r " , std::exp (nll / (i + 1 )), i + 1 , seq_count);
788
+ fflush (stdout);
789
+ }
790
+ printf (" \n " );
791
+ }
792
+
750
793
static bool is_interacting = false ;
751
794
752
795
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
@@ -815,7 +858,7 @@ int main(int argc, char ** argv) {
815
858
// load the model
816
859
{
817
860
const int64_t t_start_us = ggml_time_us ();
818
- if (!llama_model_load (params.model , model, vocab, params.n_ctx )) {
861
+ if (!llama_model_load (params.model , model, vocab, params.n_ctx )) {
819
862
fprintf (stderr, " %s: failed to load model from '%s'\n " , __func__, params.model .c_str ());
820
863
return 1 ;
821
864
}
@@ -830,13 +873,22 @@ int main(int argc, char ** argv) {
830
873
params.n_threads , std::thread::hardware_concurrency (), llama_print_system_info ());
831
874
}
832
875
876
+ std::vector<float > logits;
877
+
878
+ // determine the required inference memory per token:
879
+ size_t mem_per_token = 0 ;
880
+ llama_eval (model, params.n_threads , 0 , { 0 , 1 , 2 , 3 }, logits, mem_per_token);
881
+
882
+ if (params.perplexity ) {
883
+ perplexity (vocab, model, params, mem_per_token);
884
+ exit (0 );
885
+ }
886
+
833
887
int n_past = 0 ;
834
888
835
889
int64_t t_sample_us = 0 ;
836
890
int64_t t_predict_us = 0 ;
837
891
838
- std::vector<float > logits;
839
-
840
892
// Add a space in front of the first character to match OG llama tokenizer behavior
841
893
params.prompt .insert (0 , 1 , ' ' );
842
894
// tokenize the prompt
@@ -881,10 +933,6 @@ int main(int argc, char ** argv) {
881
933
882
934
std::vector<gpt_vocab::id> embd;
883
935
884
- // determine the required inference memory per token:
885
- size_t mem_per_token = 0 ;
886
- llama_eval (model, params.n_threads , 0 , { 0 , 1 , 2 , 3 }, logits, mem_per_token);
887
-
888
936
int last_n_size = params.repeat_last_n ;
889
937
std::vector<gpt_vocab::id> last_n_tokens (last_n_size);
890
938
std::fill (last_n_tokens.begin (), last_n_tokens.end (), 0 );
0 commit comments