@@ -716,13 +716,16 @@ int llama_main(
716
716
gpt_vocab vocab,
717
717
llama_model model,
718
718
int64_t t_load_us,
719
- int64_t t_main_start_us) {
719
+ int64_t t_main_start_us,
720
+ FILE *instream,
721
+ FILE *outstream,
722
+ FILE *errstream) {
720
723
721
724
if (params.seed < 0 ) {
722
725
params.seed = time (NULL );
723
726
}
724
727
725
- fprintf (stderr , " %s: seed = %d\n " , __func__, params.seed );
728
+ fprintf (errstream , " %s: seed = %d\n " , __func__, params.seed );
726
729
727
730
std::mt19937 rng (params.seed );
728
731
if (params.random_prompt ) {
@@ -764,13 +767,13 @@ int llama_main(
764
767
params.interactive = true ;
765
768
}
766
769
767
- fprintf (stderr , " \n " );
768
- fprintf (stderr , " %s: prompt: '%s'\n " , __func__, params.prompt .c_str ());
769
- fprintf (stderr , " %s: number of tokens in prompt = %zu\n " , __func__, embd_inp.size ());
770
+ fprintf (errstream , " \n " );
771
+ fprintf (errstream , " %s: prompt: '%s'\n " , __func__, params.prompt .c_str ());
772
+ fprintf (errstream , " %s: number of tokens in prompt = %zu\n " , __func__, embd_inp.size ());
770
773
for (int i = 0 ; i < (int ) embd_inp.size (); i++) {
771
- fprintf (stderr , " %6d -> '%s'\n " , embd_inp[i], vocab.id_to_token .at (embd_inp[i]).c_str ());
774
+ fprintf (errstream , " %6d -> '%s'\n " , embd_inp[i], vocab.id_to_token .at (embd_inp[i]).c_str ());
772
775
}
773
- fprintf (stderr , " \n " );
776
+ fprintf (errstream , " \n " );
774
777
if (params.interactive ) {
775
778
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
776
779
struct sigaction sigint_action;
@@ -782,19 +785,19 @@ int llama_main(
782
785
signal (SIGINT, sigint_handler);
783
786
#endif
784
787
785
- fprintf (stderr , " %s: interactive mode on.\n " , __func__);
788
+ fprintf (errstream , " %s: interactive mode on.\n " , __func__);
786
789
787
790
if (antiprompt_inp.size ()) {
788
- fprintf (stderr , " %s: reverse prompt: '%s'\n " , __func__, params.antiprompt .c_str ());
789
- fprintf (stderr , " %s: number of tokens in reverse prompt = %zu\n " , __func__, antiprompt_inp.size ());
791
+ fprintf (errstream , " %s: reverse prompt: '%s'\n " , __func__, params.antiprompt .c_str ());
792
+ fprintf (errstream , " %s: number of tokens in reverse prompt = %zu\n " , __func__, antiprompt_inp.size ());
790
793
for (int i = 0 ; i < (int ) antiprompt_inp.size (); i++) {
791
- fprintf (stderr , " %6d -> '%s'\n " , antiprompt_inp[i], vocab.id_to_token .at (antiprompt_inp[i]).c_str ());
794
+ fprintf (errstream , " %6d -> '%s'\n " , antiprompt_inp[i], vocab.id_to_token .at (antiprompt_inp[i]).c_str ());
792
795
}
793
- fprintf (stderr , " \n " );
796
+ fprintf (errstream , " \n " );
794
797
}
795
798
}
796
- fprintf (stderr , " sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n " , params.temp , params.top_k , params.top_p , params.repeat_last_n , params.repeat_penalty );
797
- fprintf (stderr , " \n\n " );
799
+ fprintf (errstream , " sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n " , params.temp , params.top_k , params.top_p , params.repeat_last_n , params.repeat_penalty );
800
+ fprintf (errstream , " \n\n " );
798
801
799
802
std::vector<gpt_vocab::id> embd;
800
803
@@ -807,7 +810,7 @@ int llama_main(
807
810
std::fill (last_n_tokens.begin (), last_n_tokens.end (), 0 );
808
811
809
812
if (params.interactive ) {
810
- fprintf (stderr , " == Running in interactive mode. ==\n "
813
+ fprintf (errstream , " == Running in interactive mode. ==\n "
811
814
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
812
815
" - Press Ctrl+C to interject at any time.\n "
813
816
#endif
@@ -823,7 +826,7 @@ int llama_main(
823
826
824
827
// set the color for the prompt which will be output initially
825
828
if (params.use_color ) {
826
- printf ( ANSI_COLOR_YELLOW);
829
+ fprintf (outstream, ANSI_COLOR_YELLOW);
827
830
}
828
831
829
832
while (remaining_tokens > 0 || params.interactive ) {
@@ -832,7 +835,7 @@ int llama_main(
832
835
const int64_t t_start_us = ggml_time_us ();
833
836
834
837
if (!llama_eval (model, params.n_threads , n_past, embd, logits, mem_per_token)) {
835
- fprintf (stderr , " Failed to predict\n " );
838
+ fprintf (errstream , " Failed to predict\n " );
836
839
return 1 ;
837
840
}
838
841
@@ -891,16 +894,16 @@ int llama_main(
891
894
892
895
// reset color to default if we there is no pending user input
893
896
if (!input_noecho && params.use_color && (int ) embd_inp.size () == input_consumed) {
894
- printf ( ANSI_COLOR_RESET);
897
+ fprintf (outstream, ANSI_COLOR_RESET);
895
898
}
896
899
}
897
900
898
901
// display text
899
902
if (!input_noecho) {
900
903
for (auto id : embd) {
901
- printf ( " %s" , vocab.id_to_token [id].c_str ());
904
+ fprintf (outstream, " %s" , vocab.id_to_token [id].c_str ());
902
905
}
903
- fflush (stdout );
906
+ fflush (outstream );
904
907
}
905
908
906
909
// in interactive mode, and not currently processing queued inputs;
@@ -922,16 +925,16 @@ int llama_main(
922
925
// currently being interactive
923
926
bool another_line = true ;
924
927
while (another_line) {
925
- fflush (stdout );
928
+ fflush (outstream );
926
929
char buf[256 ] = {0 };
927
930
int n_read;
928
- if (params.use_color ) printf ( ANSI_BOLD ANSI_COLOR_GREEN);
929
- if (scanf ( " %255[^\n ]%n%*c" , buf, &n_read) <= 0 ) {
931
+ if (params.use_color ) fprintf (outstream, ANSI_BOLD ANSI_COLOR_GREEN);
932
+ if (fscanf (instream, " %255[^\n ]%n%*c" , buf, &n_read) <= 0 ) {
930
933
// presumable empty line, consume the newline
931
- std::ignore = scanf ( " %*c" );
934
+ std::ignore = fscanf (instream, " %*c" );
932
935
n_read=0 ;
933
936
}
934
- if (params.use_color ) printf ( ANSI_COLOR_RESET);
937
+ if (params.use_color ) fprintf (outstream, ANSI_COLOR_RESET);
935
938
936
939
if (n_read > 0 && buf[n_read-1 ]==' \\ ' ) {
937
940
another_line = true ;
@@ -964,7 +967,7 @@ int llama_main(
964
967
if (params.interactive ) {
965
968
is_interacting = true ;
966
969
} else {
967
- fprintf (stderr , " [end of text]\n " );
970
+ fprintf (errstream , " [end of text]\n " );
968
971
break ;
969
972
}
970
973
}
@@ -984,18 +987,18 @@ int llama_main(
984
987
{
985
988
const int64_t t_main_end_us = ggml_time_us ();
986
989
987
- fprintf (stderr , " \n\n " );
988
- fprintf (stderr , " %s: mem per token = %8zu bytes\n " , __func__, mem_per_token);
989
- fprintf (stderr , " %s: load time = %8.2f ms\n " , __func__, t_load_us/1000 .0f );
990
- fprintf (stderr , " %s: sample time = %8.2f ms\n " , __func__, t_sample_us/1000 .0f );
991
- fprintf (stderr , " %s: predict time = %8.2f ms / %.2f ms per token\n " , __func__, t_predict_us/1000 .0f , t_predict_us/1000 .0f /n_past);
992
- fprintf (stderr , " %s: total time = %8.2f ms\n " , __func__, (t_main_end_us - t_main_start_us)/1000 .0f );
990
+ fprintf (errstream , " \n\n " );
991
+ fprintf (errstream , " %s: mem per token = %8zu bytes\n " , __func__, mem_per_token);
992
+ fprintf (errstream , " %s: load time = %8.2f ms\n " , __func__, t_load_us/1000 .0f );
993
+ fprintf (errstream , " %s: sample time = %8.2f ms\n " , __func__, t_sample_us/1000 .0f );
994
+ fprintf (errstream , " %s: predict time = %8.2f ms / %.2f ms per token\n " , __func__, t_predict_us/1000 .0f , t_predict_us/1000 .0f /n_past);
995
+ fprintf (errstream , " %s: total time = %8.2f ms\n " , __func__, (t_main_end_us - t_main_start_us)/1000 .0f );
993
996
}
994
997
995
998
ggml_free (model.ctx );
996
999
997
1000
if (params.use_color ) {
998
- printf ( ANSI_COLOR_RESET);
1001
+ fprintf (outstream, ANSI_COLOR_RESET);
999
1002
}
1000
1003
1001
1004
return 0 ;
0 commit comments