report error when OOM. don't exit.

Judd · Judd · commit f572bd7105cc · 2025-03-14T20:58:53.000+08:00
diff --git a/src/chat.h b/src/chat.h
@@ -651,7 +651,7 @@ namespace chatllm
                                             int gen_max_tokens,
                                             BaseStreamer *streamer = nullptr) = 0;
 
-        virtual void generate_next_token(const std::vector<int> &input_ids, const GenerationConfig &gen_config, std::vector<float> &lm_logits) {};
+        virtual bool generate_next_token(const std::vector<int> &input_ids, const GenerationConfig &gen_config, std::vector<float> &lm_logits) { return true; };
 
         virtual void abort_generation(void) = 0;
 
@@ -719,9 +719,9 @@ namespace chatllm
             return model->generate(input_ids, gen_config, continuous, completed, performance, gen_max_tokens, streamer);
         }
 
-        void generate_next_token(const std::vector<int> &input_ids, const GenerationConfig &gen_config, std::vector<float> &lm_logits) override
+        bool generate_next_token(const std::vector<int> &input_ids, const GenerationConfig &gen_config, std::vector<float> &lm_logits) override
         {
-            model->generate_next_token(input_ids, gen_config, lm_logits);
+            return model->generate_next_token(input_ids, gen_config, lm_logits);
         }
 
         void abort_generation(void) override { model->abort_generation(); }
diff --git a/src/models.cpp b/src/models.cpp
@@ -1039,7 +1039,12 @@ namespace chatllm
             while (!aborted && !completed && (n_past + (int)curr_input_ids.size() < gen_config.max_length))
             {
                 std::vector<float> lm_logits;
-                generate_next_token(curr_input_ids, gen_config, lm_logits);
+                if (!generate_next_token(curr_input_ids, gen_config, lm_logits))
+                {
+                    ggml::log(GGML_LOG_LEVEL_ERROR, "Out of memory");
+                    aborted = true;
+                    break;
+                }
 
                 if (first_call)
                 {
@@ -1113,29 +1118,35 @@ namespace chatllm
         void text_embedding(const GenerationConfig &gen_config, const std::vector<int> &input_ids,
                                     std::vector<float> &embedding) override
         {
-            run_model(input_ids, gen_config, 0, embedding);
+            auto r = run_model(input_ids, gen_config, 0, embedding);
+            if (!r) ggml::log(GGML_LOG_LEVEL_ERROR, "Out of memory");
         }
 
         float qa_rank(const GenerationConfig &gen_config, const std::vector<int> &input_ids) override
         {
             std::vector<float> output;
-            run_model(input_ids, gen_config, 0, output);
+            auto r = run_model(input_ids, gen_config, 0, output);
+            if (!r) ggml::log(GGML_LOG_LEVEL_ERROR, "Out of memory");
             CHATLLM_CHECK(output.size() == 1) << "ouput must be scaler";
 
             return output[0];
         }
 
-        void generate_next_token(const std::vector<int> &input_ids, const GenerationConfig &gen_config, std::vector<float> &lm_logits) override
+        bool generate_next_token(const std::vector<int> &input_ids, const GenerationConfig &gen_config, std::vector<float> &lm_logits) override
         {
             if (batch_input)
             {
-                run_model(input_ids, gen_config, n_past + n_past_offset, lm_logits);
+                return run_model(input_ids, gen_config, n_past + n_past_offset, lm_logits);
             }
             else
             {
                 int past = n_past + n_past_offset;
                 for (size_t i = 0 ; (i < input_ids.size()) & !aborted; i++, past++)
-                    run_model({input_ids[i]}, gen_config, past, lm_logits);
+                {
+                    if (!run_model({input_ids[i]}, gen_config, past, lm_logits))
+                        return false;
+                }
+                return true;
             }
         }
 
@@ -1218,7 +1229,7 @@ namespace chatllm
             return s;
         }
 
-        virtual void run_model(const std::vector<int> &input_ids,
+        virtual bool run_model(const std::vector<int> &input_ids,
                                        const GenerationConfig &gen_config,
                                        int past,
                                        std::vector<float> &output)
@@ -1228,7 +1239,8 @@ namespace chatllm
                 initial_run = true;
                 int past = gen_config.max_length - (int)input_ids.size();
                 if (past < 0) past = 0;
-                CHATLLM_CHECK(before_initial_run(input_ids, gen_config, past)) << "failed to reserve memory.";
+                if (!before_initial_run(input_ids, gen_config, past))
+                    return false;
             }
 
             ForwardContext ctx(&backend_context);
@@ -1255,7 +1267,7 @@ namespace chatllm
 
             output.resize(ggml::nbytes(r) / sizeof(output[0]));
 
-            CHATLLM_CHECK(ctx.allocate()) << "failed to allocate memory for graph";
+            if (!ctx.allocate()) return false;
 
             Backend::write_tensor_data(input_ids_tensor, input_ids.data());
 
@@ -1270,6 +1282,8 @@ namespace chatllm
             Backend::read_tensor_data(r, output.data());
 
             ctx.reset();
+
+            return true;
         }
 
         virtual bool is_output_terminated(const std::vector<int> &output_ids, int &keep_idx, int &pop_output)

Original file line number	Diff line number	Diff line change
`@@ -651,7 +651,7 @@ namespace chatllm`
`651`	`651`	`int gen_max_tokens,`
`652`	`652`	`BaseStreamer *streamer = nullptr) = 0;`
`653`	`653`
`654`		`- virtual void generate_next_token(const std::vector<int> &input_ids, const GenerationConfig &gen_config, std::vector<float> &lm_logits) {};`
	`654`	`+ virtual bool generate_next_token(const std::vector<int> &input_ids, const GenerationConfig &gen_config, std::vector<float> &lm_logits) { return true; };`
`655`	`655`
`656`	`656`	`virtual void abort_generation(void) = 0;`
`657`	`657`
`@@ -719,9 +719,9 @@ namespace chatllm`
`719`	`719`	`return model->generate(input_ids, gen_config, continuous, completed, performance, gen_max_tokens, streamer);`
`720`	`720`	`}`
`721`	`721`
`722`		`- void generate_next_token(const std::vector<int> &input_ids, const GenerationConfig &gen_config, std::vector<float> &lm_logits) override`
	`722`	`+ bool generate_next_token(const std::vector<int> &input_ids, const GenerationConfig &gen_config, std::vector<float> &lm_logits) override`
`723`	`723`	`{`
`724`		`- model->generate_next_token(input_ids, gen_config, lm_logits);`
	`724`	`+ return model->generate_next_token(input_ids, gen_config, lm_logits);`
`725`	`725`	`}`
`726`	`726`
`727`	`727`	`void abort_generation(void) override { model->abort_generation(); }`
Original file line number	Diff line number	Diff line change
`@@ -1039,7 +1039,12 @@ namespace chatllm`
`1039`	`1039`	`while (!aborted && !completed && (n_past + (int)curr_input_ids.size() < gen_config.max_length))`
`1040`	`1040`	`{`
`1041`	`1041`	`std::vector<float> lm_logits;`
`1042`		`- generate_next_token(curr_input_ids, gen_config, lm_logits);`
	`1042`	`+ if (!generate_next_token(curr_input_ids, gen_config, lm_logits))`
	`1043`	`+ {`
	`1044`	`+ ggml::log(GGML_LOG_LEVEL_ERROR, "Out of memory");`
	`1045`	`+ aborted = true;`
	`1046`	`+ break;`
	`1047`	`+ }`
`1043`	`1048`
`1044`	`1049`	`if (first_call)`
`1045`	`1050`	`{`
`@@ -1113,29 +1118,35 @@ namespace chatllm`
`1113`	`1118`	`void text_embedding(const GenerationConfig &gen_config, const std::vector<int> &input_ids,`
`1114`	`1119`	`std::vector<float> &embedding) override`
`1115`	`1120`	`{`
`1116`		`- run_model(input_ids, gen_config, 0, embedding);`
	`1121`	`+ auto r = run_model(input_ids, gen_config, 0, embedding);`
	`1122`	`+ if (!r) ggml::log(GGML_LOG_LEVEL_ERROR, "Out of memory");`
`1117`	`1123`	`}`
`1118`	`1124`
`1119`	`1125`	`float qa_rank(const GenerationConfig &gen_config, const std::vector<int> &input_ids) override`
`1120`	`1126`	`{`
`1121`	`1127`	`std::vector<float> output;`
`1122`		`- run_model(input_ids, gen_config, 0, output);`
	`1128`	`+ auto r = run_model(input_ids, gen_config, 0, output);`
	`1129`	`+ if (!r) ggml::log(GGML_LOG_LEVEL_ERROR, "Out of memory");`
`1123`	`1130`	`CHATLLM_CHECK(output.size() == 1) << "ouput must be scaler";`
`1124`	`1131`
`1125`	`1132`	`return output[0];`
`1126`	`1133`	`}`
`1127`	`1134`
`1128`		`- void generate_next_token(const std::vector<int> &input_ids, const GenerationConfig &gen_config, std::vector<float> &lm_logits) override`
	`1135`	`+ bool generate_next_token(const std::vector<int> &input_ids, const GenerationConfig &gen_config, std::vector<float> &lm_logits) override`
`1129`	`1136`	`{`
`1130`	`1137`	`if (batch_input)`
`1131`	`1138`	`{`
`1132`		`- run_model(input_ids, gen_config, n_past + n_past_offset, lm_logits);`
	`1139`	`+ return run_model(input_ids, gen_config, n_past + n_past_offset, lm_logits);`
`1133`	`1140`	`}`
`1134`	`1141`	`else`
`1135`	`1142`	`{`
`1136`	`1143`	`int past = n_past + n_past_offset;`
`1137`	`1144`	`for (size_t i = 0 ; (i < input_ids.size()) & !aborted; i++, past++)`
`1138`		`- run_model({input_ids[i]}, gen_config, past, lm_logits);`
	`1145`	`+ {`
	`1146`	`+ if (!run_model({input_ids[i]}, gen_config, past, lm_logits))`
	`1147`	`+ return false;`
	`1148`	`+ }`
	`1149`	`+ return true;`
`1139`	`1150`	`}`
`1140`	`1151`	`}`
`1141`	`1152`
`@@ -1218,7 +1229,7 @@ namespace chatllm`
`1218`	`1229`	`return s;`
`1219`	`1230`	`}`
`1220`	`1231`
`1221`		`- virtual void run_model(const std::vector<int> &input_ids,`
	`1232`	`+ virtual bool run_model(const std::vector<int> &input_ids,`
`1222`	`1233`	`const GenerationConfig &gen_config,`
`1223`	`1234`	`int past,`
`1224`	`1235`	`std::vector<float> &output)`
`@@ -1228,7 +1239,8 @@ namespace chatllm`
`1228`	`1239`	`initial_run = true;`
`1229`	`1240`	`int past = gen_config.max_length - (int)input_ids.size();`
`1230`	`1241`	`if (past < 0) past = 0;`
`1231`		`- CHATLLM_CHECK(before_initial_run(input_ids, gen_config, past)) << "failed to reserve memory.";`
	`1242`	`+ if (!before_initial_run(input_ids, gen_config, past))`
	`1243`	`+ return false;`
`1232`	`1244`	`}`
`1233`	`1245`
`1234`	`1246`	`ForwardContext ctx(&backend_context);`
`@@ -1255,7 +1267,7 @@ namespace chatllm`
`1255`	`1267`
`1256`	`1268`	`output.resize(ggml::nbytes(r) / sizeof(output[0]));`
`1257`	`1269`
`1258`		`- CHATLLM_CHECK(ctx.allocate()) << "failed to allocate memory for graph";`
	`1270`	`+ if (!ctx.allocate()) return false;`
`1259`	`1271`
`1260`	`1272`	`Backend::write_tensor_data(input_ids_tensor, input_ids.data());`
`1261`	`1273`
`@@ -1270,6 +1282,8 @@ namespace chatllm`
`1270`	`1282`	`Backend::read_tensor_data(r, output.data());`
`1271`	`1283`
`1272`	`1284`	`ctx.reset();`
	`1285`	`+`
	`1286`	`+ return true;`
`1273`	`1287`	`}`
`1274`	`1288`
`1275`	`1289`	`virtual bool is_output_terminated(const std::vector<int> &output_ids, int &keep_idx, int &pop_output)`