Skip to content

Commit d1e956e

Browse files
authored
[LLM] add port check and fix bugs (#10153) (#10195)
* [LLM] add port check and fix bugs * [LLM] fix stream output * [LLM] delete unused param * [LLM] fix start script
1 parent f3e85d4 commit d1e956e

File tree

8 files changed

+36
-14
lines changed

8 files changed

+36
-14
lines changed

llm/server/dockerfiles/Dockerfile_serving_cuda118_cudnn8

+1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ RUN python3 -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.or
88
&& python3 -m pip install --no-cache-dir --force-reinstall https://paddle-qa.bj.bcebos.com/paddlenlp/wheel/2f85a64edd4aa9911c94ccb5ce53e83ac41ce22b/paddlenlp-3.0.0b3.post20250123-py3-none-any.whl \
99
&& python3 -m pip install --no-cache-dir --force-reinstall https://paddlepaddle-inference-banchmark.bj.bcebos.com/paddlenlp_ops-0.0.0-py3-none-any.whl \
1010
&& python3 -m pip install --no-cache-dir sentencepiece pycryptodome tritonclient[all]==2.41.1 \
11+
&& apt update && apt install net-tools \
1112
&& apt-get clean && rm -rf /var/lib/apt/lists/*
1213

1314
# clone paddle & paddlenlp 源码(代码版本应与上述安装版本对齐)

llm/server/dockerfiles/Dockerfile_serving_cuda124_cudnn9

+1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ RUN python3 -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.or
88
&& python3 -m pip install --no-cache-dir --force-reinstall https://paddle-qa.bj.bcebos.com/paddlenlp/wheel/2f85a64edd4aa9911c94ccb5ce53e83ac41ce22b/paddlenlp-3.0.0b3.post20250123-py3-none-any.whl \
99
&& python3 -m pip install --no-cache-dir --force-reinstall https://paddlepaddle-inference-banchmark.bj.bcebos.com/paddlenlp_ops-0.0.0-py3-none-any.whl \
1010
&& python3 -m pip install --no-cache-dir sentencepiece pycryptodome tritonclient[all]==2.41.1 \
11+
&& apt update && apt install net-tools \
1112
&& apt-get clean && rm -rf /var/lib/apt/lists/*
1213

1314
# clone paddle & paddlenlp 源码(代码版本应与上述安装版本对齐)

llm/server/docs/deploy_usage_tutorial.md

+1
Original file line numberDiff line numberDiff line change
@@ -416,6 +416,7 @@ docker build --network=host -f ./dockerfiles/Dockerfile_serving_cuda124_cudnn9 -
416416
| USE_CACHE_KV_INT8 | int | 是否将 INT8配置为 KV Cache 的类型 || 0 | c8量化模型需要配置为1 |
417417
| MODEL_DIR | str | 模型文件路径 || /models/ | |
418418
| model_name | str | 模型名称 ||| 用于支持模型静态图下载,具体名称参考文档(#./static_models.md)|
419+
| OUTPUT_LOG_TO_CONSOLE | str | 是否定向输出到console 文件中 || 0 | |
419420

420421

421422
## 显存相关参数推荐

llm/server/server/scripts/start_server.sh

+24-4
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,21 @@ export SERVICE_GRPC_PORT=${GRPC_PORT:-${SERVICE_GRPC_PORT:-"8811"}}
5555
export INTER_PROC_PORT=${INTER_QUEUE_PORT:-${INTER_PROC_PORT:-"8813"}}
5656
export SERVICE_HTTP_PORT=${PUSH_MODE_HTTP_PORT:-${SERVICE_HTTP_PORT:-"9965"}}
5757

58+
check_port_occupied() {
59+
local port=$1
60+
if netstat -tuln | grep -q ":${port}\b"; then
61+
echo "PORT: ${port} occupied! Please change the port!"
62+
exit 1
63+
fi
64+
}
65+
66+
check_port_occupied ${HEALTH_HTTP_PORT}
67+
check_port_occupied ${METRICS_HTTP_PORT}
68+
check_port_occupied ${SERVICE_GRPC_PORT}
69+
check_port_occupied ${INTER_PROC_PORT}
70+
check_port_occupied ${SERVICE_HTTP_PORT}
71+
72+
5873

5974
if [ ! -d "llm_model" ];then
6075
ln -s /opt/source/PaddleNLP/llm/server/server/llm_model llm_model
@@ -83,15 +98,20 @@ else
8398
sleep ${SERVER_WAITTING_TIME:-"25"}
8499
fi
85100

86-
87-
88-
tritonserver --exit-timeout-secs 100000 --cuda-memory-pool-byte-size 0:0 --cuda-memory-pool-byte-size 1:0 \
101+
OUTPUT_LOG_TO_CONSOLE=${OUTPUT_LOG_TO_CONSOLE:-"0"}
102+
# Set the log redirection based on whether logs should be output to the console
103+
LOG_REDIRECT=""
104+
# If OUTPUT_LOG_TO_CONSOLE is set to "1", redirect logs to the console log file
105+
if [ "$OUTPUT_LOG_TO_CONSOLE" == "1" ]; then
106+
LOG_REDIRECT="> log/console.log 2>&1"
107+
fi
108+
eval tritonserver --exit-timeout-secs 100000 --cuda-memory-pool-byte-size 0:0 --cuda-memory-pool-byte-size 1:0 \
89109
--cuda-memory-pool-byte-size 2:0 --cuda-memory-pool-byte-size 3:0 --cuda-memory-pool-byte-size 4:0 \
90110
--cuda-memory-pool-byte-size 5:0 --cuda-memory-pool-byte-size 6:0 --cuda-memory-pool-byte-size 7:0 \
91111
--pinned-memory-pool-byte-size 0 --model-repository llm_model/ \
92112
--allow-http false \
93113
--grpc-port=${SERVICE_GRPC_PORT} \
94114
--metrics-port=${METRICS_HTTP_PORT} \
95-
--log-file log/server.log --log-info true &
115+
--log-file log/server.log --log-info true $LOG_REDIRECT &
96116

97117
echo "The logs for the model service, please check" ${PWD}"/log/server.log and "${PWD}"/log/workerlog.0"

llm/server/server/server/data/processor.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,9 @@ def process_response(self, response_dict, **kwargs):
183183
response_dict["usage"] = {"completion_tokens" : response_dict["send_idx"] + 1}
184184

185185
if is_end:
186-
response_dict["tokens_all"] = self.clear_request_status(req_id)
186+
self.clear_request_status(req_id)
187+
token_ids = response_dict.get("tokens_all_ids", [])
188+
response_dict["tokens_all"] = self.ids2tokens(token_ids, response_dict["req_id"])
187189
return response_dict
188190

189191
def text2ids(self, text):

llm/server/server/server/engine/infer.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -614,7 +614,7 @@ def run(self):
614614
engine_healthy_recorded_time_array,
615615
) = self.initialize_engine_healthy_recorded_time_flag()
616616
engine_healthy_recorded_time_array[0] = time.time()
617-
# infer_live_flag_shm = self.initialize_engine_live_flag()
617+
infer_live_flag_shm = self.initialize_engine_live_flag()
618618
infer_seed_increment = paddle.full(shape=[self.args.max_batch_size, 1], fill_value=4, dtype="int64")
619619
# thread_executor = ThreadPoolExecutor(max_workers=1)
620620
real_bsz = None

llm/server/server/server/engine/token_processor.py

+3-6
Original file line numberDiff line numberDiff line change
@@ -105,13 +105,12 @@ def process_sampling_results(self):
105105
except Exception as e:
106106
model_server_logger.info("while get input_data error: {0} {1}".format(e, str(traceback.format_exc())))
107107

108-
def postprocess(self, batch_result, exist_finished_task=False):
108+
def postprocess(self, batch_result):
109109
"""
110110
single post-processing function
111111
112112
Args:
113113
batch_result (list): batch results
114-
exist_finished_task (bool): whether there is a finished task
115114
"""
116115
result_dir = "./generate_token_results"
117116
if not os.path.exists(result_dir):
@@ -218,7 +217,6 @@ def _process_batch_output(self):
218217
accept_num = tokens[2 : batch + 2]
219218

220219
batch_result = list()
221-
exist_finished_task = False
222220
for i in range(batch):
223221
if self.resource_manager.stop_flags[i]:
224222
continue
@@ -253,11 +251,10 @@ def _process_batch_output(self):
253251
f"Speculate accept ratio: {1 - self.total_step * 1.0 / self.number_of_output_tokens}"
254252
f" total step: {self.total_step}. total_output_token_num: {self.number_of_output_tokens}"
255253
)
256-
exist_finished_task = True
257254
break
258255
batch_result.append(result)
259256

260-
self.postprocess(batch_result, exist_finished_task)
257+
self.postprocess(batch_result)
261258

262259

263260
class WarmUpTokenProcessor(TokenProcessor):
@@ -270,7 +267,7 @@ def __init__(self, cfg):
270267
self._is_running = True
271268
self._is_blocking = True
272269

273-
def postprocess(self, batch_result, exist_finished_task=False):
270+
def postprocess(self, batch_result):
274271
pass
275272

276273
def process_sampling_results(self):

llm/server/server/server/triton_server.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -111,12 +111,12 @@ def _cache_special_tokens(self, batch_result):
111111
["req_id"]] + batch_result[i]["token_scores"]
112112
del self.score_buffer[batch_result[i]["req_id"]]
113113

114-
def postprocess(self, batch_result, exist_finished_task=False):
114+
def postprocess(self, batch_result):
115115
"""
116116
single postprocess for triton
117117
"""
118118
try:
119-
self._cache_special_tokens(batch_result)
119+
# self._cache_special_tokens(batch_result)
120120
self.cached_generated_tokens.put(batch_result)
121121
except Exception as e:
122122
model_server_logger.info(

0 commit comments

Comments
 (0)