Skip to content

[LLM] add port check and fix bugs #10195

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions llm/server/dockerfiles/Dockerfile_serving_cuda118_cudnn8
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ RUN python3 -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.or
&& python3 -m pip install --no-cache-dir --force-reinstall https://paddle-qa.bj.bcebos.com/paddlenlp/wheel/2f85a64edd4aa9911c94ccb5ce53e83ac41ce22b/paddlenlp-3.0.0b3.post20250123-py3-none-any.whl \
&& python3 -m pip install --no-cache-dir --force-reinstall https://paddlepaddle-inference-banchmark.bj.bcebos.com/paddlenlp_ops-0.0.0-py3-none-any.whl \
&& python3 -m pip install --no-cache-dir sentencepiece pycryptodome tritonclient[all]==2.41.1 \
&& apt update && apt install net-tools \
&& apt-get clean && rm -rf /var/lib/apt/lists/*

# clone paddle & paddlenlp 源码(代码版本应与上述安装版本对齐)
Expand Down
1 change: 1 addition & 0 deletions llm/server/dockerfiles/Dockerfile_serving_cuda124_cudnn9
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ RUN python3 -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.or
&& python3 -m pip install --no-cache-dir --force-reinstall https://paddle-qa.bj.bcebos.com/paddlenlp/wheel/2f85a64edd4aa9911c94ccb5ce53e83ac41ce22b/paddlenlp-3.0.0b3.post20250123-py3-none-any.whl \
&& python3 -m pip install --no-cache-dir --force-reinstall https://paddlepaddle-inference-banchmark.bj.bcebos.com/paddlenlp_ops-0.0.0-py3-none-any.whl \
&& python3 -m pip install --no-cache-dir sentencepiece pycryptodome tritonclient[all]==2.41.1 \
&& apt update && apt install net-tools \
&& apt-get clean && rm -rf /var/lib/apt/lists/*

# clone paddle & paddlenlp 源码(代码版本应与上述安装版本对齐)
Expand Down
1 change: 1 addition & 0 deletions llm/server/docs/deploy_usage_tutorial.md
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,7 @@ docker build --network=host -f ./dockerfiles/Dockerfile_serving_cuda124_cudnn9 -
| USE_CACHE_KV_INT8 | int | 是否将 INT8配置为 KV Cache 的类型 | 否 | 0 | c8量化模型需要配置为1 |
| MODEL_DIR | str | 模型文件路径 | 否 | /models/ | |
| model_name | str | 模型名称 | 否 | 无 | 用于支持模型静态图下载,具体名称参考文档(#./static_models.md)|
| OUTPUT_LOG_TO_CONSOLE | str | 是否定向输出到console 文件中 | 否 | 0 | |


## 显存相关参数推荐
Expand Down
28 changes: 24 additions & 4 deletions llm/server/server/scripts/start_server.sh
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,21 @@ export SERVICE_GRPC_PORT=${GRPC_PORT:-${SERVICE_GRPC_PORT:-"8811"}}
export INTER_PROC_PORT=${INTER_QUEUE_PORT:-${INTER_PROC_PORT:-"8813"}}
export SERVICE_HTTP_PORT=${PUSH_MODE_HTTP_PORT:-${SERVICE_HTTP_PORT:-"9965"}}

check_port_occupied() {
local port=$1
if netstat -tuln | grep -q ":${port}\b"; then
echo "PORT: ${port} occupied! Please change the port!"
exit 1
fi
}

check_port_occupied ${HEALTH_HTTP_PORT}
check_port_occupied ${METRICS_HTTP_PORT}
check_port_occupied ${SERVICE_GRPC_PORT}
check_port_occupied ${INTER_PROC_PORT}
check_port_occupied ${SERVICE_HTTP_PORT}



if [ ! -d "llm_model" ];then
ln -s /opt/source/PaddleNLP/llm/server/server/llm_model llm_model
Expand Down Expand Up @@ -83,15 +98,20 @@ else
sleep ${SERVER_WAITTING_TIME:-"25"}
fi



tritonserver --exit-timeout-secs 100000 --cuda-memory-pool-byte-size 0:0 --cuda-memory-pool-byte-size 1:0 \
OUTPUT_LOG_TO_CONSOLE=${OUTPUT_LOG_TO_CONSOLE:-"0"}
# Set the log redirection based on whether logs should be output to the console
LOG_REDIRECT=""
# If OUTPUT_LOG_TO_CONSOLE is set to "1", redirect logs to the console log file
if [ "$OUTPUT_LOG_TO_CONSOLE" == "1" ]; then
LOG_REDIRECT="> log/console.log 2>&1"
fi
eval tritonserver --exit-timeout-secs 100000 --cuda-memory-pool-byte-size 0:0 --cuda-memory-pool-byte-size 1:0 \
--cuda-memory-pool-byte-size 2:0 --cuda-memory-pool-byte-size 3:0 --cuda-memory-pool-byte-size 4:0 \
--cuda-memory-pool-byte-size 5:0 --cuda-memory-pool-byte-size 6:0 --cuda-memory-pool-byte-size 7:0 \
--pinned-memory-pool-byte-size 0 --model-repository llm_model/ \
--allow-http false \
--grpc-port=${SERVICE_GRPC_PORT} \
--metrics-port=${METRICS_HTTP_PORT} \
--log-file log/server.log --log-info true &
--log-file log/server.log --log-info true $LOG_REDIRECT &

echo "The logs for the model service, please check" ${PWD}"/log/server.log and "${PWD}"/log/workerlog.0"
4 changes: 3 additions & 1 deletion llm/server/server/server/data/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,9 @@ def process_response(self, response_dict, **kwargs):
response_dict["usage"] = {"completion_tokens" : response_dict["send_idx"] + 1}

if is_end:
response_dict["tokens_all"] = self.clear_request_status(req_id)
self.clear_request_status(req_id)
token_ids = response_dict.get("tokens_all_ids", [])
response_dict["tokens_all"] = self.ids2tokens(token_ids, response_dict["req_id"])
return response_dict

def text2ids(self, text):
Expand Down
2 changes: 1 addition & 1 deletion llm/server/server/server/engine/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -614,7 +614,7 @@ def run(self):
engine_healthy_recorded_time_array,
) = self.initialize_engine_healthy_recorded_time_flag()
engine_healthy_recorded_time_array[0] = time.time()
# infer_live_flag_shm = self.initialize_engine_live_flag()
infer_live_flag_shm = self.initialize_engine_live_flag()
infer_seed_increment = paddle.full(shape=[self.args.max_batch_size, 1], fill_value=4, dtype="int64")
# thread_executor = ThreadPoolExecutor(max_workers=1)
real_bsz = None
Expand Down
9 changes: 3 additions & 6 deletions llm/server/server/server/engine/token_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,13 +105,12 @@ def process_sampling_results(self):
except Exception as e:
model_server_logger.info("while get input_data error: {0} {1}".format(e, str(traceback.format_exc())))

def postprocess(self, batch_result, exist_finished_task=False):
def postprocess(self, batch_result):
"""
single post-processing function

Args:
batch_result (list): batch results
exist_finished_task (bool): whether there is a finished task
"""
result_dir = "./generate_token_results"
if not os.path.exists(result_dir):
Expand Down Expand Up @@ -218,7 +217,6 @@ def _process_batch_output(self):
accept_num = tokens[2 : batch + 2]

batch_result = list()
exist_finished_task = False
for i in range(batch):
if self.resource_manager.stop_flags[i]:
continue
Expand Down Expand Up @@ -253,11 +251,10 @@ def _process_batch_output(self):
f"Speculate accept ratio: {1 - self.total_step * 1.0 / self.number_of_output_tokens}"
f" total step: {self.total_step}. total_output_token_num: {self.number_of_output_tokens}"
)
exist_finished_task = True
break
batch_result.append(result)

self.postprocess(batch_result, exist_finished_task)
self.postprocess(batch_result)


class WarmUpTokenProcessor(TokenProcessor):
Expand All @@ -270,7 +267,7 @@ def __init__(self, cfg):
self._is_running = True
self._is_blocking = True

def postprocess(self, batch_result, exist_finished_task=False):
def postprocess(self, batch_result):
pass

def process_sampling_results(self):
Expand Down
4 changes: 2 additions & 2 deletions llm/server/server/server/triton_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,12 +111,12 @@ def _cache_special_tokens(self, batch_result):
["req_id"]] + batch_result[i]["token_scores"]
del self.score_buffer[batch_result[i]["req_id"]]

def postprocess(self, batch_result, exist_finished_task=False):
def postprocess(self, batch_result):
"""
single postprocess for triton
"""
try:
self._cache_special_tokens(batch_result)
# self._cache_special_tokens(batch_result)
self.cached_generated_tokens.put(batch_result)
except Exception as e:
model_server_logger.info(
Expand Down