Skip to content

Commit 0a5b6ec

Browse files
authored
K8s: Improve Node checks for liveness probe and preStop hook (#2661)
Signed-off-by: Viet Nguyen Duc <[email protected]>
1 parent 5648285 commit 0a5b6ec

File tree

2 files changed

+34
-15
lines changed

2 files changed

+34
-15
lines changed

Diff for: charts/selenium-grid/configs/node/nodePreStop.sh

+22-9
Original file line numberDiff line numberDiff line change
@@ -46,12 +46,15 @@ function signal_hub_to_drain_node() {
4646
BASIC_AUTH="$(echo -en "${SE_ROUTER_USERNAME}:${SE_ROUTER_PASSWORD}" | base64 -w0)"
4747
if [ -n "${grid_url}" ]; then
4848
if [ "${grid_check}" = "401" ]; then
49-
echo "$(date -u +"${ts_format}") [${probe_name}] - Hub/Router requires authentication. Please check SE_ROUTER_USERNAME and SE_ROUTER_PASSWORD."
49+
echo "$(date -u +"${ts_format}") [${probe_name}] - Hub/Router requires authentication. Please check env vars SE_ROUTER_USERNAME and SE_ROUTER_PASSWORD are given."
5050
elif [ "${grid_check}" = "404" ]; then
5151
echo "$(date -u +"${ts_format}") [${probe_name}] - Hub/Router endpoint could not be found. Please check the endpoint ${grid_url}"
52+
elif [ "${grid_check}" = "200" ]; then
53+
echo "$(date -u +"${ts_format}") [${probe_name}] - Hub/Router endpoint is reachable. Signaling Hub/Router to drain node"
54+
curl --noproxy "*" -m ${max_time} -k -X POST -H "Authorization: Basic ${BASIC_AUTH}" ${grid_url}/se/grid/distributor/node/${NODE_ID}/drain --header "${HEADERS}"
55+
else
56+
echo "$(date -u +"${ts_format}") [${probe_name}] - Hub/Router endpoint returns ${grid_check}. Skip signaling upstream."
5257
fi
53-
echo "$(date -u +"${ts_format}") [${probe_name}] - Signaling Hub/Router to drain node"
54-
curl --noproxy "*" -m ${max_time} -k -X POST -H "Authorization: Basic ${BASIC_AUTH}" ${grid_url}/se/grid/distributor/node/${NODE_ID}/drain --header "${HEADERS}"
5558
else
5659
echo "$(date -u +"${ts_format}") [${probe_name}] - There is no configured HUB/ROUTER host or SE_NODE_GRID_URL isn't set. ${probe_name} ignores to send drain request to upstream."
5760
fi
@@ -63,28 +66,38 @@ function signal_node_to_drain() {
6366
}
6467

6568
if curl --noproxy "*" -m ${max_time} -sfk ${SE_SERVER_PROTOCOL}://127.0.0.1:${SE_NODE_PORT}/status > ${tmp_node_file}; then
66-
NODE_ID=$(jq -r '.value.node.nodeId' ${tmp_node_file} || "")
69+
NODE_ID=$(jq -r '.value.node.nodeId' ${tmp_node_file} || echo "")
6770
if [ -n "${NODE_ID}" ]; then
6871
echo "$(date -u +"${ts_format}") [${probe_name}] - Current Node ID is: ${NODE_ID}"
6972
signal_hub_to_drain_node
7073
echo
7174
fi
7275
signal_node_to_drain
7376
# Wait for the current session to be finished if any
77+
check_attempts=0
7478
while true; do
7579
# Attempt the cURL request and capture the exit status
76-
endpoint_http_code=$(curl --noproxy "*" --retry ${retry_time} -m ${max_time} -sfk ${SE_SERVER_PROTOCOL}://127.0.0.1:${SE_NODE_PORT}/status -o ${tmp_node_file} -w "%{http_code}")
80+
endpoint_http_code=$(curl --noproxy "*" --retry ${retry_time} -m ${max_time} -sfk "${SE_SERVER_PROTOCOL}://127.0.0.1:${SE_NODE_PORT}/status" -o "${tmp_node_file}" -w "%{http_code}")
7781
endpoint_status=$?
7882
echo "$(date -u +"${ts_format}") [${probe_name}] - Fetch the Node status via cURL with exit status: ${endpoint_status}, HTTP code: ${endpoint_http_code}"
7983

80-
SLOT_HAS_SESSION=$(jq -e ".value.node.slots[]|select(.session != null).id.id" ${tmp_node_file} | tr -d '"' || "")
81-
if [ -z "${SLOT_HAS_SESSION}" ]; then
84+
SLOT_HAS_SESSION=$(jq -r '[.value.node.slots[]? | select(.session != null)] | length' "${tmp_node_file}" || echo 0)
85+
if [ "${SLOT_HAS_SESSION}" -eq 0 ] && [ "${endpoint_http_code}" = "200" ]; then
8286
echo "$(date -u +"${ts_format}") [${probe_name}] - There is no session running. Node is ready to be terminated."
83-
echo "$(date -u +"${ts_format}") [${probe_name}] - $(cat ${tmp_node_file} || "")"
87+
echo "$(date -u +"${ts_format}") [${probe_name}] - $(cat "${tmp_node_file}" || echo "")"
8488
echo
8589
exit 0
90+
elif [ "${endpoint_http_code}" != "200" ]; then
91+
echo "$(date -u +"${ts_format}") [${probe_name}] - Node endpoint returned status ${endpoint_http_code}, attempt one more time to confirm the status."
92+
check_attempts=$((check_attempts+1))
93+
if [ ${check_attempts} -ge 3 ]; then
94+
echo "$(date -u +"${ts_format}") [${probe_name}] - Node endpoint returned status ${endpoint_http_code} for serveral times. Assume that Node is ready to be terminated."
95+
exit 0
96+
fi
97+
sleep 2;
8698
else
87-
echo "$(date -u +"${ts_format}") [${probe_name}] - Node ${probe_name} is waiting for current session on slot ${SLOT_HAS_SESSION} to be finished. Node details: message: $(jq -r '.value.message' ${tmp_node_file} || "unknown"), availability: $(jq -r '.value.node.availability' ${tmp_node_file} || "unknown")"
99+
check_attempts=0
100+
echo "$(date -u +"${ts_format}") [${probe_name}] - Node is waiting for ${SLOT_HAS_SESSION} session(s) to be finished. Node details: message: $(jq -r '.value.message' "${tmp_node_file}" || echo "unknown"), availability: $(jq -r '.value.node.availability' "${tmp_node_file}" || echo "unknown")"
88101
sleep 2;
89102
fi
90103

Diff for: charts/selenium-grid/configs/node/nodeProbe.sh

+12-6
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,12 @@ function help_message() {
2727
echo "$(date -u +"${ts_format}") [${probe_name}] - If you believe Node is registered successfully but probe still report this message and fail for a long time. Workaround by set 'global.seleniumGrid.defaultNodeStartupProbe' to 'httpGet' and report us an issue for Chart improvement with your scenario."
2828
}
2929

30-
if curl --noproxy "*" -m ${max_time} -sfk ${SE_SERVER_PROTOCOL}://127.0.0.1:${SE_NODE_PORT}/status -o ${tmp_node_file}; then
31-
NODE_ID=$(jq -r '.value.node.nodeId' ${tmp_node_file} || "")
32-
NODE_STATUS=$(jq -r '.value.node.availability' ${tmp_node_file} || "")
30+
if curl --noproxy "*" -m ${max_time} -sfk "${SE_SERVER_PROTOCOL}://127.0.0.1:${SE_NODE_PORT}/status" -o "${tmp_node_file}"; then
31+
NODE_ID=$(jq -r '.value.node.nodeId' "${tmp_node_file}" || echo "")
32+
NODE_STATUS=$(jq -r '.value.node.availability' "${tmp_node_file}" || echo "")
33+
SLOT_HAS_SESSION=$(jq -r '[.value.node.slots[]? | select(.session != null)] | length' "${tmp_node_file}" || echo 0)
3334
if [ -n "${NODE_ID}" ]; then
34-
echo "$(date -u +"${ts_format}") [${probe_name}] - Node responds the ID: ${NODE_ID} with status: ${NODE_STATUS}"
35+
echo "$(date -u +"${ts_format}") [${probe_name}] - Node responds the ID: ${NODE_ID} with status: ${NODE_STATUS}. Number of ongoing sessions: ${SLOT_HAS_SESSION}"
3536
else
3637
echo "$(date -u +"${ts_format}") [${probe_name}] - Wait for the Node to report its status"
3738
exit 1
@@ -52,15 +53,20 @@ if curl --noproxy "*" -m ${max_time} -sfk ${SE_SERVER_PROTOCOL}://127.0.0.1:${SE
5253
echo "$(date -u +"${ts_format}") [${probe_name}] - There is no configured HUB/ROUTER host or SE_NODE_GRID_URL isn't set. ${probe_name} will not work as expected."
5354
fi
5455

55-
curl --noproxy "*" -m ${max_time} -H "Authorization: Basic ${BASIC_AUTH}" -sfk "${grid_url}/status" -o ${tmp_grid_file}
56-
GRID_NODE_ID=$(jq -e ".value.nodes[].id|select(. == \"${NODE_ID}\")" ${tmp_grid_file} | tr -d '"' || "")
56+
endpoint_http_code=$(curl --noproxy "*" -m ${max_time} -H "Authorization: Basic ${BASIC_AUTH}" -sfk "${grid_url}/status" -o "${tmp_grid_file}" -w "%{http_code}")
57+
GRID_NODE_ID=$(jq -e ".value.nodes[]?.id|select(. == \"${NODE_ID}\")" "${tmp_grid_file}" | tr -d '"' || echo "")
5758
if [ -n "${GRID_NODE_ID}" ]; then
5859
echo "$(date -u +"${ts_format}") [${probe_name}] - Grid responds a matched Node ID: ${GRID_NODE_ID}"
60+
elif [ "${endpoint_http_code}" != "200" ]; then
61+
echo "$(date -u +"${ts_format}") [${probe_name}] - Grid endpoint returns ${endpoint_http_code}. Skip checking upstream."
5962
fi
6063

6164
if [ -n "${NODE_ID}" ] && [ -n "${GRID_NODE_ID}" ] && [ "${NODE_ID}" = "${GRID_NODE_ID}" ]; then
6265
echo "$(date -u +"${ts_format}") [${probe_name}] - Node ID: ${NODE_ID} is found in the Grid. Node is ready."
6366
exit 0
67+
elif [ -n "${NODE_ID}" ] && [ "${endpoint_http_code}" != "200" ]; then
68+
echo "$(date -u +"${ts_format}") [${probe_name}] - Node ID: ${NODE_ID} report its status, but could not double check ID in Hub. Assume that Node is ready."
69+
exit 0
6470
else
6571
echo "$(date -u +"${ts_format}") [${probe_name}] - Node ID: ${NODE_ID} is not found in the Grid. Node is not ready."
6672
exit 1

0 commit comments

Comments
 (0)