Skip to content

Commit 89c16ee

Browse files
authored
Improve cluster liveness check (#6709)
1 parent 2cb5973 commit 89c16ee

File tree

1 file changed

+20
-1
lines changed

1 file changed

+20
-1
lines changed

ydb/tests/olap/lib/ydb_cluster.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def _get_cluster_nodes(cls):
3636
# headers['Authorization'] = token
3737
data = requests.get(url, headers=headers).json()
3838
nodes = data.get('Nodes', [])
39-
nodes_count = data.get('TotalNodes', len(nodes))
39+
nodes_count = int(data.get('TotalNodes', len(nodes)))
4040
return nodes, nodes_count
4141
except Exception as e:
4242
LOGGER.error(e)
@@ -120,6 +120,25 @@ def execute_single_result_query(cls, query, timeout=10):
120120
@allure.step('Check if YDB alive')
121121
def check_if_ydb_alive(cls, timeout=10):
122122
try:
123+
nodes, node_count = cls._get_cluster_nodes()
124+
if node_count == 0:
125+
return False
126+
if len(nodes) < node_count:
127+
LOGGER.error(f"{node_count - len(nodes)} nodes from {node_count} don't live")
128+
return False
129+
for n in nodes:
130+
ss = n.get('SystemState', {})
131+
name = ss.get("Host")
132+
start_time = int(ss.get('StartTime', int(time()) * 1000)) / 1000
133+
uptime = int(time()) - start_time
134+
if uptime < 15:
135+
LOGGER.error(f'Node {name} too yong: {uptime}')
136+
return False
137+
# if 'MemoryUsed' in ss and 'MemoryLimit' in ss:
138+
# used = int(ss['MemoryUsed'])
139+
# limit = int(ss['MemoryLimit'])
140+
# if used > 0.9 * limit:
141+
# LOGGER.error(f'Node {name} use too many rss: {used} from {limit}')
123142
cls.execute_single_result_query("select 1", timeout)
124143
return True
125144
except BaseException as ex:

0 commit comments

Comments
 (0)