Skip to content

Commit c4270ff

Browse files
authored
test: fix cluster broken case (#390)
1 parent 334c283 commit c4270ff

File tree

4 files changed

+208
-110
lines changed

4 files changed

+208
-110
lines changed

.github/workflows/test.yaml

+5-5
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,14 @@ defaults:
2020
jobs:
2121
main:
2222
name: Main
23-
timeout-minutes: 10
23+
timeout-minutes: 15
2424
runs-on: ubuntu-latest
2525
strategy:
2626
fail-fast: false
2727
max-parallel: 10
2828
matrix:
2929
include:
3030
- {redis: '7.2', ruby: '3.3'}
31-
- {task: test_cluster_broken, restart: 'no', startup: '6'}
3231
- {redis: '7.2', ruby: '3.3', compose: compose.ssl.yaml}
3332
- {redis: '7.2', ruby: '3.3', driver: 'hiredis'}
3433
- {redis: '7.2', ruby: '3.3', driver: 'hiredis', compose: compose.ssl.yaml}
@@ -38,14 +37,15 @@ jobs:
3837
- {task: test_cluster_state, pattern: 'ScaleReadRandom', compose: compose.valkey.yaml, redis: '8', replica: '2', startup: '9'}
3938
- {task: test_cluster_state, pattern: 'ScaleReadRandomWithPrimary', compose: compose.valkey.yaml, redis: '8', replica: '2', startup: '9'}
4039
- {task: test_cluster_state, pattern: 'ScaleReadLatency', compose: compose.valkey.yaml, redis: '8', replica: '2', startup: '9'}
41-
- {ruby: 'jruby'}
42-
- {ruby: 'truffleruby'}
43-
- {task: test_cluster_down}
4440
- {redis: '8', ruby: '3.3', compose: compose.valkey.yaml, replica: '2'}
4541
- {redis: '7.2', ruby: '3.2', compose: compose.auth.yaml}
4642
- {redis: '7.0', ruby: '3.1'}
4743
- {redis: '6.2', ruby: '3.0'}
4844
- {redis: '5.0', ruby: '2.7'}
45+
- {task: test_cluster_down}
46+
- {task: test_cluster_broken, restart: 'no', startup: '6'}
47+
- {ruby: 'jruby'}
48+
- {ruby: 'truffleruby'}
4949
- {task: test_cluster_scale, pattern: 'Single', compose: compose.scale.yaml, startup: '8'}
5050
- {task: test_cluster_scale, pattern: 'Pipeline', compose: compose.scale.yaml, startup: '8'}
5151
- {task: test_cluster_scale, pattern: 'Transaction', compose: compose.scale.yaml, startup: '8'}

test/cluster_controller.rb

+30-30
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ def failover
9494
replica_info = rows.find { |row| row.primary_id == primary_info.id }
9595

9696
wait_replication_delay(@clients, replica_size: @replica_size, timeout: @timeout)
97-
replica_info.client.call('CLUSTER', 'FAILOVER', 'TAKEOVER')
97+
replica_info.client.call_once('CLUSTER', 'FAILOVER', 'TAKEOVER')
9898
wait_failover(
9999
@clients,
100100
primary_node_key: primary_info.node_key,
@@ -117,24 +117,24 @@ def start_resharding(slot:, src_node_key:, dest_node_key:)
117117
dest_host, dest_port = dest_info.node_key.split(':')
118118

119119
# @see https://redis.io/commands/cluster-setslot/#redis-cluster-live-resharding-explained
120-
dest_client.call('CLUSTER', 'SETSLOT', slot, 'IMPORTING', src_node_id)
121-
src_client.call('CLUSTER', 'SETSLOT', slot, 'MIGRATING', dest_node_id)
120+
dest_client.call_once('CLUSTER', 'SETSLOT', slot, 'IMPORTING', src_node_id)
121+
src_client.call_once('CLUSTER', 'SETSLOT', slot, 'MIGRATING', dest_node_id)
122122

123123
db_idx = '0'
124124
timeout_msec = @timeout.to_i * 1000
125125

126-
number_of_keys = src_client.call('CLUSTER', 'COUNTKEYSINSLOT', slot)
127-
keys = src_client.call('CLUSTER', 'GETKEYSINSLOT', slot, number_of_keys)
126+
number_of_keys = src_client.call_once('CLUSTER', 'COUNTKEYSINSLOT', slot)
127+
keys = src_client.call_once('CLUSTER', 'GETKEYSINSLOT', slot, number_of_keys)
128128
print_debug("#{src_client.config.host}:#{src_client.config.port} => #{dest_client.config.host}:#{dest_client.config.port} ... #{keys}")
129129
return if keys.empty?
130130

131131
begin
132-
src_client.call('MIGRATE', dest_host, dest_port, '', db_idx, timeout_msec, 'KEYS', *keys)
132+
src_client.call_once('MIGRATE', dest_host, dest_port, '', db_idx, timeout_msec, 'KEYS', *keys)
133133
rescue ::RedisClient::CommandError => e
134134
raise unless e.message.start_with?('IOERR')
135135

136136
# retry once
137-
src_client.call('MIGRATE', dest_host, dest_port, '', db_idx, timeout_msec, 'REPLACE', 'KEYS', *keys)
137+
src_client.call_once('MIGRATE', dest_host, dest_port, '', db_idx, timeout_msec, 'REPLACE', 'KEYS', *keys)
138138
end
139139

140140
wait_replication_delay(@clients, replica_size: @replica_size, timeout: @timeout)
@@ -151,7 +151,7 @@ def finish_resharding(slot:, src_node_key:, dest_node_key:)
151151
rest = rows.reject { |r| r.replica? || r.client.equal?(src) || r.client.equal?(dest) }.map(&:client)
152152

153153
([dest, src] + rest).each do |cli|
154-
cli.call('CLUSTER', 'SETSLOT', slot, 'NODE', id)
154+
cli.call_once('CLUSTER', 'SETSLOT', slot, 'NODE', id)
155155
print_debug("#{cli.config.host}:#{cli.config.port} ... CLUSTER SETSLOT #{slot} NODE #{id}")
156156
rescue ::RedisClient::CommandError => e
157157
raise unless e.message.start_with?('ERR Please use SETSLOT only with masters.')
@@ -174,12 +174,12 @@ def scale_out(primary_url:, replica_url:)
174174
@shard_size += 1
175175
@number_of_replicas = @replica_size * @shard_size
176176

177-
primary.call('CLUSTER', 'MEET', target_host, target_port)
178-
replica.call('CLUSTER', 'MEET', target_host, target_port)
177+
primary.call_once('CLUSTER', 'MEET', target_host, target_port)
178+
replica.call_once('CLUSTER', 'MEET', target_host, target_port)
179179
wait_meeting(@clients, max_attempts: @max_attempts)
180180

181-
primary_id = primary.call('CLUSTER', 'MYID')
182-
replica.call('CLUSTER', 'REPLICATE', primary_id)
181+
primary_id = primary.call_once('CLUSTER', 'MYID')
182+
replica.call_once('CLUSTER', 'REPLICATE', primary_id)
183183
save_config(@clients)
184184
wait_for_cluster_to_be_ready(skip_clients: [primary, replica])
185185

@@ -213,16 +213,16 @@ def scale_in
213213
threads = @clients.map do |cli|
214214
Thread.new(cli) do |c|
215215
c.pipelined do |pi|
216-
pi.call('CLUSTER', 'FORGET', replica_info.id)
217-
pi.call('CLUSTER', 'FORGET', primary_info.id)
216+
pi.call_once('CLUSTER', 'FORGET', replica_info.id)
217+
pi.call_once('CLUSTER', 'FORGET', primary_info.id)
218218
end
219219
rescue ::RedisClient::Error
220220
# ignore
221221
end
222222
end
223223
threads.each(&:join)
224-
replica.call('CLUSTER', 'RESET', 'SOFT')
225-
primary.call('CLUSTER', 'RESET', 'SOFT')
224+
replica.call_once('CLUSTER', 'RESET', 'SOFT')
225+
primary.call_once('CLUSTER', 'RESET', 'SOFT')
226226
@clients.reject! { |c| c.equal?(primary) || c.equal?(replica) }
227227
@shard_size -= 1
228228
@number_of_replicas = @replica_size * @shard_size
@@ -266,7 +266,7 @@ def close
266266

267267
def flush_all_data(clients)
268268
clients.each do |c|
269-
c.call('FLUSHALL')
269+
c.call_once('FLUSHALL')
270270
print_debug("#{c.config.host}:#{c.config.port} ... FLUSHALL")
271271
rescue ::RedisClient::CommandError, ::RedisClient::ReadOnlyError
272272
# READONLY You can't write against a read only replica.
@@ -277,7 +277,7 @@ def flush_all_data(clients)
277277

278278
def reset_cluster(clients)
279279
clients.each do |c|
280-
c.call('CLUSTER', 'RESET', 'HARD')
280+
c.call_once('CLUSTER', 'RESET', 'HARD')
281281
print_debug("#{c.config.host}:#{c.config.port} ... CLUSTER RESET HARD")
282282
rescue ::RedisClient::ConnectionError => e
283283
print_debug("#{c.config.host}:#{c.config.port} ... CLUSTER RESET HARD: #{e.class}: #{e.message}")
@@ -294,15 +294,15 @@ def assign_slots(clients, shard_size:)
294294
slot_idx = 0
295295
primaries.zip(slot_sizes).each do |c, s|
296296
slot_range = slot_idx..slot_idx + s - 1
297-
c.call('CLUSTER', 'ADDSLOTS', *slot_range.to_a)
297+
c.call_once('CLUSTER', 'ADDSLOTS', *slot_range.to_a)
298298
slot_idx += s
299299
print_debug("#{c.config.host}:#{c.config.port} ... CLUSTER ADDSLOTS #{slot_range.to_a}")
300300
end
301301
end
302302

303303
def save_config_epoch(clients)
304304
clients.each_with_index do |c, i|
305-
c.call('CLUSTER', 'SET-CONFIG-EPOCH', i + 1)
305+
c.call_once('CLUSTER', 'SET-CONFIG-EPOCH', i + 1)
306306
print_debug("#{c.config.host}:#{c.config.port} ... CLUSTER SET-CONFIG-EPOCH #{i + 1}")
307307
rescue ::RedisClient::CommandError
308308
# ERR Node config epoch is already non-zero
@@ -315,7 +315,7 @@ def meet_each_other(clients)
315315
rows = parse_cluster_nodes(rows)
316316
target_host, target_port = rows.first.node_key.split(':')
317317
clients.drop(1).each do |c|
318-
c.call('CLUSTER', 'MEET', target_host, target_port)
318+
c.call_once('CLUSTER', 'MEET', target_host, target_port)
319319
print_debug("#{c.config.host}:#{c.config.port} ... CLUSTER MEET #{target_host}:#{target_port}")
320320
end
321321
end
@@ -335,19 +335,19 @@ def replicate(clients, shard_size:, replica_size:)
335335
replicas = take_replicas(clients, shard_size: shard_size)
336336

337337
replicas.each_slice(replica_size).each_with_index do |subset, i|
338-
primary_id = primaries[i].call('CLUSTER', 'MYID')
338+
primary_id = primaries[i].call_once('CLUSTER', 'MYID')
339339

340340
loop do
341341
begin
342342
subset.each do |replica|
343-
replica.call('CLUSTER', 'REPLICATE', primary_id)
343+
replica.call_once('CLUSTER', 'REPLICATE', primary_id)
344344
print_debug("#{replica.config.host}:#{replica.config.port} ... CLUSTER REPLICATE #{primaries[i].config.host}:#{primaries[i].config.port}")
345345
end
346346
rescue ::RedisClient::CommandError => e
347347
print_debug(e.message)
348348
# ERR Unknown node [node-id]
349349
sleep SLEEP_SEC
350-
primary_id = primaries[i].call('CLUSTER', 'MYID')
350+
primary_id = primaries[i].call_once('CLUSTER', 'MYID')
351351
next
352352
end
353353

@@ -358,7 +358,7 @@ def replicate(clients, shard_size:, replica_size:)
358358

359359
def save_config(clients)
360360
clients.each do |c|
361-
c.call('CLUSTER', 'SAVECONFIG')
361+
c.call_once('CLUSTER', 'SAVECONFIG')
362362
print_debug("#{c.config.host}:#{c.config.port} ... CLUSTER SAVECONFIG")
363363
end
364364
end
@@ -412,7 +412,7 @@ def wait_cluster_recovering(clients, max_attempts:, skip_clients: [])
412412
key = 0
413413
wait_for_state(clients, max_attempts: max_attempts) do |client|
414414
print_debug("#{client.config.host}:#{client.config.port} ... GET #{key}")
415-
client.call('GET', key) if primary_client?(client) && !skip_clients.include?(client)
415+
client.call_once('GET', key) if primary_client?(client) && !skip_clients.include?(client)
416416
true
417417
rescue ::RedisClient::CommandError => e
418418
if e.message.start_with?('CLUSTERDOWN')
@@ -443,11 +443,11 @@ def wait_for_state(clients, max_attempts:)
443443
end
444444

445445
def hashify_cluster_info(client)
446-
client.call('CLUSTER', 'INFO').split("\r\n").to_h { |v| v.split(':') }
446+
client.call_once('CLUSTER', 'INFO').split("\r\n").to_h { |v| v.split(':') }
447447
end
448448

449449
def fetch_cluster_nodes(client)
450-
client.call('CLUSTER', 'NODES').split("\n").map(&:split)
450+
client.call_once('CLUSTER', 'NODES').split("\n").map(&:split)
451451
end
452452

453453
def associate_with_clients_and_nodes(clients)
@@ -502,11 +502,11 @@ def take_replicas(clients, shard_size:)
502502
end
503503

504504
def primary_client?(client)
505-
client.call('ROLE').first == 'master'
505+
client.call_once('ROLE').first == 'master'
506506
end
507507

508508
def replica_client?(client)
509-
client.call('ROLE').first == 'slave'
509+
client.call_once('ROLE').first == 'slave'
510510
end
511511

512512
def print_debug(msg)

0 commit comments

Comments
 (0)