|
1 | 1 | # frozen_string_literal: true
|
2 | 2 |
|
3 | 3 | require 'testing_helper'
|
| 4 | +require 'securerandom' |
4 | 5 |
|
5 | 6 | class TestAgainstClusterBroken < TestingWrapper
|
6 | 7 | WAIT_SEC = 3
|
@@ -34,8 +35,55 @@ def test_a_primary_is_down
|
34 | 35 | do_test_a_node_is_down(sacrifice, number_of_keys: 10)
|
35 | 36 | end
|
36 | 37 |
|
| 38 | + def test_reloading_on_connection_error |
| 39 | + sacrifice = @controller.select_sacrifice_of_primary |
| 40 | + # Find a key which lives on the sacrifice node |
| 41 | + test_key = generate_key_for_node(sacrifice) |
| 42 | + @client.call('SET', test_key, 'foobar1') |
| 43 | + |
| 44 | + # Shut the node down. |
| 45 | + kill_a_node_and_wait_for_failover(sacrifice) |
| 46 | + |
| 47 | + # When we try and fetch the key, it'll attempt to connect to the broken node, and |
| 48 | + # thus trigger a reload of the cluster topology. |
| 49 | + assert_equal 'OK', @client.call('SET', test_key, 'foobar2') |
| 50 | + end |
| 51 | + |
| 52 | + def test_transaction_retry_on_connection_error |
| 53 | + sacrifice = @controller.select_sacrifice_of_primary |
| 54 | + # Find a key which lives on the sacrifice node |
| 55 | + test_key = generate_key_for_node(sacrifice) |
| 56 | + @client.call('SET', test_key, 'foobar1') |
| 57 | + |
| 58 | + call_count = 0 |
| 59 | + # Begin a transaction, but shut the node down after the WATCH is issued |
| 60 | + res = @client.multi(watch: [test_key]) do |tx| |
| 61 | + kill_a_node_and_wait_for_failover(sacrifice) if call_count == 0 |
| 62 | + call_count += 1 |
| 63 | + tx.call('SET', test_key, 'foobar2') |
| 64 | + end |
| 65 | + |
| 66 | + # The transaction should have retried once and successfully completed |
| 67 | + # the second time. |
| 68 | + assert_equal ['OK'], res |
| 69 | + assert_equal 'foobar2', @client.call('GET', test_key) |
| 70 | + assert_equal 2, call_count |
| 71 | + end |
| 72 | + |
37 | 73 | private
|
38 | 74 |
|
| 75 | + def generate_key_for_node(conn) |
| 76 | + # Figure out a slot on the the sacrifice node, and a key in that slot. |
| 77 | + conn_id = conn.call('CLUSTER', 'MYID') |
| 78 | + conn_slots = conn.call('CLUSTER', 'SLOTS') |
| 79 | + .select { |res| res[2][2] == conn_id } |
| 80 | + .flat_map { |res| (res[0]..res[1]).to_a } |
| 81 | + loop do |
| 82 | + test_key = SecureRandom.hex |
| 83 | + return test_key if conn_slots.include?(conn.call('CLUSTER', 'KEYSLOT', test_key)) |
| 84 | + end |
| 85 | + end |
| 86 | + |
39 | 87 | def wait_for_replication
|
40 | 88 | client_side_timeout = TEST_TIMEOUT_SEC + 1.0
|
41 | 89 | server_side_timeout = (TEST_TIMEOUT_SEC * 1000).to_i
|
@@ -78,6 +126,23 @@ def kill_a_node(sacrifice, kill_attempts:)
|
78 | 126 | assert_raises(::RedisClient::ConnectionError) { sacrifice.call('PING') }
|
79 | 127 | end
|
80 | 128 |
|
| 129 | + def kill_a_node_and_wait_for_failover(sacrifice) |
| 130 | + other_client = @controller.clients.reject { _1 == sacrifice }.first |
| 131 | + sacrifice_id = sacrifice.call('CLUSTER', 'MYID') |
| 132 | + kill_a_node(sacrifice, kill_attempts: 10) |
| 133 | + failover_checks = 0 |
| 134 | + loop do |
| 135 | + raise 'Timed out waiting for failover in kill_a_node_and_wait_for_failover' if failover_checks > 30 |
| 136 | + |
| 137 | + # Wait for the sacrifice node to not be a primary according to CLUSTER SLOTS. |
| 138 | + cluster_slots = other_client.call('CLUSTER', 'SLOTS') |
| 139 | + break unless cluster_slots.any? { _1[2][2] == sacrifice_id } |
| 140 | + |
| 141 | + sleep 1 |
| 142 | + failover_checks += 1 |
| 143 | + end |
| 144 | + end |
| 145 | + |
81 | 146 | def wait_for_cluster_to_be_ready(wait_attempts:)
|
82 | 147 | loop do
|
83 | 148 | break if wait_attempts <= 0 || @client.call('PING') == 'PONG'
|
|
0 commit comments