Skip to content

Commit 384f3e3

Browse files
committed
Start of system test framework.
Scripts and runs system tests across the cluster in accordance with the user's localconfig.py. Includes a few very small tests for now. One tests a simple recovery of one object. Others test recovery when recovery masters fail. This patch also includes a way to push configuration options to the coordinator. For now it is used just to set testing options (that is, to ask the coordinator to crash some recovery masters).
1 parent 90660fd commit 384f3e3

33 files changed

+874
-28
lines changed

bindings/python/ramcloud.py

+8
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,11 @@ def from_param(param):
159159
ctypes.c_uint32, ctypes.c_uint32]
160160
so.rc_testing_fill.restype = status
161161

162+
so.rc_testing_set_runtime_option.argtypes = [client,
163+
ctypes.c_char_p,
164+
ctypes.c_char_p]
165+
so.rc_testing_set_runtime_option.restype = status
166+
162167
so.rc_testing_wait_for_all_tablets_normal.argtypes = [client]
163168
so.rc_testing_wait_for_all_tablets_normal.restype = None
164169

@@ -319,6 +324,9 @@ def testing_fill(self, table_id, id, object_count, object_size):
319324
object_count, object_size)
320325
self.handle_error(s)
321326

327+
def testing_set_runtime_option(self, option, value):
328+
so.rc_testing_set_runtime_option(self.client, option, value)
329+
322330
def testing_wait_for_all_tablets_normal(self):
323331
so.rc_testing_wait_for_all_tablets_normal(self.client)
324332

scripts/cluster.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -345,8 +345,9 @@ def __enter__(self):
345345
self.sandbox.__enter__()
346346
return self
347347

348-
def __exit__(self, exc_type, exc_value, exc_tb):
348+
def __exit__(self, exc_type=None, exc_value=None, exc_tb=None):
349349
self.sandbox.__exit__(exc_type, exc_value, exc_tb)
350+
return False # rethrow exception, if any
350351

351352
def run(
352353
num_servers=4, # Number of hosts on which to start

scripts/rawmetrics.py

+8-7
Original file line numberDiff line numberDiff line change
@@ -286,7 +286,6 @@ def dump_metric_info_code(self, out, path, counter):
286286
rpc.metric('enlistServerCount', 'number of invocations of ENLIST_SERVER RPC')
287287
rpc.metric('getServerListCount', 'number of invocations of GET_SERVER_LIST RPC')
288288
rpc.metric('getTabletMapCount', 'number of invocations of GET_TABLET_MAP RPC')
289-
rpc.metric('getServerStatistics', 'number of invocations of GET_SERVER_STATISTICS RPC')
290289
rpc.metric('recoverCount', 'number of invocations of RECOVER RPC')
291290
rpc.metric('hintServerDownCount', 'number of invocations of HINT_SERVER_DOWN RPC')
292291
rpc.metric('recoveryMasterFinishedCount', 'number of invocations of RECOVERY_MASTER_FINISHED_RPC')
@@ -310,14 +309,16 @@ def dump_metric_info_code(self, out, path, counter):
310309
rpc.metric('dropTabletOwnershipCount', 'number of invocations of DROP_TABLET_OWNERSHIP RPC')
311310
rpc.metric('takeTabletOwnershipCount', 'number of invocations of TAKE_TABLET_OWNERSHIP RPC')
312311
rpc.metric('backupAssignGroupCount', 'number of invocations of BACKUP_ASSIGN_GROUP RPC')
313-
rpc.metric('incrementRpcCount', 'number of invocations of INCREMENT RPC')
314-
rpc.metric('splitTabletRpcCount', 'number of invocations of SPLIT_TABLET RPC')
315312
rpc.metric('getHeadOfLogCount', 'number of invocations of GET_HEAD_OF_LOG RPC')
313+
rpc.metric('incrementRpcCount', 'number of invocations of INCREMENT RPC')
316314
rpc.metric('prepForMigrationCount', 'number of invocations of PREP_FOR_MIGRATION RPC')
317315
rpc.metric('receiveMigrationDataCount', 'number of invocations of RECEIVE_MIGRATION_DATA RPC')
318316
rpc.metric('reassignTabletOwnershipCount', 'number of invocations of REASSIGN_TABLET_OWNERSHIP RPC')
319317
rpc.metric('migrateTabletCount', 'number of invocations of MIGRATE_TABLET RPC')
320318
rpc.metric('isReplicaNeededCount', 'number of invocations of IS_REPLICA_NEEDED_RPC')
319+
rpc.metric('splitTabletCount', 'number of invocations of SPLIT_TABLET')
320+
rpc.metric('getServerStatisticsCount', 'number of invocations of GET_SERVER_STATISTICS')
321+
rpc.metric('setRuntimeOptionCount', 'number of invocations of SET_RUNTIME_OPTION')
321322
rpc.metric('illegalRpcCount', 'number of invocations of RPCs with illegal opcodes')
322323

323324
rpc.metric('rpc0Ticks', 'time spent executing RPC 0 (undefined)')
@@ -339,8 +340,6 @@ def dump_metric_info_code(self, out, path, counter):
339340
rpc.metric('enlistServerTicks', 'time spent executing ENLIST_SERVER RPC')
340341
rpc.metric('getServerListTicks', 'time spent executing GET_SERVER_LIST RPC')
341342
rpc.metric('getTabletMapTicks', 'time spent executing GET_TABLET_MAP RPC')
342-
rpc.metric('getServerStatisticsTicks', 'time spent executing GET_SERVER_STATISTICS RPC')
343-
rpc.metric('setTabletsTicks', 'time spent executing SET_TABLETS RPC')
344343
rpc.metric('recoverTicks', 'time spent executing RECOVER RPC')
345344
rpc.metric('hintServerDownTicks', 'time spent executing HINT_SERVER_DOWN RPC')
346345
rpc.metric('recoveryMasterFinishedTicks', 'time spent executing RECOVERY_MASTER_FINISHED RPC')
@@ -364,14 +363,16 @@ def dump_metric_info_code(self, out, path, counter):
364363
rpc.metric('dropTabletOwnershipTicks', 'number of invocations of DROP_TABLET_OWNERSHIP RPC')
365364
rpc.metric('takeTabletOwnershipTicks', 'number of invocations of TAKE_TABLET_OWNERSHIP RPC')
366365
rpc.metric('backupAssignGroupTicks', 'time spent executing BACKUP_ASSIGN_GROUP RPC')
367-
rpc.metric('incrementTicks', 'time spent executing INCREMENT RPC')
368-
rpc.metric('splitTabletTicks', 'time spent executing SPLIT_TABLET RPC')
369366
rpc.metric('getHeadOfLogTicks', 'time spent executing GET_HEAD_OF_LOG RPC')
367+
rpc.metric('incrementTicks', 'time spent executing INCREMENT RPC')
370368
rpc.metric('prepForMigrationTicks', 'time spent executing PREP_FOR_MIGRATION RPC')
371369
rpc.metric('receiveMigrationDataTicks', 'time spent executing RECEIVE_MIGRATION_DATA RPC')
372370
rpc.metric('reassignTabletOwnershipTicks', 'time spent executing REASSIGN_TABLET_OWNERSHIP RPC')
373371
rpc.metric('migrateTabletTicks', 'time spent executing MIGRATE_TABLET RPC')
374372
rpc.metric('isReplicaNeededTicks', 'time spent executing IS_REPLICA_NEEDED_RPC')
373+
rpc.metric('splitTabletTicks', 'time spent executing SPLIT_TABLET RPC')
374+
rpc.metric('getServerStatisticsTicks', 'time spent executing GET_SERVER_STATISTICS RPC')
375+
rpc.metric('setRuntimeOptionTicks', 'time spent executing SET_RUNTIME_OPTION RPC')
375376
rpc.metric('illegalRpcTicks', 'time spent executing RPCs with illegal opcodes')
376377

377378
transmit = Group('Transmit', 'metrics related to transmitting messages')

src/CRamCloud.cc

+13
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,19 @@ rc_testing_fill(struct rc_client* client, uint64_t tableId,
273273
return STATUS_OK;
274274
}
275275

276+
Status
277+
rc_testing_set_runtime_option(struct rc_client* client,
278+
const char* option,
279+
const char* value)
280+
{
281+
try {
282+
client->client->testingSetRuntimeOption(option, value);
283+
} catch (const ClientException& e) {
284+
return e.status;
285+
}
286+
return STATUS_OK;
287+
}
288+
276289
void
277290
rc_testing_wait_for_all_tablets_normal(struct rc_client* client)
278291
{

src/CRamCloud.h

+3-1
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,9 @@ RAMCloud::Status rc_testing_kill(struct rc_client* client, uint64_t tableId,
6666
RAMCloud::Status rc_testing_fill(struct rc_client* client, uint64_t tableId,
6767
const char* key, uint16_t keyLength,
6868
uint32_t objectCount, uint32_t objectSize);
69-
69+
RAMCloud::Status rc_testing_set_runtime_option(struct rc_client* client,
70+
const char* option,
71+
const char* value);
7072
void rc_testing_wait_for_all_tablets_normal(struct rc_client* client);
7173

7274
#ifdef __cplusplus

src/CoordinatorClient.cc

+28
Original file line numberDiff line numberDiff line change
@@ -446,6 +446,34 @@ CoordinatorClient::sendServerList(ServerId destination)
446446
checkStatus(HERE);
447447
}
448448

449+
/**
450+
* Sets a runtime option field on the coordinator to the indicated value.
451+
*
452+
* \param option
453+
* String name which corresponds to a member field in the RuntimeOptions
454+
* class (e.g. "failRecoveryMasters") whose value should be replaced with
455+
* the given value.
456+
* \param value
457+
* String which can be parsed into the type of the field indicated by
458+
* \a option. The format is specific to the type of each field but is
459+
* generally either a single value (e.g. "10", "word") or a collection
460+
* separated by spaces (e.g. "1 2 3", "first second"). See RuntimeOptions
461+
* for more information.
462+
*/
463+
void
464+
CoordinatorClient::setRuntimeOption(const char* option, const char* value)
465+
{
466+
Buffer req, resp;
467+
SetRuntimeOptionRpc::Request& reqHdr(
468+
allocHeader<SetRuntimeOptionRpc>(req));
469+
reqHdr.optionLength = downCast<uint32_t>(strlen(option) + 1);
470+
reqHdr.valueLength = downCast<uint32_t>(strlen(value) + 1);
471+
Buffer::Chunk::appendToBuffer(&req, option, reqHdr.optionLength);
472+
Buffer::Chunk::appendToBuffer(&req, value, reqHdr.valueLength);
473+
sendRecv<SetRuntimeOptionRpc>(session, req, resp);
474+
checkStatus(HERE);
475+
}
476+
449477
CoordinatorClient::SetMinOpenSegmentId::SetMinOpenSegmentId(
450478
CoordinatorClient& client,
451479
ServerId serverId,

src/CoordinatorClient.h

+1
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ class CoordinatorClient : public Client {
6666
bool successful);
6767
void setWill(uint64_t masterId, const ProtoBuf::Tablets& will);
6868
void sendServerList(ServerId destination);
69+
void setRuntimeOption(const char* option, const char* value);
6970

7071
class SetMinOpenSegmentId {
7172
public:

src/CoordinatorService.cc

+30-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,8 @@ CoordinatorService::CoordinatorService(Context& context)
3535
, nextTableId(0)
3636
, nextTableMasterIdx(0)
3737
, nextReplicationId(1)
38-
, recoveryManager(context, serverList, tabletMap)
38+
, runtimeOptions()
39+
, recoveryManager(context, serverList, tabletMap, &runtimeOptions)
3940
, forceServerDownForTesting(false)
4041
{
4142
recoveryManager.start();
@@ -95,6 +96,10 @@ CoordinatorService::dispatch(RpcOpcode opcode,
9596
callHandler<SendServerListRpc, CoordinatorService,
9697
&CoordinatorService::sendServerList>(rpc);
9798
break;
99+
case SetRuntimeOptionRpc::opcode:
100+
callHandler<SetRuntimeOptionRpc, CoordinatorService,
101+
&CoordinatorService::setRuntimeOption>(rpc);
102+
break;
98103
case ReassignTabletOwnershipRpc::opcode:
99104
callHandler<ReassignTabletOwnershipRpc, CoordinatorService,
100105
&CoordinatorService::reassignTabletOwnership>(rpc);
@@ -635,6 +640,30 @@ CoordinatorService::sendServerList(
635640
sendServerList(id);
636641
}
637642

643+
/**
644+
* Sets a runtime option field on the coordinator to the indicated value.
645+
* See CoordinatorClient::setRuntimeOption() for details.
646+
*
647+
* \copydetails Service::ping
648+
*/
649+
void
650+
CoordinatorService::setRuntimeOption(const SetRuntimeOptionRpc::Request& reqHdr,
651+
SetRuntimeOptionRpc::Response& respHdr,
652+
Rpc& rpc)
653+
{
654+
const char* option = getString(rpc.requestPayload, sizeof(reqHdr),
655+
reqHdr.optionLength);
656+
const char* value = getString(rpc.requestPayload,
657+
downCast<uint32_t>(sizeof(reqHdr) +
658+
reqHdr.optionLength),
659+
reqHdr.valueLength);
660+
try {
661+
runtimeOptions.set(option, value);
662+
} catch (const std::out_of_range& e) {
663+
respHdr.common.status = STATUS_OBJECT_DOESNT_EXIST;
664+
}
665+
}
666+
638667
/**
639668
* Assign a new replicationId to a backup, and inform the backup which nodes
640669
* are in its replication group.

src/CoordinatorService.h

+11
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include "RawMetrics.h"
2626
#include "Recovery.h"
2727
#include "Rpc.h"
28+
#include "RuntimeOptions.h"
2829
#include "ServerId.h"
2930
#include "Service.h"
3031
#include "TabletMap.h"
@@ -92,6 +93,9 @@ class CoordinatorService : public Service {
9293
void sendServerList(const SendServerListRpc::Request& reqHdr,
9394
SendServerListRpc::Response& respHdr,
9495
Rpc& rpc);
96+
void setRuntimeOption(const SetRuntimeOptionRpc::Request& reqHdr,
97+
SetRuntimeOptionRpc::Response& respHdr,
98+
Rpc& rpc);
9599
// - helper methods -
96100
bool assignReplicationGroup(uint64_t replicationId,
97101
const vector<ServerId>& replicationGroupIds);
@@ -151,6 +155,13 @@ class CoordinatorService : public Service {
151155
*/
152156
uint64_t nextReplicationId;
153157

158+
/**
159+
* Contains coordinator configuration options which can be modified while
160+
* the cluster is running. Currently mostly used for setting debugging
161+
* or testing parameters.
162+
*/
163+
RuntimeOptions runtimeOptions;
164+
154165
/**
155166
* Handles all master recovery details on behalf of the coordinator.
156167
*/

src/CoordinatorServiceTest.cc

+12
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
*/
1515

1616
#include "TestUtil.h"
17+
#include "ClientException.h"
1718
#include "CoordinatorClient.h"
1819
#include "CoordinatorService.h"
1920
#include "MasterService.h"
@@ -614,6 +615,17 @@ TEST_F(CoordinatorServiceTest, sendServerList_service) {
614615
"entries (version number 2)"));
615616
}
616617

618+
TEST_F(CoordinatorServiceTest, setRuntimeOption) {
619+
client->setRuntimeOption("failRecoveryMasters", "1 2 3");
620+
ASSERT_EQ(3u, service->runtimeOptions.failRecoveryMasters.size());
621+
EXPECT_EQ(1u, service->runtimeOptions.popFailRecoveryMasters());
622+
EXPECT_EQ(2u, service->runtimeOptions.popFailRecoveryMasters());
623+
EXPECT_EQ(3u, service->runtimeOptions.popFailRecoveryMasters());
624+
EXPECT_EQ(0u, service->runtimeOptions.popFailRecoveryMasters());
625+
EXPECT_THROW(client->setRuntimeOption("BAD", "1 2 3"),
626+
ObjectDoesntExistException);
627+
}
628+
617629
TEST_F(CoordinatorServiceTest, setMinOpenSegmentId) {
618630
EXPECT_THROW(client->setMinOpenSegmentId(ServerId(2, 2), 100),
619631
ClientException);

src/MakefragCoordinator

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
COORDINATOR_SRCFILES := \
22
src/CoordinatorService.cc \
3-
src/TabletMap.cc \
43
src/MasterRecoveryManager.cc \
4+
src/TabletMap.cc \
5+
src/RuntimeOptions.cc \
56
$(NULL)
67

78
COORDINATOR_OBJFILES := $(COORDINATOR_SRCFILES)

src/MakefragTest

+1
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ TESTS_SRCFILES := \
5959
src/ReplicaManagerTest.cc \
6060
src/ReplicatedSegmentTest.cc \
6161
src/RpcTest.cc \
62+
src/RuntimeOptionsTest.cc \
6263
src/SegmentTest.cc \
6364
src/SegmentIteratorTest.cc \
6465
src/ServerTest.cc \

src/MasterClient.cc

+14-2
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,21 @@ RejectRules defaultRejectRules;
2929
/**
3030
* Fill a master server with the given number of objects, each of the
3131
* same given size. Objects are added to all tables in the master in
32-
* a round-robin fashion.
32+
* a round-robin fashion. This method exists simply to quickly fill a
33+
* master for experiments.
3334
*
34-
* This method exists simply to quickly fill a master for experiments.
35+
* See MasterClient::fillWithTestData() for more information.
36+
*
37+
* \bug Will return an error if the master only owns part of a table
38+
* (because the hash of the fabricated keys may land in a region it
39+
* doesn't own).
40+
*
41+
* \param numObjects
42+
* Total number of objects to add to the server.
43+
* \param objectSize
44+
* Bytes of garbage data to place in each object not including the
45+
* key (the keys are ASCII strings starting with "0" and increasing
46+
* numerically in each table).
3547
*/
3648
void
3749
MasterClient::fillWithTestData(uint32_t numObjects, uint32_t objectSize)

src/MasterRecoveryManager.cc

+9-1
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,17 @@ namespace RAMCloud {
2828
* Authoritative list of all servers in the system and their details.
2929
* \param tabletMap
3030
* Authoritative information about tablets and their mapping to servers.
31+
* \param runtimeOptions
32+
* Configuration options which are stored by the coordinator.
33+
* May be NULL for testing.
3134
*/
3235
MasterRecoveryManager::MasterRecoveryManager(Context& context,
3336
CoordinatorServerList& serverList,
34-
TabletMap& tabletMap)
37+
TabletMap& tabletMap,
38+
RuntimeOptions* runtimeOptions)
3539
: serverList(serverList)
3640
, tabletMap(tabletMap)
41+
, runtimeOptions(runtimeOptions)
3742
, thread()
3843
, waitingRecoveries()
3944
, activeRecoveries()
@@ -226,6 +231,9 @@ class MaybeStartRecoveryTask : public Task {
226231
"another recovery is active for the same ServerId",
227232
recovery->crashedServerId.getId());
228233
} else {
234+
if (mgr.runtimeOptions)
235+
recovery->testingFailRecoveryMasters =
236+
mgr.runtimeOptions->popFailRecoveryMasters();
229237
recovery->schedule();
230238
mgr.activeRecoveries[recovery->getRecoveryId()] = recovery;
231239
mgr.waitingRecoveries.pop();

src/MasterRecoveryManager.h

+10-1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "CoordinatorServerList.h"
2222
#include "ProtoBuf.h"
2323
#include "Recovery.h"
24+
#include "RuntimeOptions.h"
2425
#include "ServerTracker.h"
2526
#include "TabletMap.h"
2627
#include "Tub.h"
@@ -51,7 +52,8 @@ class MasterRecoveryManager : public Recovery::Owner
5152
PUBLIC:
5253
MasterRecoveryManager(Context& context,
5354
CoordinatorServerList& serverList,
54-
TabletMap& tabletMap);
55+
TabletMap& tabletMap,
56+
RuntimeOptions* runtimeOptions);
5557
~MasterRecoveryManager();
5658

5759
void start();
@@ -78,6 +80,13 @@ class MasterRecoveryManager : public Recovery::Owner
7880
/// Authoritative information about tablets and their mapping to servers.
7981
TabletMap& tabletMap;
8082

83+
/**
84+
* Contains coordinator configuration options which can be modified while
85+
* the cluster is running. Currently mostly used for setting debugging
86+
* or testing parameters.
87+
*/
88+
RuntimeOptions* runtimeOptions;
89+
8190
/**
8291
* Drives recoveries; wakes up whenever new recoveries are waiting
8392
* or active recoveries have new work to complete.

src/MasterRecoveryManagerTest.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ struct MasterRecoveryManagerTest : public ::testing::Test {
3434
: context()
3535
, serverList(context)
3636
, tabletMap()
37-
, mgr(context, serverList, tabletMap)
37+
, mgr(context, serverList, tabletMap, NULL)
3838
{
3939
Logger::get().setLogLevels(RAMCloud::SILENT_LOG_LEVEL);
4040
}

0 commit comments

Comments
 (0)