Skip to content

Commit fbd71be

Browse files
MB-64910 MB-61292: Change behavior of bucket deks callbacks
Change #1: set_active_key for buckets should treat enoent and not_supported as "bucket not found". When bucket is on disk, but not in memcached (e.g. when cluster membership is inactiveAdded or inactiveFailed), we can't push keys to memcached. If we treat it as error (behavior before this this change), we won't be able to modify encryption-at-rest settings because cb_cluster_secrets update_bucket_deks status will show error (issues list will not be empty). At the same time it seems ok to treat as ok, because memcached is not encrypting any data in this bucket, so it doesn't need new keys. When bucket is activated (e.g. we add node back to the cluster), ns_memcached will push actual keys to memcached in create_bucket. Change #2: Treat not_found in set_active_key as ok, but only when ns_memcached process doesn't exist before set_active_key attempt. This is important in order to avoid races when set_active_key and create_bucket are called in parallel. Basically the following scenario: 1. (process1) ns_memcached fetches old keys 2. (process2) set_active_dek is called (and gets not_found) 3. (process1) ns_memcached creates the bucket with old keys 4. (process1) ns_memcached crashes 5. (process2) we check if ns_memcached is running and return ok 6. Bucket is created with old keys Change #3: get_dek_id_in_use should return not_found when bucket doesn't exist or when memcached returns not_supported. Reasoning is the same as in change #1. Basically when there is no bucket in memcached, we should assume that all current deks are still in use and don't drop anything. The goal of the change is to not treat it as error basically, because it leads to the situation when we can't modify encryption-at-rest settings. Change-Id: I63cc3e2d7ddbadf5f5866c662858c0dd2d81b270 Reviewed-on: https://review.couchbase.org/c/ns_server/+/223510 Tested-by: Timofey Barmin <[email protected]> Well-Formed: Build Bot <[email protected]> Reviewed-by: Navdeep S Boparai <[email protected]>
1 parent 6371382 commit fbd71be

File tree

2 files changed

+71
-10
lines changed

2 files changed

+71
-10
lines changed

apps/ns_server/src/cb_cluster_secrets.erl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1562,6 +1562,9 @@ garbage_collect_deks(Kind, Force, #state{deks = DeksInfo} = State) ->
15621562
%% The entity that uses deks does not exist.
15631563
%% Ignoring it here because we assume that deks will
15641564
%% be removed by maybe_update_deks
1565+
%% It is possible that bucket exists on disk, but not in
1566+
%% memcached. In this case it is important to not remove
1567+
%% any deks here. It is not an error either.
15651568
{ok, State};
15661569
{succ, {error, Reason}} ->
15671570
{error, State, Reason};

apps/ns_server/src/ns_memcached.erl

Lines changed: 68 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1886,7 +1886,39 @@ set_tls_config(Config) ->
18861886

18871887
set_active_dek_for_bucket(Bucket, _ActiveDek) ->
18881888
{ok, DeksSnapshot} = cb_crypto:fetch_deks_snapshot({bucketDek, Bucket}),
1889-
set_active_dek(Bucket, DeksSnapshot).
1889+
NsMemcachedExists = (whereis(server(Bucket)) =/= undefined),
1890+
case set_active_dek(Bucket, DeksSnapshot) of
1891+
ok -> ok;
1892+
{error, not_found} -> %% bucket does not exist
1893+
case NsMemcachedExists of
1894+
true ->
1895+
%% ns_memcached is running, we should retry when bucket
1896+
%% is created
1897+
{error, retry};
1898+
false ->
1899+
%% ns_memcached was not running before the call, so it is
1900+
%% safe to return ok
1901+
%% If bucket needs to be created, ns_memcached will create
1902+
%% it and use newest keys. If bucket doesn't need to be
1903+
%% created (e.g. it is failed over), there is no need to
1904+
%% notify memcached at all.
1905+
%% Note that it is important that we check if ns_memcached
1906+
%% is running before calling set_active_dek, otherwise it is
1907+
%% possible that ns_memcached creates the bucket with old
1908+
%% keys:
1909+
%% 1. ns_memcached fetches old keys
1910+
%% 2. this process calls set_active_dek (and gets not_found)
1911+
%% 3. ns_memcached creates the bucket with old keys
1912+
%% 4. ns_memcached crashes
1913+
%% 5. we check if ns_memcached is running and returning ok
1914+
?log_debug("Ignoring not_found when setting encryption "
1915+
"keys for bucket ~p (bucket doesn't seem to "
1916+
"exist in memcached)", [Bucket]),
1917+
ok
1918+
end;
1919+
{error, E} ->
1920+
{error, E}
1921+
end.
18901922

18911923
set_active_dek(TypeOrBucket, DeksSnapshot) ->
18921924
?log_debug("Setting active encryption key id for ~p: ~p...",
@@ -1905,10 +1937,24 @@ set_active_dek(TypeOrBucket, DeksSnapshot) ->
19051937
?log_debug("Setting encryption key for ~p succeeded",
19061938
[TypeOrBucket]),
19071939
ok;
1908-
{error, couldnt_connect_to_memcached} -> {error, retry};
1940+
{error, couldnt_connect_to_memcached} ->
1941+
?log_debug("Setting encryption key for ~p failed: "
1942+
"couldnt_connect_to_memcached", [TypeOrBucket]),
1943+
{error, retry};
19091944
%% It can happen during start, when bucket is not created yet
1910-
{error, {key_enoent, undefined}} -> {error, retry};
1911-
{error, {not_supported, undefined}} -> {error, retry};
1945+
{error, {key_enoent, undefined}} ->
1946+
?log_debug("Setting encryption key for ~p failed: "
1947+
"key_enoent", [TypeOrBucket]),
1948+
{error, not_found};
1949+
{error, {not_supported, undefined}} ->
1950+
?log_debug("Setting encryption key for ~p failed: "
1951+
"not_supported", [TypeOrBucket]),
1952+
%% memcached returns not supported for buckets
1953+
%% that exist on other nodes, but not on this node
1954+
%% (ns_server uploads terse bucket info for such
1955+
%% buckets), so from encryption perspective
1956+
%% it is the same as when bucket does not exist
1957+
{error, not_found};
19121958
{error, E} ->
19131959
?log_error("Setting encryption key for ~p failed: ~p",
19141960
[TypeOrBucket, E]),
@@ -1920,27 +1966,39 @@ sanitize_in_use_keys(InUseKeys) ->
19201966
(K) -> K
19211967
end, InUseKeys).
19221968

1923-
get_dek_ids_in_use(BucketOrType, StatsFn) ->
1969+
get_dek_ids_in_use(Bucket, StatsFn) ->
19241970
RV = perform_very_long_call(
19251971
fun (Sock) ->
19261972
case mc_binary:quick_stats(Sock, <<"encryption-key-ids">>,
19271973
StatsFn, []) of
19281974
{ok, Ids} ->
19291975
{reply, {ok, Ids}};
1976+
{memcached_error, not_supported, _} ->
1977+
%% memcached returns not supported for buckets
1978+
%% that exist on other nodes, but not on this node
1979+
%% (ns_server uploads terse bucket info for such
1980+
%% buckets), so from encryption perspective
1981+
%% it is the same as when bucket does not exist
1982+
?log_debug("Get deks in use for ~p returned "
1983+
"not_supported", [Bucket]),
1984+
{reply, {error, not_found}};
19301985
{memcached_error, Error, Msg} ->
19311986
?log_error("Failed to get dek ids in use for "
1932-
"~p: ~p", [BucketOrType, {Error, Msg}]),
1987+
"~p: ~p", [Bucket, {Error, Msg}]),
19331988
{reply, {error, Error}}
19341989
end
1935-
end, BucketOrType),
1990+
end, Bucket),
19361991

19371992
case RV of
19381993
{ok, _} -> RV;
1939-
{error, couldnt_connect_to_memcached} -> {error, retry};
1994+
{error, couldnt_connect_to_memcached} ->
1995+
{error, retry};
19401996
%% It can happen during start, when bucket is not created yet
19411997
{error, {select_bucket_failed,
1942-
{memcached_error, key_enoent, undefined}}} -> {error, retry};
1943-
{error, E} -> {error, E}
1998+
{memcached_error, key_enoent, undefined}}} ->
1999+
{error, not_found};
2000+
{error, E} ->
2001+
{error, E}
19442002
end.
19452003

19462004
get_dek_ids_in_use(Type) when Type =:= "@audit"; Type =:= "@logs" ->

0 commit comments

Comments
 (0)