3
3
#include < ydb/core/testlib/test_client.h>
4
4
#include < ydb/public/lib/deprecated/kicli/kicli.h>
5
5
6
+ #include < ydb/core/mind/hive/hive_events.h>
6
7
#include < ydb/core/node_whiteboard/node_whiteboard.h>
7
8
#include < ydb/core/blobstorage/base/blobstorage_events.h>
8
9
#include < ydb/core/tx/schemeshard/schemeshard.h>
@@ -68,7 +69,8 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
68
69
69
70
struct TTestVSlotInfo {
70
71
std::optional<NKikimrBlobStorage::EVDiskStatus> Status;
71
- ui32 Generation;
72
+ ui32 Generation = DEFAULT_GROUP_GENERATION;
73
+ NKikimrBlobStorage::EDriveStatus PDiskStatus = NKikimrBlobStorage::ACTIVE;
72
74
73
75
TTestVSlotInfo (std::optional<NKikimrBlobStorage::EVDiskStatus> status = NKikimrBlobStorage::READY,
74
76
ui32 generation = DEFAULT_GROUP_GENERATION)
@@ -77,7 +79,11 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
77
79
{
78
80
}
79
81
80
- TTestVSlotInfo (NKikimrBlobStorage::EVDiskStatus status) : Status(status), Generation(DEFAULT_GROUP_GENERATION) {}
82
+ TTestVSlotInfo (NKikimrBlobStorage::EVDiskStatus status, NKikimrBlobStorage::EDriveStatus pDiskStatus = NKikimrBlobStorage::ACTIVE)
83
+ : Status(status)
84
+ , PDiskStatus(pDiskStatus)
85
+ {
86
+ }
81
87
};
82
88
83
89
using TVDisks = TVector<TTestVSlotInfo>;
@@ -222,18 +228,20 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
222
228
entry->mutable_info ()->set_name (STORAGE_POOL_NAME);
223
229
}
224
230
225
- void AddPDisksToSysViewResponse (NSysView::TEvSysView::TEvGetPDisksResponse::TPtr* ev, size_t count , double occupancy) {
231
+ void AddPDisksToSysViewResponse (NSysView::TEvSysView::TEvGetPDisksResponse::TPtr* ev, const TVDisks& vslots , double occupancy) {
226
232
auto & record = (*ev)->Get ()->Record ;
227
233
auto entrySample = record.entries (0 );
228
234
record.clear_entries ();
229
235
auto pdiskId = PDISK_START_ID;
230
236
const size_t totalSize = 3'200'000'000'000ull ;
231
- for (size_t i = 0 ; i < count; ++i) {
237
+ const auto *descriptor = NKikimrBlobStorage::EDriveStatus_descriptor ();
238
+ for (const auto & vslot : vslots) {
232
239
auto * entry = record.add_entries ();
233
240
entry->CopyFrom (entrySample);
234
241
entry->mutable_key ()->set_pdiskid (pdiskId);
235
242
entry->mutable_info ()->set_totalsize (totalSize);
236
243
entry->mutable_info ()->set_availablesize ((1 - occupancy) * totalSize);
244
+ entry->mutable_info ()->set_statusv2 (descriptor->FindValueByNumber (vslot.PDiskStatus )->name ());
237
245
++pdiskId;
238
246
}
239
247
}
@@ -482,7 +490,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
482
490
}
483
491
case NSysView::TEvSysView::EvGetPDisksResponse: {
484
492
auto * x = reinterpret_cast <NSysView::TEvSysView::TEvGetPDisksResponse::TPtr*>(&ev);
485
- AddPDisksToSysViewResponse (x, vdisks. size () , occupancy);
493
+ AddPDisksToSysViewResponse (x, vdisks, occupancy);
486
494
break ;
487
495
}
488
496
case NSysView::TEvSysView::EvGetGroupsResponse: {
@@ -710,6 +718,14 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
710
718
CheckHcResultHasIssuesWithStatus (result, " STORAGE_GROUP" , Ydb::Monitoring::StatusFlag::RED, 1 );
711
719
}
712
720
721
+ Y_UNIT_TEST (YellowIssueReadyVDisksOnFaultyPDisks) {
722
+ auto result = RequestHcWithVdisks (NKikimrBlobStorage::TGroupStatus::PARTIAL, TVDisks{3 , {NKikimrBlobStorage::READY, NKikimrBlobStorage::FAULTY}});
723
+ Cerr << result.ShortDebugString () << Endl;
724
+ CheckHcResultHasIssuesWithStatus (result, " STORAGE_GROUP" , Ydb::Monitoring::StatusFlag::YELLOW, 1 );
725
+ CheckHcResultHasIssuesWithStatus (result, " STORAGE_GROUP" , Ydb::Monitoring::StatusFlag::ORANGE, 0 );
726
+ CheckHcResultHasIssuesWithStatus (result, " STORAGE_GROUP" , Ydb::Monitoring::StatusFlag::RED, 0 );
727
+ }
728
+
713
729
/* HC currently infers group status on its own, so it's never unknown
714
730
Y_UNIT_TEST(RedGroupIssueWhenUnknownGroupStatus) {
715
731
auto result = RequestHcWithVdisks(NKikimrBlobStorage::TGroupStatus::UNKNOWN, {});
@@ -1818,123 +1834,128 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
1818
1834
UNIT_ASSERT_VALUES_EQUAL (database_status.storage ().pools ()[0 ].id (), " static" );
1819
1835
}
1820
1836
1821
- void HiveSyncTest (bool syncPeriod) {
1837
+ Y_UNIT_TEST (ShardsLimit999) {
1838
+ ShardsQuotaTest (999 , 1000 , 1 , Ydb::Monitoring::StatusFlag::RED);
1839
+ }
1840
+
1841
+ Y_UNIT_TEST (ShardsLimit995) {
1842
+ ShardsQuotaTest (995 , 1000 , 1 , Ydb::Monitoring::StatusFlag::ORANGE);
1843
+ }
1844
+
1845
+ Y_UNIT_TEST (ShardsLimit905) {
1846
+ ShardsQuotaTest (905 , 1000 , 1 , Ydb::Monitoring::StatusFlag::YELLOW);
1847
+ }
1848
+
1849
+ Y_UNIT_TEST (ShardsLimit800) {
1850
+ ShardsQuotaTest (805 , 1000 , 0 , Ydb::Monitoring::StatusFlag::GREEN);
1851
+ }
1852
+
1853
+ Y_UNIT_TEST (ShardsNoLimit) {
1854
+ ShardsQuotaTest (105 , 0 , 0 , Ydb::Monitoring::StatusFlag::GREEN);
1855
+ }
1856
+
1857
+ bool HasDeadTabletIssue (const Ydb::Monitoring::SelfCheckResult& result) {
1858
+ for (const auto & issue_log : result.issue_log ()) {
1859
+ if (issue_log.level () == 4 && issue_log.type () == " TABLET" ) {
1860
+ return true ;
1861
+ }
1862
+ }
1863
+ return false ;
1864
+ }
1865
+
1866
+ Y_UNIT_TEST (TestTabletIsDead) {
1822
1867
TPortManager tp;
1823
1868
ui16 port = tp.GetPort (2134 );
1824
1869
ui16 grpcPort = tp.GetPort (2135 );
1825
1870
auto settings = TServerSettings (port)
1826
- .SetNodeCount (1 )
1871
+ .SetNodeCount (2 )
1827
1872
.SetDynamicNodeCount (1 )
1828
1873
.SetUseRealThreads (false )
1829
1874
.SetDomainName (" Root" );
1830
1875
TServer server (settings);
1831
1876
server.EnableGRpc (grpcPort);
1877
+
1832
1878
TClient client (settings);
1833
- TTestActorRuntime& runtime = *server.GetRuntime ();
1834
1879
1835
- ui32 dynNodeId = runtime.GetNodeId (1 );
1880
+ TTestActorRuntime* runtime = server.GetRuntime ();
1881
+ TActorId sender = runtime->AllocateEdgeActor ();
1836
1882
1837
- auto observerFunc = [&](TAutoPtr<IEventHandle>& ev) {
1838
- switch (ev->GetTypeRewrite ()) {
1839
- case TEvHive::EvResponseHiveInfo: {
1840
- auto *x = reinterpret_cast <TEvHive::TEvResponseHiveInfo::TPtr*>(&ev);
1841
- auto & record = (*x)->Get ()->Record ;
1842
- record.SetStartTimeTimestamp (0 );
1843
- if (syncPeriod) {
1844
- record.SetResponseTimestamp (NHealthCheck::TSelfCheckRequest::HIVE_SYNCHRONIZATION_PERIOD_MS / 2 );
1845
- } else {
1846
- record.SetResponseTimestamp (NHealthCheck::TSelfCheckRequest::HIVE_SYNCHRONIZATION_PERIOD_MS * 2 );
1847
- }
1848
- auto *tablet = record.MutableTablets ()->Add ();
1849
- tablet->SetTabletID (1 );
1850
- tablet->SetNodeID (dynNodeId);
1851
- tablet->SetTabletType (NKikimrTabletBase::TTabletTypes::DataShard);
1852
- tablet->SetVolatileState (NKikimrHive::TABLET_VOLATILE_STATE_BOOTING);
1853
- tablet->MutableObjectDomain ()->SetSchemeShard (SUBDOMAIN_KEY.OwnerId );
1854
- tablet->MutableObjectDomain ()->SetPathId (SUBDOMAIN_KEY.LocalPathId );
1855
- break ;
1856
- }
1857
- case TEvHive::EvResponseHiveNodeStats: {
1858
- auto *x = reinterpret_cast <TEvHive::TEvResponseHiveNodeStats::TPtr*>(&ev);
1859
- auto &record = (*x)->Get ()->Record ;
1860
- auto *nodeStats = record.MutableNodeStats ()->Add ();
1861
- nodeStats->SetNodeId (dynNodeId);
1862
- nodeStats->MutableNodeDomain ()->SetSchemeShard (SUBDOMAIN_KEY.OwnerId );
1863
- nodeStats->MutableNodeDomain ()->SetPathId (SUBDOMAIN_KEY.LocalPathId );
1864
- break ;
1865
- }
1866
- case NConsole::TEvConsole::EvGetTenantStatusResponse: {
1867
- auto *x = reinterpret_cast <NConsole::TEvConsole::TEvGetTenantStatusResponse::TPtr*>(&ev);
1868
- ChangeGetTenantStatusResponse (x, " /Root/database" );
1869
- break ;
1870
- }
1871
- case TEvTxProxySchemeCache::EvNavigateKeySetResult: {
1872
- auto *x = reinterpret_cast <TEvTxProxySchemeCache::TEvNavigateKeySetResult::TPtr*>(&ev);
1873
- TSchemeCacheNavigate::TEntry& entry ((*x)->Get ()->Request ->ResultSet .front ());
1874
- entry.Status = TSchemeCacheNavigate::EStatus::Ok;
1875
- entry.Kind = TSchemeCacheNavigate::EKind::KindExtSubdomain;
1876
- entry.Path = {" Root" , " database" };
1877
- entry.DomainInfo = MakeIntrusive<TDomainInfo>(SUBDOMAIN_KEY, SUBDOMAIN_KEY);
1883
+ server.SetupDynamicLocalService (2 , " Root" );
1884
+ server.StartPQTablets (1 );
1885
+ server.DestroyDynamicLocalService (2 );
1886
+ runtime->AdvanceCurrentTime (TDuration::Minutes (5 ));
1878
1887
1879
- break ;
1880
- }
1881
- }
1888
+ TAutoPtr<IEventHandle> handle;
1889
+ runtime->Send (new IEventHandle (NHealthCheck::MakeHealthCheckID (), sender, new NHealthCheck::TEvSelfCheckRequest (), 0 ));
1890
+ auto result = runtime->GrabEdgeEvent <NHealthCheck::TEvSelfCheckResult>(handle)->Result ;
1891
+ Cerr << result.ShortDebugString ();
1882
1892
1883
- return TTestActorRuntime::EEventAction::PROCESS;
1884
- };
1885
- runtime.SetObserverFunc (observerFunc);
1893
+ UNIT_ASSERT (HasDeadTabletIssue (result));
1894
+ }
1886
1895
1887
- TActorId sender = runtime.AllocateEdgeActor ();
1888
- TAutoPtr<IEventHandle> handle;
1896
+ Y_UNIT_TEST (TestBootingTabletIsNotDead) {
1897
+ TPortManager tp;
1898
+ ui16 port = tp.GetPort (2134 );
1899
+ ui16 grpcPort = tp.GetPort (2135 );
1900
+ auto settings = TServerSettings (port)
1901
+ .SetNodeCount (2 )
1902
+ .SetDynamicNodeCount (1 )
1903
+ .SetUseRealThreads (false )
1904
+ .SetDomainName (" Root" );
1905
+ TServer server (settings);
1906
+ server.EnableGRpc (grpcPort);
1889
1907
1890
- auto *request = new NHealthCheck::TEvSelfCheckRequest;
1891
- request->Request .set_return_verbose_status (true );
1892
- request->Database = " /Root/database" ;
1893
- runtime.Send (new IEventHandle (NHealthCheck::MakeHealthCheckID (), sender, request, 0 ));
1894
- const auto result = runtime.GrabEdgeEvent <NHealthCheck::TEvSelfCheckResult>(handle)->Result ;
1908
+ TClient client (settings);
1895
1909
1896
- Cerr << result.ShortDebugString () << Endl;
1910
+ TTestActorRuntime* runtime = server.GetRuntime ();
1911
+ TActorId sender = runtime->AllocateEdgeActor ();
1897
1912
1898
- UNIT_ASSERT_VALUES_EQUAL (result. database_status_size (), 1 );
1913
+ auto blockBoot = runtime-> AddObserver <NHive::TEvPrivate::TEvProcessBootQueue>([]( auto && ev) { ev. Reset (); } );
1899
1914
1900
- bool deadTabletIssueFoundInResult = false ;
1901
- for (const auto &issue_log : result.issue_log ()) {
1902
- if (issue_log.level () == 4 && issue_log.type () == " TABLET" ) {
1903
- UNIT_ASSERT_VALUES_EQUAL (issue_log.location ().compute ().tablet ().id ().size (), 1 );
1904
- UNIT_ASSERT_VALUES_EQUAL (issue_log.location ().compute ().tablet ().type (), " DataShard" );
1905
- deadTabletIssueFoundInResult = true ;
1906
- }
1907
- }
1915
+ server.SetupDynamicLocalService (2 , " Root" );
1916
+ server.StartPQTablets (1 , false );
1917
+ runtime->AdvanceCurrentTime (TDuration::Minutes (5 ));
1908
1918
1909
- UNIT_ASSERT_VALUES_EQUAL (syncPeriod, !deadTabletIssueFoundInResult);
1910
- }
1919
+ TAutoPtr<IEventHandle> handle;
1920
+ runtime->Send (new IEventHandle (NHealthCheck::MakeHealthCheckID (), sender, new NHealthCheck::TEvSelfCheckRequest (), 0 ));
1921
+ auto result = runtime->GrabEdgeEvent <NHealthCheck::TEvSelfCheckResult>(handle)->Result ;
1922
+ Cerr << result.ShortDebugString ();
1911
1923
1912
- Y_UNIT_TEST (HiveSyncPeriodIgnoresTabletsState) {
1913
- HiveSyncTest (true );
1924
+ UNIT_ASSERT (!HasDeadTabletIssue (result));
1914
1925
}
1915
1926
1916
- Y_UNIT_TEST (AfterHiveSyncPeriodReportsTabletsState) {
1917
- HiveSyncTest (false );
1918
- }
1927
+ Y_UNIT_TEST (TestReBootingTabletIsDead) {
1928
+ TPortManager tp;
1929
+ ui16 port = tp.GetPort (2134 );
1930
+ ui16 grpcPort = tp.GetPort (2135 );
1931
+ auto settings = TServerSettings (port)
1932
+ .SetNodeCount (2 )
1933
+ .SetDynamicNodeCount (2 )
1934
+ .SetUseRealThreads (false )
1935
+ .SetDomainName (" Root" );
1936
+ TServer server (settings);
1937
+ server.EnableGRpc (grpcPort);
1919
1938
1920
- Y_UNIT_TEST (ShardsLimit999) {
1921
- ShardsQuotaTest (999 , 1000 , 1 , Ydb::Monitoring::StatusFlag::RED);
1922
- }
1939
+ TClient client (settings);
1923
1940
1924
- Y_UNIT_TEST (ShardsLimit995) {
1925
- ShardsQuotaTest ( 995 , 1000 , 1 , Ydb::Monitoring::StatusFlag::ORANGE );
1926
- }
1941
+ TTestActorRuntime* runtime = server. GetRuntime ();
1942
+ runtime-> SetLogPriority (NKikimrServices::HIVE, NActors::NLog::PRI_TRACE );
1943
+ TActorId sender = runtime-> AllocateEdgeActor ();
1927
1944
1928
- Y_UNIT_TEST (ShardsLimit905) {
1929
- ShardsQuotaTest (905 , 1000 , 1 , Ydb::Monitoring::StatusFlag::YELLOW);
1930
- }
1931
1945
1932
- Y_UNIT_TEST (ShardsLimit800) {
1933
- ShardsQuotaTest (805 , 1000 , 0 , Ydb::Monitoring::StatusFlag::GREEN);
1934
- }
1946
+ server.SetupDynamicLocalService (2 , " Root" );
1947
+ server.StartPQTablets (1 , true );
1948
+ server.SetupDynamicLocalService (3 , " Root" );
1949
+ auto blockBoot = runtime->AddObserver <NHive::TEvPrivate::TEvProcessBootQueue>([](auto && ev) { ev.Reset (); });
1950
+ server.DestroyDynamicLocalService (2 );
1951
+ runtime->AdvanceCurrentTime (TDuration::Minutes (5 ));
1935
1952
1936
- Y_UNIT_TEST (ShardsNoLimit) {
1937
- ShardsQuotaTest (105 , 0 , 0 , Ydb::Monitoring::StatusFlag::GREEN);
1953
+ TAutoPtr<IEventHandle> handle;
1954
+ runtime->Send (new IEventHandle (NHealthCheck::MakeHealthCheckID (), sender, new NHealthCheck::TEvSelfCheckRequest (), 0 ));
1955
+ auto result = runtime->GrabEdgeEvent <NHealthCheck::TEvSelfCheckResult>(handle)->Result ;
1956
+ Cerr << result.ShortDebugString ();
1957
+
1958
+ UNIT_ASSERT (HasDeadTabletIssue (result));
1938
1959
}
1939
1960
}
1940
1961
}
0 commit comments