Skip to content

Commit 6b8d3f6

Browse files
tveaseydimitris-athanasiou
authored andcommitted
[ML] Trap out-of-bounds read initialising boosted tree training (#709) (#711)
1 parent 7495f75 commit 6b8d3f6

File tree

3 files changed

+50
-2
lines changed

3 files changed

+50
-2
lines changed

lib/maths/CBoostedTreeImpl.cc

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -414,8 +414,9 @@ CBoostedTreeImpl::gainAndCurvatureAtPercentile(double percentile,
414414
return {0.0, 0.0};
415415
}
416416

417-
std::size_t index{static_cast<std::size_t>(
418-
percentile * static_cast<double>(gains.size()) / 100.0 + 0.5)};
417+
std::size_t index{std::min(
418+
static_cast<std::size_t>(percentile * static_cast<double>(gains.size()) / 100.0 + 0.5),
419+
gains.size() - 1)};
419420
std::nth_element(gains.begin(), gains.begin() + index, gains.end());
420421
std::nth_element(curvatures.begin(), curvatures.begin() + index, curvatures.end());
421422

lib/maths/unittest/CBoostedTreeTest.cc

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -629,6 +629,50 @@ void CBoostedTreeTest::testIntegerRegressor() {
629629
CPPUNIT_ASSERT(modelRSquared > 0.99);
630630
}
631631

632+
void CBoostedTreeTest::testSingleSplit() {
633+
634+
// We were getting an out-of-bound read in initialization when running on
635+
// data with only two distinct values for the target variable. This test
636+
// fails intermittently without the fix.
637+
638+
test::CRandomNumbers rng;
639+
640+
std::size_t rows{100};
641+
std::size_t cols{2};
642+
643+
TDoubleVec x;
644+
rng.generateUniformSamples(0.0, 1.0, rows, x);
645+
for (auto& xi : x) {
646+
xi = std::floor(xi + 0.5);
647+
}
648+
649+
auto frame = core::makeMainStorageDataFrame(cols).first;
650+
frame->categoricalColumns({false, false});
651+
for (std::size_t i = 0; i < rows; ++i) {
652+
frame->writeRow([&](core::CDataFrame::TFloatVecItr column, std::int32_t&) {
653+
*(column++) = x[i];
654+
*column = 10.0 * x[i];
655+
});
656+
}
657+
frame->finishWritingRows();
658+
659+
auto regression = maths::CBoostedTreeFactory::constructFromParameters(
660+
1, std::make_unique<maths::boosted_tree::CMse>())
661+
.buildFor(*frame, cols - 1);
662+
663+
regression->train();
664+
665+
double modelBias;
666+
double modelRSquared;
667+
std::tie(modelBias, modelRSquared) = computeEvaluationMetrics(
668+
*frame, 0, rows, regression->columnHoldingPrediction(frame->numberColumns()),
669+
[](const TRowRef& row) { return 10.0 * row[0]; }, 0.0);
670+
671+
LOG_DEBUG(<< "bias = " << modelBias);
672+
LOG_DEBUG(<< " R^2 = " << modelRSquared);
673+
CPPUNIT_ASSERT(modelRSquared > 0.99);
674+
}
675+
632676
void CBoostedTreeTest::testTranslationInvariance() {
633677

634678
// We should get similar performance independent of fixed shifts for the target.
@@ -1070,6 +1114,8 @@ CppUnit::Test* CBoostedTreeTest::suite() {
10701114
&CBoostedTreeTest::testCategoricalRegressors));
10711115
suiteOfTests->addTest(new CppUnit::TestCaller<CBoostedTreeTest>(
10721116
"CBoostedTreeTest::testIntegerRegressor", &CBoostedTreeTest::testIntegerRegressor));
1117+
suiteOfTests->addTest(new CppUnit::TestCaller<CBoostedTreeTest>(
1118+
"CBoostedTreeTest::testSingleSplit", &CBoostedTreeTest::testSingleSplit));
10731119
suiteOfTests->addTest(new CppUnit::TestCaller<CBoostedTreeTest>(
10741120
"CBoostedTreeTest::testTranslationInvariance",
10751121
&CBoostedTreeTest::testTranslationInvariance));

lib/maths/unittest/CBoostedTreeTest.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ class CBoostedTreeTest : public CppUnit::TestFixture {
1919
void testConstantTarget();
2020
void testCategoricalRegressors();
2121
void testIntegerRegressor();
22+
void testSingleSplit();
2223
void testTranslationInvariance();
2324
void testDepthBasedRegularization();
2425
void testEstimateMemoryUsedByTrain();

0 commit comments

Comments
 (0)