Skip to content

Commit a40f542

Browse files
committed
[ML] Trap out-of-bounds read initialising boosted tree training (elastic#709)
1 parent aec1e08 commit a40f542

File tree

3 files changed

+50
-2
lines changed

3 files changed

+50
-2
lines changed

lib/maths/CBoostedTreeImpl.cc

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -364,8 +364,9 @@ CBoostedTreeImpl::gainAndCurvatureAtPercentile(double percentile,
364364
return {0.0, 0.0};
365365
}
366366

367-
std::size_t index{static_cast<std::size_t>(
368-
percentile * static_cast<double>(gains.size()) / 100.0 + 0.5)};
367+
std::size_t index{std::min(
368+
static_cast<std::size_t>(percentile * static_cast<double>(gains.size()) / 100.0 + 0.5),
369+
gains.size() - 1)};
369370
std::nth_element(gains.begin(), gains.begin() + index, gains.end());
370371
std::nth_element(curvatures.begin(), curvatures.begin() + index, curvatures.end());
371372

lib/maths/unittest/CBoostedTreeTest.cc

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -650,6 +650,50 @@ void CBoostedTreeTest::testIntegerRegressor() {
650650
CPPUNIT_ASSERT(modelRSquared > 0.99);
651651
}
652652

653+
void CBoostedTreeTest::testSingleSplit() {
654+
655+
// We were getting an out-of-bound read in initialization when running on
656+
// data with only two distinct values for the target variable. This test
657+
// fails intermittently without the fix.
658+
659+
test::CRandomNumbers rng;
660+
661+
std::size_t rows{100};
662+
std::size_t cols{2};
663+
664+
TDoubleVec x;
665+
rng.generateUniformSamples(0.0, 1.0, rows, x);
666+
for (auto& xi : x) {
667+
xi = std::floor(xi + 0.5);
668+
}
669+
670+
auto frame = core::makeMainStorageDataFrame(cols).first;
671+
frame->categoricalColumns({false, false});
672+
for (std::size_t i = 0; i < rows; ++i) {
673+
frame->writeRow([&](core::CDataFrame::TFloatVecItr column, std::int32_t&) {
674+
*(column++) = x[i];
675+
*column = 10.0 * x[i];
676+
});
677+
}
678+
frame->finishWritingRows();
679+
680+
auto regression = maths::CBoostedTreeFactory::constructFromParameters(
681+
1, std::make_unique<maths::boosted_tree::CMse>())
682+
.buildFor(*frame, cols - 1);
683+
684+
regression->train();
685+
686+
double modelBias;
687+
double modelRSquared;
688+
std::tie(modelBias, modelRSquared) = computeEvaluationMetrics(
689+
*frame, 0, rows, regression->columnHoldingPrediction(frame->numberColumns()),
690+
[](const TRowRef& row) { return 10.0 * row[0]; }, 0.0);
691+
692+
LOG_DEBUG(<< "bias = " << modelBias);
693+
LOG_DEBUG(<< " R^2 = " << modelRSquared);
694+
CPPUNIT_ASSERT(modelRSquared > 0.99);
695+
}
696+
653697
void CBoostedTreeTest::testTranslationInvariance() {
654698

655699
// We should get similar performance independent of fixed shifts for the target.
@@ -1036,6 +1080,8 @@ CppUnit::Test* CBoostedTreeTest::suite() {
10361080
&CBoostedTreeTest::testCategoricalRegressors));
10371081
suiteOfTests->addTest(new CppUnit::TestCaller<CBoostedTreeTest>(
10381082
"CBoostedTreeTest::testIntegerRegressor", &CBoostedTreeTest::testIntegerRegressor));
1083+
suiteOfTests->addTest(new CppUnit::TestCaller<CBoostedTreeTest>(
1084+
"CBoostedTreeTest::testSingleSplit", &CBoostedTreeTest::testSingleSplit));
10391085
suiteOfTests->addTest(new CppUnit::TestCaller<CBoostedTreeTest>(
10401086
"CBoostedTreeTest::testTranslationInvariance",
10411087
&CBoostedTreeTest::testTranslationInvariance));

lib/maths/unittest/CBoostedTreeTest.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ class CBoostedTreeTest : public CppUnit::TestFixture {
1919
void testConstantTarget();
2020
void testCategoricalRegressors();
2121
void testIntegerRegressor();
22+
void testSingleSplit();
2223
void testTranslationInvariance();
2324
void testEstimateMemoryUsedByTrain();
2425
void testProgressMonitoring();

0 commit comments

Comments
 (0)