diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index df570c90ec..692aff017c 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -414,8 +414,9 @@ CBoostedTreeImpl::gainAndCurvatureAtPercentile(double percentile, return {0.0, 0.0}; } - std::size_t index{static_cast( - percentile * static_cast(gains.size()) / 100.0 + 0.5)}; + std::size_t index{std::min( + static_cast(percentile * static_cast(gains.size()) / 100.0 + 0.5), + gains.size() - 1)}; std::nth_element(gains.begin(), gains.begin() + index, gains.end()); std::nth_element(curvatures.begin(), curvatures.begin() + index, curvatures.end()); diff --git a/lib/maths/unittest/CBoostedTreeTest.cc b/lib/maths/unittest/CBoostedTreeTest.cc index ce47890755..f45d735cfd 100644 --- a/lib/maths/unittest/CBoostedTreeTest.cc +++ b/lib/maths/unittest/CBoostedTreeTest.cc @@ -629,6 +629,50 @@ void CBoostedTreeTest::testIntegerRegressor() { CPPUNIT_ASSERT(modelRSquared > 0.99); } +void CBoostedTreeTest::testSingleSplit() { + + // We were getting an out-of-bound read in initialization when running on + // data with only two distinct values for the target variable. This test + // fails intermittently without the fix. + + test::CRandomNumbers rng; + + std::size_t rows{100}; + std::size_t cols{2}; + + TDoubleVec x; + rng.generateUniformSamples(0.0, 1.0, rows, x); + for (auto& xi : x) { + xi = std::floor(xi + 0.5); + } + + auto frame = core::makeMainStorageDataFrame(cols).first; + frame->categoricalColumns({false, false}); + for (std::size_t i = 0; i < rows; ++i) { + frame->writeRow([&](core::CDataFrame::TFloatVecItr column, std::int32_t&) { + *(column++) = x[i]; + *column = 10.0 * x[i]; + }); + } + frame->finishWritingRows(); + + auto regression = maths::CBoostedTreeFactory::constructFromParameters( + 1, std::make_unique()) + .buildFor(*frame, cols - 1); + + regression->train(); + + double modelBias; + double modelRSquared; + std::tie(modelBias, modelRSquared) = computeEvaluationMetrics( + *frame, 0, rows, regression->columnHoldingPrediction(frame->numberColumns()), + [](const TRowRef& row) { return 10.0 * row[0]; }, 0.0); + + LOG_DEBUG(<< "bias = " << modelBias); + LOG_DEBUG(<< " R^2 = " << modelRSquared); + CPPUNIT_ASSERT(modelRSquared > 0.99); +} + void CBoostedTreeTest::testTranslationInvariance() { // We should get similar performance independent of fixed shifts for the target. @@ -1070,6 +1114,8 @@ CppUnit::Test* CBoostedTreeTest::suite() { &CBoostedTreeTest::testCategoricalRegressors)); suiteOfTests->addTest(new CppUnit::TestCaller( "CBoostedTreeTest::testIntegerRegressor", &CBoostedTreeTest::testIntegerRegressor)); + suiteOfTests->addTest(new CppUnit::TestCaller( + "CBoostedTreeTest::testSingleSplit", &CBoostedTreeTest::testSingleSplit)); suiteOfTests->addTest(new CppUnit::TestCaller( "CBoostedTreeTest::testTranslationInvariance", &CBoostedTreeTest::testTranslationInvariance)); diff --git a/lib/maths/unittest/CBoostedTreeTest.h b/lib/maths/unittest/CBoostedTreeTest.h index aa7212d505..6c098bdc0b 100644 --- a/lib/maths/unittest/CBoostedTreeTest.h +++ b/lib/maths/unittest/CBoostedTreeTest.h @@ -19,6 +19,7 @@ class CBoostedTreeTest : public CppUnit::TestFixture { void testConstantTarget(); void testCategoricalRegressors(); void testIntegerRegressor(); + void testSingleSplit(); void testTranslationInvariance(); void testDepthBasedRegularization(); void testEstimateMemoryUsedByTrain();