Skip to content

Commit 60f710e

Browse files
authored
[ML] Fix a SIGSEGV in outlier detection (#503)
Backport #502
1 parent 0805a62 commit 60f710e

File tree

6 files changed

+118
-15
lines changed

6 files changed

+118
-15
lines changed

include/maths/CKdTree.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,7 @@ class CKdTree {
271271
for (const auto& neighbour : neighbours) {
272272
result.push_back(neighbour.second);
273273
}
274-
} else if (n > m_Nodes.size()) {
274+
} else if (n >= m_Nodes.size()) {
275275
TDoubleVec distances;
276276
distances.reserve(m_Nodes.size());
277277
result.reserve(m_Nodes.size());

include/maths/COutliers.h

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,9 @@ class CLof final : public CNearestNeighbourMethod<POINT, NEAREST_NEIGHBOURS> {
224224
void add(const POINT& point, const TPointVec& neighbours, TDouble1VecVec&) override {
225225
// This is called exactly once for each point therefore an element
226226
// of m_KDistances is only ever written by one thread.
227+
if (neighbours.size() < 2) {
228+
return;
229+
}
227230
std::size_t i{point.annotation()};
228231
std::size_t a(point == neighbours[0] ? 1 : 0);
229232
std::size_t b{std::min(this->k() + a - 1, neighbours.size() + a - 2)};
@@ -437,6 +440,14 @@ class CLdof final : public CNearestNeighbourMethod<POINT, NEAREST_NEIGHBOURS> {
437440
private:
438441
void add(const POINT& point, const std::vector<POINT>& neighbours, TDouble1VecVec& scores) override {
439442

443+
std::size_t dimension{las::dimension(point)};
444+
auto& score = scores[point.annotation()];
445+
score.assign(this->computeFeatureInfluence() ? dimension + 1 : 1, 0.0);
446+
447+
if (neighbours.size() < 2) {
448+
return;
449+
}
450+
440451
auto ldof = [](const TMeanAccumulator& d, const TMeanAccumulator& D) {
441452
return CBasicStatistics::mean(D) > 0.0
442453
? CBasicStatistics::mean(d) / CBasicStatistics::mean(D)
@@ -445,10 +456,6 @@ class CLdof final : public CNearestNeighbourMethod<POINT, NEAREST_NEIGHBOURS> {
445456

446457
std::size_t a(point == neighbours[0] ? 1 : 0);
447458
std::size_t b{std::min(this->k() + a - 1, neighbours.size() + a - 2)};
448-
std::size_t dimension{las::dimension(point)};
449-
450-
auto& score = scores[point.annotation()];
451-
score.resize(this->computeFeatureInfluence() ? dimension + 1 : 1);
452459

453460
TMeanAccumulator D;
454461
{
@@ -502,13 +509,17 @@ class CDistancekNN final : public CNearestNeighbourMethod<POINT, NEAREST_NEIGHBO
502509
private:
503510
void add(const POINT& point, const std::vector<POINT>& neighbours, TDouble1VecVec& scores) override {
504511

512+
auto& score = scores[point.annotation()];
513+
score.assign(this->computeFeatureInfluence() ? las::dimension(point) + 1 : 1, 0.0);
514+
515+
if (neighbours.size() < 2) {
516+
return;
517+
}
518+
505519
std::size_t k{std::min(this->k() + 1, neighbours.size() - 1) -
506520
(point == neighbours[0] ? 0 : 1)};
507521
const auto& kthNeighbour = neighbours[k];
508522

509-
auto& score = scores[point.annotation()];
510-
score.resize(this->computeFeatureInfluence() ? las::dimension(point) + 1 : 1);
511-
512523
double d{las::distance(point, kthNeighbour)};
513524
score[0] = d;
514525
for (std::size_t i = 1; i < score.size(); ++i) {
@@ -536,12 +547,16 @@ class CTotalDistancekNN final
536547
private:
537548
void add(const POINT& point, const std::vector<POINT>& neighbours, TDouble1VecVec& scores) override {
538549

539-
std::size_t a(point == neighbours[0] ? 1 : 0);
540-
std::size_t b{std::min(this->k() + a - 1, neighbours.size() + a - 2)};
541-
542550
auto& score = scores[point.annotation()];
543551
score.assign(this->computeFeatureInfluence() ? las::dimension(point) + 1 : 1, 0.0);
544552

553+
if (neighbours.size() < 2) {
554+
return;
555+
}
556+
557+
std::size_t a(point == neighbours[0] ? 1 : 0);
558+
std::size_t b{std::min(this->k() + a - 1, neighbours.size() + a - 2)};
559+
545560
for (std::size_t i = a; i <= b; ++i) {
546561
double d{las::distance(point, neighbours[i])};
547562
score[0] += d;

lib/maths/unittest/CKdTreeTest.cc

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,8 @@ std::string print(const POINT& t) {
8989
}
9090

9191
void CKdTreeTest::testBuild() {
92-
const std::size_t numberTests = 200;
92+
93+
const std::size_t numberTests{200};
9394

9495
test::CRandomNumbers rng;
9596

@@ -123,6 +124,7 @@ void CKdTreeTest::testBuild() {
123124
}
124125

125126
void CKdTreeTest::testBuildWithMove() {
127+
126128
test::CRandomNumbers rng;
127129

128130
TDoubleVec samples;
@@ -157,7 +159,8 @@ void CKdTreeTest::testBuildWithMove() {
157159
}
158160

159161
void CKdTreeTest::testNearestNeighbour() {
160-
const std::size_t numberTests = 200u;
162+
163+
const std::size_t numberTests{200};
161164

162165
test::CRandomNumbers rng;
163166

@@ -202,7 +205,8 @@ void CKdTreeTest::testNearestNeighbour() {
202205
}
203206

204207
void CKdTreeTest::testNearestNeighbours() {
205-
const std::size_t numberTests = 200u;
208+
209+
const std::size_t numberTests{200};
206210

207211
test::CRandomNumbers rng;
208212

@@ -251,6 +255,32 @@ void CKdTreeTest::testNearestNeighbours() {
251255
}
252256
}
253257

258+
void CKdTreeTest::testRequestingEveryPoint() {
259+
260+
test::CRandomNumbers rng;
261+
262+
TDoubleVec samples;
263+
rng.generateUniformSamples(-100.0, 100.0, 5 * 5, samples);
264+
265+
TVector5Vec points;
266+
for (std::size_t j = 0u; j < samples.size(); j += 5) {
267+
points.emplace_back(&samples[j], &samples[j + 5]);
268+
}
269+
std::stable_sort(points.begin(), points.end());
270+
std::string expected{core::CContainerPrinter::print(points)};
271+
272+
maths::CKdTree<TVector5> kdTree;
273+
kdTree.build(points);
274+
CPPUNIT_ASSERT(kdTree.checkInvariants());
275+
276+
TVector5Vec neighbours;
277+
kdTree.nearestNeighbours(5, TVector5{0.0}, neighbours);
278+
std::stable_sort(neighbours.begin(), neighbours.end());
279+
280+
CPPUNIT_ASSERT_EQUAL(kdTree.size(), neighbours.size());
281+
CPPUNIT_ASSERT_EQUAL(expected, core::CContainerPrinter::print(neighbours));
282+
}
283+
254284
CppUnit::Test* CKdTreeTest::suite() {
255285
CppUnit::TestSuite* suiteOfTests = new CppUnit::TestSuite("CKdTreeTest");
256286

@@ -262,6 +292,8 @@ CppUnit::Test* CKdTreeTest::suite() {
262292
"CKdTreeTest::testNearestNeighbour", &CKdTreeTest::testNearestNeighbour));
263293
suiteOfTests->addTest(new CppUnit::TestCaller<CKdTreeTest>(
264294
"CKdTreeTest::testNearestNeighbours", &CKdTreeTest::testNearestNeighbours));
295+
suiteOfTests->addTest(new CppUnit::TestCaller<CKdTreeTest>(
296+
"CKdTreeTest::testRequestingEveryPoint", &CKdTreeTest::testRequestingEveryPoint));
265297

266298
return suiteOfTests;
267299
}

lib/maths/unittest/CKdTreeTest.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ class CKdTreeTest : public CppUnit::TestFixture {
1515
void testBuildWithMove();
1616
void testNearestNeighbour();
1717
void testNearestNeighbours();
18+
void testRequestingEveryPoint();
1819

1920
static CppUnit::Test* suite();
2021
};

lib/maths/unittest/COutliersTest.cc

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,7 @@ void COutliersTest::testLof() {
231231
}
232232

233233
void COutliersTest::testDlof() {
234+
234235
// Test against definition without projecting.
235236

236237
test::CRandomNumbers rng;
@@ -270,6 +271,7 @@ void COutliersTest::testDlof() {
270271
}
271272

272273
void COutliersTest::testDistancekNN() {
274+
273275
// Test against definition without projecting.
274276

275277
test::CRandomNumbers rng;
@@ -302,6 +304,7 @@ void COutliersTest::testDistancekNN() {
302304
}
303305

304306
void COutliersTest::testTotalDistancekNN() {
307+
305308
// Test against definition without projecting.
306309

307310
test::CRandomNumbers rng;
@@ -339,6 +342,7 @@ void COutliersTest::testTotalDistancekNN() {
339342
}
340343

341344
void COutliersTest::testEnsemble() {
345+
342346
// Check error stats for scores, 0.1, 0.5 and 0.9. We should see precision increase
343347
// for higher scores but recall decrease.
344348
//
@@ -670,7 +674,7 @@ void COutliersTest::testMostlyDuplicate() {
670674
[i](const TSizeDoublePr& outlier_) { return i == outlier_.first; });
671675
TPoint point(1);
672676
point(0) = outlier != outliers.end() ? outlier->second : 0.0;
673-
points.push_back(point);
677+
points.push_back(std::move(point));
674678
}
675679

676680
for (std::size_t numberPartitions : {1, 3}) {
@@ -707,6 +711,54 @@ void COutliersTest::testMostlyDuplicate() {
707711
}
708712
}
709713

714+
void COutliersTest::testFewPoints() {
715+
716+
// Check there are no failures when there only a few points.
717+
718+
std::size_t rows{101};
719+
test::CRandomNumbers rng;
720+
721+
for (std::size_t numberPoints : {1, 2, 5}) {
722+
723+
LOG_DEBUG(<< "# points = " << numberPoints);
724+
725+
TPointVec points;
726+
TDoubleVec components;
727+
for (std::size_t i = 0; i < numberPoints; ++i) {
728+
TPoint point(rows);
729+
rng.generateUniformSamples(0.0, 10.0, rows, components);
730+
for (std::size_t j = 0; j < components.size(); ++j) {
731+
point(j) = components[j];
732+
}
733+
points.push_back(std::move(point));
734+
}
735+
736+
auto frame = test::CDataFrameTestUtils::toMainMemoryDataFrame(points);
737+
738+
maths::COutliers::SComputeParameters params{1, // Number threads
739+
1, // Number partitions,
740+
true, // Standardize columns
741+
maths::COutliers::E_Ensemble,
742+
0, // Compute number neighbours
743+
true, // Compute feature influences
744+
0.05}; // Outlier fraction
745+
maths::COutliers::compute(params, *frame);
746+
747+
bool passed{true};
748+
749+
frame->readRows(1, [&](core::CDataFrame::TRowItr beginRows,
750+
core::CDataFrame::TRowItr endRows) {
751+
for (auto row = beginRows; row != endRows; ++row) {
752+
// Check score is in range 0 to 1.
753+
LOG_DEBUG(<< "outlier score = " << (*row)[rows]);
754+
passed &= (*row)[rows] >= 0.0 && (*row)[rows] <= 1.0;
755+
}
756+
});
757+
758+
CPPUNIT_ASSERT(passed);
759+
}
760+
}
761+
710762
CppUnit::Test* COutliersTest::suite() {
711763
CppUnit::TestSuite* suiteOfTests = new CppUnit::TestSuite("COutliersTest");
712764

@@ -729,6 +781,8 @@ CppUnit::Test* COutliersTest::suite() {
729781
"COutliersTest::testProgressMonitoring", &COutliersTest::testProgressMonitoring));
730782
suiteOfTests->addTest(new CppUnit::TestCaller<COutliersTest>(
731783
"COutliersTest::testMostlyDuplicate", &COutliersTest::testMostlyDuplicate));
784+
suiteOfTests->addTest(new CppUnit::TestCaller<COutliersTest>(
785+
"COutliersTest::testFewPoints", &COutliersTest::testFewPoints));
732786

733787
return suiteOfTests;
734788
}

lib/maths/unittest/COutliersTest.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ class COutliersTest : public CppUnit::TestFixture {
2020
void testEstimateMemoryUsedByCompute();
2121
void testProgressMonitoring();
2222
void testMostlyDuplicate();
23+
void testFewPoints();
2324

2425
static CppUnit::Test* suite();
2526
};

0 commit comments

Comments
 (0)