Skip to content

[7.3][ML] Fix a SIGSEGV in outlier detection #503

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 19, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion include/maths/CKdTree.h
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ class CKdTree {
for (const auto& neighbour : neighbours) {
result.push_back(neighbour.second);
}
} else if (n > m_Nodes.size()) {
} else if (n >= m_Nodes.size()) {
TDoubleVec distances;
distances.reserve(m_Nodes.size());
result.reserve(m_Nodes.size());
Expand Down
35 changes: 25 additions & 10 deletions include/maths/COutliers.h
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,9 @@ class CLof final : public CNearestNeighbourMethod<POINT, NEAREST_NEIGHBOURS> {
void add(const POINT& point, const TPointVec& neighbours, TDouble1VecVec&) override {
// This is called exactly once for each point therefore an element
// of m_KDistances is only ever written by one thread.
if (neighbours.size() < 2) {
return;
}
std::size_t i{point.annotation()};
std::size_t a(point == neighbours[0] ? 1 : 0);
std::size_t b{std::min(this->k() + a - 1, neighbours.size() + a - 2)};
Expand Down Expand Up @@ -430,6 +433,14 @@ class CLdof final : public CNearestNeighbourMethod<POINT, NEAREST_NEIGHBOURS> {
private:
void add(const POINT& point, const std::vector<POINT>& neighbours, TDouble1VecVec& scores) override {

std::size_t dimension{las::dimension(point)};
auto& score = scores[point.annotation()];
score.assign(this->computeFeatureInfluence() ? dimension + 1 : 1, 0.0);

if (neighbours.size() < 2) {
return;
}

auto ldof = [](const TMeanAccumulator& d, const TMeanAccumulator& D) {
return CBasicStatistics::mean(D) > 0.0
? CBasicStatistics::mean(d) / CBasicStatistics::mean(D)
Expand All @@ -438,10 +449,6 @@ class CLdof final : public CNearestNeighbourMethod<POINT, NEAREST_NEIGHBOURS> {

std::size_t a(point == neighbours[0] ? 1 : 0);
std::size_t b{std::min(this->k() + a - 1, neighbours.size() + a - 2)};
std::size_t dimension{las::dimension(point)};

auto& score = scores[point.annotation()];
score.resize(this->computeFeatureInfluence() ? dimension + 1 : 1);

TMeanAccumulator D;
{
Expand Down Expand Up @@ -495,13 +502,17 @@ class CDistancekNN final : public CNearestNeighbourMethod<POINT, NEAREST_NEIGHBO
private:
void add(const POINT& point, const std::vector<POINT>& neighbours, TDouble1VecVec& scores) override {

auto& score = scores[point.annotation()];
score.assign(this->computeFeatureInfluence() ? las::dimension(point) + 1 : 1, 0.0);

if (neighbours.size() < 2) {
return;
}

std::size_t k{std::min(this->k() + 1, neighbours.size() - 1) -
(point == neighbours[0] ? 0 : 1)};
const auto& kthNeighbour = neighbours[k];

auto& score = scores[point.annotation()];
score.resize(this->computeFeatureInfluence() ? las::dimension(point) + 1 : 1);

double d{las::distance(point, kthNeighbour)};
score[0] = d;
for (std::size_t i = 1; i < score.size(); ++i) {
Expand Down Expand Up @@ -529,12 +540,16 @@ class CTotalDistancekNN final
private:
void add(const POINT& point, const std::vector<POINT>& neighbours, TDouble1VecVec& scores) override {

std::size_t a(point == neighbours[0] ? 1 : 0);
std::size_t b{std::min(this->k() + a - 1, neighbours.size() + a - 2)};

auto& score = scores[point.annotation()];
score.assign(this->computeFeatureInfluence() ? las::dimension(point) + 1 : 1, 0.0);

if (neighbours.size() < 2) {
return;
}

std::size_t a(point == neighbours[0] ? 1 : 0);
std::size_t b{std::min(this->k() + a - 1, neighbours.size() + a - 2)};

for (std::size_t i = a; i <= b; ++i) {
double d{las::distance(point, neighbours[i])};
score[0] += d;
Expand Down
38 changes: 35 additions & 3 deletions lib/maths/unittest/CKdTreeTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,8 @@ std::string print(const POINT& t) {
}

void CKdTreeTest::testBuild() {
const std::size_t numberTests = 200;

const std::size_t numberTests{200};

test::CRandomNumbers rng;

Expand Down Expand Up @@ -123,6 +124,7 @@ void CKdTreeTest::testBuild() {
}

void CKdTreeTest::testBuildWithMove() {

test::CRandomNumbers rng;

TDoubleVec samples;
Expand Down Expand Up @@ -157,7 +159,8 @@ void CKdTreeTest::testBuildWithMove() {
}

void CKdTreeTest::testNearestNeighbour() {
const std::size_t numberTests = 200u;

const std::size_t numberTests{200};

test::CRandomNumbers rng;

Expand Down Expand Up @@ -202,7 +205,8 @@ void CKdTreeTest::testNearestNeighbour() {
}

void CKdTreeTest::testNearestNeighbours() {
const std::size_t numberTests = 200u;

const std::size_t numberTests{200};

test::CRandomNumbers rng;

Expand Down Expand Up @@ -251,6 +255,32 @@ void CKdTreeTest::testNearestNeighbours() {
}
}

void CKdTreeTest::testRequestingEveryPoint() {

test::CRandomNumbers rng;

TDoubleVec samples;
rng.generateUniformSamples(-100.0, 100.0, 5 * 5, samples);

TVector5Vec points;
for (std::size_t j = 0u; j < samples.size(); j += 5) {
points.emplace_back(&samples[j], &samples[j + 5]);
}
std::stable_sort(points.begin(), points.end());
std::string expected{core::CContainerPrinter::print(points)};

maths::CKdTree<TVector5> kdTree;
kdTree.build(points);
CPPUNIT_ASSERT(kdTree.checkInvariants());

TVector5Vec neighbours;
kdTree.nearestNeighbours(5, TVector5{0.0}, neighbours);
std::stable_sort(neighbours.begin(), neighbours.end());

CPPUNIT_ASSERT_EQUAL(kdTree.size(), neighbours.size());
CPPUNIT_ASSERT_EQUAL(expected, core::CContainerPrinter::print(neighbours));
}

CppUnit::Test* CKdTreeTest::suite() {
CppUnit::TestSuite* suiteOfTests = new CppUnit::TestSuite("CKdTreeTest");

Expand All @@ -262,6 +292,8 @@ CppUnit::Test* CKdTreeTest::suite() {
"CKdTreeTest::testNearestNeighbour", &CKdTreeTest::testNearestNeighbour));
suiteOfTests->addTest(new CppUnit::TestCaller<CKdTreeTest>(
"CKdTreeTest::testNearestNeighbours", &CKdTreeTest::testNearestNeighbours));
suiteOfTests->addTest(new CppUnit::TestCaller<CKdTreeTest>(
"CKdTreeTest::testRequestingEveryPoint", &CKdTreeTest::testRequestingEveryPoint));

return suiteOfTests;
}
1 change: 1 addition & 0 deletions lib/maths/unittest/CKdTreeTest.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ class CKdTreeTest : public CppUnit::TestFixture {
void testBuildWithMove();
void testNearestNeighbour();
void testNearestNeighbours();
void testRequestingEveryPoint();

static CppUnit::Test* suite();
};
Expand Down
56 changes: 55 additions & 1 deletion lib/maths/unittest/COutliersTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,7 @@ void COutliersTest::testLof() {
}

void COutliersTest::testDlof() {

// Test against definition without projecting.

test::CRandomNumbers rng;
Expand Down Expand Up @@ -270,6 +271,7 @@ void COutliersTest::testDlof() {
}

void COutliersTest::testDistancekNN() {

// Test against definition without projecting.

test::CRandomNumbers rng;
Expand Down Expand Up @@ -302,6 +304,7 @@ void COutliersTest::testDistancekNN() {
}

void COutliersTest::testTotalDistancekNN() {

// Test against definition without projecting.

test::CRandomNumbers rng;
Expand Down Expand Up @@ -339,6 +342,7 @@ void COutliersTest::testTotalDistancekNN() {
}

void COutliersTest::testEnsemble() {

// Check error stats for scores, 0.1, 0.5 and 0.9. We should see precision increase
// for higher scores but recall decrease.
//
Expand Down Expand Up @@ -670,7 +674,7 @@ void COutliersTest::testMostlyDuplicate() {
[i](const TSizeDoublePr& outlier_) { return i == outlier_.first; });
TPoint point(1);
point(0) = outlier != outliers.end() ? outlier->second : 0.0;
points.push_back(point);
points.push_back(std::move(point));
}

for (std::size_t numberPartitions : {1, 3}) {
Expand Down Expand Up @@ -707,6 +711,54 @@ void COutliersTest::testMostlyDuplicate() {
}
}

void COutliersTest::testFewPoints() {

// Check there are no failures when there only a few points.

std::size_t rows{101};
test::CRandomNumbers rng;

for (std::size_t numberPoints : {1, 2, 5}) {

LOG_DEBUG(<< "# points = " << numberPoints);

TPointVec points;
TDoubleVec components;
for (std::size_t i = 0; i < numberPoints; ++i) {
TPoint point(rows);
rng.generateUniformSamples(0.0, 10.0, rows, components);
for (std::size_t j = 0; j < components.size(); ++j) {
point(j) = components[j];
}
points.push_back(std::move(point));
}

auto frame = test::CDataFrameTestUtils::toMainMemoryDataFrame(points);

maths::COutliers::SComputeParameters params{1, // Number threads
1, // Number partitions,
true, // Standardize columns
maths::COutliers::E_Ensemble,
0, // Compute number neighbours
true, // Compute feature influences
0.05}; // Outlier fraction
maths::COutliers::compute(params, *frame);

bool passed{true};

frame->readRows(1, [&](core::CDataFrame::TRowItr beginRows,
core::CDataFrame::TRowItr endRows) {
for (auto row = beginRows; row != endRows; ++row) {
// Check score is in range 0 to 1.
LOG_DEBUG(<< "outlier score = " << (*row)[rows]);
passed &= (*row)[rows] >= 0.0 && (*row)[rows] <= 1.0;
}
});

CPPUNIT_ASSERT(passed);
}
}

CppUnit::Test* COutliersTest::suite() {
CppUnit::TestSuite* suiteOfTests = new CppUnit::TestSuite("COutliersTest");

Expand All @@ -729,6 +781,8 @@ CppUnit::Test* COutliersTest::suite() {
"COutliersTest::testProgressMonitoring", &COutliersTest::testProgressMonitoring));
suiteOfTests->addTest(new CppUnit::TestCaller<COutliersTest>(
"COutliersTest::testMostlyDuplicate", &COutliersTest::testMostlyDuplicate));
suiteOfTests->addTest(new CppUnit::TestCaller<COutliersTest>(
"COutliersTest::testFewPoints", &COutliersTest::testFewPoints));

return suiteOfTests;
}
1 change: 1 addition & 0 deletions lib/maths/unittest/COutliersTest.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class COutliersTest : public CppUnit::TestFixture {
void testEstimateMemoryUsedByCompute();
void testProgressMonitoring();
void testMostlyDuplicate();
void testFewPoints();

static CppUnit::Test* suite();
};
Expand Down