diff --git a/include/maths/CKdTree.h b/include/maths/CKdTree.h index 33a2bb00a2..94f182f2f0 100644 --- a/include/maths/CKdTree.h +++ b/include/maths/CKdTree.h @@ -271,7 +271,7 @@ class CKdTree { for (const auto& neighbour : neighbours) { result.push_back(neighbour.second); } - } else if (n > m_Nodes.size()) { + } else if (n >= m_Nodes.size()) { TDoubleVec distances; distances.reserve(m_Nodes.size()); result.reserve(m_Nodes.size()); diff --git a/include/maths/COutliers.h b/include/maths/COutliers.h index a92eeb1024..318b8afc6c 100644 --- a/include/maths/COutliers.h +++ b/include/maths/COutliers.h @@ -224,6 +224,9 @@ class CLof final : public CNearestNeighbourMethod { void add(const POINT& point, const TPointVec& neighbours, TDouble1VecVec&) override { // This is called exactly once for each point therefore an element // of m_KDistances is only ever written by one thread. + if (neighbours.size() < 2) { + return; + } std::size_t i{point.annotation()}; std::size_t a(point == neighbours[0] ? 1 : 0); std::size_t b{std::min(this->k() + a - 1, neighbours.size() + a - 2)}; @@ -430,6 +433,14 @@ class CLdof final : public CNearestNeighbourMethod { private: void add(const POINT& point, const std::vector& neighbours, TDouble1VecVec& scores) override { + std::size_t dimension{las::dimension(point)}; + auto& score = scores[point.annotation()]; + score.assign(this->computeFeatureInfluence() ? dimension + 1 : 1, 0.0); + + if (neighbours.size() < 2) { + return; + } + auto ldof = [](const TMeanAccumulator& d, const TMeanAccumulator& D) { return CBasicStatistics::mean(D) > 0.0 ? CBasicStatistics::mean(d) / CBasicStatistics::mean(D) @@ -438,10 +449,6 @@ class CLdof final : public CNearestNeighbourMethod { std::size_t a(point == neighbours[0] ? 1 : 0); std::size_t b{std::min(this->k() + a - 1, neighbours.size() + a - 2)}; - std::size_t dimension{las::dimension(point)}; - - auto& score = scores[point.annotation()]; - score.resize(this->computeFeatureInfluence() ? dimension + 1 : 1); TMeanAccumulator D; { @@ -495,13 +502,17 @@ class CDistancekNN final : public CNearestNeighbourMethod& neighbours, TDouble1VecVec& scores) override { + auto& score = scores[point.annotation()]; + score.assign(this->computeFeatureInfluence() ? las::dimension(point) + 1 : 1, 0.0); + + if (neighbours.size() < 2) { + return; + } + std::size_t k{std::min(this->k() + 1, neighbours.size() - 1) - (point == neighbours[0] ? 0 : 1)}; const auto& kthNeighbour = neighbours[k]; - auto& score = scores[point.annotation()]; - score.resize(this->computeFeatureInfluence() ? las::dimension(point) + 1 : 1); - double d{las::distance(point, kthNeighbour)}; score[0] = d; for (std::size_t i = 1; i < score.size(); ++i) { @@ -529,12 +540,16 @@ class CTotalDistancekNN final private: void add(const POINT& point, const std::vector& neighbours, TDouble1VecVec& scores) override { - std::size_t a(point == neighbours[0] ? 1 : 0); - std::size_t b{std::min(this->k() + a - 1, neighbours.size() + a - 2)}; - auto& score = scores[point.annotation()]; score.assign(this->computeFeatureInfluence() ? las::dimension(point) + 1 : 1, 0.0); + if (neighbours.size() < 2) { + return; + } + + std::size_t a(point == neighbours[0] ? 1 : 0); + std::size_t b{std::min(this->k() + a - 1, neighbours.size() + a - 2)}; + for (std::size_t i = a; i <= b; ++i) { double d{las::distance(point, neighbours[i])}; score[0] += d; diff --git a/lib/maths/unittest/CKdTreeTest.cc b/lib/maths/unittest/CKdTreeTest.cc index d69c979f09..e62527126e 100644 --- a/lib/maths/unittest/CKdTreeTest.cc +++ b/lib/maths/unittest/CKdTreeTest.cc @@ -89,7 +89,8 @@ std::string print(const POINT& t) { } void CKdTreeTest::testBuild() { - const std::size_t numberTests = 200; + + const std::size_t numberTests{200}; test::CRandomNumbers rng; @@ -123,6 +124,7 @@ void CKdTreeTest::testBuild() { } void CKdTreeTest::testBuildWithMove() { + test::CRandomNumbers rng; TDoubleVec samples; @@ -157,7 +159,8 @@ void CKdTreeTest::testBuildWithMove() { } void CKdTreeTest::testNearestNeighbour() { - const std::size_t numberTests = 200u; + + const std::size_t numberTests{200}; test::CRandomNumbers rng; @@ -202,7 +205,8 @@ void CKdTreeTest::testNearestNeighbour() { } void CKdTreeTest::testNearestNeighbours() { - const std::size_t numberTests = 200u; + + const std::size_t numberTests{200}; test::CRandomNumbers rng; @@ -251,6 +255,32 @@ void CKdTreeTest::testNearestNeighbours() { } } +void CKdTreeTest::testRequestingEveryPoint() { + + test::CRandomNumbers rng; + + TDoubleVec samples; + rng.generateUniformSamples(-100.0, 100.0, 5 * 5, samples); + + TVector5Vec points; + for (std::size_t j = 0u; j < samples.size(); j += 5) { + points.emplace_back(&samples[j], &samples[j + 5]); + } + std::stable_sort(points.begin(), points.end()); + std::string expected{core::CContainerPrinter::print(points)}; + + maths::CKdTree kdTree; + kdTree.build(points); + CPPUNIT_ASSERT(kdTree.checkInvariants()); + + TVector5Vec neighbours; + kdTree.nearestNeighbours(5, TVector5{0.0}, neighbours); + std::stable_sort(neighbours.begin(), neighbours.end()); + + CPPUNIT_ASSERT_EQUAL(kdTree.size(), neighbours.size()); + CPPUNIT_ASSERT_EQUAL(expected, core::CContainerPrinter::print(neighbours)); +} + CppUnit::Test* CKdTreeTest::suite() { CppUnit::TestSuite* suiteOfTests = new CppUnit::TestSuite("CKdTreeTest"); @@ -262,6 +292,8 @@ CppUnit::Test* CKdTreeTest::suite() { "CKdTreeTest::testNearestNeighbour", &CKdTreeTest::testNearestNeighbour)); suiteOfTests->addTest(new CppUnit::TestCaller( "CKdTreeTest::testNearestNeighbours", &CKdTreeTest::testNearestNeighbours)); + suiteOfTests->addTest(new CppUnit::TestCaller( + "CKdTreeTest::testRequestingEveryPoint", &CKdTreeTest::testRequestingEveryPoint)); return suiteOfTests; } diff --git a/lib/maths/unittest/CKdTreeTest.h b/lib/maths/unittest/CKdTreeTest.h index bc3e1d16ab..14f9c8985f 100644 --- a/lib/maths/unittest/CKdTreeTest.h +++ b/lib/maths/unittest/CKdTreeTest.h @@ -15,6 +15,7 @@ class CKdTreeTest : public CppUnit::TestFixture { void testBuildWithMove(); void testNearestNeighbour(); void testNearestNeighbours(); + void testRequestingEveryPoint(); static CppUnit::Test* suite(); }; diff --git a/lib/maths/unittest/COutliersTest.cc b/lib/maths/unittest/COutliersTest.cc index 1c6a92e757..efd75469f3 100644 --- a/lib/maths/unittest/COutliersTest.cc +++ b/lib/maths/unittest/COutliersTest.cc @@ -231,6 +231,7 @@ void COutliersTest::testLof() { } void COutliersTest::testDlof() { + // Test against definition without projecting. test::CRandomNumbers rng; @@ -270,6 +271,7 @@ void COutliersTest::testDlof() { } void COutliersTest::testDistancekNN() { + // Test against definition without projecting. test::CRandomNumbers rng; @@ -302,6 +304,7 @@ void COutliersTest::testDistancekNN() { } void COutliersTest::testTotalDistancekNN() { + // Test against definition without projecting. test::CRandomNumbers rng; @@ -339,6 +342,7 @@ void COutliersTest::testTotalDistancekNN() { } void COutliersTest::testEnsemble() { + // Check error stats for scores, 0.1, 0.5 and 0.9. We should see precision increase // for higher scores but recall decrease. // @@ -670,7 +674,7 @@ void COutliersTest::testMostlyDuplicate() { [i](const TSizeDoublePr& outlier_) { return i == outlier_.first; }); TPoint point(1); point(0) = outlier != outliers.end() ? outlier->second : 0.0; - points.push_back(point); + points.push_back(std::move(point)); } for (std::size_t numberPartitions : {1, 3}) { @@ -707,6 +711,54 @@ void COutliersTest::testMostlyDuplicate() { } } +void COutliersTest::testFewPoints() { + + // Check there are no failures when there only a few points. + + std::size_t rows{101}; + test::CRandomNumbers rng; + + for (std::size_t numberPoints : {1, 2, 5}) { + + LOG_DEBUG(<< "# points = " << numberPoints); + + TPointVec points; + TDoubleVec components; + for (std::size_t i = 0; i < numberPoints; ++i) { + TPoint point(rows); + rng.generateUniformSamples(0.0, 10.0, rows, components); + for (std::size_t j = 0; j < components.size(); ++j) { + point(j) = components[j]; + } + points.push_back(std::move(point)); + } + + auto frame = test::CDataFrameTestUtils::toMainMemoryDataFrame(points); + + maths::COutliers::SComputeParameters params{1, // Number threads + 1, // Number partitions, + true, // Standardize columns + maths::COutliers::E_Ensemble, + 0, // Compute number neighbours + true, // Compute feature influences + 0.05}; // Outlier fraction + maths::COutliers::compute(params, *frame); + + bool passed{true}; + + frame->readRows(1, [&](core::CDataFrame::TRowItr beginRows, + core::CDataFrame::TRowItr endRows) { + for (auto row = beginRows; row != endRows; ++row) { + // Check score is in range 0 to 1. + LOG_DEBUG(<< "outlier score = " << (*row)[rows]); + passed &= (*row)[rows] >= 0.0 && (*row)[rows] <= 1.0; + } + }); + + CPPUNIT_ASSERT(passed); + } +} + CppUnit::Test* COutliersTest::suite() { CppUnit::TestSuite* suiteOfTests = new CppUnit::TestSuite("COutliersTest"); @@ -729,6 +781,8 @@ CppUnit::Test* COutliersTest::suite() { "COutliersTest::testProgressMonitoring", &COutliersTest::testProgressMonitoring)); suiteOfTests->addTest(new CppUnit::TestCaller( "COutliersTest::testMostlyDuplicate", &COutliersTest::testMostlyDuplicate)); + suiteOfTests->addTest(new CppUnit::TestCaller( + "COutliersTest::testFewPoints", &COutliersTest::testFewPoints)); return suiteOfTests; } diff --git a/lib/maths/unittest/COutliersTest.h b/lib/maths/unittest/COutliersTest.h index 31e0c6b1fb..bfe86a1dff 100644 --- a/lib/maths/unittest/COutliersTest.h +++ b/lib/maths/unittest/COutliersTest.h @@ -20,6 +20,7 @@ class COutliersTest : public CppUnit::TestFixture { void testEstimateMemoryUsedByCompute(); void testProgressMonitoring(); void testMostlyDuplicate(); + void testFewPoints(); static CppUnit::Test* suite(); };