From 145ccb8615748ff188253e6079b6dd51ea59c1cd Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Fri, 6 Mar 2020 17:54:01 +0000 Subject: [PATCH] Tighten up missing category handling --- include/core/CDataFrame.h | 5 +- include/maths/CDataFrameUtils.h | 18 ++ lib/core/CDataFrame.cc | 4 - lib/maths/CDataFrameCategoryEncoder.cc | 8 +- lib/maths/CDataFrameUtils.cc | 9 +- lib/maths/unittest/CDataFrameUtilsTest.cc | 222 ++++++++++++++++++++-- 6 files changed, 243 insertions(+), 23 deletions(-) diff --git a/include/core/CDataFrame.h b/include/core/CDataFrame.h index 7b7c8f7314..44f54aafac 100644 --- a/include/core/CDataFrame.h +++ b/include/core/CDataFrame.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -487,7 +488,9 @@ class CORE_EXPORT CDataFrame final { std::size_t numberColumns); //! Get the value to use for a missing element in a data frame. - static double valueOfMissing(); + static constexpr double valueOfMissing() { + return std::numeric_limits::quiet_NaN(); + } private: using TStrSizeUMap = boost::unordered_map; diff --git a/include/maths/CDataFrameUtils.h b/include/maths/CDataFrameUtils.h index 507f7f0a15..bf7e5661ea 100644 --- a/include/maths/CDataFrameUtils.h +++ b/include/maths/CDataFrameUtils.h @@ -122,9 +122,15 @@ class MATHS_EXPORT CDataFrameUtils : private core::CNonInstantiatable { COneHotCategoricalColumnValue(std::size_t column, std::size_t category) : CColumnValue{column}, m_Category{category} {} double operator()(const TRowRef& row) const override { + if (isMissing(row[this->column()])) { + return core::CDataFrame::valueOfMissing(); + } return static_cast(row[this->column()]) == m_Category ? 1.0 : 0.0; } double operator()(const TFloatVec& row) const override { + if (isMissing(row[this->column()])) { + return core::CDataFrame::valueOfMissing(); + } return static_cast(row[this->column()]) == m_Category ? 1.0 : 0.0; } std::size_t hash() const override { return m_Category; } @@ -140,10 +146,16 @@ class MATHS_EXPORT CDataFrameUtils : private core::CNonInstantiatable { CFrequencyCategoricalColumnValue(std::size_t column, const TDoubleVec& frequencies) : CColumnValue{column}, m_Frequencies{&frequencies} {} double operator()(const TRowRef& row) const override { + if (isMissing(row[this->column()])) { + return core::CDataFrame::valueOfMissing(); + } std::size_t category{static_cast(row[this->column()])}; return (*m_Frequencies)[category]; } double operator()(const TFloatVec& row) const override { + if (isMissing(row[this->column()])) { + return core::CDataFrame::valueOfMissing(); + } std::size_t category{static_cast(row[this->column()])}; return (*m_Frequencies)[category]; } @@ -166,10 +178,16 @@ class MATHS_EXPORT CDataFrameUtils : private core::CNonInstantiatable { : CColumnValue{column}, m_RareCategories{&rareCategories}, m_TargetMeanValues{&targetMeanValues} { } double operator()(const TRowRef& row) const override { + if (isMissing(row[this->column()])) { + return core::CDataFrame::valueOfMissing(); + } std::size_t category{static_cast(row[this->column()])}; return this->isRare(category) ? 0.0 : (*m_TargetMeanValues)[category]; } double operator()(const TFloatVec& row) const override { + if (isMissing(row[this->column()])) { + return core::CDataFrame::valueOfMissing(); + } std::size_t category{static_cast(row[this->column()])}; return this->isRare(category) ? 0.0 : (*m_TargetMeanValues)[category]; } diff --git a/lib/core/CDataFrame.cc b/lib/core/CDataFrame.cc index 00b6ee0119..c65baba8ab 100644 --- a/lib/core/CDataFrame.cc +++ b/lib/core/CDataFrame.cc @@ -390,10 +390,6 @@ std::size_t CDataFrame::estimateMemoryUsage(bool inMainMemory, return inMainMemory ? numberRows * numberColumns * sizeof(float) : 0; } -double CDataFrame::valueOfMissing() { - return std::numeric_limits::quiet_NaN(); -} - CDataFrame::TRowFuncVecBoolPr CDataFrame::parallelApplyToAllRows(std::size_t numberThreads, std::size_t beginRows, diff --git a/lib/maths/CDataFrameCategoryEncoder.cc b/lib/maths/CDataFrameCategoryEncoder.cc index 8e657bd612..6cae28f83a 100644 --- a/lib/maths/CDataFrameCategoryEncoder.cc +++ b/lib/maths/CDataFrameCategoryEncoder.cc @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -452,7 +453,9 @@ EEncoding CDataFrameCategoryEncoder::COneHotEncoding::type() const { } double CDataFrameCategoryEncoder::COneHotEncoding::encode(double value) const { - return static_cast(value) == m_HotCategory; + return CDataFrameUtils::isMissing(value) + ? core::CDataFrame::valueOfMissing() + : static_cast(value) == m_HotCategory; } bool CDataFrameCategoryEncoder::COneHotEncoding::isBinary() const { @@ -503,6 +506,9 @@ EEncoding CDataFrameCategoryEncoder::CMappedEncoding::type() const { } double CDataFrameCategoryEncoder::CMappedEncoding::encode(double value) const { + if (CDataFrameUtils::isMissing(value)) { + return core::CDataFrame::valueOfMissing(); + } std::size_t category{static_cast(value)}; return category < m_Map.size() ? m_Map[category] : m_Fallback; } diff --git a/lib/maths/CDataFrameUtils.cc b/lib/maths/CDataFrameUtils.cc index 518bedbc28..092dc6c6b5 100644 --- a/lib/maths/CDataFrameUtils.cc +++ b/lib/maths/CDataFrameUtils.cc @@ -824,9 +824,11 @@ CDataFrameUtils::TSizeDoublePrVecVecVec CDataFrameUtils::categoricalMicWithColum 1, 0, frame.numberRows(), [&](TRowItr beginRows, TRowItr endRows) { for (auto row = beginRows; row != endRows; ++row) { + if (isMissing((*row)[i]) || isMissing(target(*row))) { + continue; + } std::size_t category{static_cast((*row)[i])}; - if (frequencies[i][category] >= minimumFrequency && - isMissing(target(*row)) == false) { + if (frequencies[i][category] >= minimumFrequency) { sampler.sample(*row); } } @@ -905,6 +907,9 @@ CDataFrameUtils::TSizeDoublePrVecVecVec CDataFrameUtils::categoricalMicWithColum encoders.clear(); for (const auto& sample : samples) { + if (isMissing(sample[i])) { + continue; + } std::size_t category{static_cast(sample[i])}; if (frequencies[i][category] >= minimumFrequency) { auto encoder = makeEncoder(i, i, category); diff --git a/lib/maths/unittest/CDataFrameUtilsTest.cc b/lib/maths/unittest/CDataFrameUtilsTest.cc index b3d8f87eba..2394334220 100644 --- a/lib/maths/unittest/CDataFrameUtilsTest.cc +++ b/lib/maths/unittest/CDataFrameUtilsTest.cc @@ -4,6 +4,7 @@ * you may not use this file except in compliance with the Elastic License. */ +#include #include #include @@ -604,6 +605,9 @@ BOOST_AUTO_TEST_CASE(testStratifiedCrossValidationRowMasks) { BOOST_AUTO_TEST_CASE(testMicWithColumn) { + // Test we get the exact MICe value when the number of rows is less than + // the target sample size. + test::CRandomNumbers rng; std::size_t capacity{500}; @@ -619,9 +623,6 @@ BOOST_AUTO_TEST_CASE(testMicWithColumn) { return core::makeMainStorageDataFrame(numberCols, capacity).first; }}; - // Test we get exactly the value we expect when the number of rows is less - // than the target sample size. - for (const auto& factory : {makeOnDisk, makeMainMemory}) { auto frame = factory(); @@ -662,8 +663,27 @@ BOOST_AUTO_TEST_CASE(testMicWithColumn) { BOOST_REQUIRE_EQUAL(core::CContainerPrinter::print(expected), core::CContainerPrinter::print(actual)); } +} + +BOOST_AUTO_TEST_CASE(testMicWithColumnWithMissing) { - // Test missing values. + // Test we get the exact MICe value with missing values when the number + // of rows is less than the target sample size. + + test::CRandomNumbers rng; + + std::size_t capacity{500}; + std::size_t numberRows{2000}; + std::size_t numberCols{4}; + + TFactoryFunc makeOnDisk{[=] { + return core::makeDiskStorageDataFrame(test::CTestTmpDir::tmpDir(), + numberCols, numberRows, capacity) + .first; + }}; + TFactoryFunc makeMainMemory{[=] { + return core::makeMainStorageDataFrame(numberCols, capacity).first; + }}; for (const auto& factory : {makeOnDisk, makeMainMemory}) { auto frame = factory(); @@ -677,9 +697,9 @@ BOOST_AUTO_TEST_CASE(testMicWithColumn) { rng.generateUniformSamples(-5.0, 5.0, 4, row); row[3] = 2.0 * row[0] - 1.5 * row[1] + 4.0 * row[2]; for (std::size_t j = 0; j < row.size(); ++j) { - TDoubleVec p; - rng.generateUniformSamples(0.0, 1.0, 1, p); - if (p[0] < 0.01) { + TDoubleVec u01; + rng.generateUniformSamples(0.0, 1.0, 1, u01); + if (u01[0] < 0.01) { row[j] = core::CDataFrame::valueOfMissing(); ++missing[j]; } @@ -722,6 +742,8 @@ BOOST_AUTO_TEST_CASE(testMicWithColumn) { BOOST_AUTO_TEST_CASE(testCategoryFrequencies) { + // Test we get the correct frequencies for each category. + std::size_t rows{5000}; std::size_t cols{4}; std::size_t capacity{500}; @@ -783,8 +805,73 @@ BOOST_AUTO_TEST_CASE(testCategoryFrequencies) { core::stopDefaultAsyncExecutor(); } +BOOST_AUTO_TEST_CASE(testCategoryFrequenciesWithMissing) { + + // Test we get the correct frequencies for each category with missing values. + + std::size_t rows{5000}; + std::size_t cols{4}; + std::size_t capacity{500}; + double probabilityMissing{0.01}; + double missingStandardDeviation{ + std::sqrt(probabilityMissing * static_cast(rows))}; + + test::CRandomNumbers rng; + + TDoubleVecVec expectedFrequencies; + TDoubleVecVec values; + std::tie(expectedFrequencies, values) = generateCategoricalData( + rng, rows, cols, {10.0, 30.0, 1.0, 5.0, 15.0, 9.0, 20.0, 10.0}); + + TFactoryFunc makeOnDisk{[=] { + return core::makeDiskStorageDataFrame(test::CTestTmpDir::tmpDir(), cols, rows, capacity) + .first; + }}; + TFactoryFunc makeMainMemory{ + [=] { return core::makeMainStorageDataFrame(cols, capacity).first; }}; + + for (const auto& factory : {makeOnDisk, makeMainMemory}) { + + auto frame = factory(); + frame->categoricalColumns(TBoolVec{true, false, true, false}); + + TDoubleVec u01; + for (std::size_t i = 0; i < rows; ++i) { + frame->writeRow([&](core::CDataFrame::TFloatVecItr column, std::int32_t&) mutable { + for (std::size_t j = 0; j < cols; ++j, ++column) { + rng.generateUniformSamples(0.0, 1.0, 1, u01); + if (u01[0] < probabilityMissing) { + *column = core::CDataFrame::valueOfMissing(); + } else { + *column = values[j][i]; + } + } + }); + } + frame->finishWritingRows(); + + TDoubleVecVec actualFrequencies{maths::CDataFrameUtils::categoryFrequencies( + 1, *frame, maskAll(rows), {0, 1, 2, 3})}; + + BOOST_REQUIRE_EQUAL(std::size_t{4}, actualFrequencies.size()); + for (std::size_t i : {0, 2}) { + BOOST_REQUIRE_EQUAL(actualFrequencies.size(), expectedFrequencies.size()); + for (std::size_t j = 0; j < actualFrequencies[i].size(); ++j) { + BOOST_REQUIRE_CLOSE_ABSOLUTE( + expectedFrequencies[i][j], actualFrequencies[i][j], + 3.0 * missingStandardDeviation / static_cast(rows)); + } + } + for (std::size_t i : {1, 3}) { + BOOST_TEST_REQUIRE(actualFrequencies[i].empty()); + } + } +} + BOOST_AUTO_TEST_CASE(testMeanValueOfTargetForCategories) { + // Test we get the correct mean values for each category. + std::size_t rows{2000}; std::size_t cols{4}; std::size_t capacity{500}; @@ -876,23 +963,32 @@ BOOST_AUTO_TEST_CASE(testMeanValueOfTargetForCategoriesWithMissing) { std::tie(frequencies, values) = generateCategoricalData( rng, rows, cols - 1, {10.0, 30.0, 1.0, 5.0, 15.0, 9.0, 20.0, 10.0}); - TDoubleVec uniform01; - rng.generateUniformSamples(0.0, 1.0, rows, uniform01); - values.resize(cols); values[cols - 1].resize(rows, 0.0); TMeanAccumulatorVecVec expectedMeans(cols, TMeanAccumulatorVec(8)); + TDoubleVec u01; for (std::size_t i = 0; i < rows; ++i) { - if (uniform01[i] < 0.9) { + for (std::size_t j = 0; j + 1 < cols; ++j) { + rng.generateUniformSamples(0.0, 1.0, 1, u01); + if (u01[0] < 0.01) { + values[j][i] = core::CDataFrame::valueOfMissing(); + } + } + rng.generateUniformSamples(0.0, 1.0, 1, u01); + if (u01[i] < 0.9) { for (std::size_t j = 0; j + 1 < cols; ++j) { - values[cols - 1][i] += values[j][i]; + if (maths::CDataFrameUtils::isMissing(values[j][i]) == false) { + values[cols - 1][i] += values[j][i]; + } } for (std::size_t j = 0; j + 1 < cols; ++j) { - expectedMeans[j][static_cast(values[j][i])].add( - values[cols - 1][i]); + if (maths::CDataFrameUtils::isMissing(values[j][i]) == false) { + expectedMeans[j][static_cast(values[j][i])].add( + values[cols - 1][i]); + } } } else { - values[cols - 1][i] = std::numeric_limits::quiet_NaN(); + values[cols - 1][i] = core::CDataFrame::valueOfMissing(); } } @@ -924,6 +1020,8 @@ BOOST_AUTO_TEST_CASE(testMeanValueOfTargetForCategoriesWithMissing) { BOOST_AUTO_TEST_CASE(testCategoryMicWithColumn) { + // Test one uncorrelated and one uncorrelated categorical field MICe. + std::size_t rows{5000}; std::size_t cols{4}; std::size_t capacity{2000}; @@ -997,6 +1095,9 @@ BOOST_AUTO_TEST_CASE(testCategoryMicWithColumn) { BOOST_TEST_REQUIRE(mics[0][0].second < 0.05); BOOST_TEST_REQUIRE(mics[2][0].second > 0.50); + // The expected order is a function of both the category frequency + // and its order since the target value is order + noise so the + // larger the order the smaller the noise, relatively. TSizeVec categoryOrder; for (const auto& category : mics[2]) { categoryOrder.push_back(category.first); @@ -1010,4 +1111,95 @@ BOOST_AUTO_TEST_CASE(testCategoryMicWithColumn) { core::stopDefaultAsyncExecutor(); } + +BOOST_AUTO_TEST_CASE(testCategoryMicWithColumnWithMissing) { + + std::size_t rows{5000}; + std::size_t cols{4}; + std::size_t capacity{2000}; + + test::CRandomNumbers rng; + + TDoubleVecVec frequencies; + TDoubleVecVec values; + std::tie(frequencies, values) = + generateCategoricalData(rng, rows, cols - 1, {20.0, 60.0, 5.0, 15.0, 1.0}); + + values.resize(cols); + rng.generateNormalSamples(0.0, 1.0, rows, values[cols - 1]); + TDoubleVec u01; + for (std::size_t i = 0; i < rows; ++i) { + values[cols - 1][i] += 2.0 * values[2][i]; + for (std::size_t j = 0; j < cols - 1; ++j) { + rng.generateUniformSamples(0.0, 1.0, 1, u01); + if (u01[0] < 0.01) { + values[j][i] = core::CDataFrame::valueOfMissing(); + } + } + } + + TFactoryFunc makeOnDisk{[=] { + return core::makeDiskStorageDataFrame(test::CTestTmpDir::tmpDir(), cols, rows, capacity) + .first; + }}; + TFactoryFunc makeMainMemory{ + [=] { return core::makeMainStorageDataFrame(cols, capacity).first; }}; + + for (const auto& factory : {makeOnDisk, makeMainMemory}) { + + auto frame = factory(); + + frame->categoricalColumns(TBoolVec{true, false, true, false}); + for (std::size_t i = 0; i < rows; ++i) { + frame->writeRow([&values, i, cols](core::CDataFrame::TFloatVecItr column, + std::int32_t&) { + for (std::size_t j = 0; j < cols; ++j, ++column) { + *column = values[j][i]; + } + }); + } + frame->finishWritingRows(); + + auto mics = maths::CDataFrameUtils::categoricalMicWithColumn( + maths::CDataFrameUtils::CMetricColumnValue{3}, 1, *frame, + maskAll(rows), {0, 1, 2}, + {{[](std::size_t, std::size_t sampleColumn, std::size_t category) { + return std::make_unique( + sampleColumn, category); + }, + 0.01}})[0]; + + LOG_DEBUG(<< "mics[0] = " << core::CContainerPrinter::print(mics[0])); + LOG_DEBUG(<< "mics[2] = " << core::CContainerPrinter::print(mics[2])); + + BOOST_REQUIRE_EQUAL(std::size_t{4}, mics.size()); + for (const auto& mic : mics) { + BOOST_TEST_REQUIRE(std::is_sorted( + mic.begin(), mic.end(), [](const auto& lhs, const auto& rhs) { + return maths::COrderings::lexicographical_compare( + -lhs.second, lhs.first, -rhs.second, rhs.first); + })); + } + for (std::size_t i : {0, 2}) { + BOOST_REQUIRE_EQUAL(std::size_t{5}, mics[i].size()); + } + for (std::size_t i : {1, 3}) { + BOOST_TEST_REQUIRE(mics[i].empty()); + } + + BOOST_TEST_REQUIRE(mics[0][0].second < 0.04); + BOOST_TEST_REQUIRE(mics[2][0].second > 0.49); + + // The expected order is a function of both the category frequency + // and its order since the target value is order + noise so the + // larger the order the smaller the noise, relatively. + TSizeVec categoryOrder; + for (const auto& category : mics[2]) { + categoryOrder.push_back(category.first); + } + BOOST_REQUIRE_EQUAL(std::string{"[1, 3, 0, 4, 2]"}, + core::CContainerPrinter::print(categoryOrder)); + } +} + BOOST_AUTO_TEST_SUITE_END()