diff --git a/lib/maths/CBayesianOptimisation.cc b/lib/maths/CBayesianOptimisation.cc index 58f6a07b66..672ca61e63 100644 --- a/lib/maths/CBayesianOptimisation.cc +++ b/lib/maths/CBayesianOptimisation.cc @@ -27,6 +27,8 @@ namespace ml { namespace maths { namespace { +const std::string VERSION_7_5_TAG{"7.5"}; + const std::string MIN_BOUNDARY_TAG{"min_boundary"}; const std::string MAX_BOUNDARY_TAG{"max_boundary"}; const std::string ERROR_VARIANCES_TAG{"error_variances"}; @@ -443,6 +445,7 @@ double CBayesianOptimisation::kernel(const TVector& a, const TVector& x, const T void CBayesianOptimisation::acceptPersistInserter(core::CStatePersistInserter& inserter) const { try { + core::CPersistUtils::persist(VERSION_7_5_TAG, "", inserter); inserter.insertValue(RNG_TAG, m_Rng.toString()); core::CPersistUtils::persist(MIN_BOUNDARY_TAG, m_MinBoundary, inserter); core::CPersistUtils::persist(MAX_BOUNDARY_TAG, m_MaxBoundary, inserter); @@ -460,39 +463,45 @@ void CBayesianOptimisation::acceptPersistInserter(core::CStatePersistInserter& i } bool CBayesianOptimisation::acceptRestoreTraverser(core::CStateRestoreTraverser& traverser) { - try { - do { - const std::string& name = traverser.name(); - RESTORE(RNG_TAG, m_Rng.fromString(traverser.value())) - RESTORE(MIN_BOUNDARY_TAG, - core::CPersistUtils::restore(MIN_BOUNDARY_TAG, m_MinBoundary, traverser)) - RESTORE(MAX_BOUNDARY_TAG, - core::CPersistUtils::restore(MAX_BOUNDARY_TAG, m_MaxBoundary, traverser)) - RESTORE(ERROR_VARIANCES_TAG, - core::CPersistUtils::restore(ERROR_VARIANCES_TAG, m_ErrorVariances, traverser)) - RESTORE(RANGE_SHIFT_TAG, - core::CPersistUtils::restore(RANGE_SHIFT_TAG, m_RangeShift, traverser)) - RESTORE(RANGE_SCALE_TAG, - core::CPersistUtils::restore(RANGE_SCALE_TAG, m_RangeScale, traverser)) - RESTORE(RESTARTS_TAG, - core::CPersistUtils::restore(RESTARTS_TAG, m_Restarts, traverser)) - RESTORE(KERNEL_PARAMETERS_TAG, - core::CPersistUtils::restore(KERNEL_PARAMETERS_TAG, - m_KernelParameters, traverser)) - RESTORE(MIN_KERNEL_COORDINATE_DISTANCE_SCALES_TAG, - core::CPersistUtils::restore( - MIN_KERNEL_COORDINATE_DISTANCE_SCALES_TAG, - m_MinimumKernelCoordinateDistanceScale, traverser)) - RESTORE(FUNCTION_MEAN_VALUES_TAG, - core::CPersistUtils::restore(FUNCTION_MEAN_VALUES_TAG, - m_FunctionMeanValues, traverser)) - } while (traverser.next()); - } catch (std::exception& e) { - LOG_ERROR(<< "Failed to restore state! " << e.what()); - return false; - } + if (traverser.name() == VERSION_7_5_TAG) { + try { + do { + const std::string& name = traverser.name(); + RESTORE(RNG_TAG, m_Rng.fromString(traverser.value())) + RESTORE(MIN_BOUNDARY_TAG, + core::CPersistUtils::restore(MIN_BOUNDARY_TAG, m_MinBoundary, traverser)) + RESTORE(MAX_BOUNDARY_TAG, + core::CPersistUtils::restore(MAX_BOUNDARY_TAG, m_MaxBoundary, traverser)) + RESTORE(ERROR_VARIANCES_TAG, + core::CPersistUtils::restore(ERROR_VARIANCES_TAG, + m_ErrorVariances, traverser)) + RESTORE(RANGE_SHIFT_TAG, + core::CPersistUtils::restore(RANGE_SHIFT_TAG, m_RangeShift, traverser)) + RESTORE(RANGE_SCALE_TAG, + core::CPersistUtils::restore(RANGE_SCALE_TAG, m_RangeScale, traverser)) + RESTORE(RESTARTS_TAG, + core::CPersistUtils::restore(RESTARTS_TAG, m_Restarts, traverser)) + RESTORE(KERNEL_PARAMETERS_TAG, + core::CPersistUtils::restore(KERNEL_PARAMETERS_TAG, + m_KernelParameters, traverser)) + RESTORE(MIN_KERNEL_COORDINATE_DISTANCE_SCALES_TAG, + core::CPersistUtils::restore( + MIN_KERNEL_COORDINATE_DISTANCE_SCALES_TAG, + m_MinimumKernelCoordinateDistanceScale, traverser)) + RESTORE(FUNCTION_MEAN_VALUES_TAG, + core::CPersistUtils::restore(FUNCTION_MEAN_VALUES_TAG, + m_FunctionMeanValues, traverser)) + } while (traverser.next()); + } catch (std::exception& e) { + LOG_ERROR(<< "Failed to restore state! " << e.what()); + return false; + } - return true; + return true; + } + LOG_ERROR(<< "Input error: unsupported state serialization version. Currently supported version: " + << VERSION_7_5_TAG); + return false; } std::size_t CBayesianOptimisation::memoryUsage() const { diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index 692aff017c..3e8d4201e6 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -961,13 +961,15 @@ std::size_t CBoostedTreeImpl::maximumTreeSize(std::size_t numberRows) const { const std::size_t CBoostedTreeImpl::PACKED_BIT_VECTOR_MAXIMUM_ROWS_PER_BYTE{256}; namespace { +const std::string VERSION_7_5_TAG{"7.5"}; + const std::string BAYESIAN_OPTIMIZATION_TAG{"bayesian_optimization"}; const std::string BEST_FOREST_TAG{"best_forest"}; const std::string BEST_FOREST_TEST_LOSS_TAG{"best_forest_test_loss"}; const std::string BEST_HYPERPARAMETERS_TAG{"best_hyperparameters"}; const std::string CURRENT_ROUND_TAG{"current_round"}; const std::string DEPENDENT_VARIABLE_TAG{"dependent_variable"}; -const std::string ENCODER_TAG{"encoder_tag"}; +const std::string ENCODER_TAG{"encoder"}; const std::string ETA_GROWTH_RATE_PER_TREE_TAG{"eta_growth_rate_per_tree"}; const std::string ETA_OVERRIDE_TAG{"eta_override"}; const std::string ETA_TAG{"eta"}; @@ -1034,6 +1036,7 @@ void CBoostedTreeImpl::SHyperparameters::acceptPersistInserter(core::CStatePersi } void CBoostedTreeImpl::acceptPersistInserter(core::CStatePersistInserter& inserter) const { + core::CPersistUtils::persist(VERSION_7_5_TAG, "", inserter); core::CPersistUtils::persist(BAYESIAN_OPTIMIZATION_TAG, *m_BayesianOptimization, inserter); core::CPersistUtils::persist(BEST_FOREST_TEST_LOSS_TAG, m_BestForestTestLoss, inserter); core::CPersistUtils::persist(CURRENT_ROUND_TAG, m_CurrentRound, inserter); @@ -1120,89 +1123,97 @@ bool CBoostedTreeImpl::SHyperparameters::acceptRestoreTraverser(core::CStateRest } bool CBoostedTreeImpl::acceptRestoreTraverser(core::CStateRestoreTraverser& traverser) { - do { - const std::string& name = traverser.name(); - RESTORE_NO_ERROR(BAYESIAN_OPTIMIZATION_TAG, - m_BayesianOptimization = - std::make_unique(traverser)) - RESTORE(BEST_FOREST_TEST_LOSS_TAG, - core::CPersistUtils::restore(BEST_FOREST_TEST_LOSS_TAG, - m_BestForestTestLoss, traverser)) - RESTORE(CURRENT_ROUND_TAG, - core::CPersistUtils::restore(CURRENT_ROUND_TAG, m_CurrentRound, traverser)) - RESTORE(DEPENDENT_VARIABLE_TAG, - core::CPersistUtils::restore(DEPENDENT_VARIABLE_TAG, - m_DependentVariable, traverser)) - RESTORE_NO_ERROR(ENCODER_TAG, - m_Encoder = std::make_unique(traverser)) - RESTORE(ETA_GROWTH_RATE_PER_TREE_TAG, - core::CPersistUtils::restore(ETA_GROWTH_RATE_PER_TREE_TAG, - m_EtaGrowthRatePerTree, traverser)) - RESTORE(ETA_TAG, core::CPersistUtils::restore(ETA_TAG, m_Eta, traverser)) - RESTORE(FEATURE_BAG_FRACTION_TAG, - core::CPersistUtils::restore(FEATURE_BAG_FRACTION_TAG, - m_FeatureBagFraction, traverser)) - RESTORE(FEATURE_DATA_TYPES_TAG, - core::CPersistUtils::restore(FEATURE_DATA_TYPES_TAG, - m_FeatureDataTypes, traverser)); - RESTORE(FEATURE_SAMPLE_PROBABILITIES_TAG, - core::CPersistUtils::restore(FEATURE_SAMPLE_PROBABILITIES_TAG, - m_FeatureSampleProbabilities, traverser)) - RESTORE(MAXIMUM_ATTEMPTS_TO_ADD_TREE_TAG, - core::CPersistUtils::restore(MAXIMUM_ATTEMPTS_TO_ADD_TREE_TAG, - m_MaximumAttemptsToAddTree, traverser)) - RESTORE(MAXIMUM_OPTIMISATION_ROUNDS_PER_HYPERPARAMETER_TAG, - core::CPersistUtils::restore( - MAXIMUM_OPTIMISATION_ROUNDS_PER_HYPERPARAMETER_TAG, - m_MaximumOptimisationRoundsPerHyperparameter, traverser)) - RESTORE(MAXIMUM_TREE_SIZE_MULTIPLIER_TAG, - core::CPersistUtils::restore(MAXIMUM_TREE_SIZE_MULTIPLIER_TAG, - m_MaximumTreeSizeMultiplier, traverser)) - RESTORE(MISSING_FEATURE_ROW_MASKS_TAG, - core::CPersistUtils::restore(MISSING_FEATURE_ROW_MASKS_TAG, - m_MissingFeatureRowMasks, traverser)) - RESTORE(NUMBER_FOLDS_TAG, - core::CPersistUtils::restore(NUMBER_FOLDS_TAG, m_NumberFolds, traverser)) - RESTORE(NUMBER_ROUNDS_TAG, - core::CPersistUtils::restore(NUMBER_ROUNDS_TAG, m_NumberRounds, traverser)) - RESTORE(NUMBER_SPLITS_PER_FEATURE_TAG, - core::CPersistUtils::restore(NUMBER_SPLITS_PER_FEATURE_TAG, - m_NumberSplitsPerFeature, traverser)) - RESTORE(NUMBER_THREADS_TAG, - core::CPersistUtils::restore(NUMBER_THREADS_TAG, m_NumberThreads, traverser)) - RESTORE(RANDOM_NUMBER_GENERATOR_TAG, m_Rng.fromString(traverser.value())) - RESTORE(REGULARIZATION_TAG, - core::CPersistUtils::restore(REGULARIZATION_TAG, m_Regularization, traverser)) - RESTORE(REGULARIZATION_OVERRIDE_TAG, - core::CPersistUtils::restore(REGULARIZATION_OVERRIDE_TAG, - m_RegularizationOverride, traverser)) - RESTORE(ROWS_PER_FEATURE_TAG, - core::CPersistUtils::restore(ROWS_PER_FEATURE_TAG, m_RowsPerFeature, traverser)) - RESTORE(TESTING_ROW_MASKS_TAG, - core::CPersistUtils::restore(TESTING_ROW_MASKS_TAG, m_TestingRowMasks, traverser)) - RESTORE(MAXIMUM_NUMBER_TREES_TAG, - core::CPersistUtils::restore(MAXIMUM_NUMBER_TREES_TAG, - m_MaximumNumberTrees, traverser)) - RESTORE(TRAINING_ROW_MASKS_TAG, - core::CPersistUtils::restore(TRAINING_ROW_MASKS_TAG, m_TrainingRowMasks, traverser)) - RESTORE(TRAINING_PROGRESS_TAG, - core::CPersistUtils::restore(TRAINING_PROGRESS_TAG, m_TrainingProgress, traverser)) - RESTORE(BEST_FOREST_TAG, - core::CPersistUtils::restore(BEST_FOREST_TAG, m_BestForest, traverser)) - RESTORE(BEST_HYPERPARAMETERS_TAG, - core::CPersistUtils::restore(BEST_HYPERPARAMETERS_TAG, - m_BestHyperparameters, traverser)) - RESTORE(ETA_OVERRIDE_TAG, - core::CPersistUtils::restore(ETA_OVERRIDE_TAG, m_EtaOverride, traverser)) - RESTORE(FEATURE_BAG_FRACTION_OVERRIDE_TAG, - core::CPersistUtils::restore(FEATURE_BAG_FRACTION_OVERRIDE_TAG, - m_FeatureBagFractionOverride, traverser)) - RESTORE(MAXIMUM_NUMBER_TREES_OVERRIDE_TAG, - core::CPersistUtils::restore(MAXIMUM_NUMBER_TREES_OVERRIDE_TAG, - m_MaximumNumberTreesOverride, traverser)) - RESTORE(LOSS_TAG, restoreLoss(m_Loss, traverser)) - } while (traverser.next()); - return true; + if (traverser.name() == VERSION_7_5_TAG) { + do { + const std::string& name = traverser.name(); + RESTORE_NO_ERROR(BAYESIAN_OPTIMIZATION_TAG, + m_BayesianOptimization = + std::make_unique(traverser)) + RESTORE(BEST_FOREST_TEST_LOSS_TAG, + core::CPersistUtils::restore(BEST_FOREST_TEST_LOSS_TAG, + m_BestForestTestLoss, traverser)) + RESTORE(CURRENT_ROUND_TAG, + core::CPersistUtils::restore(CURRENT_ROUND_TAG, m_CurrentRound, traverser)) + RESTORE(DEPENDENT_VARIABLE_TAG, + core::CPersistUtils::restore(DEPENDENT_VARIABLE_TAG, + m_DependentVariable, traverser)) + RESTORE_NO_ERROR(ENCODER_TAG, + m_Encoder = std::make_unique(traverser)) + RESTORE(ETA_GROWTH_RATE_PER_TREE_TAG, + core::CPersistUtils::restore(ETA_GROWTH_RATE_PER_TREE_TAG, + m_EtaGrowthRatePerTree, traverser)) + RESTORE(ETA_TAG, core::CPersistUtils::restore(ETA_TAG, m_Eta, traverser)) + RESTORE(FEATURE_BAG_FRACTION_TAG, + core::CPersistUtils::restore(FEATURE_BAG_FRACTION_TAG, + m_FeatureBagFraction, traverser)) + RESTORE(FEATURE_DATA_TYPES_TAG, + core::CPersistUtils::restore(FEATURE_DATA_TYPES_TAG, + m_FeatureDataTypes, traverser)); + RESTORE(FEATURE_SAMPLE_PROBABILITIES_TAG, + core::CPersistUtils::restore(FEATURE_SAMPLE_PROBABILITIES_TAG, + m_FeatureSampleProbabilities, traverser)) + RESTORE(MAXIMUM_ATTEMPTS_TO_ADD_TREE_TAG, + core::CPersistUtils::restore(MAXIMUM_ATTEMPTS_TO_ADD_TREE_TAG, + m_MaximumAttemptsToAddTree, traverser)) + RESTORE(MAXIMUM_OPTIMISATION_ROUNDS_PER_HYPERPARAMETER_TAG, + core::CPersistUtils::restore( + MAXIMUM_OPTIMISATION_ROUNDS_PER_HYPERPARAMETER_TAG, + m_MaximumOptimisationRoundsPerHyperparameter, traverser)) + RESTORE(MAXIMUM_TREE_SIZE_MULTIPLIER_TAG, + core::CPersistUtils::restore(MAXIMUM_TREE_SIZE_MULTIPLIER_TAG, + m_MaximumTreeSizeMultiplier, traverser)) + RESTORE(MISSING_FEATURE_ROW_MASKS_TAG, + core::CPersistUtils::restore(MISSING_FEATURE_ROW_MASKS_TAG, + m_MissingFeatureRowMasks, traverser)) + RESTORE(NUMBER_FOLDS_TAG, + core::CPersistUtils::restore(NUMBER_FOLDS_TAG, m_NumberFolds, traverser)) + RESTORE(NUMBER_ROUNDS_TAG, + core::CPersistUtils::restore(NUMBER_ROUNDS_TAG, m_NumberRounds, traverser)) + RESTORE(NUMBER_SPLITS_PER_FEATURE_TAG, + core::CPersistUtils::restore(NUMBER_SPLITS_PER_FEATURE_TAG, + m_NumberSplitsPerFeature, traverser)) + RESTORE(NUMBER_THREADS_TAG, + core::CPersistUtils::restore(NUMBER_THREADS_TAG, m_NumberThreads, traverser)) + RESTORE(RANDOM_NUMBER_GENERATOR_TAG, m_Rng.fromString(traverser.value())) + RESTORE(REGULARIZATION_TAG, + core::CPersistUtils::restore(REGULARIZATION_TAG, m_Regularization, traverser)) + RESTORE(REGULARIZATION_OVERRIDE_TAG, + core::CPersistUtils::restore(REGULARIZATION_OVERRIDE_TAG, + m_RegularizationOverride, traverser)) + RESTORE(ROWS_PER_FEATURE_TAG, + core::CPersistUtils::restore(ROWS_PER_FEATURE_TAG, m_RowsPerFeature, traverser)) + RESTORE(TESTING_ROW_MASKS_TAG, + core::CPersistUtils::restore(TESTING_ROW_MASKS_TAG, + m_TestingRowMasks, traverser)) + RESTORE(MAXIMUM_NUMBER_TREES_TAG, + core::CPersistUtils::restore(MAXIMUM_NUMBER_TREES_TAG, + m_MaximumNumberTrees, traverser)) + RESTORE(TRAINING_ROW_MASKS_TAG, + core::CPersistUtils::restore(TRAINING_ROW_MASKS_TAG, + m_TrainingRowMasks, traverser)) + RESTORE(TRAINING_PROGRESS_TAG, + core::CPersistUtils::restore(TRAINING_PROGRESS_TAG, + m_TrainingProgress, traverser)) + RESTORE(BEST_FOREST_TAG, + core::CPersistUtils::restore(BEST_FOREST_TAG, m_BestForest, traverser)) + RESTORE(BEST_HYPERPARAMETERS_TAG, + core::CPersistUtils::restore(BEST_HYPERPARAMETERS_TAG, + m_BestHyperparameters, traverser)) + RESTORE(ETA_OVERRIDE_TAG, + core::CPersistUtils::restore(ETA_OVERRIDE_TAG, m_EtaOverride, traverser)) + RESTORE(FEATURE_BAG_FRACTION_OVERRIDE_TAG, + core::CPersistUtils::restore(FEATURE_BAG_FRACTION_OVERRIDE_TAG, + m_FeatureBagFractionOverride, traverser)) + RESTORE(MAXIMUM_NUMBER_TREES_OVERRIDE_TAG, + core::CPersistUtils::restore(MAXIMUM_NUMBER_TREES_OVERRIDE_TAG, + m_MaximumNumberTreesOverride, traverser)) + RESTORE(LOSS_TAG, restoreLoss(m_Loss, traverser)) + } while (traverser.next()); + return true; + } + LOG_ERROR(<< "Input error: unsupported state serialization version. Currently supported version: " + << VERSION_7_5_TAG); + return false; } bool CBoostedTreeImpl::restoreLoss(CBoostedTree::TLossFunctionUPtr& loss, diff --git a/lib/maths/CDataFrameCategoryEncoder.cc b/lib/maths/CDataFrameCategoryEncoder.cc index 0de553b30c..e1f9166f63 100644 --- a/lib/maths/CDataFrameCategoryEncoder.cc +++ b/lib/maths/CDataFrameCategoryEncoder.cc @@ -38,6 +38,8 @@ const std::size_t CATEGORY_FOR_FREQUENCY_ENCODING{CATEGORY_FOR_METRICS - 1}; const std::size_t CATEGORY_FOR_TARGET_MEAN_ENCODING{CATEGORY_FOR_FREQUENCY_ENCODING - 1}; const std::size_t CATEGORY_FOR_DEPENDENT_VARIABLE{CATEGORY_FOR_TARGET_MEAN_ENCODING - 1}; +const std::string VERSION_7_5_TAG{"7.5"}; + const std::string ENCODING_VECTOR_TAG{"encoding_vector"}; const std::string ENCODING_INPUT_COLUMN_INDEX_TAG{"encoding_input_column_index"}; const std::string ENCODING_MIC_TAG{"encoding_mic"}; @@ -271,19 +273,25 @@ std::uint64_t CDataFrameCategoryEncoder::checksum(std::uint64_t seed) const { } void CDataFrameCategoryEncoder::acceptPersistInserter(core::CStatePersistInserter& inserter) const { + core::CPersistUtils::persist(VERSION_7_5_TAG, "", inserter); inserter.insertLevel(ENCODING_VECTOR_TAG, std::bind(&CDataFrameCategoryEncoder::persistEncodings, this, std::placeholders::_1)); } bool CDataFrameCategoryEncoder::acceptRestoreTraverser(core::CStateRestoreTraverser& traverser) { - do { - const std::string& name{traverser.name()}; - RESTORE(ENCODING_VECTOR_TAG, - traverser.traverseSubLevel(std::bind(&CDataFrameCategoryEncoder::restoreEncodings, - this, std::placeholders::_1))) - } while (traverser.next()); - return true; + if (traverser.name() == VERSION_7_5_TAG) { + do { + const std::string& name{traverser.name()}; + RESTORE(ENCODING_VECTOR_TAG, traverser.traverseSubLevel(std::bind( + &CDataFrameCategoryEncoder::restoreEncodings, + this, std::placeholders::_1))) + } while (traverser.next()); + return true; + } + LOG_ERROR(<< "Input error: unsupported state serialization version. Currently supported version: " + << VERSION_7_5_TAG); + return false; } void CDataFrameCategoryEncoder::persistEncodings(core::CStatePersistInserter& inserter) const { diff --git a/lib/maths/unittest/CBoostedTreeTest.cc b/lib/maths/unittest/CBoostedTreeTest.cc index f45d735cfd..b3adc4290f 100644 --- a/lib/maths/unittest/CBoostedTreeTest.cc +++ b/lib/maths/unittest/CBoostedTreeTest.cc @@ -18,8 +18,10 @@ #include #include +#include #include #include +#include #include using namespace ml; @@ -160,6 +162,21 @@ auto predictAndComputeEvaluationMetrics(const F& generateFunction, return std::make_pair(std::move(modelBias), std::move(modelRSquared)); } + +void readFileToStream(const std::string& filename, std::stringstream& stream) { + std::ifstream file(filename); + CPPUNIT_ASSERT(file.is_open()); + std::string str((std::istreambuf_iterator(file)), + std::istreambuf_iterator()); + stream << str; + stream.flush(); +} + +void clearFile(const std::string& filename) { + std::ofstream file; + file.open(filename, std::ofstream::out | std::ofstream::trunc); + file.close(); +} } void CBoostedTreeTest::testPiecewiseConstant() { @@ -1010,39 +1027,27 @@ void CBoostedTreeTest::testRestoreErrorHandling() { }; core::CLogger::CScopeSetFatalErrorHandler scope{errorHandler}; + const std::string logFile{"test.log"}; + + // log at level ERROR only + CPPUNIT_ASSERT(ml::core::CLogger::instance().reconfigureFromFile( + "testfiles/testLogErrors.boost.log.ini")); + std::size_t cols{3}; std::size_t capacity{50}; auto frame = core::makeMainStorageDataFrame(cols, capacity).first; std::stringstream errorInBayesianOptimisationState; - errorInBayesianOptimisationState - << "{\"bayesian_optimization\":" - "{\"min_boundary\":{\"dense_vector\":\"-9.191737e-1:-2.041179:-3.506558:1.025:2e-1\"}," - "\"max_boundary\":{\"dense_vector\":\"3.685997:2.563991:-1.203973:a:8e-1\"}," - "\"error_variances\":\"\",\"kernel_parameters\":{\"dense_vector\":\"1:1:1:1:1:1\"}," - "\"min_kernel_coordinate_distance_scales\":{\"dense_vector\":\"1e-3:1e-3:1e-3:1e-3:1e-3\"}," - "\"function_mean_values\":{\"d\":\"0\"}},\"best_forest_test_loss\":\"1.797693e308\"," - "\"current_round\":\"0\",\"dependent_variable\":\"2\",\"eta_growth_rate_per_tree\":\"1.05\"," - "\"eta\":\"1e-1\",\"feature_bag_fraction\":\"5e-1\",\"feature_sample_probabilities\":\"1:0:0\"," - "\"gamma\":\"1.298755\",\"lambda\":\"3.988485\",\"maximum_attempts_to_add_tree\":\"3\"," - "\"maximum_optimisation_rounds_per_hyperparameter\":\"3\",\"maximum_tree_size_fraction\":\"10\"," - "\"missing_feature_row_masks\":{\"d\":\"3\",\"a\":\"50:0:1:50\",\"a\":\"50:0:1:50\",\"a\":\"50:0:1:50\"}," - "\"number_folds\":\"2\",\"number_rounds\":\"15\",\"number_splits_per_feature\":\"40\"," - "\"number_threads\":\"1\",\"rows_per_feature\":\"50\"," - "\"testing_row_masks\":{\"d\":\"2\",\"a\":\"50:1:1:5:1:1:5:3:3:3:1:1:1:1:4:1:4:3:6:1:1:2:1:2\"," - "\"a\":\"50:0:1:5:1:1:5:3:3:3:1:1:1:1:4:1:4:3:6:1:1:2:1:2\"},\"maximum_number_trees\":\"2\"," - "\"training_row_masks\":{\"d\":\"2\",\"a\":\"50:0:1:5:1:1:5:3:3:3:1:1:1:1:4:1:4:3:6:1:1:2:1:2\"," - "\"a\":\"50:1:1:5:1:1:5:3:3:3:1:1:1:1:4:1:4:3:6:1:1:2:1:2\"},\"best_forest\":{\"d\":\"0\"}," - "\"best_hyperparameters\":{\"hyperparam_lambda\":\"0\",\"hyperparam_gamma\":\"0\"," - "\"hyperparam_eta\":\"0\",\"hyperparam_eta_growth_rate_per_tree\":\"0\"," - "\"hyperparam_feature_bag_fraction\":\"0\",\"hyperparam_feature_sample_probabilities\":\"\"}," - "\"eta_override\":\"false;0\",\"feature_bag_fraction_override\":\"false;0\",\"gamma_override\":\"false;0\"," - "\"lambda_override\":\"false;0\",\"maximum_number_trees_override\":\"true;2\",\"loss\":\"mse\"}"; + readFileToStream("testfiles/error_bayesian_optimisation_state.json", + errorInBayesianOptimisationState); errorInBayesianOptimisationState.flush(); bool throwsExceptions{false}; + std::stringstream buffer; + clearFile(logFile); try { + auto boostedTree = maths::CBoostedTreeFactory::constructFromString(errorInBayesianOptimisationState) .buildFor(*frame, 2); } catch (const std::exception& e) { @@ -1051,36 +1056,41 @@ void CBoostedTreeTest::testRestoreErrorHandling() { core::CRegex re; re.init("Input error:.*"); CPPUNIT_ASSERT(re.matches(e.what())); + readFileToStream(logFile, buffer); + CPPUNIT_ASSERT(buffer.str().find("Failed to restore MAX_BOUNDARY_TAG") != + std::string::npos); } CPPUNIT_ASSERT(throwsExceptions); std::stringstream errorInBoostedTreeImplState; - errorInBoostedTreeImplState - << "{\"bayesian_optimization\":" - "{\"min_boundary\":{\"dense_vector\":\"-9.191737e-1:-2.041179:-3.506558:1.025:2e-1\"}," - "\"max_boundary\":{\"dense_vector\":\"3.685997:2.563991:-1.203973:0.1:8e-1\"}," - "\"error_variances\":\"\",\"kernel_parameters\":{\"dense_vector\":\"1:1:1:1:1:1\"}," - "\"min_kernel_coordinate_distance_scales\":{\"dense_vector\":\"1e-3:1e-3:1e-3:1e-3:1e-3\"}," - "\"function_mean_values\":{\"d\":\"0\"}},\"best_forest_test_loss\":\"1.797693e308\"," - "\"current_round\":\"0\",\"dependent_variable\":\"2\",\"eta_growth_rate_per_tree\":\"1.05\"," - "\"eta\":\"1e-1\",\"feature_bag_fraction\":\"5e-1\",\"feature_sample_probabilities\":\"1:0:0\"," - "\"gamma\":\"1.298755\",\"lambda\":\"3.988485\",\"maximum_attempts_to_add_tree\":\"3\"," - "\"maximum_optimisation_rounds_per_hyperparameter\":\"3\",\"maximum_tree_size_fraction\":\"10\"," - "\"missing_feature_row_masks\":{\"d\":\"3\",\"a\":\"50:0:1:50\",\"a\":\"50:0:1:50\",\"a\":\"50:0:1:50\"}," - "\"number_folds\":\"\",\"number_rounds\":\"15\",\"number_splits_per_feature\":\"40\"," - "\"number_threads\":\"1\",\"rows_per_feature\":\"50\"," - "\"testing_row_masks\":{\"d\":\"2\",\"a\":\"50:1:1:5:1:1:5:3:3:3:1:1:1:1:4:1:4:3:6:1:1:2:1:2\"," - "\"a\":\"50:0:1:5:1:1:5:3:3:3:1:1:1:1:4:1:4:3:6:1:1:2:1:2\"},\"maximum_number_trees\":\"2\"," - "\"training_row_masks\":{\"d\":\"2\",\"a\":\"50:0:1:5:1:1:5:3:3:3:1:1:1:1:4:1:4:3:6:1:1:2:1:2\"," - "\"a\":\"50:1:1:5:1:1:5:3:3:3:1:1:1:1:4:1:4:3:6:1:1:2:1:2\"},\"best_forest\":{\"d\":\"0\"}," - "\"best_hyperparameters\":{\"hyperparam_lambda\":\"0\",\"hyperparam_gamma\":\"0\"," - "\"hyperparam_eta\":\"0\",\"hyperparam_eta_growth_rate_per_tree\":\"0\"," - "\"hyperparam_feature_bag_fraction\":\"0\",\"hyperparam_feature_sample_probabilities\":\"\"}," - "\"eta_override\":\"false;0\",\"feature_bag_fraction_override\":\"false;0\",\"gamma_override\":\"false;0\"," - "\"lambda_override\":\"false;0\",\"maximum_number_trees_override\":\"true;2\",\"loss\":\"mse\"}"; + readFileToStream("testfiles/error_boosted_tree_impl_state.json", errorInBoostedTreeImplState); errorInBoostedTreeImplState.flush(); throwsExceptions = false; + buffer.clear(); + clearFile(logFile); + try { + auto boostedTree = maths::CBoostedTreeFactory::constructFromString(errorInBoostedTreeImplState) + .buildFor(*frame, 2); + } catch (const std::exception& e) { + LOG_DEBUG(<< "got = " << e.what()); + throwsExceptions = true; + core::CRegex re; + re.init("Input error:.*"); + CPPUNIT_ASSERT(re.matches(e.what())); + readFileToStream(logFile, buffer); + CPPUNIT_ASSERT(buffer.str().find("Failed to restore NUMBER_FOLDS_TAG") != + std::string::npos); + } + CPPUNIT_ASSERT(throwsExceptions); + + std::stringstream errorInStateVersion; + readFileToStream("testfiles/error_no_version_state.json", errorInStateVersion); + errorInStateVersion.flush(); + + throwsExceptions = false; + buffer.clear(); + clearFile(logFile); try { auto boostedTree = maths::CBoostedTreeFactory::constructFromString(errorInBoostedTreeImplState) .buildFor(*frame, 2); @@ -1090,8 +1100,12 @@ void CBoostedTreeTest::testRestoreErrorHandling() { core::CRegex re; re.init("Input error:.*"); CPPUNIT_ASSERT(re.matches(e.what())); + readFileToStream(logFile, buffer); + CPPUNIT_ASSERT(buffer.str().find("unsupported state serialization version.") != + std::string::npos); } CPPUNIT_ASSERT(throwsExceptions); + ml::core::CLogger::instance().reset(); } CppUnit::Test* CBoostedTreeTest::suite() { diff --git a/lib/maths/unittest/testfiles/error_bayesian_optimisation_state.json b/lib/maths/unittest/testfiles/error_bayesian_optimisation_state.json new file mode 100644 index 0000000000..98459ef5d0 --- /dev/null +++ b/lib/maths/unittest/testfiles/error_bayesian_optimisation_state.json @@ -0,0 +1,116 @@ +{ + "7.5": "", + "bayesian_optimization": { + "7.5": "", + "rng": "16294208416658607535:7960286522194355700", + "min_boundary": { + "dense_vector": "-6.18966:-2.047204:-4.574167:2:5e-2:-3.506558:1.025:2e-1" + }, + "max_boundary": { + "dense_vector": "-1.584489:2.557966:-1.589118e-2:7.321928:2.5e-1:a:1.075:8e-1" + }, + "error_variances": "", + "kernel_parameters": { + "dense_vector": "1:1:1:1:1:1:1:1:1" + }, + "min_kernel_coordinate_distance_scales": { + "dense_vector": "1e-3:1e-3:1e-3:1e-3:1e-3:1e-3:1e-3:1e-3" + }, + "function_mean_values": { + "d": "0" + }, + "range_scale": "1", + "range_shift": "0", + "restarts": "10" + }, + "best_forest_test_loss": "1.797693e308", + "current_round": "0", + "dependent_variable": "2", + "encoder_tag": { + "7.5": "", + "encoding_vector": { + "identity_encoding": { + "encoding_input_column_index": "0", + "encoding_mic": "3.556275e-1" + }, + "identity_encoding": { + "encoding_input_column_index": "2", + "encoding_mic": "0" + } + } + }, + "eta_growth_rate_per_tree": "1.05", + "eta": "1e-1", + "feature_bag_fraction": "5e-1", + "feature_data_types": { + "d": "2", + "a": "0:2.10813656449318e-1:9.96675395965576:", + "a": "0:2.90942788124084e-1:9.86151218414307:" + }, + "feature_sample_probabilities": "1:0", + "maximum_attempts_to_add_tree": "3", + "maximum_optimisation_rounds_per_hyperparameter": "3", + "maximum_tree_size_multiplier": "10", + "missing_feature_row_masks": { + "d": "3", + "a": "50:0:1:50", + "a": "50:0:1:50", + "a": "50:0:1:50" + }, + "number_folds": "2", + "number_rounds": "24", + "number_splits_per_feature": "75", + "number_threads": "1", + "random_number_generator": "6348936557884334503:6746432788814635579", + "regularization_override": { + "regularization_depth_penalty_multiplier": "false;0", + "regularization_tree_size_penalty_multiplier": "false;0", + "regularization_leaf_weight_penalty_multiplier": "false;0", + "regularization_soft_tree_depth_limit": "false;0", + "regularization_soft_tree_depth_tolerance": "false;0" + }, + "regularization": { + "regularization_depth_penalty_multiplier": "2.050525e-2", + "regularization_tree_size_penalty_multiplier": "1.031488e-1", + "regularization_leaf_weight_penalty_multiplier": "1.290953", + "regularization_soft_tree_depth_limit": "3", + "regularization_soft_tree_depth_tolerance": "1.5e-1" + }, + "rows_per_feature": "50", + "testing_row_masks": { + "d": "2", + "a": "50:1:1:5:1:1:5:3:3:3:1:1:1:1:4:1:4:3:6:1:1:2:1:2", + "a": "50:0:1:5:1:1:5:3:3:3:1:1:1:1:4:1:4:3:6:1:1:2:1:2" + }, + "maximum_number_trees": "2", + "training_row_masks": { + "d": "2", + "a": "50:0:1:5:1:1:5:3:3:3:1:1:1:1:4:1:4:3:6:1:1:2:1:2", + "a": "50:1:1:5:1:1:5:3:3:3:1:1:1:1:4:1:4:3:6:1:1:2:1:2" + }, + "training_progress": { + "loop_size_tag": "76", + "progress_steps_tag": "32", + "current_step_progress_tag": "3.125e-2", + "loop_pos_tag": "26" + }, + "best_forest": { + "d": "0" + }, + "best_hyperparameters": { + "hyperparam_eta": "1e-1", + "hyperparam_eta_growth_rate_per_tree": "1.05", + "hyperparam_feature_bag_fraction": "5e-1", + "hyperparam_regularization": { + "regularization_depth_penalty_multiplier": "0", + "regularization_tree_size_penalty_multiplier": "0", + "regularization_leaf_weight_penalty_multiplier": "0", + "regularization_soft_tree_depth_limit": "0", + "regularization_soft_tree_depth_tolerance": "0" + } + }, + "eta_override": "false;0", + "feature_bag_fraction_override": "false;0", + "maximum_number_trees_override": "true;2", + "loss": "mse" +} diff --git a/lib/maths/unittest/testfiles/error_boosted_tree_impl_state.json b/lib/maths/unittest/testfiles/error_boosted_tree_impl_state.json new file mode 100644 index 0000000000..3d46515f81 --- /dev/null +++ b/lib/maths/unittest/testfiles/error_boosted_tree_impl_state.json @@ -0,0 +1,116 @@ +{ + "7.5": "", + "bayesian_optimization": { + "7.5": "", + "rng": "16294208416658607535:7960286522194355700", + "min_boundary": { + "dense_vector": "-6.18966:-2.047204:-4.574167:2:5e-2:-3.506558:1.025:2e-1" + }, + "max_boundary": { + "dense_vector": "-1.584489:2.557966:-1.589118e-2:7.321928:2.5e-1:-1.203973:1.075:8e-1" + }, + "error_variances": "", + "kernel_parameters": { + "dense_vector": "1:1:1:1:1:1:1:1:1" + }, + "min_kernel_coordinate_distance_scales": { + "dense_vector": "1e-3:1e-3:1e-3:1e-3:1e-3:1e-3:1e-3:1e-3" + }, + "function_mean_values": { + "d": "0" + }, + "range_scale": "1", + "range_shift": "0", + "restarts": "10" + }, + "best_forest_test_loss": "1.797693e308", + "current_round": "0", + "dependent_variable": "2", + "encoder_tag": { + "7.5": "", + "encoding_vector": { + "identity_encoding": { + "encoding_input_column_index": "0", + "encoding_mic": "3.556275e-1" + }, + "identity_encoding": { + "encoding_input_column_index": "2", + "encoding_mic": "0" + } + } + }, + "eta_growth_rate_per_tree": "1.05", + "eta": "1e-1", + "feature_bag_fraction": "5e-1", + "feature_data_types": { + "d": "2", + "a": "0:2.10813656449318e-1:9.96675395965576:", + "a": "0:2.90942788124084e-1:9.86151218414307:" + }, + "feature_sample_probabilities": "1:0", + "maximum_attempts_to_add_tree": "3", + "maximum_optimisation_rounds_per_hyperparameter": "3", + "maximum_tree_size_multiplier": "10", + "missing_feature_row_masks": { + "d": "3", + "a": "50:0:1:50", + "a": "50:0:1:50", + "a": "50:0:1:50" + }, + "number_folds": "", + "number_rounds": "24", + "number_splits_per_feature": "75", + "number_threads": "1", + "random_number_generator": "6348936557884334503:6746432788814635579", + "regularization_override": { + "regularization_depth_penalty_multiplier": "false;0", + "regularization_tree_size_penalty_multiplier": "false;0", + "regularization_leaf_weight_penalty_multiplier": "false;0", + "regularization_soft_tree_depth_limit": "false;0", + "regularization_soft_tree_depth_tolerance": "false;0" + }, + "regularization": { + "regularization_depth_penalty_multiplier": "2.050525e-2", + "regularization_tree_size_penalty_multiplier": "1.031488e-1", + "regularization_leaf_weight_penalty_multiplier": "1.290953", + "regularization_soft_tree_depth_limit": "3", + "regularization_soft_tree_depth_tolerance": "1.5e-1" + }, + "rows_per_feature": "50", + "testing_row_masks": { + "d": "2", + "a": "50:1:1:5:1:1:5:3:3:3:1:1:1:1:4:1:4:3:6:1:1:2:1:2", + "a": "50:0:1:5:1:1:5:3:3:3:1:1:1:1:4:1:4:3:6:1:1:2:1:2" + }, + "maximum_number_trees": "2", + "training_row_masks": { + "d": "2", + "a": "50:0:1:5:1:1:5:3:3:3:1:1:1:1:4:1:4:3:6:1:1:2:1:2", + "a": "50:1:1:5:1:1:5:3:3:3:1:1:1:1:4:1:4:3:6:1:1:2:1:2" + }, + "training_progress": { + "loop_size_tag": "76", + "progress_steps_tag": "32", + "current_step_progress_tag": "3.125e-2", + "loop_pos_tag": "26" + }, + "best_forest": { + "d": "0" + }, + "best_hyperparameters": { + "hyperparam_eta": "1e-1", + "hyperparam_eta_growth_rate_per_tree": "1.05", + "hyperparam_feature_bag_fraction": "5e-1", + "hyperparam_regularization": { + "regularization_depth_penalty_multiplier": "0", + "regularization_tree_size_penalty_multiplier": "0", + "regularization_leaf_weight_penalty_multiplier": "0", + "regularization_soft_tree_depth_limit": "0", + "regularization_soft_tree_depth_tolerance": "0" + } + }, + "eta_override": "false;0", + "feature_bag_fraction_override": "false;0", + "maximum_number_trees_override": "true;2", + "loss": "mse" +} diff --git a/lib/maths/unittest/testfiles/error_no_version_state.json b/lib/maths/unittest/testfiles/error_no_version_state.json new file mode 100644 index 0000000000..367e3fcc3f --- /dev/null +++ b/lib/maths/unittest/testfiles/error_no_version_state.json @@ -0,0 +1,115 @@ +{ + "bayesian_optimization": { + "7.5": "", + "rng": "16294208416658607535:7960286522194355700", + "min_boundary": { + "dense_vector": "-6.18966:-2.047204:-4.574167:2:5e-2:-3.506558:1.025:2e-1" + }, + "max_boundary": { + "dense_vector": "-1.584489:2.557966:-1.589118e-2:7.321928:2.5e-1:-1.203973:1.075:8e-1" + }, + "error_variances": "", + "kernel_parameters": { + "dense_vector": "1:1:1:1:1:1:1:1:1" + }, + "min_kernel_coordinate_distance_scales": { + "dense_vector": "1e-3:1e-3:1e-3:1e-3:1e-3:1e-3:1e-3:1e-3" + }, + "function_mean_values": { + "d": "0" + }, + "range_scale": "1", + "range_shift": "0", + "restarts": "10" + }, + "best_forest_test_loss": "1.797693e308", + "current_round": "0", + "dependent_variable": "2", + "encoder_tag": { + "7.5": "", + "encoding_vector": { + "identity_encoding": { + "encoding_input_column_index": "0", + "encoding_mic": "3.556275e-1" + }, + "identity_encoding": { + "encoding_input_column_index": "2", + "encoding_mic": "0" + } + } + }, + "eta_growth_rate_per_tree": "1.05", + "eta": "1e-1", + "feature_bag_fraction": "5e-1", + "feature_data_types": { + "d": "2", + "a": "0:2.10813656449318e-1:9.96675395965576:", + "a": "0:2.90942788124084e-1:9.86151218414307:" + }, + "feature_sample_probabilities": "1:0", + "maximum_attempts_to_add_tree": "3", + "maximum_optimisation_rounds_per_hyperparameter": "3", + "maximum_tree_size_multiplier": "10", + "missing_feature_row_masks": { + "d": "3", + "a": "50:0:1:50", + "a": "50:0:1:50", + "a": "50:0:1:50" + }, + "number_folds": "2", + "number_rounds": "24", + "number_splits_per_feature": "75", + "number_threads": "1", + "random_number_generator": "6348936557884334503:6746432788814635579", + "regularization_override": { + "regularization_depth_penalty_multiplier": "false;0", + "regularization_tree_size_penalty_multiplier": "false;0", + "regularization_leaf_weight_penalty_multiplier": "false;0", + "regularization_soft_tree_depth_limit": "false;0", + "regularization_soft_tree_depth_tolerance": "false;0" + }, + "regularization": { + "regularization_depth_penalty_multiplier": "2.050525e-2", + "regularization_tree_size_penalty_multiplier": "1.031488e-1", + "regularization_leaf_weight_penalty_multiplier": "1.290953", + "regularization_soft_tree_depth_limit": "3", + "regularization_soft_tree_depth_tolerance": "1.5e-1" + }, + "rows_per_feature": "50", + "testing_row_masks": { + "d": "2", + "a": "50:1:1:5:1:1:5:3:3:3:1:1:1:1:4:1:4:3:6:1:1:2:1:2", + "a": "50:0:1:5:1:1:5:3:3:3:1:1:1:1:4:1:4:3:6:1:1:2:1:2" + }, + "maximum_number_trees": "2", + "training_row_masks": { + "d": "2", + "a": "50:0:1:5:1:1:5:3:3:3:1:1:1:1:4:1:4:3:6:1:1:2:1:2", + "a": "50:1:1:5:1:1:5:3:3:3:1:1:1:1:4:1:4:3:6:1:1:2:1:2" + }, + "training_progress": { + "loop_size_tag": "76", + "progress_steps_tag": "32", + "current_step_progress_tag": "3.125e-2", + "loop_pos_tag": "26" + }, + "best_forest": { + "d": "0" + }, + "best_hyperparameters": { + "hyperparam_eta": "1e-1", + "hyperparam_eta_growth_rate_per_tree": "1.05", + "hyperparam_feature_bag_fraction": "5e-1", + "hyperparam_regularization": { + "regularization_depth_penalty_multiplier": "0", + "regularization_tree_size_penalty_multiplier": "0", + "regularization_leaf_weight_penalty_multiplier": "0", + "regularization_soft_tree_depth_limit": "0", + "regularization_soft_tree_depth_tolerance": "0" + } + }, + "eta_override": "false;0", + "feature_bag_fraction_override": "false;0", + "maximum_number_trees_override": "true;2", + "loss": "mse" +}