diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc index 9ffe4000d1..e760eb5878 100644 --- a/docs/CHANGELOG.asciidoc +++ b/docs/CHANGELOG.asciidoc @@ -58,6 +58,8 @@ (See {ml-pull}1126[#1126], issue: {issue}54506[#54506].) * Added a {ml} native code build for Linux on AArch64. (See {ml-pull}1132[#1132] and {ml-pull}1135[#1135].) +* Improve data frame analysis runtime by optimising memory alignment for intrinsic + operations. (See {ml-pull}1142[#1142].) == {es} version 7.7.1 @@ -66,7 +68,6 @@ * Fixed background persistence of categorizer state (See {ml-pull}1137[#1137], issue: {ml-issue}1136[#1136].) - == {es} version 7.7.0 === New Features diff --git a/include/api/CDataFrameTrainBoostedTreeRunner.h b/include/api/CDataFrameTrainBoostedTreeRunner.h index fc20903f74..1dff2b5178 100644 --- a/include/api/CDataFrameTrainBoostedTreeRunner.h +++ b/include/api/CDataFrameTrainBoostedTreeRunner.h @@ -52,7 +52,7 @@ class API_EXPORT CDataFrameTrainBoostedTreeRunner : public CDataFrameAnalysisRun static const std::string NUM_TOP_FEATURE_IMPORTANCE_VALUES; static const std::string TRAINING_PERCENT_FIELD_NAME; - //Output + // Output static const std::string IS_TRAINING_FIELD_NAME; static const std::string FEATURE_NAME_FIELD_NAME; static const std::string IMPORTANCE_FIELD_NAME; diff --git a/include/core/CAlignment.h b/include/core/CAlignment.h new file mode 100644 index 0000000000..bedde0e994 --- /dev/null +++ b/include/core/CAlignment.h @@ -0,0 +1,114 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +#ifndef INCLUDED_ml_core_CAlignment_h +#define INCLUDED_ml_core_CAlignment_h + +#include + +#include + +#include +#include +#include + +namespace ml { +namespace core { + +class CORE_EXPORT CAlignment { +public: + //! Alignment types. + enum EType { + E_Unaligned = 1, + E_Aligned8 = 8, + E_Aligned16 = 16, + E_Aligned32 = 32 + }; + + //! This is an ordering by inclusion, i.e. \p lhs < \p rhs if an address is + //! \p rhs aligned implies it is lhs aligned but not vice versa. + static bool less(EType lhs, EType rhs) { return bytes(lhs) < bytes(rhs); } + + //! Get the alignment of \p address. + template + static EType maxAlignment(const T* address) { + // clang-format off + return (isAligned(address, E_Aligned32) ? E_Aligned32 : + (isAligned(address, E_Aligned16) ? E_Aligned16 : + (isAligned(address, E_Aligned8) ? E_Aligned8 : + (E_Unaligned)))); + // clang-format on + } + + //! Check if \p address has \p alignment. + template + static bool isAligned(const T* address, EType alignment) { + return offset(address, alignment) == 0; + } + + //! Get the next index in \p buffer which is aligned to \p alignment. + template + static std::size_t + nextAligned(const std::array& buffer, std::size_t index, EType alignment) { + std::size_t offset_{offset(&buffer[index], alignment)}; + return offset_ == 0 ? index : index + (bytes(alignment) - offset_) / sizeof(T); + } + + //! Get the next index in \p buffer which is aligned to \p alignment. + template + static std::size_t + nextAligned(const std::vector& buffer, std::size_t index, EType alignment) { + std::size_t offset_{offset(&buffer[index], alignment)}; + return offset_ == 0 ? index : index + (bytes(alignment) - offset_) / sizeof(T); + } + + //! Round up n items of T so they use a multiple of \p alignment size memory. + template + static std::size_t roundup(EType alignment, std::size_t n) { + return roundupSizeof(alignment, n) / sizeof(T); + } + + //! Round up sizeof(T) up to multiple of \p alignment bytes. + template + static std::size_t roundupSizeof(EType alignment, std::size_t n = 1) { + std::size_t bytes_{bytes(alignment)}; + return ((n * sizeof(T) + bytes_ - 1) / bytes_) * bytes_; + } + + //! Print the type. + static std::string print(EType type) { + switch (type) { + case E_Unaligned: + return "unaligned"; + case E_Aligned8: + return "aligned 8"; + case E_Aligned16: + return "aligned 16"; + case E_Aligned32: + return "aligned 32"; + } + return ""; + } + +private: + template + static std::size_t offset(const T* address, EType alignment) { + return reinterpret_cast(address) & mask(alignment); + } + + static std::size_t mask(EType alignment) { return bytes(alignment) - 1; } + + static std::size_t bytes(EType alignment) { + return static_cast(alignment); + } +}; + +template +using CAlignedAllocator = Eigen::aligned_allocator; +} +} + +#endif // INCLUDED_ml_core_CAlignment_h diff --git a/include/core/CDataFrame.h b/include/core/CDataFrame.h index 44f54aafac..a9effe0135 100644 --- a/include/core/CDataFrame.h +++ b/include/core/CDataFrame.h @@ -7,6 +7,7 @@ #ifndef INCLUDED_ml_core_CDataFrame_h #define INCLUDED_ml_core_CDataFrame_h +#include #include #include #include @@ -32,7 +33,7 @@ class CTemporaryDirectory; namespace data_frame_detail { -using TFloatVec = std::vector; +using TFloatVec = std::vector>; using TFloatVecItr = TFloatVec::iterator; using TInt32Vec = std::vector; using TInt32VecCItr = TInt32Vec::const_iterator; @@ -178,8 +179,9 @@ class CORE_EXPORT CRowIterator final //! parallelized in which case each reader reads a disjoint subset of the data //! frame's rows. //! -//! Space can be reserved at any point to hold one or more additional columns. -//! These are not visible until they are written. +//! Space can be reserved for additional rows and the data frame can be resized +//! to hold one or more additional columns. Resizing is a heavyweight operation +//! and should be minimized. //! //! IMPLEMENTATION:\n //! This is a fairly lightweight container which is essentially responsible @@ -187,8 +189,9 @@ class CORE_EXPORT CRowIterator final //! The store format is determined by the user implementing functionality to //! read and write state from the store. For example, these could copy to / //! from main memory, "write to" / "read from" disk, etc. A factory function -//! must be provided to the constructor which effectively that determines the -//! type of storage used. It is assumed that copying this has no side effects. +//! for new chunks of storage must be provided to the constructor and this +//! effectively determines the type of storage used. It is assumed that copying +//! this function has no side effects. //! //! The data frame is divided into slices each of which represent a number of //! contiguous rows. The idea is that they contain a reasonable amount of memory @@ -196,29 +199,34 @@ class CORE_EXPORT CRowIterator final //! "reads from" disk (a whole slice being written or read in one go), mean we'll //! get good locality of reference and mean there is minimal book keeping overhead //! (such as state for vector sizes, pointers to starts of memory blocks, etc). -//! In addition, it is assumed that access to the individual slices is thread -//! safe. If they share state the implementation must ensure that access to this -//! is synchronized. +//! It is possible to choose an alignment for each row in which case the address +//! of the start of each row is 8, 16, etc byte aligned. This comes with a memory +//! overhead as row sizes are then rounded up to the nearest multiple of the +//! alignment size. Finally, note that it is assumed that access to the individual +//! slices is thread safe. If they share state the implementation must ensure that +//! access to this is synchronized. //! -//! Reads and writes of a single row are also done via call backs supplied to the +//! Reads and writes of a single row are done via call backs supplied to the //! readRows and writeRow functions. This is to achieve maximum decoupling from //! the calling code for how the underlying values are used or where they come //! from. It also means certain operations can be done very efficiently. For example, //! a stream can be attached to a row writer function to copy the values directly -//! into the data frame storage. +//! into the data frame storage with no marshalling costs. //! -//! Read and writes to storage can optionally happen in a separate thread to the -//! row reading and writing to deal with the case that these operations can by -//! time consuming. +//! Read from and writes to storage can optionally happen in a separate thread +//! to the row reading and writing to deal with the case that these operations +//! can by time consuming. class CORE_EXPORT CDataFrame final { public: using TBoolVec = std::vector; + using TSizeVec = std::vector; using TStrVec = std::vector; using TStrVecVec = std::vector; using TStrCRng = CVectorRange; - using TFloatVec = std::vector; + using TFloatVec = std::vector>; using TFloatVecItr = TFloatVec::iterator; using TInt32Vec = std::vector; + using TSizeAlignmentPrVec = std::vector>; using TRowRef = data_frame_detail::CRowRef; using TRowItr = data_frame_detail::CRowIterator; using TRowFunc = std::function; @@ -245,6 +253,7 @@ class CORE_EXPORT CDataFrame final { public: //! \param[in] inMainMemory True if the data frame is stored in main memory. //! \param[in] numberColumns The number of columns in the data frame. + //! \param[in] rowAlignment The alignment to use for the start of each row. //! \param[in] sliceCapacityInRows The capacity of a slice of the data frame //! as a number of rows. //! \param[in] readAndWriteToStoreSyncStrategy Controls whether reads and @@ -256,6 +265,7 @@ class CORE_EXPORT CDataFrame final { //! the implementers responsibility to ensure these conditions are satisfied. CDataFrame(bool inMainMemory, std::size_t numberColumns, + CAlignment::EType rowAlignment, std::size_t sliceCapacityInRows, EReadWriteToStorage readAndWriteToStoreSyncStrategy, const TWriteSliceToStoreFunc& writeSliceToStore); @@ -263,6 +273,7 @@ class CORE_EXPORT CDataFrame final { //! Overload which manages the setting of slice capacity to a sensible default. CDataFrame(bool inMainMemory, std::size_t numberColumns, + CAlignment::EType rowAlignment, EReadWriteToStorage readAndWriteToStoreSyncStrategy, const TWriteSliceToStoreFunc& writeSliceToStore); @@ -297,6 +308,18 @@ class CORE_EXPORT CDataFrame final { //! \param[in] numberColumns The desired number of columns. void resizeColumns(std::size_t numberThreads, std::size_t numberColumns); + //! Resize to contain \p extraColumns columns. + //! + //! These are split up into blocks of columns with their required alignment. + //! Pads are automatically inserted for alignment and a vector of the start + //! position of each block of columns is returned. + //! + //! \param[in] numberThreads The target number of threads to use. + //! \param[in] extraColumns The desired additional columns. + //! \return The index of each (block of) columns in \p extraColumns. + //! \warning This only supports alignments less than or equal the row alignment. + TSizeVec resizeColumns(std::size_t numberThreads, const TSizeAlignmentPrVec& extraColumns); + //! This reads rows using one or more readers. //! //! One reader is bound to one thread. Each thread reads a disjoint subset @@ -351,7 +374,7 @@ class CORE_EXPORT CDataFrame final { std::vector readers; readers.reserve(result.first.size()); for (auto& reader_ : result.first) { - readers.push_back(std::move(*reader_.target())); + readers.emplace_back(std::move(*reader_.target())); } return {std::move(readers), result.second}; @@ -412,7 +435,7 @@ class CORE_EXPORT CDataFrame final { std::vector writers; writers.reserve(result.first.size()); for (auto& writer_ : result.first) { - writers.push_back(std::move(*writer_.target())); + writers.emplace_back(std::move(*writer_.target())); } return {std::move(writers), result.second}; @@ -485,7 +508,8 @@ class CORE_EXPORT CDataFrame final { //! \p numberColumns columns. static std::size_t estimateMemoryUsage(bool inMainMemory, std::size_t numberRows, - std::size_t numberColumns); + std::size_t numberColumns, + CAlignment::EType alignment); //! Get the value to use for a missing element in a data frame. static constexpr double valueOfMissing() { @@ -568,6 +592,8 @@ class CORE_EXPORT CDataFrame final { std::size_t m_RowCapacity; //! The capacity of a slice of the data frame as a number of rows. std::size_t m_SliceCapacityInRows; + //! The start of row memory alignment. + core::CAlignment::EType m_RowAlignment; //! If true read and write asynchronously to storage. EReadWriteToStorage m_ReadAndWriteToStoreSyncStrategy; @@ -610,12 +636,14 @@ class CORE_EXPORT CDataFrame final { //! capacity in rows. //! \param[in] readWriteToStoreSyncStrategy Controls whether reads and writes //! from slice storage are synchronous or asynchronous. +//! \param[in] alignment The alignment to use for the start of each row. CORE_EXPORT std::pair, std::shared_ptr> makeMainStorageDataFrame(std::size_t numberColumns, boost::optional sliceCapacity = boost::none, CDataFrame::EReadWriteToStorage readWriteToStoreSyncStrategy = - CDataFrame::EReadWriteToStorage::E_Sync); + CDataFrame::EReadWriteToStorage::E_Sync, + CAlignment::EType alignment = CAlignment::E_Aligned16); //! Make a data frame which uses disk storage for its slices. //! @@ -627,6 +655,7 @@ makeMainStorageDataFrame(std::size_t numberColumns, //! capacity in rows. //! \param[in] readWriteToStoreSyncStrategy Controls whether reads and writes //! from slice storage are synchronous or asynchronous. +//! \param[in] alignment The alignment to use for the start of each row. CORE_EXPORT std::pair, std::shared_ptr> makeDiskStorageDataFrame(const std::string& rootDirectory, @@ -634,7 +663,8 @@ makeDiskStorageDataFrame(const std::string& rootDirectory, std::size_t numberRows, boost::optional sliceCapacity = boost::none, CDataFrame::EReadWriteToStorage readWriteToStoreSyncStrategy = - CDataFrame::EReadWriteToStorage::E_Async); + CDataFrame::EReadWriteToStorage::E_Async, + CAlignment::EType alignment = CAlignment::E_Aligned16); } } diff --git a/include/core/CDataFrameRowSlice.h b/include/core/CDataFrameRowSlice.h index c27707f82f..52ba395775 100644 --- a/include/core/CDataFrameRowSlice.h +++ b/include/core/CDataFrameRowSlice.h @@ -7,6 +7,7 @@ #ifndef INCLUDED_ml_core_CDataFrameRowSlice_h #define INCLUDED_ml_core_CDataFrameRowSlice_h +#include #include #include #include @@ -24,7 +25,7 @@ namespace data_frame_row_slice_detail { //! \brief The implementation backing a data frame row slice handle. class CORE_EXPORT CDataFrameRowSliceHandleImpl { public: - using TFloatVec = std::vector; + using TFloatVec = std::vector>; using TInt32Vec = std::vector; using TImplPtr = std::unique_ptr; @@ -42,7 +43,7 @@ class CORE_EXPORT CDataFrameRowSliceHandleImpl { //! CDataFrame storage. class CORE_EXPORT CDataFrameRowSliceHandle { public: - using TFloatVec = std::vector; + using TFloatVec = std::vector>; using TFloatVecItr = TFloatVec::iterator; using TInt32Vec = std::vector; using TInt32VecCItr = TInt32Vec::const_iterator; @@ -83,7 +84,7 @@ class CORE_EXPORT CDataFrameRowSliceHandle { //! \brief CDataFrame slice storage interface. class CORE_EXPORT CDataFrameRowSlice { public: - using TFloatVec = std::vector; + using TFloatVec = std::vector>; using TInt32Vec = std::vector; public: diff --git a/include/maths/CBoostedTreeImpl.h b/include/maths/CBoostedTreeImpl.h index 8969a743c9..276e3b3b9f 100644 --- a/include/maths/CBoostedTreeImpl.h +++ b/include/maths/CBoostedTreeImpl.h @@ -102,8 +102,8 @@ class MATHS_EXPORT CBoostedTreeImpl final { //! Get the column containing the dependent variable. std::size_t columnHoldingDependentVariable() const; - //! Get the number of columns in the original data frame. - std::size_t numberInputColumns() const; + //! Get start indices of the extra columns. + const TSizeVec& extraColumns() const; //! Get the weights to apply to each class's predicted probability when //! assigning classes. @@ -303,7 +303,7 @@ class MATHS_EXPORT CBoostedTreeImpl final { mutable CPRNG::CXorOShiro128Plus m_Rng; std::size_t m_NumberThreads; std::size_t m_DependentVariable = std::numeric_limits::max(); - std::size_t m_NumberInputColumns = 0; + TSizeVec m_ExtraColumns; TLossFunctionUPtr m_Loss; CBoostedTree::EClassAssignmentObjective m_ClassAssignmentObjective = CBoostedTree::E_MinimumRecall; diff --git a/include/maths/CBoostedTreeLeafNodeStatistics.h b/include/maths/CBoostedTreeLeafNodeStatistics.h index 4f5627699f..df887f9612 100644 --- a/include/maths/CBoostedTreeLeafNodeStatistics.h +++ b/include/maths/CBoostedTreeLeafNodeStatistics.h @@ -7,6 +7,7 @@ #ifndef INCLUDED_ml_maths_CBoostedTreeLeafNodeStatistics_h #define INCLUDED_ml_maths_CBoostedTreeLeafNodeStatistics_h +#include #include #include #include @@ -26,6 +27,7 @@ #include #include +#include #include #include #include @@ -56,9 +58,9 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { using TImmutableRadixSetVec = std::vector; using TPtr = std::shared_ptr; using TPtrPtrPr = std::pair; - using TMemoryMappedFloatVector = CMemoryMappedDenseVector; - using TMemoryMappedDoubleVector = CMemoryMappedDenseVector; - using TMemoryMappedDoubleMatrix = CMemoryMappedDenseMatrix; + using TMemoryMappedFloatVector = CMemoryMappedDenseVector; + using TMemoryMappedDoubleVector = CMemoryMappedDenseVector; + using TMemoryMappedDoubleMatrix = CMemoryMappedDenseMatrix; //! \brief Accumulates aggregate derivatives. class MATHS_EXPORT CDerivatives { @@ -70,10 +72,9 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { static bool dynamicSizeAlwaysZero() { return true; } public: - CDerivatives(std::size_t numberLossParameters, double* storage) - : m_Count{0}, m_Gradient{storage, static_cast(numberLossParameters)}, - m_Curvature{storage + numberLossParameters, - static_cast(numberLossParameters), + CDerivatives(std::size_t numberLossParameters, double* storageGradients, double* storageCurvatures) + : m_Gradient{storageGradients, static_cast(numberLossParameters)}, + m_Curvature{storageCurvatures, static_cast(numberLossParameters), static_cast(numberLossParameters)} {} //! Get the accumulated count. @@ -87,20 +88,16 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { return m_Curvature; } - //! Add \p count, \p gradient and \p curvature to the accumulator. - void add(std::size_t count, - const TMemoryMappedFloatVector& gradient, - const TMemoryMappedFloatVector& curvature) { + //! Add \p count and \p derivatives to the accumulator. + void add(std::size_t count, const TMemoryMappedFloatVector& derivatives) { m_Count += count; - m_Gradient += gradient; - this->curvatureTriangleView() += curvature; + this->upperTriangularFlatView() += derivatives; } //! Compute the accumulation of both collections of derivatives. void add(const CDerivatives& other) { m_Count += other.m_Count; - m_Gradient += other.m_Gradient; - m_Curvature += other.m_Curvature; + this->flatView() += const_cast(&other)->flatView(); } //! Set to the difference of \p lhs and \p rhs. @@ -134,15 +131,16 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { //! Remap the accumulated curvature to lower triangle row major format. void remapCurvature() { - // For performance, we accumulate curvatures into the first n (n + 1) / 2 - // elements of the array backing m_Curvature. However, the memory mapped - // matrix class expects them to be stored column major in the lower triangle - // of n x n matrix. This copies them backwards to their correct positions. - for (std::ptrdiff_t j = m_Curvature.cols() - 1, - k = m_Curvature.rows() * (m_Curvature.rows() + 1) / 2 - 1; + // For performance, we accumulate curvatures into the first n + n (n + 1) / 2 + // elements of the array backing upperTriangularFlatView. However, the memory + // mapped matrix class expects them to be stored column major in the lower + // triangle of an n x n matrix. This copies them backwards to their correct + // positions. + TMemoryMappedDoubleVector derivatives{this->upperTriangularFlatView()}; + for (std::ptrdiff_t j = m_Curvature.cols() - 1, k = derivatives.rows() - 1; j >= 0; --j) { for (std::ptrdiff_t i = m_Curvature.rows() - 1; i >= j; --i, --k) { - m_Curvature(i, j) = m_Curvature.array()(k); + m_Curvature(i, j) = derivatives(k); } } } @@ -155,8 +153,17 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { } private: - TMemoryMappedDoubleVector curvatureTriangleView() { - return {m_Curvature.data(), m_Curvature.rows() * (m_Curvature.rows() + 1) / 2}; + TMemoryMappedDoubleVector upperTriangularFlatView() { + // Gradient + upper triangle of the Hessian. + auto n = m_Gradient.rows(); + return {m_Gradient.data(), n * (n + 3) / 2}; + } + + TMemoryMappedDoubleVector flatView() { + // Gradient + pad + full Hessian. + auto n = m_Curvature.data() - m_Gradient.data() + + m_Curvature.rows() * m_Curvature.cols(); + return {m_Gradient.data(), n}; } private: @@ -166,27 +173,26 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { }; //! \brief A collection of aggregate derivatives for candidate feature splits. - class MATHS_EXPORT CPerSplitDerivatives { + class MATHS_EXPORT CSplitsDerivatives { public: using TDerivativesVec = std::vector; public: - explicit CPerSplitDerivatives(std::size_t numberLossParameters = 0) + explicit CSplitsDerivatives(std::size_t numberLossParameters = 0) : m_NumberLossParameters{numberLossParameters} {} - CPerSplitDerivatives(const TImmutableRadixSetVec& candidateSplits, - std::size_t numberLossParameters) + CSplitsDerivatives(const TImmutableRadixSetVec& candidateSplits, std::size_t numberLossParameters) : m_NumberLossParameters{numberLossParameters} { this->map(candidateSplits); } - CPerSplitDerivatives(const CPerSplitDerivatives& other) + CSplitsDerivatives(const CSplitsDerivatives& other) : m_NumberLossParameters{other.m_NumberLossParameters} { this->map(other.m_Derivatives); this->add(other); } - CPerSplitDerivatives(CPerSplitDerivatives&&) = default; + CSplitsDerivatives(CSplitsDerivatives&&) = default; - CPerSplitDerivatives& operator=(const CPerSplitDerivatives& other) = delete; - CPerSplitDerivatives& operator=(CPerSplitDerivatives&&) = default; + CSplitsDerivatives& operator=(const CSplitsDerivatives& other) = delete; + CSplitsDerivatives& operator=(CSplitsDerivatives&&) = default; //! \return The aggregate count for \p feature and \p split. std::size_t count(std::size_t feature, std::size_t split) const { @@ -227,21 +233,19 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { //! the \p split of \p feature. void addDerivatives(std::size_t feature, std::size_t split, - const TMemoryMappedFloatVector& gradient, - const TMemoryMappedFloatVector& curvature) { - m_Derivatives[feature][split].add(1, gradient, curvature); + const TMemoryMappedFloatVector& derivatives) { + m_Derivatives[feature][split].add(1, derivatives); } //! Add \p gradient and \p curvature to the accumulated derivatives for //! missing values of \p feature. void addMissingDerivatives(std::size_t feature, - const TMemoryMappedFloatVector& gradient, - const TMemoryMappedFloatVector& curvature) { - m_MissingDerivatives[feature].add(1, gradient, curvature); + const TMemoryMappedFloatVector& derivatives) { + m_MissingDerivatives[feature].add(1, derivatives); } //! Compute the accumulation of both collections of per split derivatives. - void add(const CPerSplitDerivatives& other) { + void add(const CSplitsDerivatives& other) { for (std::size_t i = 0; i < other.m_Derivatives.size(); ++i) { for (std::size_t j = 0; j < other.m_Derivatives[i].size(); ++j) { m_Derivatives[i][j].add(other.m_Derivatives[i][j]); @@ -251,7 +255,7 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { } //! Subtract \p rhs. - void subtract(const CPerSplitDerivatives& rhs) { + void subtract(const CSplitsDerivatives& rhs) { for (std::size_t i = 0; i < m_Derivatives.size(); ++i) { for (std::size_t j = 0; j < m_Derivatives[i].size(); ++j) { m_Derivatives[i][j].subtract(rhs.m_Derivatives[i][j]); @@ -287,7 +291,7 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { sizeof(CDerivatives)}; std::size_t storageSize{numberFeatures * (numberSplitsPerFeature + 1) * numberLossParameters * (numberLossParameters + 1) * sizeof(double)}; - return sizeof(CPerSplitDerivatives) + derivativesSize + storageSize; + return sizeof(CSplitsDerivatives) + derivativesSize + storageSize; } //! Get a checksum of this object. @@ -300,6 +304,7 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { private: using TDerivativesVecVec = std::vector; + using TAlignedDoubleVec = std::vector>; private: static std::size_t number(const TDerivativesVec& derivatives) { @@ -318,42 +323,62 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { // | | | | // V V V V // | n | n^2 | ... | n | n^2 | + // + // Note we ensure 16 byte alignment because we're using aligned memory + // mapped vectors which have much better performance. + std::size_t numberFeatures{splits.size()}; std::size_t totalNumberSplits{ std::accumulate(splits.begin(), splits.end(), std::size_t{0}, [](std::size_t size, const auto& featureSplits) { return size + number(featureSplits); })}; - int numberGradients{static_cast(m_NumberLossParameters)}; - int numberCurvatures{numberGradients * numberGradients}; - int numberDerivatives{numberGradients + numberCurvatures}; + std::size_t numberGradients{this->numberGradients()}; + std::size_t numberDerivatives{this->numberDerivatives()}; - m_Derivatives.resize(splits.size()); - m_MissingDerivatives.reserve(splits.size()); - m_Storage.resize((totalNumberSplits + splits.size()) * numberDerivatives, 0.0); + m_Derivatives.resize(numberFeatures); + m_MissingDerivatives.reserve(numberFeatures); + + m_Storage.resize((totalNumberSplits + numberFeatures) * numberDerivatives, 0.0); double* storage{&m_Storage[0]}; - for (std::size_t i = 0; i < splits.size(); ++i, storage += numberDerivatives) { + for (std::size_t i = 0; i < numberFeatures; ++i, storage += numberDerivatives) { std::size_t size{number(splits[i])}; m_Derivatives[i].reserve(size); for (std::size_t j = 0; j < size; ++j, storage += numberDerivatives) { - m_Derivatives[i].emplace_back(m_NumberLossParameters, storage); + m_Derivatives[i].emplace_back(m_NumberLossParameters, storage, + storage + numberGradients); } - m_MissingDerivatives.emplace_back(m_NumberLossParameters, storage); + m_MissingDerivatives.emplace_back(m_NumberLossParameters, storage, + storage + numberGradients); } } + std::size_t numberDerivatives() const { + return this->numberGradients() + this->numberCurvatures(); + } + + std::size_t numberGradients() const { + return core::CAlignment::roundup(core::CAlignment::E_Aligned16, + m_NumberLossParameters); + } + + std::size_t numberCurvatures() const { + return core::CAlignment::roundup( + core::CAlignment::E_Aligned16, m_NumberLossParameters * m_NumberLossParameters); + } + private: std::size_t m_NumberLossParameters = 0; TDerivativesVecVec m_Derivatives; TDerivativesVec m_MissingDerivatives; - TDoubleVec m_Storage; + TAlignedDoubleVec m_Storage; }; public: CBoostedTreeLeafNodeStatistics(std::size_t id, - std::size_t numberInputColumns, + const TSizeVec& extraColumns, std::size_t numberLossParameters, std::size_t numberThreads, const core::CDataFrame& frame, @@ -366,7 +391,7 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { //! Only called by split but is public so it's accessible to std::make_shared. CBoostedTreeLeafNodeStatistics(std::size_t id, - std::size_t numberInputColumns, + const TSizeVec& extraColumns, std::size_t numberLossParameters, std::size_t numberThreads, const core::CDataFrame& frame, @@ -442,6 +467,8 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { std::size_t numberLossParameters); private: + using TSizeVecCRef = std::reference_wrapper; + //! \brief Statistics relating to a split of the node. struct MATHS_EXPORT SSplitStatistics : private boost::less_than_comparable { @@ -490,18 +517,18 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final { const CBoostedTreeNode& split, const core::CPackedBitVector& parentRowMask); void addRowDerivatives(const CEncodedDataFrameRowRef& row, - CPerSplitDerivatives& splitDerivatives) const; + CSplitsDerivatives& splitsDerivatives) const; SSplitStatistics computeBestSplitStatistics(const TRegularization& regularization, const TSizeVec& featureBag) const; private: std::size_t m_Id; std::size_t m_Depth; - std::size_t m_NumberInputColumns; + TSizeVecCRef m_ExtraColumns; std::size_t m_NumberLossParameters; const TImmutableRadixSetVec& m_CandidateSplits; core::CPackedBitVector m_RowMask; - CPerSplitDerivatives m_Derivatives; + CSplitsDerivatives m_Derivatives; SSplitStatistics m_BestSplit; }; } diff --git a/include/maths/CBoostedTreeUtils.h b/include/maths/CBoostedTreeUtils.h index 700ee07019..d447a0754e 100644 --- a/include/maths/CBoostedTreeUtils.h +++ b/include/maths/CBoostedTreeUtils.h @@ -22,83 +22,85 @@ namespace boosted_tree { class CLoss; } namespace boosted_tree_detail { +using TSizeVec = std::vector; using TRowRef = core::CDataFrame::TRowRef; using TMemoryMappedFloatVector = CMemoryMappedDenseVector; +using TSizeAlignmentPrVec = std::vector>; +using TAlignedMemoryMappedFloatVector = + CMemoryMappedDenseVector; -inline std::size_t lossHessianStoredSize(std::size_t numberLossParameters) { +//! Get the size of upper triangle of the loss Hessain. +inline std::size_t lossHessianUpperTriangleSize(std::size_t numberLossParameters) { return numberLossParameters * (numberLossParameters + 1) / 2; } -inline std::size_t predictionColumn(std::size_t numberInputColumns) { - return numberInputColumns; -} - -inline std::size_t lossGradientColumn(std::size_t numberInputColumns, - std::size_t numberLossParameters) { - return predictionColumn(numberInputColumns) + numberLossParameters; -} - -inline std::size_t lossCurvatureColumn(std::size_t numberInputColumns, - std::size_t numberLossParameters) { - return lossGradientColumn(numberInputColumns, numberLossParameters) + numberLossParameters; -} - -inline std::size_t exampleWeightColumn(std::size_t numberInputColumns, - std::size_t numberLossParameters) { - return lossCurvatureColumn(numberInputColumns, numberLossParameters) + - lossHessianStoredSize(numberLossParameters); -} +//! Get the extra columns needed by training. +MATHS_EXPORT +TSizeAlignmentPrVec extraColumns(std::size_t numberLossParameters); //! Read the prediction from \p row. MATHS_EXPORT TMemoryMappedFloatVector readPrediction(const TRowRef& row, - std::size_t numberInputColumns, - std::size_t numberLossParamaters); + const TSizeVec& extraColumns, + std::size_t numberLossParameters); //! Zero the prediction of \p row. MATHS_EXPORT -void zeroPrediction(const TRowRef& row, std::size_t numberInputColumns, std::size_t numberLossParamaters); +void zeroPrediction(const TRowRef& row, const TSizeVec& extraColumns, std::size_t numberLossParameters); + +//! Read all the loss derivatives from \p row into an aligned vector. +MATHS_EXPORT +TAlignedMemoryMappedFloatVector readLossDerivatives(const TRowRef& row, + const TSizeVec& extraColumns, + std::size_t numberLossParameters); //! Read the loss gradient from \p row. MATHS_EXPORT TMemoryMappedFloatVector readLossGradient(const TRowRef& row, - std::size_t numberInputColumns, + const TSizeVec& extraColumns, std::size_t numberLossParameters); //! Zero the loss gradient of \p row. MATHS_EXPORT -void zeroLossGradient(const TRowRef& row, std::size_t numberInputColumns, std::size_t numberLossParameters); +void zeroLossGradient(const TRowRef& row, const TSizeVec& extraColumns, std::size_t numberLossParameters); //! Write the loss gradient to \p row. MATHS_EXPORT void writeLossGradient(const TRowRef& row, - std::size_t numberInputColumns, + const TSizeVec& extraColumns, const boosted_tree::CLoss& loss, const TMemoryMappedFloatVector& prediction, double actual, double weight = 1.0); +//! Read the loss flat column major Hessian from \p row. MATHS_EXPORT TMemoryMappedFloatVector readLossCurvature(const TRowRef& row, - std::size_t numberInputColumns, + const TSizeVec& extraColumns, std::size_t numberLossParameters); +//! Zero the loss Hessian of \p row. MATHS_EXPORT -void zeroLossCurvature(const TRowRef& row, std::size_t numberInputColumns, std::size_t numberLossParameters); +void zeroLossCurvature(const TRowRef& row, const TSizeVec& extraColumns, std::size_t numberLossParameters); +//! Write the loss Hessian to \p row. MATHS_EXPORT void writeLossCurvature(const TRowRef& row, - std::size_t numberInputColumns, + const TSizeVec& extraColumns, const boosted_tree::CLoss& curvature, const TMemoryMappedFloatVector& prediction, double actual, double weight = 1.0); +//! Read the example weight from \p row. +MATHS_EXPORT +double readExampleWeight(const TRowRef& row, const TSizeVec& extraColumns); + +//! Write the example weight to \p row . MATHS_EXPORT -double readExampleWeight(const TRowRef& row, - std::size_t numberInputColumns, - std::size_t numberLossParameters); +void writeExampleWeight(const TRowRef& row, const TSizeVec& extraColumns, double weight); +//! Read the actual value for the target from \p row. MATHS_EXPORT double readActual(const TRowRef& row, std::size_t dependentVariable); diff --git a/include/maths/CDataFrameUtils.h b/include/maths/CDataFrameUtils.h index ab822d34c1..581cd74f30 100644 --- a/include/maths/CDataFrameUtils.h +++ b/include/maths/CDataFrameUtils.h @@ -35,9 +35,10 @@ struct SRowTo { static_assert(sizeof(T) < 0, "Vector type not supported"); }; -template -struct SRowTo> { - static CMemoryMappedDenseVector dispatch(const core::CDataFrame::TRowRef& row) { +template +struct SRowTo> { + static CMemoryMappedDenseVector + dispatch(const core::CDataFrame::TRowRef& row) { return {row.data(), static_cast(row.numberColumns())}; } }; diff --git a/include/maths/CInformationCriteria.h b/include/maths/CInformationCriteria.h index 1881b0adbe..c000eb62f5 100644 --- a/include/maths/CInformationCriteria.h +++ b/include/maths/CInformationCriteria.h @@ -95,7 +95,7 @@ class CSphericalGaussianInfoCriterion { public: using TPointVec = std::vector; using TPointVecVec = std::vector; - using TBarePoint = typename SStripped::Type; + using TBarePoint = typename SUnannotated::Type; using TBarePointPrecise = typename SFloatingPoint::Type; using TCoordinate = typename SCoordinate::Type; using TMeanVarAccumulator = @@ -194,7 +194,7 @@ class CGaussianInfoCriterion { public: using TPointVec = std::vector; using TPointVecVec = std::vector; - using TBarePoint = typename SStripped::Type; + using TBarePoint = typename SUnannotated::Type; using TBarePointPrecise = typename SFloatingPoint::Type; using TCoordinate = typename SCoordinate::Type; using TCovariances = CBasicStatistics::SSampleCovariances; diff --git a/include/maths/CKMeans.h b/include/maths/CKMeans.h index 22a1e38f82..6352295172 100644 --- a/include/maths/CKMeans.h +++ b/include/maths/CKMeans.h @@ -217,7 +217,7 @@ class CKMeans { protected: using TCoordinate = typename SCoordinate::Type; - using TBarePoint = typename SStripped::Type; + using TBarePoint = typename SUnannotated::Type; using TBarePointPrecise = typename SFloatingPoint::Type; using TMeanAccumulator = typename CBasicStatistics::SSampleMean::TAccumulator; diff --git a/include/maths/CLinearAlgebraEigen.h b/include/maths/CLinearAlgebraEigen.h index 79732967e3..7a630769bd 100644 --- a/include/maths/CLinearAlgebraEigen.h +++ b/include/maths/CLinearAlgebraEigen.h @@ -362,11 +362,11 @@ struct SConstant> { //! of CMemoryMappedDenseVector. //! //! \sa CMemoryMappedDenseVector for more information. -template +template class CMemoryMappedDenseMatrix - : public Eigen::Map::TBase> { + : public Eigen::Map::TBase, ALIGNMENT> { public: - using TBase = Eigen::Map::TBase>; + using TBase = Eigen::Map::TBase, ALIGNMENT>; //! See core::CMemory. static bool dynamicSizeAlwaysZero() { return true; } @@ -426,15 +426,16 @@ class CMemoryMappedDenseMatrix }; //! Free efficient efficient swap for ADLU. -template -void swap(CMemoryMappedDenseMatrix& lhs, CMemoryMappedDenseMatrix& rhs) { +template +void swap(CMemoryMappedDenseMatrix& lhs, + CMemoryMappedDenseMatrix& rhs) { lhs.swap(rhs); } //! \brief Gets a constant square dense matrix with specified dimension or with //! specified numbers of rows and columns. -template -struct SConstant> { +template +struct SConstant> { static auto get(std::ptrdiff_t dimension, SCALAR constant) -> decltype(SConstant>::get(dimension, 1)) { return SConstant>::get(dimension, constant); @@ -476,12 +477,12 @@ struct SConstant> { //! This better fits our needs with data frames where we want to reference the //! memory stored in the data frame rows, but never modify it directly through //! this vector type. -template +template class CMemoryMappedDenseVector - : public Eigen::Map::TBase> { + : public Eigen::Map::TBase, ALIGNMENT> { public: using TDenseVector = CDenseVector; - using TBase = Eigen::Map; + using TBase = Eigen::Map; //! See core::CMemory. static bool dynamicSizeAlwaysZero() { return true; } @@ -545,14 +546,15 @@ class CMemoryMappedDenseVector }; //! Free efficient efficient swap for ADLU. -template -void swap(CMemoryMappedDenseVector& lhs, CMemoryMappedDenseVector& rhs) { +template +void swap(CMemoryMappedDenseVector& lhs, + CMemoryMappedDenseVector& rhs) { lhs.swap(rhs); } //! \brief Gets a constant dense vector with specified dimension. -template -struct SConstant> { +template +struct SConstant> { static auto get(std::ptrdiff_t dimension, SCALAR constant) -> decltype(SConstant>::get(dimension, constant)) { return SConstant>::get(dimension, constant); diff --git a/include/maths/CLinearAlgebraFwd.h b/include/maths/CLinearAlgebraFwd.h index 805fa84892..594f723125 100644 --- a/include/maths/CLinearAlgebraFwd.h +++ b/include/maths/CLinearAlgebraFwd.h @@ -72,9 +72,9 @@ template class CDenseVectorInitializer; template class CDenseMatrixInitializer; -template +template class CMemoryMappedDenseVector; -template +template class CMemoryMappedDenseMatrix; } } diff --git a/include/maths/CLinearAlgebraShims.h b/include/maths/CLinearAlgebraShims.h index 6a1bd0549d..aef41801a0 100644 --- a/include/maths/CLinearAlgebraShims.h +++ b/include/maths/CLinearAlgebraShims.h @@ -31,8 +31,8 @@ std::size_t dimension(const CDenseVector& x) { } //! Get the dimension of an Eigen memory mapped vector. -template -std::size_t dimension(const CMemoryMappedDenseVector& x) { +template +std::size_t dimension(const CMemoryMappedDenseVector& x) { return static_cast(x.size()); } @@ -75,9 +75,10 @@ CDenseMatrix conformableZeroMatrix(const CDenseVector& x) { } //! Get the conformable zero initialized matrix for the Eigen memory mapped vector. -template -CDenseMatrix conformableZeroMatrix(const CMemoryMappedDenseVector& x) { - return SConstant>::get(dimension(x), 0); +template +CDenseMatrix +conformableZeroMatrix(const CMemoryMappedDenseVector& x) { + return SConstant>::get(dimension(x), 0); } //! Get the conformable zero initialized matrix for the underlying vector. @@ -129,41 +130,41 @@ void max(const VECTOR& x, VECTOR& y) { //! Expose componentwise operations for our internal vectors. template -typename SArrayView::Type componentwise(VECTOR& x) { +VECTOR& componentwise(VECTOR& x) { return x; } //! Expose componentwise operations for Eigen dense vectors. template -typename SArrayView>::Type -componentwise(const CDenseVector& x) { +auto componentwise(const CDenseVector& x) -> decltype(x.array()) { return x.array(); } template -typename SArrayView>::Type componentwise(CDenseVector& x) { +auto componentwise(CDenseVector& x) -> decltype(x.array()) { return x.array(); } //! Expose componentwise operations for Eigen memory mapped vectors. -template -typename SArrayView>::Type -componentwise(const CMemoryMappedDenseVector& x) { +template +auto componentwise(const CMemoryMappedDenseVector& x) + -> decltype(x.array()) { return x.array(); } -template -typename SArrayView>::Type -componentwise(CMemoryMappedDenseVector& x) { +template +auto componentwise(CMemoryMappedDenseVector& x) + -> decltype(x.array()) { return x.array(); } //! Expose componentwise operations for our annotated vectors. template -typename SArrayView::Type -componentwise(const CAnnotatedVector& x) { +auto componentwise(const CAnnotatedVector& x) + -> decltype(componentwise(static_cast(x))) { return componentwise(static_cast(x)); } template -typename SArrayView::Type& componentwise(CAnnotatedVector& x) { +auto componentwise(CAnnotatedVector& x) + -> decltype(componentwise(static_cast(x))) { return componentwise(static_cast(x)); } @@ -186,9 +187,9 @@ SCALAR distance(const CDenseVector& x, const CDenseVector& y) { } //! Euclidean distance implementation for an Eigen memory mapped vector. -template -SCALAR distance(const CMemoryMappedDenseVector& x, - const CMemoryMappedDenseVector& y) { +template +SCALAR distance(const CMemoryMappedDenseVector& x, + const CMemoryMappedDenseVector& y) { return (y - x).norm(); } @@ -213,8 +214,8 @@ SCALAR norm(const CDenseVector& x) { } //! Get the Euclidean norm of an Eigen memory mapped vector. -template -SCALAR norm(const CMemoryMappedDenseVector& x) { +template +SCALAR norm(const CMemoryMappedDenseVector& x) { return x.norm(); } @@ -237,8 +238,8 @@ SCALAR L1(const CDenseVector& x) { } //! Get the Manhattan norm of an Eigen memory mapped vector. -template -SCALAR L1(const CMemoryMappedDenseVector& x) { +template +SCALAR L1(const CMemoryMappedDenseVector& x) { return x.template lpNorm<1>(); } @@ -261,8 +262,8 @@ SCALAR frobenius(const CDenseMatrix& x) { } //! Get the Euclidean norm of an Eigen memory mapped matrix. -template -SCALAR frobenius(const CMemoryMappedDenseMatrix& x) { +template +SCALAR frobenius(const CMemoryMappedDenseMatrix& x) { return x.norm(); } @@ -279,19 +280,21 @@ SCALAR inner(const CDenseVector& x, const CDenseVector& y) { } //! Get the inner product of two Eigen memory mapped vectors. -template -SCALAR inner(const CMemoryMappedDenseVector& x, - const CMemoryMappedDenseVector& y) { +template +SCALAR inner(const CMemoryMappedDenseVector& x, + const CMemoryMappedDenseVector& y) { return x.dot(y); } //! Get the inner product of Eigen dense and memory mapped vectors. -template -SCALAR inner(const CDenseVector& x, const CMemoryMappedDenseVector& y) { +template +SCALAR inner(const CDenseVector& x, + const CMemoryMappedDenseVector& y) { return x.dot(y); } //! Get the inner product of Eigen dense and memory mapped vectors. -template -SCALAR inner(const CMemoryMappedDenseVector& x, const CDenseVector& y) { +template +SCALAR inner(const CMemoryMappedDenseVector& x, + const CDenseVector& y) { return x.dot(y); } @@ -327,8 +330,8 @@ CDenseMatrix outer(const CDenseVector& x) { } //! Get the outer product of an Eigen memory mapped vector. -template -CDenseMatrix outer(const CMemoryMappedDenseVector& x) { +template +CDenseMatrix outer(const CMemoryMappedDenseVector& x) { return outer(CDenseVector(x)); } diff --git a/include/maths/CTypeTraits.h b/include/maths/CTypeTraits.h index f9be718125..50fa05a2cd 100644 --- a/include/maths/CTypeTraits.h +++ b/include/maths/CTypeTraits.h @@ -70,14 +70,14 @@ struct SPromoted> { }; //! \brief Defines the promoted type for an Eigen memory mapped matrix. -template -struct SPromoted> { +template +struct SPromoted> { using Type = CDenseMatrix::Type>; }; //! \brief Defines the promoted type for an Eigen memory mapped vector. -template -struct SPromoted> { +template +struct SPromoted> { using Type = CDenseVector::Type>; }; @@ -142,14 +142,14 @@ struct SFloatingPoint, U> { }; //! \brief Defines an Eigen dense matrix on a suitable floating point type. -template -struct SFloatingPoint, U> { +template +struct SFloatingPoint, U> { using Type = CDenseMatrix::Type>; }; //! \brief Defines an Eigen dense vector on a suitable floating point type. -template -struct SFloatingPoint, U> { +template +struct SFloatingPoint, U> { using Type = CDenseVector::Type>; }; @@ -214,14 +214,14 @@ struct SCoordinate> { }; //! \brief Extracts the coordinate type for an Eigen memory mapped matrix. -template -struct SCoordinate> { +template +struct SCoordinate> { using Type = SCALAR; }; //! \brief Extracts the coordinate type for an Eigen memory mapped vector. -template -struct SCoordinate> { +template +struct SCoordinate> { using Type = SCALAR; }; @@ -268,9 +268,9 @@ struct SConformableMatrix> { }; //! \brief Extracts the conformable matrix type for an Eigen memory mapped vector. -template -struct SConformableMatrix> { - using Type = CMemoryMappedDenseMatrix; +template +struct SConformableMatrix> { + using Type = CMemoryMappedDenseMatrix; }; //! \brief Extracts the conformable matrix type for an Eigen sparse vector. @@ -285,68 +285,6 @@ struct SConformableMatrix> { using Type = typename SConformableMatrix::Type; }; -//! \brief Defines the array view for componentwise operations on our internal -//! vectors and matrices. -template -struct SArrayView { - using Type = VECTOR&; -}; - -//! \brief Defines the array view for componentwise operations on a Eigen dense matrix. -template -struct SArrayView> { - using Type = - Eigen::ArrayWrapper>; -}; -template -struct SArrayView> { - using Type = - Eigen::ArrayWrapper>; -}; - -//! \brief Defines the array view for componentwise operations on an Eigen dense matrix. -template -struct SArrayView> { - using Type = - Eigen::ArrayWrapper>; -}; -template -struct SArrayView> { - using Type = - Eigen::ArrayWrapper>; -}; - -//! \brief Defines the array view for componentwise operations on an Eigen memory mapped matrix. -template -struct SArrayView> { - using Type = Eigen::ArrayWrapper< - const Eigen::Map, 0, Eigen::Stride<0, 0>>>; -}; -template -struct SArrayView> { - using Type = Eigen::ArrayWrapper< - Eigen::Map, 0, Eigen::Stride<0, 0>>>; -}; - -//! \brief Defines the array view for componentwise operations on an Eigen memory mapped vector. -template -struct SArrayView> { - using Type = Eigen::ArrayWrapper< - const Eigen::Map, 0, Eigen::Stride<0, 0>>>; -}; -template -struct SArrayView> { - using Type = Eigen::ArrayWrapper< - Eigen::Map, 0, Eigen::Stride<0, 0>>>; -}; - -//! \brief Defines the array view for componentwise operations on Eigen dense -//! vectors and matrices. -template -struct SArrayView> { - using Type = typename SArrayView::Type; -}; - //! \brief Defines the type of a singular value decomposition of a matrix. template struct SJacobiSvd { @@ -372,14 +310,14 @@ struct SJacobiSvd //! \brief Defines a type which strips off any annotation from a vector. //! This is the raw vector type by default. template -struct SStripped { +struct SUnannotated { using Type = VECTOR; }; //! \brief Specialisation for annotated vectors. This is the underlying //! vector type. template -struct SStripped> { +struct SUnannotated> { using Type = VECTOR; }; } diff --git a/lib/api/CDataFrameAnalysisRunner.cc b/lib/api/CDataFrameAnalysisRunner.cc index c1ba307ca7..dc3d15d0a7 100644 --- a/lib/api/CDataFrameAnalysisRunner.cc +++ b/lib/api/CDataFrameAnalysisRunner.cc @@ -163,7 +163,7 @@ std::size_t CDataFrameAnalysisRunner::estimateMemoryUsage(std::size_t totalNumbe std::size_t numberColumns) const { return core::CDataFrame::estimateMemoryUsage( this->storeDataFrameInMainMemory(), totalNumberRows, - numberColumns + this->numberExtraColumns()) + + numberColumns + this->numberExtraColumns(), core::CAlignment::E_Aligned16) + this->estimateBookkeepingMemoryUsage(m_NumberPartitions, totalNumberRows, partitionNumberRows, numberColumns); } diff --git a/lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc b/lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc index c69c2352bc..1b580eadc3 100644 --- a/lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc +++ b/lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc @@ -165,17 +165,17 @@ void CDataFrameTrainBoostedTreeClassifierRunner::writeOneRow( featureImportance->shap( row, [&writer, &classValues]( const maths::CTreeShapFeatureImportance::TSizeVec& indices, - const TStrVec& names, + const TStrVec& featureNames, const maths::CTreeShapFeatureImportance::TVectorVec& shap) { - writer.Key(CDataFrameTrainBoostedTreeRunner::FEATURE_IMPORTANCE_FIELD_NAME); + writer.Key(FEATURE_IMPORTANCE_FIELD_NAME); writer.StartArray(); for (auto i : indices) { if (shap[i].norm() != 0.0) { writer.StartObject(); - writer.Key(CDataFrameTrainBoostedTreeRunner::FEATURE_NAME_FIELD_NAME); - writer.String(names[i]); + writer.Key(FEATURE_NAME_FIELD_NAME); + writer.String(featureNames[i]); if (shap[i].size() == 1) { - writer.Key(CDataFrameTrainBoostedTreeRunner::IMPORTANCE_FIELD_NAME); + writer.Key(IMPORTANCE_FIELD_NAME); writer.Double(shap[i](0)); } else { for (int j = 0; j < shap[i].size(); ++j) { diff --git a/lib/api/CDataFrameTrainBoostedTreeRegressionRunner.cc b/lib/api/CDataFrameTrainBoostedTreeRegressionRunner.cc index 2a146fca6e..92fddd3cc3 100644 --- a/lib/api/CDataFrameTrainBoostedTreeRegressionRunner.cc +++ b/lib/api/CDataFrameTrainBoostedTreeRegressionRunner.cc @@ -96,16 +96,16 @@ void CDataFrameTrainBoostedTreeRegressionRunner::writeOneRow( if (featureImportance != nullptr) { featureImportance->shap( row, [&writer](const maths::CTreeShapFeatureImportance::TSizeVec& indices, - const TStrVec& names, + const TStrVec& featureNames, const maths::CTreeShapFeatureImportance::TVectorVec& shap) { - writer.Key(CDataFrameTrainBoostedTreeRunner::FEATURE_IMPORTANCE_FIELD_NAME); + writer.Key(FEATURE_IMPORTANCE_FIELD_NAME); writer.StartArray(); for (auto i : indices) { if (shap[i].norm() != 0.0) { writer.StartObject(); - writer.Key(CDataFrameTrainBoostedTreeRunner::FEATURE_NAME_FIELD_NAME); - writer.String(names[i]); - writer.Key(CDataFrameTrainBoostedTreeRunner::IMPORTANCE_FIELD_NAME); + writer.Key(FEATURE_NAME_FIELD_NAME); + writer.String(featureNames[i]); + writer.Key(IMPORTANCE_FIELD_NAME); writer.Double(shap[i](0)); writer.EndObject(); } diff --git a/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc b/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc index 7cfb5ababa..f6ab101fbb 100644 --- a/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc +++ b/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc @@ -197,7 +197,7 @@ BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor10000Rows) { } BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor100000Rows) { - testEstimateMemoryUsage(100000, "40mb", "9mb", 0); + testEstimateMemoryUsage(100000, "41mb", "10mb", 0); } BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor10000000Rows) { diff --git a/lib/core/CDataFrame.cc b/lib/core/CDataFrame.cc index c65baba8ab..20bd435610 100644 --- a/lib/core/CDataFrame.cc +++ b/lib/core/CDataFrame.cc @@ -119,11 +119,13 @@ std::size_t computeSliceCapacity(std::size_t numberColumns) { CDataFrame::CDataFrame(bool inMainMemory, std::size_t numberColumns, + CAlignment::EType rowAlignment, std::size_t sliceCapacityInRows, EReadWriteToStorage readAndWriteToStoreSyncStrategy, const TWriteSliceToStoreFunc& writeSliceToStore) : m_InMainMemory{inMainMemory}, m_NumberColumns{numberColumns}, - m_RowCapacity{numberColumns}, m_SliceCapacityInRows{sliceCapacityInRows}, + m_RowCapacity{CAlignment::roundup(rowAlignment, numberColumns)}, + m_SliceCapacityInRows{sliceCapacityInRows}, m_RowAlignment{rowAlignment}, m_ReadAndWriteToStoreSyncStrategy{readAndWriteToStoreSyncStrategy}, m_WriteSliceToStore{writeSliceToStore}, m_ColumnNames(numberColumns), m_CategoricalColumnValues(numberColumns), m_MissingString{DEFAULT_MISSING_STRING}, @@ -132,10 +134,15 @@ CDataFrame::CDataFrame(bool inMainMemory, CDataFrame::CDataFrame(bool inMainMemory, std::size_t numberColumns, + CAlignment::EType rowAlignment, EReadWriteToStorage readAndWriteToStoreSyncStrategy, const TWriteSliceToStoreFunc& writeSliceToStore) - : CDataFrame{inMainMemory, numberColumns, computeSliceCapacity(numberColumns), - readAndWriteToStoreSyncStrategy, writeSliceToStore} { + : CDataFrame{inMainMemory, + numberColumns, + rowAlignment, + computeSliceCapacity(numberColumns), + readAndWriteToStoreSyncStrategy, + writeSliceToStore} { } CDataFrame::~CDataFrame() = default; @@ -153,15 +160,20 @@ std::size_t CDataFrame::numberColumns() const { } void CDataFrame::reserve(std::size_t numberThreads, std::size_t rowCapacity) { + + rowCapacity = CAlignment::roundup(m_RowAlignment, rowCapacity); + if (m_RowCapacity >= rowCapacity) { return; } + std::size_t oldRowCapacity{m_RowCapacity}; m_RowCapacity = rowCapacity; - parallel_for_each(numberThreads, m_Slices.begin(), m_Slices.end(), [this](TRowSlicePtr& slice) { - slice->reserve(m_NumberColumns, m_RowCapacity - m_NumberColumns); - }); + parallel_for_each(numberThreads, m_Slices.begin(), m_Slices.end(), + [oldRowCapacity, this](TRowSlicePtr& slice) { + slice->reserve(oldRowCapacity, m_RowCapacity - oldRowCapacity); + }); } void CDataFrame::resizeColumns(std::size_t numberThreads, std::size_t numberColumns) { @@ -172,6 +184,26 @@ void CDataFrame::resizeColumns(std::size_t numberThreads, std::size_t numberColu m_NumberColumns = numberColumns; } +CDataFrame::TSizeVec CDataFrame::resizeColumns(std::size_t numberThreads, + const TSizeAlignmentPrVec& extraColumns) { + TSizeVec result; + result.reserve(extraColumns.size()); + std::size_t index{m_NumberColumns}; + for (const auto& columns : extraColumns) { + std::size_t count; + CAlignment::EType alignment; + std::tie(count, alignment) = columns; + if (CAlignment::less(m_RowAlignment, alignment)) { + HANDLE_FATAL(<< "Unsupported column alignment " << CAlignment::print(alignment)); + } + index = CAlignment::roundup(alignment, index); + result.push_back(index); + index += count; + } + this->resizeColumns(numberThreads, index); + return result; +} + CDataFrame::TRowFuncVecBoolPr CDataFrame::readRows(std::size_t numberThreads, std::size_t beginRows, std::size_t endRows, @@ -386,8 +418,11 @@ std::uint64_t CDataFrame::checksum() const { std::size_t CDataFrame::estimateMemoryUsage(bool inMainMemory, std::size_t numberRows, - std::size_t numberColumns) { - return inMainMemory ? numberRows * numberColumns * sizeof(float) : 0; + std::size_t numberColumns, + CAlignment::EType alignment) { + return inMainMemory + ? numberRows * CAlignment::roundupSizeof(alignment, numberColumns) + : 0; } CDataFrame::TRowFuncVecBoolPr @@ -450,13 +485,13 @@ CDataFrame::parallelApplyToAllRows(std::size_t numberThreads, }, std::move(func))); - TRowFuncVec functions; - functions.reserve(results.size()); + TRowFuncVec funcs; + funcs.reserve(results.size()); for (auto& result : results) { - functions.emplace_back(std::move(result.s_FunctionState)); + funcs.emplace_back(std::move(result.s_FunctionState)); } - return {std::move(functions), successful.load()}; + return {std::move(funcs), successful.load()}; } CDataFrame::TRowFuncVecBoolPr @@ -562,7 +597,14 @@ CDataFrame::sequentialApplyToAllRows(std::size_t beginRows, break; } - return {{std::move(func)}, true}; + // TRowFuncVec funcs{std::move(func)}; moves func into an std::inializer_list + // but then *copies* from the list because the standard requires its elements + // are treated as constant, see 8.5.4/5. + TRowFuncVec funcs; + funcs.reserve(1); + funcs.emplace_back(std::move(func)); + + return TRowFuncVecBoolPr{std::move(funcs), true}; } void CDataFrame::applyToRowsOfOneSlice(TRowFunc& func, @@ -634,11 +676,10 @@ void CDataFrame::CDataFrameRowSliceWriter::operator()(const TWriteFunc& writeRow // Write the next row at the end of the current slice being written // and if the slice is full pass to the thread storing slices. - std::size_t end{m_RowsOfSliceBeingWritten.size()}; - - m_RowsOfSliceBeingWritten.resize(end + m_RowCapacity); + std::size_t start{m_RowsOfSliceBeingWritten.size()}; + m_RowsOfSliceBeingWritten.resize(start + m_RowCapacity); m_DocHashesOfSliceBeingWritten.emplace_back(); - writeRow(m_RowsOfSliceBeingWritten.begin() + end, + writeRow(m_RowsOfSliceBeingWritten.begin() + start, m_DocHashesOfSliceBeingWritten.back()); ++m_NumberRows; @@ -691,19 +732,20 @@ CDataFrame::CDataFrameRowSliceWriter::finishWritingRows() { std::pair, std::shared_ptr> makeMainStorageDataFrame(std::size_t numberColumns, boost::optional sliceCapacity, - CDataFrame::EReadWriteToStorage readWriteToStoreSyncStrategy) { + CDataFrame::EReadWriteToStorage readWriteToStoreSyncStrategy, + CAlignment::EType alignment) { auto writer = [](std::size_t firstRow, TFloatVec rows, TInt32Vec docHashes) { return std::make_unique( firstRow, std::move(rows), std::move(docHashes)); }; if (sliceCapacity != boost::none) { - return {std::make_unique(true, numberColumns, *sliceCapacity, + return {std::make_unique(true, numberColumns, alignment, *sliceCapacity, readWriteToStoreSyncStrategy, writer), nullptr}; } - return {std::make_unique(true, numberColumns, + return {std::make_unique(true, numberColumns, alignment, readWriteToStoreSyncStrategy, writer), nullptr}; } @@ -713,7 +755,8 @@ makeDiskStorageDataFrame(const std::string& rootDirectory, std::size_t numberColumns, std::size_t numberRows, boost::optional sliceCapacity, - CDataFrame::EReadWriteToStorage readWriteToStoreSyncStrategy) { + CDataFrame::EReadWriteToStorage readWriteToStoreSyncStrategy, + CAlignment::EType alignment) { std::size_t minimumSpace{2 * numberRows * numberColumns * sizeof(CFloatStorage)}; auto directory = std::make_shared(rootDirectory, minimumSpace); @@ -728,11 +771,11 @@ makeDiskStorageDataFrame(const std::string& rootDirectory, }; if (sliceCapacity != boost::none) { - return {std::make_unique(false, numberColumns, *sliceCapacity, + return {std::make_unique(false, numberColumns, alignment, *sliceCapacity, readWriteToStoreSyncStrategy, writer), directory}; } - return {std::make_unique(false, numberColumns, + return {std::make_unique(false, numberColumns, alignment, readWriteToStoreSyncStrategy, writer), directory}; } diff --git a/lib/core/CDataFrameRowSlice.cc b/lib/core/CDataFrameRowSlice.cc index f750db6166..ca405d74b9 100644 --- a/lib/core/CDataFrameRowSlice.cc +++ b/lib/core/CDataFrameRowSlice.cc @@ -20,7 +20,7 @@ namespace ml { namespace core { -using TFloatVec = std::vector; +using TFloatVec = std::vector>; using TFloatVecItr = TFloatVec::iterator; using TInt32Vec = std::vector; using TInt32VecCItr = TInt32Vec::const_iterator; @@ -104,8 +104,8 @@ class CBadDataFrameRowSliceHandle final : public CDataFrameRowSliceHandleImpl { }; //! Checksum \p vec. -template -std::uint64_t computeChecksum(const std::vector& vec) { +template +std::uint64_t computeChecksum(const std::vector& vec) { return CHashing::murmurHash64(vec.data(), static_cast(sizeof(T) * vec.size()), 0); } diff --git a/lib/core/Makefile b/lib/core/Makefile index 735f5fe320..01e617a2a0 100644 --- a/lib/core/Makefile +++ b/lib/core/Makefile @@ -15,6 +15,7 @@ USE_BOOST_IOSTREAMS_LIBS=1 USE_BOOST_LOGSETUP_LIBS=1 USE_BOOST_THREAD_LIBS=1 USE_RAPIDJSON=1 +USE_EIGEN=1 USE_XML=1 USE_ZLIB=1 USE_STRPTIME=1 diff --git a/lib/core/unittest/CAlignmentTest.cc b/lib/core/unittest/CAlignmentTest.cc new file mode 100644 index 0000000000..fd720de37c --- /dev/null +++ b/lib/core/unittest/CAlignmentTest.cc @@ -0,0 +1,181 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +#include +#include + +#include + +#include +#include + +BOOST_AUTO_TEST_SUITE(CAlignmentTest) + +using namespace ml; + +BOOST_AUTO_TEST_CASE(testMaxAlignment) { + + // Test some known alignments. + + alignas(32) const char addresses[64]{}; + for (std::size_t i = 0; i < 64; ++i) { + if (i % 32 == 0) { + BOOST_TEST_REQUIRE(core::CAlignment::maxAlignment(&addresses[i]) == + core::CAlignment::E_Aligned32); + } else if (i % 16 == 0) { + BOOST_TEST_REQUIRE(core::CAlignment::maxAlignment(&addresses[i]) == + core::CAlignment::E_Aligned16); + } else if (i % 8 == 0) { + BOOST_TEST_REQUIRE(core::CAlignment::maxAlignment(&addresses[i]) == + core::CAlignment::E_Aligned8); + } else { + BOOST_TEST_REQUIRE(core::CAlignment::maxAlignment(&addresses[i]) == + core::CAlignment::E_Unaligned); + } + } +} + +BOOST_AUTO_TEST_CASE(testIsAligned) { + + // Test some known alignments. + + alignas(32) const char addresses[64]{}; + for (std::size_t i = 0; i < 64; ++i) { + if (i % 32 == 0) { + BOOST_TEST_REQUIRE(core::CAlignment::isAligned( + &addresses[i], core::CAlignment::E_Aligned32)); + BOOST_TEST_REQUIRE(core::CAlignment::isAligned( + &addresses[i], core::CAlignment::E_Aligned16)); + BOOST_TEST_REQUIRE(core::CAlignment::isAligned( + &addresses[i], core::CAlignment::E_Aligned8)); + } else if (i % 16 == 0) { + BOOST_TEST_REQUIRE(core::CAlignment::isAligned( + &addresses[i], core::CAlignment::E_Aligned32) == false); + BOOST_TEST_REQUIRE(core::CAlignment::isAligned( + &addresses[i], core::CAlignment::E_Aligned16)); + BOOST_TEST_REQUIRE(core::CAlignment::isAligned( + &addresses[i], core::CAlignment::E_Aligned8)); + } else if (i % 8 == 0) { + BOOST_TEST_REQUIRE(core::CAlignment::isAligned( + &addresses[i], core::CAlignment::E_Aligned32) == false); + BOOST_TEST_REQUIRE(core::CAlignment::isAligned( + &addresses[i], core::CAlignment::E_Aligned16) == false); + BOOST_TEST_REQUIRE(core::CAlignment::isAligned( + &addresses[i], core::CAlignment::E_Aligned8)); + } else { + BOOST_TEST_REQUIRE(core::CAlignment::isAligned( + &addresses[i], core::CAlignment::E_Aligned32) == false); + BOOST_TEST_REQUIRE(core::CAlignment::isAligned( + &addresses[i], core::CAlignment::E_Aligned16) == false); + BOOST_TEST_REQUIRE(core::CAlignment::isAligned( + &addresses[i], core::CAlignment::E_Aligned8) == false); + } + } +} + +BOOST_AUTO_TEST_CASE(testNextAligned) { + + // Test that next aligned is the first position with the required alignment + // after the current index. + + alignas(32) std::array addresses; + + for (std::size_t i = 0; i < 8; ++i) { + std::size_t i32{core::CAlignment::nextAligned(addresses, i, core::CAlignment::E_Aligned32)}; + BOOST_TEST_REQUIRE(core::CAlignment::isAligned( + &addresses[i32], core::CAlignment::E_Aligned32)); + for (std::size_t j = i + 1; j < i32; ++j) { + BOOST_TEST_REQUIRE(core::CAlignment::isAligned( + &addresses[j], core::CAlignment::E_Aligned32) == false); + } + + std::size_t i16{core::CAlignment::nextAligned(addresses, i, core::CAlignment::E_Aligned16)}; + BOOST_TEST_REQUIRE(core::CAlignment::isAligned( + &addresses[i16], core::CAlignment::E_Aligned16)); + for (std::size_t j = i + 1; j < i16; ++j) { + BOOST_TEST_REQUIRE(core::CAlignment::isAligned( + &addresses[j], core::CAlignment::E_Aligned16) == false); + } + + std::size_t i8{core::CAlignment::nextAligned(addresses, i, core::CAlignment::E_Aligned8)}; + BOOST_TEST_REQUIRE(core::CAlignment::isAligned( + &addresses[i8], core::CAlignment::E_Aligned8)); + for (std::size_t j = i + 1; j < i8; ++j) { + BOOST_TEST_REQUIRE(core::CAlignment::isAligned( + &addresses[j], core::CAlignment::E_Aligned8) == false); + } + } +} + +BOOST_AUTO_TEST_CASE(testRoundup) { + + // Test rounding up the size of a block of char objects generates the expected sizes. + + BOOST_TEST_REQUIRE( + core::CAlignment::roundup(core::CAlignment::E_Aligned32, 0) == 0); + BOOST_TEST_REQUIRE( + core::CAlignment::roundup(core::CAlignment::E_Aligned16, 0) == 0); + BOOST_TEST_REQUIRE(core::CAlignment::roundup(core::CAlignment::E_Aligned8, 0) == 0); + BOOST_TEST_REQUIRE( + core::CAlignment::roundup(core::CAlignment::E_Unaligned, 0) == 0); + for (std::size_t i = 1; i < 128; ++i) { + BOOST_TEST_REQUIRE(core::CAlignment::roundup(core::CAlignment::E_Aligned32, + i) == 32 * ((i + 31) / 32)); + BOOST_TEST_REQUIRE(core::CAlignment::roundup(core::CAlignment::E_Aligned16, + i) == 16 * ((i + 15) / 16)); + BOOST_TEST_REQUIRE(core::CAlignment::roundup(core::CAlignment::E_Aligned8, + i) == 8 * ((i + 7) / 8)); + BOOST_TEST_REQUIRE( + core::CAlignment::roundup(core::CAlignment::E_Unaligned, i) == i); + } +} + +BOOST_AUTO_TEST_CASE(testRoundupSizeof) { + + // Test rounding up the size of a block of float objects generates the expected memory. + + BOOST_TEST_REQUIRE(core::CAlignment::roundupSizeof( + core::CAlignment::E_Aligned32, 0) == 0); + BOOST_TEST_REQUIRE(core::CAlignment::roundupSizeof( + core::CAlignment::E_Aligned16, 0) == 0); + BOOST_TEST_REQUIRE(core::CAlignment::roundupSizeof( + core::CAlignment::E_Aligned8, 0) == 0); + BOOST_TEST_REQUIRE(core::CAlignment::roundupSizeof( + core::CAlignment::E_Unaligned, 0) == 0); + for (std::size_t i = 1; i < 32; ++i) { + BOOST_TEST_REQUIRE( + core::CAlignment::roundupSizeof(core::CAlignment::E_Aligned32, i) == + 32 * ((4 * i + 31) / 32)); + BOOST_TEST_REQUIRE( + core::CAlignment::roundupSizeof(core::CAlignment::E_Aligned16, i) == + 16 * ((4 * i + 15) / 16)); + BOOST_TEST_REQUIRE(core::CAlignment::roundupSizeof( + core::CAlignment::E_Aligned8, i) == 8 * ((4 * i + 7) / 8)); + BOOST_TEST_REQUIRE(core::CAlignment::roundupSizeof( + core::CAlignment::E_Unaligned, i) == 4 * i); + } +} + +BOOST_AUTO_TEST_CASE(testAlignedAllocator) { + + core::CAlignedAllocator allocator; + + std::vector addresses; + + bool aligned32{true}; + for (std::size_t i = 0; i < 20; ++i) { + double* address{allocator.allocate(6)}; + addresses.push_back(address); + aligned32 = aligned32 && + core::CAlignment::isAligned(address, core::CAlignment::E_Aligned32); + } + for (auto& address : addresses) { + allocator.deallocate(address, 6); + } + BOOST_TEST_REQUIRE(aligned32); +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/lib/core/unittest/CDataFrameTest.cc b/lib/core/unittest/CDataFrameTest.cc index 8b87eb6f2a..f38a20f1ce 100644 --- a/lib/core/unittest/CDataFrameTest.cc +++ b/lib/core/unittest/CDataFrameTest.cc @@ -4,9 +4,11 @@ * you may not use this file except in compliance with the Elastic License. */ +#include #include #include #include +#include #include #include @@ -28,7 +30,8 @@ using namespace ml; namespace { using TBoolVec = std::vector; using TDoubleVec = std::vector; -using TFloatVec = std::vector; +using TFloatVec = + std::vector>; using TFloatVecItr = TFloatVec::iterator; using TFloatVecCItr = TFloatVec::const_iterator; using TSizeFloatVecUMap = boost::unordered_map; @@ -831,4 +834,115 @@ BOOST_FIXTURE_TEST_CASE(testRowMask, CTestFixture) { } } +BOOST_FIXTURE_TEST_CASE(testAlignment, CTestFixture) { + + // Test all the rows have the requested alignment. + + using TAlignedFactoryFunc = + std::function(core::CAlignment::EType)>; + + std::size_t rows{5000}; + std::size_t cols{15}; + std::size_t capacity{1000}; + TFloatVec components{testData(rows, cols)}; + + test::CRandomNumbers rng; + + TAlignedFactoryFunc makeOnDisk = [=](core::CAlignment::EType alignment) { + return core::makeDiskStorageDataFrame( + boost::filesystem::current_path().string(), cols, rows, capacity, + core::CDataFrame::EReadWriteToStorage::E_Async, alignment) + .first; + }; + TAlignedFactoryFunc makeMainMemory = [=](core::CAlignment::EType alignment) { + return core::makeMainStorageDataFrame( + cols, capacity, core::CDataFrame::EReadWriteToStorage::E_Sync, alignment) + .first; + }; + + std::string type[]{"on disk", "main memory"}; + std::size_t t{0}; + for (const auto& factory : {makeOnDisk, makeMainMemory}) { + for (auto alignment : {core::CAlignment::E_Aligned8, core::CAlignment::E_Aligned16, + core::CAlignment::E_Aligned32}) { + LOG_DEBUG(<< "Test aligned " << alignment << " " << type[t]); + + auto frame = factory(alignment); + + for (std::size_t i = 0; i < components.size(); i += cols) { + frame->writeRow(makeWriter(components, cols, i)); + } + frame->finishWritingRows(); + + frame->readRows(1, [alignment](TRowItr beginRows, TRowItr endRows) { + for (auto row = beginRows; row != endRows; ++row) { + BOOST_TEST_REQUIRE(core::CAlignment::isAligned(row->data(), alignment)); + } + }); + } + ++t; + } +} + +BOOST_FIXTURE_TEST_CASE(testAlignedExtraColumns, CTestFixture) { + + // Test all the rows have the requested alignment. + + using TAlignedFactoryFunc = + std::function(core::CAlignment::EType)>; + + std::size_t rows{5000}; + std::size_t cols{15}; + std::size_t capacity{1000}; + TFloatVec components{testData(rows, cols)}; + core::CDataFrame::TSizeAlignmentPrVec extraCols{{2, core::CAlignment::E_Unaligned}, + {3, core::CAlignment::E_Aligned16}, + {1, core::CAlignment::E_Unaligned}}; + test::CRandomNumbers rng; + + TAlignedFactoryFunc makeOnDisk = [=](core::CAlignment::EType alignment) { + return core::makeDiskStorageDataFrame( + boost::filesystem::current_path().string(), cols, rows, capacity, + core::CDataFrame::EReadWriteToStorage::E_Async, alignment) + .first; + }; + TAlignedFactoryFunc makeMainMemory = [=](core::CAlignment::EType alignment) { + return core::makeMainStorageDataFrame( + cols, capacity, core::CDataFrame::EReadWriteToStorage::E_Sync, alignment) + .first; + }; + + std::string type[]{"on disk", "main memory"}; + std::size_t t{0}; + for (const auto& factory : {makeOnDisk, makeMainMemory}) { + for (auto alignment : {core::CAlignment::E_Aligned16, core::CAlignment::E_Aligned32}) { + LOG_DEBUG(<< "Test aligned " << alignment << " " << type[t]); + + auto frame = factory(alignment); + + for (std::size_t i = 0; i < components.size(); i += cols) { + frame->writeRow(makeWriter(components, cols, i)); + } + frame->finishWritingRows(); + + auto offsets = frame->resizeColumns(1, extraCols); + for (std::size_t i = 1; i < offsets.size(); ++i) { + BOOST_TEST_REQUIRE(offsets[i] - offsets[i - 1] >= + extraCols[i - 1].first); + } + + BOOST_TEST_REQUIRE(extraCols.size() == offsets.size()); + frame->readRows(1, [&](TRowItr beginRows, TRowItr endRows) { + for (auto row = beginRows; row != endRows; ++row) { + for (std::size_t i = 0; i < extraCols.size(); ++i) { + BOOST_TEST_REQUIRE(core::CAlignment::isAligned( + row->data() + offsets[i], extraCols[i].second)); + } + } + }); + } + ++t; + } +} + BOOST_AUTO_TEST_SUITE_END() diff --git a/lib/core/unittest/Makefile b/lib/core/unittest/Makefile index 85cca582fd..535258553c 100644 --- a/lib/core/unittest/Makefile +++ b/lib/core/unittest/Makefile @@ -26,6 +26,7 @@ CProcessPriorityTest.cc \ SRCS=\ $(OS_SRCS) \ Main.cc \ +CAlignmentTest.cc \ CAllocationStrategyTest.cc \ CBase64FilterTest.cc \ CCompressedDictionaryTest.cc \ diff --git a/lib/maths/CBoostedTree.cc b/lib/maths/CBoostedTree.cc index 60f273b78e..e1353531eb 100644 --- a/lib/maths/CBoostedTree.cc +++ b/lib/maths/CBoostedTree.cc @@ -170,8 +170,8 @@ std::size_t CBoostedTree::columnHoldingDependentVariable() const { CBoostedTree::TDouble2Vec CBoostedTree::readPrediction(const TRowRef& row) const { const auto& loss = m_Impl->loss(); return loss - .transform(boosted_tree_detail::readPrediction( - row, m_Impl->numberInputColumns(), loss.numberParameters())) + .transform(boosted_tree_detail::readPrediction(row, m_Impl->extraColumns(), + loss.numberParameters())) .to(); } @@ -180,7 +180,7 @@ CBoostedTree::TDouble2Vec CBoostedTree::readAndAdjustPrediction(const TRowRef& r const auto& loss = m_Impl->loss(); auto prediction = loss.transform(boosted_tree_detail::readPrediction( - row, m_Impl->numberInputColumns(), loss.numberParameters())); + row, m_Impl->extraColumns(), loss.numberParameters())); switch (loss.type()) { case CLoss::E_BinaryClassification: diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc index 04d9c2e655..70f50c6ad6 100644 --- a/lib/maths/CBoostedTreeFactory.cc +++ b/lib/maths/CBoostedTreeFactory.cc @@ -24,7 +24,6 @@ namespace ml { namespace maths { using namespace boosted_tree_detail; using TDoubleVec = std::vector; -using TSizeVec = std::vector; using TRowItr = core::CDataFrame::TRowItr; namespace { @@ -274,9 +273,9 @@ void CBoostedTreeFactory::initializeNumberFolds(core::CDataFrame& frame) const { } void CBoostedTreeFactory::resizeDataFrame(core::CDataFrame& frame) const { - m_TreeImpl->m_NumberInputColumns = frame.numberColumns(); - frame.resizeColumns(m_TreeImpl->m_NumberThreads, - frame.numberColumns() + this->numberExtraColumnsForTrain()); + std::size_t numberLossParameters{m_TreeImpl->m_Loss->numberParameters()}; + m_TreeImpl->m_ExtraColumns = frame.resizeColumns( + m_TreeImpl->m_NumberThreads, extraColumns(numberLossParameters)); m_TreeImpl->m_Instrumentation->updateMemoryUsage(core::CMemory::dynamicSize(frame)); } @@ -293,11 +292,8 @@ void CBoostedTreeFactory::initializeCrossValidation(core::CDataFrame& frame) con frame.writeColumns(m_NumberThreads, 0, frame.numberRows(), [&](TRowItr beginRows, TRowItr endRows) { - std::size_t column{exampleWeightColumn( - m_TreeImpl->m_NumberInputColumns, - m_TreeImpl->m_Loss->numberParameters())}; for (auto row = beginRows; row != endRows; ++row) { - row->writeColumn(column, 1.0); + writeExampleWeight(*row, m_TreeImpl->m_ExtraColumns, 1.0); } }, &allTrainingRowsMask); diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index 91c0f639df..59cd52942a 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -290,7 +290,7 @@ void CBoostedTreeImpl::predict(core::CDataFrame& frame) const { m_NumberThreads, 0, frame.numberRows(), [&](TRowItr beginRows, TRowItr endRows) { std::size_t numberLossParameters{m_Loss->numberParameters()}; for (auto row = beginRows; row != endRows; ++row) { - auto prediction = readPrediction(*row, m_NumberInputColumns, numberLossParameters); + auto prediction = readPrediction(*row, m_ExtraColumns, numberLossParameters); prediction = predictRow(m_Encoder->encode(*row), m_BestForest); } }); @@ -406,14 +406,14 @@ void CBoostedTreeImpl::computeClassificationWeights(const core::CDataFrame& fram // We predict the log-odds but this is expected to return // the log of the predicted class probabilities. TMemoryMappedFloatVector result{&storage[0], 2}; - result.array() = m_Loss - ->transform(readPrediction( - row, m_NumberInputColumns, numberClasses)) - .array() - .log(); + result.array() = + m_Loss + ->transform(readPrediction(row, m_ExtraColumns, numberClasses)) + .array() + .log(); return result; } - return readPrediction(row, m_NumberInputColumns, numberClasses); + return readPrediction(row, m_ExtraColumns, numberClasses); }); break; } @@ -507,9 +507,9 @@ CBoostedTreeImpl::TNodeVec CBoostedTreeImpl::initializePredictionsAndLossDerivat [this](TRowItr beginRows, TRowItr endRows) { std::size_t numberLossParameters{m_Loss->numberParameters()}; for (auto row = beginRows; row != endRows; ++row) { - zeroPrediction(*row, m_NumberInputColumns, numberLossParameters); - zeroLossGradient(*row, m_NumberInputColumns, numberLossParameters); - zeroLossCurvature(*row, m_NumberInputColumns, numberLossParameters); + zeroPrediction(*row, m_ExtraColumns, numberLossParameters); + zeroLossGradient(*row, m_ExtraColumns, numberLossParameters); + zeroLossCurvature(*row, m_ExtraColumns, numberLossParameters); } }, &updateRowMask); @@ -665,7 +665,7 @@ CBoostedTreeImpl::candidateSplits(const core::CDataFrame& frame, [this](const TRowRef& row) { std::size_t numberLossParameters{m_Loss->numberParameters()}; return trace(numberLossParameters, - readLossCurvature(row, m_NumberInputColumns, numberLossParameters)); + readLossCurvature(row, m_ExtraColumns, numberLossParameters)); }) .first; @@ -736,8 +736,8 @@ CBoostedTreeImpl::trainTree(core::CDataFrame& frame, TLeafNodeStatisticsPtrQueue leaves(maximumTreeSize / 2 + 3); leaves.push_back(std::make_shared( - 0 /*root*/, m_NumberInputColumns, m_Loss->numberParameters(), - m_NumberThreads, frame, *m_Encoder, m_Regularization, candidateSplits, + 0 /*root*/, m_ExtraColumns, m_Loss->numberParameters(), m_NumberThreads, + frame, *m_Encoder, m_Regularization, candidateSplits, this->featureBag(), 0 /*depth*/, trainingRowMask)); // We update local variables because the callback can be expensive if it @@ -1006,11 +1006,10 @@ void CBoostedTreeImpl::refreshPredictionsAndLossDerivatives(core::CDataFrame& fr [&](TArgMinLossVec& leafValues_, TRowItr beginRows, TRowItr endRows) { std::size_t numberLossParameters{m_Loss->numberParameters()}; for (auto row = beginRows; row != endRows; ++row) { - auto prediction = readPrediction(*row, m_NumberInputColumns, + auto prediction = readPrediction(*row, m_ExtraColumns, numberLossParameters); double actual{readActual(*row, m_DependentVariable)}; - double weight{readExampleWeight(*row, m_NumberInputColumns, - numberLossParameters)}; + double weight{readExampleWeight(*row, m_ExtraColumns)}; leafValues_[root(tree).leafIndex(m_Encoder->encode(*row), tree)] .add(prediction, actual, weight); } @@ -1040,14 +1039,12 @@ void CBoostedTreeImpl::refreshPredictionsAndLossDerivatives(core::CDataFrame& fr [&](TRowItr beginRows, TRowItr endRows) { std::size_t numberLossParameters{m_Loss->numberParameters()}; for (auto row = beginRows; row != endRows; ++row) { - auto prediction = readPrediction(*row, m_NumberInputColumns, numberLossParameters); + auto prediction = readPrediction(*row, m_ExtraColumns, numberLossParameters); double actual{readActual(*row, m_DependentVariable)}; - double weight{readExampleWeight(*row, m_NumberInputColumns, numberLossParameters)}; + double weight{readExampleWeight(*row, m_ExtraColumns)}; prediction += root(tree).value(m_Encoder->encode(*row), tree); - writeLossGradient(*row, m_NumberInputColumns, *m_Loss, - prediction, actual, weight); - writeLossCurvature(*row, m_NumberInputColumns, *m_Loss, - prediction, actual, weight); + writeLossGradient(*row, m_ExtraColumns, *m_Loss, prediction, actual, weight); + writeLossCurvature(*row, m_ExtraColumns, *m_Loss, prediction, actual, weight); } }, &updateRowMask); @@ -1062,8 +1059,7 @@ double CBoostedTreeImpl::meanLoss(const core::CDataFrame& frame, [&](TMeanAccumulator& loss, TRowItr beginRows, TRowItr endRows) { std::size_t numberLossParameters{m_Loss->numberParameters()}; for (auto row = beginRows; row != endRows; ++row) { - auto prediction = readPrediction(*row, m_NumberInputColumns, - numberLossParameters); + auto prediction = readPrediction(*row, m_ExtraColumns, numberLossParameters); double actual{readActual(*row, m_DependentVariable)}; loss.add(m_Loss->value(prediction, actual)); } @@ -1559,8 +1555,8 @@ std::size_t CBoostedTreeImpl::columnHoldingDependentVariable() const { return m_DependentVariable; } -std::size_t CBoostedTreeImpl::numberInputColumns() const { - return m_NumberInputColumns; +const CBoostedTreeImpl::TSizeVec& CBoostedTreeImpl::extraColumns() const { + return m_ExtraColumns; } CBoostedTreeImpl::TVector CBoostedTreeImpl::classificationWeights() const { diff --git a/lib/maths/CBoostedTreeLeafNodeStatistics.cc b/lib/maths/CBoostedTreeLeafNodeStatistics.cc index abd7becb35..79c26a8a15 100644 --- a/lib/maths/CBoostedTreeLeafNodeStatistics.cc +++ b/lib/maths/CBoostedTreeLeafNodeStatistics.cc @@ -28,7 +28,7 @@ const std::size_t ASSIGN_MISSING_TO_RIGHT{1}; CBoostedTreeLeafNodeStatistics::CBoostedTreeLeafNodeStatistics( std::size_t id, - std::size_t numberInputColumns, + const TSizeVec& extraColumns, std::size_t numberLossParameters, std::size_t numberThreads, const core::CDataFrame& frame, @@ -38,8 +38,8 @@ CBoostedTreeLeafNodeStatistics::CBoostedTreeLeafNodeStatistics( const TSizeVec& featureBag, std::size_t depth, const core::CPackedBitVector& rowMask) - : m_Id{id}, m_Depth{depth}, m_NumberInputColumns{numberInputColumns}, - m_NumberLossParameters{numberLossParameters}, m_CandidateSplits{candidateSplits}, m_RowMask{rowMask} { + : m_Id{id}, m_Depth{depth}, m_ExtraColumns{extraColumns}, m_NumberLossParameters{numberLossParameters}, + m_CandidateSplits{candidateSplits}, m_RowMask{rowMask} { this->computeAggregateLossDerivatives(numberThreads, frame, encoder); m_BestSplit = this->computeBestSplitStatistics(regularization, featureBag); @@ -47,7 +47,7 @@ CBoostedTreeLeafNodeStatistics::CBoostedTreeLeafNodeStatistics( CBoostedTreeLeafNodeStatistics::CBoostedTreeLeafNodeStatistics( std::size_t id, - std::size_t numberInputColumns, + const TSizeVec& extraColumns, std::size_t numberLossParameters, std::size_t numberThreads, const core::CDataFrame& frame, @@ -59,7 +59,7 @@ CBoostedTreeLeafNodeStatistics::CBoostedTreeLeafNodeStatistics( std::size_t depth, const CBoostedTreeNode& split, const core::CPackedBitVector& parentRowMask) - : m_Id{id}, m_Depth{depth}, m_NumberInputColumns{numberInputColumns}, + : m_Id{id}, m_Depth{depth}, m_ExtraColumns{extraColumns}, m_NumberLossParameters{numberLossParameters}, m_CandidateSplits{candidateSplits} { this->computeRowMaskAndAggregateLossDerivatives( @@ -74,7 +74,7 @@ CBoostedTreeLeafNodeStatistics::CBoostedTreeLeafNodeStatistics( const TRegularization& regularization, const TSizeVec& featureBag, core::CPackedBitVector rowMask) - : m_Id{id}, m_Depth{sibling.m_Depth}, m_NumberInputColumns{sibling.m_NumberInputColumns}, + : m_Id{id}, m_Depth{sibling.m_Depth}, m_ExtraColumns{sibling.m_ExtraColumns}, m_NumberLossParameters{sibling.m_NumberLossParameters}, m_CandidateSplits{sibling.m_CandidateSplits}, m_RowMask{std::move(rowMask)}, m_Derivatives{std::move(parent.m_Derivatives)} { @@ -96,9 +96,9 @@ CBoostedTreeLeafNodeStatistics::split(std::size_t leftChildId, if (this->leftChildHasFewerRows()) { auto leftChild = std::make_shared( - leftChildId, m_NumberInputColumns, m_NumberLossParameters, - numberThreads, frame, encoder, regularization, candidateSplits, - featureBag, true /*is left child*/, m_Depth + 1, split, m_RowMask); + leftChildId, m_ExtraColumns, m_NumberLossParameters, numberThreads, + frame, encoder, regularization, candidateSplits, featureBag, + true /*is left child*/, m_Depth + 1, split, m_RowMask); core::CPackedBitVector rightChildRowMask{std::move(m_RowMask)}; rightChildRowMask ^= leftChild->rowMask(); auto rightChild = std::make_shared( @@ -111,9 +111,9 @@ CBoostedTreeLeafNodeStatistics::split(std::size_t leftChildId, } auto rightChild = std::make_shared( - rightChildId, m_NumberInputColumns, m_NumberLossParameters, - numberThreads, frame, encoder, regularization, candidateSplits, - featureBag, false /*is left child*/, m_Depth + 1, split, m_RowMask); + rightChildId, m_ExtraColumns, m_NumberLossParameters, numberThreads, + frame, encoder, regularization, candidateSplits, featureBag, + false /*is left child*/, m_Depth + 1, split, m_RowMask); core::CPackedBitVector leftChildRowMask{std::move(m_RowMask)}; leftChildRowMask ^= rightChild->rowMask(); auto leftChild = std::make_shared( @@ -171,15 +171,15 @@ CBoostedTreeLeafNodeStatistics::estimateMemoryUsage(std::size_t numberRows, // case for memory usage. This is because the rows will be spread over many // rows so the masks will mainly contain 0 bits in this case. std::size_t rowMaskSize{numberRows / PACKED_BIT_VECTOR_MAXIMUM_ROWS_PER_BYTE}; - std::size_t perSplitDerivativesSize{CPerSplitDerivatives::estimateMemoryUsage( + std::size_t splitsDerivativesSize{CSplitsDerivatives::estimateMemoryUsage( numberFeatures, numberSplitsPerFeature, numberLossParameters)}; - return sizeof(CBoostedTreeLeafNodeStatistics) + rowMaskSize + perSplitDerivativesSize; + return sizeof(CBoostedTreeLeafNodeStatistics) + rowMaskSize + splitsDerivativesSize; } void CBoostedTreeLeafNodeStatistics::maybeRecoverMemory() { if (this->gain() <= 0.0) { m_RowMask = core::CPackedBitVector{}; - m_Derivatives = CPerSplitDerivatives{}; + m_Derivatives = CSplitsDerivatives{}; } } @@ -191,12 +191,12 @@ void CBoostedTreeLeafNodeStatistics::computeAggregateLossDerivatives( auto result = frame.readRows( numberThreads, 0, frame.numberRows(), core::bindRetrievableState( - [&](CPerSplitDerivatives& perSplitDerivatives, TRowItr beginRows, TRowItr endRows) { + [&](CSplitsDerivatives& splitsDerivatives, TRowItr beginRows, TRowItr endRows) { for (auto row = beginRows; row != endRows; ++row) { - this->addRowDerivatives(encoder.encode(*row), perSplitDerivatives); + this->addRowDerivatives(encoder.encode(*row), splitsDerivatives); } }, - CPerSplitDerivatives{m_CandidateSplits, m_NumberLossParameters}), + CSplitsDerivatives{m_CandidateSplits, m_NumberLossParameters}), &m_RowMask); m_Derivatives = std::move(result.first[0].s_FunctionState); @@ -217,22 +217,22 @@ void CBoostedTreeLeafNodeStatistics::computeRowMaskAndAggregateLossDerivatives( auto result = frame.readRows( numberThreads, 0, frame.numberRows(), core::bindRetrievableState( - [&](std::pair& state, + [&](std::pair& state, TRowItr beginRows, TRowItr endRows) { auto& mask = state.first; - auto& perSplitDerivatives = state.second; + auto& splitsDerivatives = state.second; for (auto row = beginRows; row != endRows; ++row) { auto encodedRow = encoder.encode(*row); if (split.assignToLeft(encodedRow) == isLeftChild) { std::size_t index{row->index()}; mask.extend(false, index - mask.size()); mask.extend(true); - this->addRowDerivatives(encodedRow, perSplitDerivatives); + this->addRowDerivatives(encodedRow, splitsDerivatives); } } }, std::make_pair(core::CPackedBitVector{}, - CPerSplitDerivatives{m_CandidateSplits, m_NumberLossParameters})), + CSplitsDerivatives{m_CandidateSplits, m_NumberLossParameters})), &parentRowMask); for (auto& mask_ : result.first) { @@ -250,18 +250,17 @@ void CBoostedTreeLeafNodeStatistics::computeRowMaskAndAggregateLossDerivatives( } void CBoostedTreeLeafNodeStatistics::addRowDerivatives(const CEncodedDataFrameRowRef& row, - CPerSplitDerivatives& perSplitDerivatives) const { + CSplitsDerivatives& splitsDerivatives) const { const TRowRef& unencodedRow{row.unencodedRow()}; - auto gradient = readLossGradient(unencodedRow, m_NumberInputColumns, m_NumberLossParameters); - auto curvature = readLossCurvature(unencodedRow, m_NumberInputColumns, m_NumberLossParameters); + auto derivatives = readLossDerivatives(unencodedRow, m_ExtraColumns, m_NumberLossParameters); for (std::size_t feature = 0; feature < m_CandidateSplits.size(); ++feature) { double featureValue{row[feature]}; if (CDataFrameUtils::isMissing(featureValue)) { - perSplitDerivatives.addMissingDerivatives(feature, gradient, curvature); + splitsDerivatives.addMissingDerivatives(feature, derivatives); } else { std::ptrdiff_t split{m_CandidateSplits[feature].upperBound(featureValue)}; - perSplitDerivatives.addDerivatives(feature, split, gradient, curvature); + splitsDerivatives.addDerivatives(feature, split, derivatives); } } } diff --git a/lib/maths/CBoostedTreeUtils.cc b/lib/maths/CBoostedTreeUtils.cc index 934aa4610c..efaa8d9967 100644 --- a/lib/maths/CBoostedTreeUtils.cc +++ b/lib/maths/CBoostedTreeUtils.cc @@ -13,44 +13,57 @@ namespace ml { namespace maths { namespace boosted_tree_detail { using namespace boosted_tree; +namespace { +enum EExtraColumn { E_Prediction = 0, E_Gradient, E_Curvature, E_Weight }; +} + +TSizeAlignmentPrVec extraColumns(std::size_t numberLossParameters) { + return {{numberLossParameters, core::CAlignment::E_Unaligned}, + {numberLossParameters, core::CAlignment::E_Aligned16}, + {numberLossParameters * numberLossParameters, core::CAlignment::E_Unaligned}, + {1, core::CAlignment::E_Unaligned}}; +} TMemoryMappedFloatVector readPrediction(const TRowRef& row, - std::size_t numberInputColumns, - std::size_t numberLossParamaters) { - return {row.data() + predictionColumn(numberInputColumns), - static_cast(numberLossParamaters)}; + const TSizeVec& extraColumns, + std::size_t numberLossParameters) { + return {row.data() + extraColumns[E_Prediction], static_cast(numberLossParameters)}; } -void zeroPrediction(const TRowRef& row, std::size_t numberInputColumns, std::size_t numberLossParamaters) { - std::size_t offset{predictionColumn(numberInputColumns)}; - for (std::size_t i = 0; i < numberLossParamaters; ++i) { - row.writeColumn(offset + i, 0.0); +void zeroPrediction(const TRowRef& row, const TSizeVec& extraColumns, std::size_t numberLossParameters) { + for (std::size_t i = 0; i < numberLossParameters; ++i) { + row.writeColumn(extraColumns[E_Prediction] + i, 0.0); } } +TAlignedMemoryMappedFloatVector readLossDerivatives(const TRowRef& row, + const TSizeVec& extraColumns, + std::size_t numberLossParameters) { + return {row.data() + extraColumns[E_Gradient], + static_cast(numberLossParameters + + lossHessianUpperTriangleSize(numberLossParameters))}; +} + TMemoryMappedFloatVector readLossGradient(const TRowRef& row, - std::size_t numberInputColumns, + const TSizeVec& extraColumns, std::size_t numberLossParameters) { - return {row.data() + lossGradientColumn(numberInputColumns, numberLossParameters), - static_cast(numberLossParameters)}; + return {row.data() + extraColumns[E_Gradient], static_cast(numberLossParameters)}; } -void zeroLossGradient(const TRowRef& row, std::size_t numberInputColumns, std::size_t numberLossParameters) { - std::size_t offset{lossGradientColumn(numberInputColumns, numberLossParameters)}; +void zeroLossGradient(const TRowRef& row, const TSizeVec& extraColumns, std::size_t numberLossParameters) { for (std::size_t i = 0; i < numberLossParameters; ++i) { - row.writeColumn(offset + i, 0.0); + row.writeColumn(extraColumns[E_Gradient] + i, 0.0); } } void writeLossGradient(const TRowRef& row, - std::size_t numberInputColumns, + const TSizeVec& extraColumns, const CLoss& loss, const TMemoryMappedFloatVector& prediction, double actual, double weight) { - std::size_t offset{lossGradientColumn(numberInputColumns, prediction.size())}; - auto writer = [&row, offset](std::size_t i, double value) { - row.writeColumn(offset + i, value); + auto writer = [&row, &extraColumns](std::size_t i, double value) { + row.writeColumn(extraColumns[E_Gradient] + i, value); }; // We wrap the writer in another lambda which we know takes advantage // of std::function small size optimization to avoid heap allocations. @@ -59,29 +72,27 @@ void writeLossGradient(const TRowRef& row, } TMemoryMappedFloatVector readLossCurvature(const TRowRef& row, - std::size_t numberInputColumns, + const TSizeVec& extraColumns, std::size_t numberLossParameters) { - return {row.data() + lossCurvatureColumn(numberInputColumns, numberLossParameters), - static_cast(lossHessianStoredSize(numberLossParameters))}; + return {row.data() + extraColumns[E_Curvature], + static_cast(lossHessianUpperTriangleSize(numberLossParameters))}; } -void zeroLossCurvature(const TRowRef& row, std::size_t numberInputColumns, std::size_t numberLossParameters) { - std::size_t offset{lossCurvatureColumn(numberInputColumns, numberLossParameters)}; - for (std::size_t i = 0, size = lossHessianStoredSize(numberLossParameters); +void zeroLossCurvature(const TRowRef& row, const TSizeVec& extraColumns, std::size_t numberLossParameters) { + for (std::size_t i = 0, size = lossHessianUpperTriangleSize(numberLossParameters); i < size; ++i) { - row.writeColumn(offset + i, 0.0); + row.writeColumn(extraColumns[E_Curvature] + i, 0.0); } } void writeLossCurvature(const TRowRef& row, - std::size_t numberInputColumns, + const TSizeVec& extraColumns, const CLoss& loss, const TMemoryMappedFloatVector& prediction, double actual, double weight) { - std::size_t offset{lossCurvatureColumn(numberInputColumns, prediction.size())}; - auto writer = [&row, offset](std::size_t i, double value) { - row.writeColumn(offset + i, value); + auto writer = [&row, &extraColumns](std::size_t i, double value) { + row.writeColumn(extraColumns[E_Curvature] + i, value); }; // We wrap the writer in another lambda which we know takes advantage // of std::function small size optimization to avoid heap allocations. @@ -89,10 +100,12 @@ void writeLossCurvature(const TRowRef& row, [&writer](std::size_t i, double value) { writer(i, value); }, weight); } -double readExampleWeight(const TRowRef& row, - std::size_t numberInputColumns, - std::size_t numberLossParameters) { - return row[exampleWeightColumn(numberInputColumns, numberLossParameters)]; +double readExampleWeight(const TRowRef& row, const TSizeVec& extraColumns) { + return row[extraColumns[E_Weight]]; +} + +void writeExampleWeight(const TRowRef& row, const TSizeVec& extraColumns, double weight) { + row.writeColumn(extraColumns[E_Weight], weight); } double readActual(const TRowRef& row, std::size_t dependentVariable) { diff --git a/lib/maths/COutliers.cc b/lib/maths/COutliers.cc index e7eae3b466..a004f75a32 100644 --- a/lib/maths/COutliers.cc +++ b/lib/maths/COutliers.cc @@ -6,6 +6,7 @@ #include +#include #include #include #include @@ -34,6 +35,8 @@ const std::string COMPUTE_OUTLIER_SCORES{"compute_outlier_scores"}; using TRowItr = core::CDataFrame::TRowItr; using TStepCallback = std::function; +using TMemoryMappedFloatVector = CMemoryMappedDenseVector; +using TDenseFloatVector = CDenseVector; double shift(double score) { return std::exp(-2.0) + score; @@ -878,7 +881,7 @@ bool computeOutliersNoPartitions(const COutliers::SComputeParameters& params, core::CDataFrame& frame, CDataFrameAnalysisInstrumentationInterface& instrumentation) { - using TPoint = CMemoryMappedDenseVector; + using TPoint = TMemoryMappedFloatVector; using TPointVec = std::vector; std::int64_t frameMemory{signedMemoryUsage(frame)}; @@ -964,7 +967,7 @@ bool computeOutliersPartitioned(const COutliers::SComputeParameters& params, core::CDataFrame& frame, CDataFrameAnalysisInstrumentationInterface& instrumentation) { - using TPoint = CDenseVector; + using TPoint = TDenseFloatVector; using TPointVec = std::vector; core::CStopWatch watch{true}; @@ -1079,9 +1082,9 @@ std::size_t COutliers::estimateMemoryUsedByCompute(const SComputeParameters& par std::size_t partitionNumberPoints, std::size_t dimension) { return params.s_NumberPartitions == 1 - ? COutliers::estimateMemoryUsedByCompute>( + ? COutliers::estimateMemoryUsedByCompute( params, totalNumberPoints, partitionNumberPoints, dimension) - : COutliers::estimateMemoryUsedByCompute>( + : COutliers::estimateMemoryUsedByCompute( params, totalNumberPoints, partitionNumberPoints, dimension); } diff --git a/lib/maths/CTreeShapFeatureImportance.cc b/lib/maths/CTreeShapFeatureImportance.cc index 9a1f6e8f27..240ea1eee8 100644 --- a/lib/maths/CTreeShapFeatureImportance.cc +++ b/lib/maths/CTreeShapFeatureImportance.cc @@ -4,7 +4,6 @@ * you may not use this file except in compliance with the Elastic License. */ -#include "core/Concurrency.h" #include #include @@ -24,12 +23,8 @@ CTreeShapFeatureImportance::CTreeShapFeatureImportance(const core::CDataFrame& f const CDataFrameCategoryEncoder& encoder, TTreeVec& forest, std::size_t numberTopShapValues) - : m_NumberTopShapValues{numberTopShapValues}, m_Encoder{&encoder}, m_Forest{&forest} { - - m_ColumnNames.reserve(frame.columnNames().size()); - for (const auto& name : frame.columnNames()) { - m_ColumnNames.push_back(name); - } + : m_NumberTopShapValues{numberTopShapValues}, m_Encoder{&encoder}, m_Forest{&forest}, + m_ColumnNames{frame.columnNames()} { // When traversing a tree, we successively copy the parent path and add one // new element to it. This means that if a tree has maxDepth depth, we store diff --git a/lib/maths/unittest/CBoostedTreeLeafNodeStatisticsTest.cc b/lib/maths/unittest/CBoostedTreeLeafNodeStatisticsTest.cc index a0a2c28584..f27423baef 100644 --- a/lib/maths/unittest/CBoostedTreeLeafNodeStatisticsTest.cc +++ b/lib/maths/unittest/CBoostedTreeLeafNodeStatisticsTest.cc @@ -22,7 +22,9 @@ using TDoubleVec = std::vector; using TDoubleVecVec = std::vector; using TSizeVec = std::vector; using TSizeVecVec = std::vector; -using TFloatVec = std::vector; +using TAlignedFloatVec = + std::vector>; +using TAlignedDoubleVec = std::vector>; using TVector = maths::CDenseVector; using TVectorVec = std::vector; using TVectorVecVec = std::vector; @@ -32,22 +34,22 @@ using TMatrixVecVec = std::vector; using TImmutableRadixSet = maths::CBoostedTreeLeafNodeStatistics::TImmutableRadixSet; using TImmutableRadixSetVec = maths::CBoostedTreeLeafNodeStatistics::TImmutableRadixSetVec; using TDerivatives = maths::CBoostedTreeLeafNodeStatistics::CDerivatives; -using TPerSplitDerivatives = maths::CBoostedTreeLeafNodeStatistics::CPerSplitDerivatives; +using TSplitsDerivatives = maths::CBoostedTreeLeafNodeStatistics::CSplitsDerivatives; namespace { template -maths::CMemoryMappedDenseVector makeGradient(T* storage, std::size_t n) { +maths::CMemoryMappedDenseVector makeVector(T* storage, std::size_t n) { return maths::CMemoryMappedDenseVector{storage, static_cast(n)}; } -template -maths::CMemoryMappedDenseVector makeCurvature(T* storage, std::size_t n) { - return maths::CMemoryMappedDenseVector(storage, static_cast(n)); +template +maths::CMemoryMappedDenseVector makeAlignedVector(T* storage, std::size_t n) { + return maths::CMemoryMappedDenseVector{storage, static_cast(n)}; } template -TMatrix rowMajorHessian(std::size_t n, const maths::CMemoryMappedDenseVector& curvatures) { +TMatrix columnMajorHessian(std::size_t n, const maths::CMemoryMappedDenseVector& curvatures) { TMatrix result{n, n}; for (std::size_t i = 0, k = 0; i < n; ++i) { for (std::size_t j = i; j < n; ++j, ++k) { @@ -77,20 +79,24 @@ void testDerivativesFor(std::size_t numberParameters) { LOG_DEBUG(<< "Accumulate"); - TDoubleVec storage1(numberGradients * (numberGradients + 1), 0.0); - TDerivatives derivatives1{numberParameters, &storage1[0]}; + std::size_t paddedNumberGradients{core::CAlignment::roundup( + core::CAlignment::E_Aligned16, numberGradients)}; + + TAlignedDoubleVec storage1(paddedNumberGradients + numberGradients * numberGradients, 0.0); + TDerivatives derivatives1{numberParameters, &storage1[0], + &storage1[paddedNumberGradients]}; for (std::size_t j = 0; j < 10; ++j) { - TFloatVec storage; + TAlignedFloatVec rowStorage; for (std::size_t i = 0; i < numberGradients; ++i) { - storage.push_back(gradients[i][j]); + rowStorage.push_back(gradients[i][j]); } for (std::size_t i = 0; i < numberCurvatures; ++i) { - storage.push_back(curvatures[i][j]); + rowStorage.push_back(curvatures[i][j]); } - auto gradient = makeGradient(&storage[0], numberGradients); - auto curvature = makeCurvature(&storage[numberGradients], numberCurvatures); - derivatives1.add(1, gradient, curvature); + auto derivatives_ = makeAlignedVector( + &rowStorage[0], numberGradients + numberCurvatures); + derivatives1.add(1, derivatives_); } derivatives1.remapCurvature(); @@ -110,20 +116,21 @@ void testDerivativesFor(std::size_t numberParameters) { LOG_DEBUG(<< "Merge"); - TDoubleVec storage2(numberGradients * (numberGradients + 1), 0.0); - TDerivatives derivatives2{numberParameters, &storage2[0]}; + TAlignedDoubleVec storage2(paddedNumberGradients + numberGradients * numberGradients, 0.0); + TDerivatives derivatives2{numberParameters, &storage2[0], + &storage2[paddedNumberGradients]}; for (std::size_t j = 10; j < 20; ++j) { - TFloatVec storage; + TAlignedFloatVec storage; for (std::size_t i = 0; i < numberGradients; ++i) { storage.push_back(gradients[i][j]); } for (std::size_t i = 0; i < numberCurvatures; ++i) { storage.push_back(curvatures[i][j]); } - auto gradient = makeGradient(&storage[0], numberGradients); - auto curvature = makeCurvature(&storage[numberGradients], numberCurvatures); - derivatives2.add(1, gradient, curvature); + auto derivatives = makeAlignedVector( + &storage[0], numberGradients + numberCurvatures); + derivatives2.add(1, derivatives); } derivatives2.remapCurvature(); @@ -204,36 +211,37 @@ void testPerSplitDerivativesFor(std::size_t numberParameters) { TMatrix::Zero(numberParameters, numberParameters)); } - auto addDerivatives = [&](TPerSplitDerivatives& derivatives) { + auto addDerivatives = [&](TSplitsDerivatives& derivatives) { for (std::size_t i = 0, j = 0, k = 0; i < numberSamples; ++i, j += numberGradients, k += numberCurvatures) { - TFloatVec storage; + TAlignedFloatVec storage; storage.insert(storage.end(), &gradients[j], &gradients[j + numberGradients]); storage.insert(storage.end(), &curvatures[j], &curvatures[k + numberCurvatures]); - auto gradient = makeGradient(&storage[0], numberGradients); - auto curvature = makeCurvature(&storage[numberGradients], numberCurvatures); + auto derivatives_ = makeAlignedVector( + &storage[0], numberGradients + numberCurvatures); + auto gradient = makeVector(&storage[0], numberGradients); + auto curvature = makeVector(&storage[numberGradients], numberCurvatures); if (uniform01[i] < 0.1) { - derivatives.addMissingDerivatives(features[i], gradient, curvature); + derivatives.addMissingDerivatives(features[i], derivatives_); ++expectedMissingCounts[features[i]]; expectedMissingGradients[features[i]] += gradient; expectedMissingCurvatures[features[i]] += - rowMajorHessian(numberParameters, curvature); + columnMajorHessian(numberParameters, curvature); } else { - derivatives.addDerivatives(features[i], splits[features[i]][i], - gradient, curvature); + derivatives.addDerivatives(features[i], splits[features[i]][i], derivatives_); ++expectedCounts[features[i]][splits[features[i]][i]]; expectedGradients[features[i]][splits[features[i]][i]] += gradient; expectedCurvatures[features[i]][splits[features[i]][i]] += - rowMajorHessian(numberParameters, curvature); + columnMajorHessian(numberParameters, curvature); } } derivatives.remapCurvature(); }; - auto validate = [&](const TPerSplitDerivatives& derivatives) { + auto validate = [&](const TSplitsDerivatives& derivatives) { for (std::size_t i = 0; i < expectedCounts.size(); ++i) { for (std::size_t j = 0; j < expectedGradients[i].size(); ++j) { TMatrix curvature{ @@ -256,7 +264,7 @@ void testPerSplitDerivativesFor(std::size_t numberParameters) { LOG_TRACE(<< "Test accumulation"); - TPerSplitDerivatives derivatives1{featureSplits, numberParameters}; + TSplitsDerivatives derivatives1{featureSplits, numberParameters}; addDerivatives(derivatives1); validate(derivatives1); @@ -267,7 +275,7 @@ void testPerSplitDerivativesFor(std::size_t numberParameters) { rng.generateUniformSamples(-1.5, 1.0, numberSamples * numberGradients, gradients); rng.generateUniformSamples(0.1, 0.5, numberSamples * numberCurvatures, curvatures); - TPerSplitDerivatives derivatives2{featureSplits, numberParameters}; + TSplitsDerivatives derivatives2{featureSplits, numberParameters}; addDerivatives(derivatives2); derivatives1.add(derivatives2); @@ -275,7 +283,7 @@ void testPerSplitDerivativesFor(std::size_t numberParameters) { LOG_TRACE(<< "Test copy"); - TPerSplitDerivatives derivatives3{derivatives1}; + TSplitsDerivatives derivatives3{derivatives1}; BOOST_REQUIRE_EQUAL(derivatives1.checksum(), derivatives3.checksum()); } } diff --git a/lib/maths/unittest/CDataFrameCategoryEncoderTest.cc b/lib/maths/unittest/CDataFrameCategoryEncoderTest.cc index 2936d350dd..4530f1e06b 100644 --- a/lib/maths/unittest/CDataFrameCategoryEncoderTest.cc +++ b/lib/maths/unittest/CDataFrameCategoryEncoderTest.cc @@ -31,7 +31,8 @@ using TDoubleVec = std::vector; using TDoubleVecVec = std::vector; using TSizeVec = std::vector; using TSizeVecVec = std::vector; -using TFloatVec = std::vector; +using TFloatVec = + std::vector>; using TMeanAccumulator = maths::CBasicStatistics::SSampleMean::TAccumulator; using TMeanAccumulatorVec = std::vector; using TMeanAccumulatorVecVec = std::vector; @@ -604,7 +605,7 @@ BOOST_AUTO_TEST_CASE(testUnseenCategoryEncoding) { maths::CDataFrameCategoryEncoder encoder{{1, *frame, 3}}; - TFloatVec unseen{3.0, 5.0, 4.0, 1.5}; + TFloatVec unseen{3.0f, 5.0f, 4.0f, 1.5f}; core::CDataFrame::TRowRef row{rows, unseen.begin(), unseen.end(), 0}; auto encodedRow = encoder.encode(row); diff --git a/lib/maths/unittest/COutliersTest.cc b/lib/maths/unittest/COutliersTest.cc index 776af37c07..dfb96853a7 100644 --- a/lib/maths/unittest/COutliersTest.cc +++ b/lib/maths/unittest/COutliersTest.cc @@ -4,6 +4,7 @@ * you may not use this file except in compliance with the Elastic License. */ +#include #include #include #include @@ -595,7 +596,7 @@ BOOST_AUTO_TEST_CASE(testEstimateMemoryUsedByCompute) { 0.05}; // Outlier fraction std::int64_t estimatedMemoryUsage( - core::CDataFrame::estimateMemoryUsage(i == 0, 40500, 6) + + core::CDataFrame::estimateMemoryUsage(i == 0, 40500, 6, core::CAlignment::E_Aligned16) + maths::COutliers::estimateMemoryUsedByCompute( params, numberPoints, (numberPoints + numberPartitions[i] - 1) / numberPartitions[i], @@ -624,7 +625,7 @@ BOOST_AUTO_TEST_CASE(testEstimateMemoryUsedByCompute) { LOG_DEBUG(<< "estimated peak memory = " << estimatedMemoryUsage); LOG_DEBUG(<< "high water mark = " << maxMemoryUsage); BOOST_TEST_REQUIRE(std::abs(maxMemoryUsage - estimatedMemoryUsage) < - std::max(maxMemoryUsage.load(), estimatedMemoryUsage) / 10); + std::max(maxMemoryUsage.load(), estimatedMemoryUsage) / 6); } } diff --git a/mk/linux.mk b/mk/linux.mk index f11e436f28..2c6d845536 100644 --- a/mk/linux.mk +++ b/mk/linux.mk @@ -75,7 +75,7 @@ else RAPIDJSONCPPFLAGS=-DRAPIDJSON_HAS_STDSTRING -DRAPIDJSON_SSE42 endif EIGENINCLUDES=-isystem $(CPP_SRC_HOME)/3rd_party/eigen -EIGENCPPFLAGS=-DEIGEN_MPL2_ONLY +EIGENCPPFLAGS=-DEIGEN_MPL2_ONLY -DEIGEN_MAX_ALIGN_BYTES=32 XMLINCLUDES=`/usr/local/gcc75/bin/xml2-config --cflags` XMLLIBS=`/usr/local/gcc75/bin/xml2-config --libs` DYNAMICLIBLDFLAGS=$(PLATPICFLAGS) -shared -Wl,--as-needed -L$(CPP_PLATFORM_HOME)/$(DYNAMIC_LIB_DIR) $(COVERAGE) -Wl,-z,relro -Wl,-z,now -Wl,-rpath,'$$ORIGIN/.' diff --git a/mk/linux_crosscompile_linux.mk b/mk/linux_crosscompile_linux.mk index a32eef7d72..4d34bbaa6a 100644 --- a/mk/linux_crosscompile_linux.mk +++ b/mk/linux_crosscompile_linux.mk @@ -76,7 +76,7 @@ else RAPIDJSONCPPFLAGS=-DRAPIDJSON_HAS_STDSTRING endif EIGENINCLUDES=-isystem $(CPP_SRC_HOME)/3rd_party/eigen -EIGENCPPFLAGS=-DEIGEN_MPL2_ONLY +EIGENCPPFLAGS=-DEIGEN_MPL2_ONLY -DEIGEN_MAX_ALIGN_BYTES=32 XMLINCLUDES=-I$(SYSROOT)/usr/local/gcc75/include/libxml2 XMLLIBS=-L$(SYSROOT)/usr/local/gcc75/lib -lxml2 -lz -lm -ldl DYNAMICLIBLDFLAGS=$(PLATPICFLAGS) -shared -Wl,--as-needed -L$(CPP_PLATFORM_HOME)/$(DYNAMIC_LIB_DIR) $(COVERAGE) -Wl,-z,relro -Wl,-z,now -Wl,-rpath,'$$ORIGIN/.' diff --git a/mk/linux_crosscompile_macosx.mk b/mk/linux_crosscompile_macosx.mk index 1734e8be23..eb79720ea6 100644 --- a/mk/linux_crosscompile_macosx.mk +++ b/mk/linux_crosscompile_macosx.mk @@ -72,7 +72,7 @@ BOOSTTESTLIBS=-lboost_unit_test_framework-clang-darwin$(BOOSTCLANGVER)-mt-x64-$( RAPIDJSONINCLUDES=-isystem $(CPP_SRC_HOME)/3rd_party/rapidjson/include RAPIDJSONCPPFLAGS=-DRAPIDJSON_HAS_STDSTRING -DRAPIDJSON_SSE42 EIGENINCLUDES=-isystem $(CPP_SRC_HOME)/3rd_party/eigen -EIGENCPPFLAGS=-DEIGEN_MPL2_ONLY +EIGENCPPFLAGS=-DEIGEN_MPL2_ONLY -DEIGEN_MAX_ALIGN_BYTES=32 XMLINCLUDES=-isystem $(SYSROOT)/usr/include/libxml2 XMLLIBLDFLAGS=-L$(SYSROOT)/usr/lib XMLLIBS=-lxml2 diff --git a/mk/macosx.mk b/mk/macosx.mk index c39b0de3f7..61933f6116 100644 --- a/mk/macosx.mk +++ b/mk/macosx.mk @@ -62,7 +62,7 @@ BOOSTTESTLIBS=-lboost_unit_test_framework-clang-darwin$(BOOSTCLANGVER)-mt-x64-$( RAPIDJSONINCLUDES=-isystem $(CPP_SRC_HOME)/3rd_party/rapidjson/include RAPIDJSONCPPFLAGS=-DRAPIDJSON_HAS_STDSTRING -DRAPIDJSON_SSE42 EIGENINCLUDES=-isystem $(CPP_SRC_HOME)/3rd_party/eigen -EIGENCPPFLAGS=-DEIGEN_MPL2_ONLY +EIGENCPPFLAGS=-DEIGEN_MPL2_ONLY -DEIGEN_MAX_ALIGN_BYTES=32 XMLINCLUDES=-isystem $(SDK_PATH)/usr/include/libxml2 XMLLIBLDFLAGS=-L/usr/lib XMLLIBS=-lxml2 diff --git a/mk/windows.mk b/mk/windows.mk index d9d55d020d..a1517a2d98 100644 --- a/mk/windows.mk +++ b/mk/windows.mk @@ -94,7 +94,7 @@ RAPIDJSONCPPFLAGS=-DRAPIDJSON_HAS_STDSTRING -DRAPIDJSON_SSE42 # Eigen automatically uses SSE and SSE2 on 64 bit Windows - only the higher # versions need to be explicitly enabled EIGENINCLUDES=-I$(CPP_SRC_HOME)/3rd_party/eigen -EIGENCPPFLAGS=-DEIGEN_MPL2_ONLY -DEIGEN_VECTORIZE_SSE3 -DEIGEN_VECTORIZE_SSE4_1 -DEIGEN_VECTORIZE_SSE4_2 +EIGENCPPFLAGS=-DEIGEN_MPL2_ONLY -DEIGEN_VECTORIZE_SSE3 -DEIGEN_VECTORIZE_SSE4_1 -DEIGEN_VECTORIZE_SSE4_2 -DEIGEN_MAX_ALIGN_BYTES=32 XMLINCLUDES=-I$(LOCAL_DRIVE):/usr/local/include/libxml2 XMLLIBLDFLAGS=-LIBPATH:$(LOCAL_DRIVE):/usr/local/lib XMLLIBS=libxml2.lib