Skip to content

Commit 80a8e99

Browse files
authored
[7.8][ML] Ensure the performance critical data are 16 byte aligned for data frame analyses (#1152)
Backport #1142.
1 parent c63ddef commit 80a8e99

40 files changed

+913
-442
lines changed

docs/CHANGELOG.asciidoc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@
5252
(See {ml-pull}1126[#1126], issue: {issue}54506[#54506].)
5353
* Added a {ml} native code build for Linux on AArch64. (See {ml-pull}1132[#1132] and
5454
{ml-pull}1135[#1135].)
55+
* Improve data frame analysis runtime by optimising memory alignment for intrinsic
56+
operations. (See {ml-pull}1142[#1142].)
5557

5658
== {es} version 7.7.1
5759

@@ -60,7 +62,6 @@
6062
* Fixed background persistence of categorizer state (See {ml-pull}1137[#1137],
6163
issue: {ml-issue}1136[#1136].)
6264

63-
6465
== {es} version 7.7.0
6566

6667
=== New Features

include/api/CDataFrameTrainBoostedTreeRunner.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ class API_EXPORT CDataFrameTrainBoostedTreeRunner : public CDataFrameAnalysisRun
5252
static const std::string NUM_TOP_FEATURE_IMPORTANCE_VALUES;
5353
static const std::string TRAINING_PERCENT_FIELD_NAME;
5454

55-
//Output
55+
// Output
5656
static const std::string IS_TRAINING_FIELD_NAME;
5757
static const std::string FEATURE_NAME_FIELD_NAME;
5858
static const std::string IMPORTANCE_FIELD_NAME;

include/core/CAlignment.h

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License;
4+
* you may not use this file except in compliance with the Elastic License.
5+
*/
6+
7+
#ifndef INCLUDED_ml_core_CAlignment_h
8+
#define INCLUDED_ml_core_CAlignment_h
9+
10+
#include <core/ImportExport.h>
11+
12+
#include <Eigen/Core>
13+
14+
#include <array>
15+
#include <cstddef>
16+
#include <vector>
17+
18+
namespace ml {
19+
namespace core {
20+
21+
class CORE_EXPORT CAlignment {
22+
public:
23+
//! Alignment types.
24+
enum EType {
25+
E_Unaligned = 1,
26+
E_Aligned8 = 8,
27+
E_Aligned16 = 16,
28+
E_Aligned32 = 32
29+
};
30+
31+
//! This is an ordering by inclusion, i.e. \p lhs < \p rhs if an address is
32+
//! \p rhs aligned implies it is lhs aligned but not vice versa.
33+
static bool less(EType lhs, EType rhs) { return bytes(lhs) < bytes(rhs); }
34+
35+
//! Get the alignment of \p address.
36+
template<typename T>
37+
static EType maxAlignment(const T* address) {
38+
// clang-format off
39+
return (isAligned(address, E_Aligned32) ? E_Aligned32 :
40+
(isAligned(address, E_Aligned16) ? E_Aligned16 :
41+
(isAligned(address, E_Aligned8) ? E_Aligned8 :
42+
(E_Unaligned))));
43+
// clang-format on
44+
}
45+
46+
//! Check if \p address has \p alignment.
47+
template<typename T>
48+
static bool isAligned(const T* address, EType alignment) {
49+
return offset(address, alignment) == 0;
50+
}
51+
52+
//! Get the next index in \p buffer which is aligned to \p alignment.
53+
template<typename T, std::size_t N>
54+
static std::size_t
55+
nextAligned(const std::array<T, N>& buffer, std::size_t index, EType alignment) {
56+
std::size_t offset_{offset(&buffer[index], alignment)};
57+
return offset_ == 0 ? index : index + (bytes(alignment) - offset_) / sizeof(T);
58+
}
59+
60+
//! Get the next index in \p buffer which is aligned to \p alignment.
61+
template<typename T>
62+
static std::size_t
63+
nextAligned(const std::vector<T>& buffer, std::size_t index, EType alignment) {
64+
std::size_t offset_{offset(&buffer[index], alignment)};
65+
return offset_ == 0 ? index : index + (bytes(alignment) - offset_) / sizeof(T);
66+
}
67+
68+
//! Round up n items of T so they use a multiple of \p alignment size memory.
69+
template<typename T>
70+
static std::size_t roundup(EType alignment, std::size_t n) {
71+
return roundupSizeof<T>(alignment, n) / sizeof(T);
72+
}
73+
74+
//! Round up sizeof(T) up to multiple of \p alignment bytes.
75+
template<typename T>
76+
static std::size_t roundupSizeof(EType alignment, std::size_t n = 1) {
77+
std::size_t bytes_{bytes(alignment)};
78+
return ((n * sizeof(T) + bytes_ - 1) / bytes_) * bytes_;
79+
}
80+
81+
//! Print the type.
82+
static std::string print(EType type) {
83+
switch (type) {
84+
case E_Unaligned:
85+
return "unaligned";
86+
case E_Aligned8:
87+
return "aligned 8";
88+
case E_Aligned16:
89+
return "aligned 16";
90+
case E_Aligned32:
91+
return "aligned 32";
92+
}
93+
return "";
94+
}
95+
96+
private:
97+
template<typename T>
98+
static std::size_t offset(const T* address, EType alignment) {
99+
return reinterpret_cast<std::size_t>(address) & mask(alignment);
100+
}
101+
102+
static std::size_t mask(EType alignment) { return bytes(alignment) - 1; }
103+
104+
static std::size_t bytes(EType alignment) {
105+
return static_cast<std::size_t>(alignment);
106+
}
107+
};
108+
109+
template<typename T>
110+
using CAlignedAllocator = Eigen::aligned_allocator<T>;
111+
}
112+
}
113+
114+
#endif // INCLUDED_ml_core_CAlignment_h

include/core/CDataFrame.h

Lines changed: 49 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#ifndef INCLUDED_ml_core_CDataFrame_h
88
#define INCLUDED_ml_core_CDataFrame_h
99

10+
#include <core/CAlignment.h>
1011
#include <core/CFloatStorage.h>
1112
#include <core/CPackedBitVector.h>
1213
#include <core/CVectorRange.h>
@@ -32,7 +33,7 @@ class CTemporaryDirectory;
3233

3334
namespace data_frame_detail {
3435

35-
using TFloatVec = std::vector<CFloatStorage>;
36+
using TFloatVec = std::vector<CFloatStorage, CAlignedAllocator<CFloatStorage>>;
3637
using TFloatVecItr = TFloatVec::iterator;
3738
using TInt32Vec = std::vector<std::int32_t>;
3839
using TInt32VecCItr = TInt32Vec::const_iterator;
@@ -178,47 +179,54 @@ class CORE_EXPORT CRowIterator final
178179
//! parallelized in which case each reader reads a disjoint subset of the data
179180
//! frame's rows.
180181
//!
181-
//! Space can be reserved at any point to hold one or more additional columns.
182-
//! These are not visible until they are written.
182+
//! Space can be reserved for additional rows and the data frame can be resized
183+
//! to hold one or more additional columns. Resizing is a heavyweight operation
184+
//! and should be minimized.
183185
//!
184186
//! IMPLEMENTATION:\n
185187
//! This is a fairly lightweight container which is essentially responsible
186188
//! for managing the read and write process to some underlying store format.
187189
//! The store format is determined by the user implementing functionality to
188190
//! read and write state from the store. For example, these could copy to /
189191
//! from main memory, "write to" / "read from" disk, etc. A factory function
190-
//! must be provided to the constructor which effectively that determines the
191-
//! type of storage used. It is assumed that copying this has no side effects.
192+
//! for new chunks of storage must be provided to the constructor and this
193+
//! effectively determines the type of storage used. It is assumed that copying
194+
//! this function has no side effects.
192195
//!
193196
//! The data frame is divided into slices each of which represent a number of
194197
//! contiguous rows. The idea is that they contain a reasonable amount of memory
195198
//! so that, for example, they significantly reduce the number of "writes to" /
196199
//! "reads from" disk (a whole slice being written or read in one go), mean we'll
197200
//! get good locality of reference and mean there is minimal book keeping overhead
198201
//! (such as state for vector sizes, pointers to starts of memory blocks, etc).
199-
//! In addition, it is assumed that access to the individual slices is thread
200-
//! safe. If they share state the implementation must ensure that access to this
201-
//! is synchronized.
202+
//! It is possible to choose an alignment for each row in which case the address
203+
//! of the start of each row is 8, 16, etc byte aligned. This comes with a memory
204+
//! overhead as row sizes are then rounded up to the nearest multiple of the
205+
//! alignment size. Finally, note that it is assumed that access to the individual
206+
//! slices is thread safe. If they share state the implementation must ensure that
207+
//! access to this is synchronized.
202208
//!
203-
//! Reads and writes of a single row are also done via call backs supplied to the
209+
//! Reads and writes of a single row are done via call backs supplied to the
204210
//! readRows and writeRow functions. This is to achieve maximum decoupling from
205211
//! the calling code for how the underlying values are used or where they come
206212
//! from. It also means certain operations can be done very efficiently. For example,
207213
//! a stream can be attached to a row writer function to copy the values directly
208-
//! into the data frame storage.
214+
//! into the data frame storage with no marshalling costs.
209215
//!
210-
//! Read and writes to storage can optionally happen in a separate thread to the
211-
//! row reading and writing to deal with the case that these operations can by
212-
//! time consuming.
216+
//! Read from and writes to storage can optionally happen in a separate thread
217+
//! to the row reading and writing to deal with the case that these operations
218+
//! can by time consuming.
213219
class CORE_EXPORT CDataFrame final {
214220
public:
215221
using TBoolVec = std::vector<bool>;
222+
using TSizeVec = std::vector<std::size_t>;
216223
using TStrVec = std::vector<std::string>;
217224
using TStrVecVec = std::vector<TStrVec>;
218225
using TStrCRng = CVectorRange<const TStrVec>;
219-
using TFloatVec = std::vector<CFloatStorage>;
226+
using TFloatVec = std::vector<CFloatStorage, CAlignedAllocator<CFloatStorage>>;
220227
using TFloatVecItr = TFloatVec::iterator;
221228
using TInt32Vec = std::vector<std::int32_t>;
229+
using TSizeAlignmentPrVec = std::vector<std::pair<std::size_t, CAlignment::EType>>;
222230
using TRowRef = data_frame_detail::CRowRef;
223231
using TRowItr = data_frame_detail::CRowIterator;
224232
using TRowFunc = std::function<void(TRowItr, TRowItr)>;
@@ -245,6 +253,7 @@ class CORE_EXPORT CDataFrame final {
245253
public:
246254
//! \param[in] inMainMemory True if the data frame is stored in main memory.
247255
//! \param[in] numberColumns The number of columns in the data frame.
256+
//! \param[in] rowAlignment The alignment to use for the start of each row.
248257
//! \param[in] sliceCapacityInRows The capacity of a slice of the data frame
249258
//! as a number of rows.
250259
//! \param[in] readAndWriteToStoreSyncStrategy Controls whether reads and
@@ -256,13 +265,15 @@ class CORE_EXPORT CDataFrame final {
256265
//! the implementers responsibility to ensure these conditions are satisfied.
257266
CDataFrame(bool inMainMemory,
258267
std::size_t numberColumns,
268+
CAlignment::EType rowAlignment,
259269
std::size_t sliceCapacityInRows,
260270
EReadWriteToStorage readAndWriteToStoreSyncStrategy,
261271
const TWriteSliceToStoreFunc& writeSliceToStore);
262272

263273
//! Overload which manages the setting of slice capacity to a sensible default.
264274
CDataFrame(bool inMainMemory,
265275
std::size_t numberColumns,
276+
CAlignment::EType rowAlignment,
266277
EReadWriteToStorage readAndWriteToStoreSyncStrategy,
267278
const TWriteSliceToStoreFunc& writeSliceToStore);
268279

@@ -297,6 +308,18 @@ class CORE_EXPORT CDataFrame final {
297308
//! \param[in] numberColumns The desired number of columns.
298309
void resizeColumns(std::size_t numberThreads, std::size_t numberColumns);
299310

311+
//! Resize to contain \p extraColumns columns.
312+
//!
313+
//! These are split up into blocks of columns with their required alignment.
314+
//! Pads are automatically inserted for alignment and a vector of the start
315+
//! position of each block of columns is returned.
316+
//!
317+
//! \param[in] numberThreads The target number of threads to use.
318+
//! \param[in] extraColumns The desired additional columns.
319+
//! \return The index of each (block of) columns in \p extraColumns.
320+
//! \warning This only supports alignments less than or equal the row alignment.
321+
TSizeVec resizeColumns(std::size_t numberThreads, const TSizeAlignmentPrVec& extraColumns);
322+
300323
//! This reads rows using one or more readers.
301324
//!
302325
//! One reader is bound to one thread. Each thread reads a disjoint subset
@@ -351,7 +374,7 @@ class CORE_EXPORT CDataFrame final {
351374
std::vector<READER> readers;
352375
readers.reserve(result.first.size());
353376
for (auto& reader_ : result.first) {
354-
readers.push_back(std::move(*reader_.target<READER>()));
377+
readers.emplace_back(std::move(*reader_.target<READER>()));
355378
}
356379

357380
return {std::move(readers), result.second};
@@ -412,7 +435,7 @@ class CORE_EXPORT CDataFrame final {
412435
std::vector<WRITER> writers;
413436
writers.reserve(result.first.size());
414437
for (auto& writer_ : result.first) {
415-
writers.push_back(std::move(*writer_.target<WRITER>()));
438+
writers.emplace_back(std::move(*writer_.target<WRITER>()));
416439
}
417440

418441
return {std::move(writers), result.second};
@@ -485,7 +508,8 @@ class CORE_EXPORT CDataFrame final {
485508
//! \p numberColumns columns.
486509
static std::size_t estimateMemoryUsage(bool inMainMemory,
487510
std::size_t numberRows,
488-
std::size_t numberColumns);
511+
std::size_t numberColumns,
512+
CAlignment::EType alignment);
489513

490514
//! Get the value to use for a missing element in a data frame.
491515
static constexpr double valueOfMissing() {
@@ -568,6 +592,8 @@ class CORE_EXPORT CDataFrame final {
568592
std::size_t m_RowCapacity;
569593
//! The capacity of a slice of the data frame as a number of rows.
570594
std::size_t m_SliceCapacityInRows;
595+
//! The start of row memory alignment.
596+
core::CAlignment::EType m_RowAlignment;
571597

572598
//! If true read and write asynchronously to storage.
573599
EReadWriteToStorage m_ReadAndWriteToStoreSyncStrategy;
@@ -610,12 +636,14 @@ class CORE_EXPORT CDataFrame final {
610636
//! capacity in rows.
611637
//! \param[in] readWriteToStoreSyncStrategy Controls whether reads and writes
612638
//! from slice storage are synchronous or asynchronous.
639+
//! \param[in] alignment The alignment to use for the start of each row.
613640
CORE_EXPORT
614641
std::pair<std::unique_ptr<CDataFrame>, std::shared_ptr<CTemporaryDirectory>>
615642
makeMainStorageDataFrame(std::size_t numberColumns,
616643
boost::optional<std::size_t> sliceCapacity = boost::none,
617644
CDataFrame::EReadWriteToStorage readWriteToStoreSyncStrategy =
618-
CDataFrame::EReadWriteToStorage::E_Sync);
645+
CDataFrame::EReadWriteToStorage::E_Sync,
646+
CAlignment::EType alignment = CAlignment::E_Aligned16);
619647

620648
//! Make a data frame which uses disk storage for its slices.
621649
//!
@@ -627,14 +655,16 @@ makeMainStorageDataFrame(std::size_t numberColumns,
627655
//! capacity in rows.
628656
//! \param[in] readWriteToStoreSyncStrategy Controls whether reads and writes
629657
//! from slice storage are synchronous or asynchronous.
658+
//! \param[in] alignment The alignment to use for the start of each row.
630659
CORE_EXPORT
631660
std::pair<std::unique_ptr<CDataFrame>, std::shared_ptr<CTemporaryDirectory>>
632661
makeDiskStorageDataFrame(const std::string& rootDirectory,
633662
std::size_t numberColumns,
634663
std::size_t numberRows,
635664
boost::optional<std::size_t> sliceCapacity = boost::none,
636665
CDataFrame::EReadWriteToStorage readWriteToStoreSyncStrategy =
637-
CDataFrame::EReadWriteToStorage::E_Async);
666+
CDataFrame::EReadWriteToStorage::E_Async,
667+
CAlignment::EType alignment = CAlignment::E_Aligned16);
638668
}
639669
}
640670

include/core/CDataFrameRowSlice.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#ifndef INCLUDED_ml_core_CDataFrameRowSlice_h
88
#define INCLUDED_ml_core_CDataFrameRowSlice_h
99

10+
#include <core/CAlignment.h>
1011
#include <core/CFloatStorage.h>
1112
#include <core/CompressUtils.h>
1213
#include <core/ImportExport.h>
@@ -24,7 +25,7 @@ namespace data_frame_row_slice_detail {
2425
//! \brief The implementation backing a data frame row slice handle.
2526
class CORE_EXPORT CDataFrameRowSliceHandleImpl {
2627
public:
27-
using TFloatVec = std::vector<CFloatStorage>;
28+
using TFloatVec = std::vector<CFloatStorage, CAlignedAllocator<CFloatStorage>>;
2829
using TInt32Vec = std::vector<std::int32_t>;
2930
using TImplPtr = std::unique_ptr<CDataFrameRowSliceHandleImpl>;
3031

@@ -42,7 +43,7 @@ class CORE_EXPORT CDataFrameRowSliceHandleImpl {
4243
//! CDataFrame storage.
4344
class CORE_EXPORT CDataFrameRowSliceHandle {
4445
public:
45-
using TFloatVec = std::vector<CFloatStorage>;
46+
using TFloatVec = std::vector<CFloatStorage, CAlignedAllocator<CFloatStorage>>;
4647
using TFloatVecItr = TFloatVec::iterator;
4748
using TInt32Vec = std::vector<std::int32_t>;
4849
using TInt32VecCItr = TInt32Vec::const_iterator;
@@ -83,7 +84,7 @@ class CORE_EXPORT CDataFrameRowSliceHandle {
8384
//! \brief CDataFrame slice storage interface.
8485
class CORE_EXPORT CDataFrameRowSlice {
8586
public:
86-
using TFloatVec = std::vector<CFloatStorage>;
87+
using TFloatVec = std::vector<CFloatStorage, CAlignedAllocator<CFloatStorage>>;
8788
using TInt32Vec = std::vector<std::int32_t>;
8889

8990
public:

include/maths/CBoostedTreeImpl.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -102,8 +102,8 @@ class MATHS_EXPORT CBoostedTreeImpl final {
102102
//! Get the column containing the dependent variable.
103103
std::size_t columnHoldingDependentVariable() const;
104104

105-
//! Get the number of columns in the original data frame.
106-
std::size_t numberInputColumns() const;
105+
//! Get start indices of the extra columns.
106+
const TSizeVec& extraColumns() const;
107107

108108
//! Get the weights to apply to each class's predicted probability when
109109
//! assigning classes.
@@ -303,7 +303,7 @@ class MATHS_EXPORT CBoostedTreeImpl final {
303303
mutable CPRNG::CXorOShiro128Plus m_Rng;
304304
std::size_t m_NumberThreads;
305305
std::size_t m_DependentVariable = std::numeric_limits<std::size_t>::max();
306-
std::size_t m_NumberInputColumns = 0;
306+
TSizeVec m_ExtraColumns;
307307
TLossFunctionUPtr m_Loss;
308308
CBoostedTree::EClassAssignmentObjective m_ClassAssignmentObjective =
309309
CBoostedTree::E_MinimumRecall;

0 commit comments

Comments
 (0)