7
7
#ifndef INCLUDED_ml_core_CDataFrame_h
8
8
#define INCLUDED_ml_core_CDataFrame_h
9
9
10
+ #include < core/CAlignment.h>
10
11
#include < core/CFloatStorage.h>
11
12
#include < core/CPackedBitVector.h>
12
13
#include < core/CVectorRange.h>
@@ -32,7 +33,7 @@ class CTemporaryDirectory;
32
33
33
34
namespace data_frame_detail {
34
35
35
- using TFloatVec = std::vector<CFloatStorage>;
36
+ using TFloatVec = std::vector<CFloatStorage, CAlignedAllocator<CFloatStorage> >;
36
37
using TFloatVecItr = TFloatVec::iterator;
37
38
using TInt32Vec = std::vector<std::int32_t >;
38
39
using TInt32VecCItr = TInt32Vec::const_iterator;
@@ -178,47 +179,54 @@ class CORE_EXPORT CRowIterator final
178
179
// ! parallelized in which case each reader reads a disjoint subset of the data
179
180
// ! frame's rows.
180
181
// !
181
- // ! Space can be reserved at any point to hold one or more additional columns.
182
- // ! These are not visible until they are written.
182
+ // ! Space can be reserved for additional rows and the data frame can be resized
183
+ // ! to hold one or more additional columns. Resizing is a heavyweight operation
184
+ // ! and should be minimized.
183
185
// !
184
186
// ! IMPLEMENTATION:\n
185
187
// ! This is a fairly lightweight container which is essentially responsible
186
188
// ! for managing the read and write process to some underlying store format.
187
189
// ! The store format is determined by the user implementing functionality to
188
190
// ! read and write state from the store. For example, these could copy to /
189
191
// ! from main memory, "write to" / "read from" disk, etc. A factory function
190
- // ! must be provided to the constructor which effectively that determines the
191
- // ! type of storage used. It is assumed that copying this has no side effects.
192
+ // ! for new chunks of storage must be provided to the constructor and this
193
+ // ! effectively determines the type of storage used. It is assumed that copying
194
+ // ! this function has no side effects.
192
195
// !
193
196
// ! The data frame is divided into slices each of which represent a number of
194
197
// ! contiguous rows. The idea is that they contain a reasonable amount of memory
195
198
// ! so that, for example, they significantly reduce the number of "writes to" /
196
199
// ! "reads from" disk (a whole slice being written or read in one go), mean we'll
197
200
// ! get good locality of reference and mean there is minimal book keeping overhead
198
201
// ! (such as state for vector sizes, pointers to starts of memory blocks, etc).
199
- // ! In addition, it is assumed that access to the individual slices is thread
200
- // ! safe. If they share state the implementation must ensure that access to this
201
- // ! is synchronized.
202
+ // ! It is possible to choose an alignment for each row in which case the address
203
+ // ! of the start of each row is 8, 16, etc byte aligned. This comes with a memory
204
+ // ! overhead as row sizes are then rounded up to the nearest multiple of the
205
+ // ! alignment size. Finally, note that it is assumed that access to the individual
206
+ // ! slices is thread safe. If they share state the implementation must ensure that
207
+ // ! access to this is synchronized.
202
208
// !
203
- // ! Reads and writes of a single row are also done via call backs supplied to the
209
+ // ! Reads and writes of a single row are done via call backs supplied to the
204
210
// ! readRows and writeRow functions. This is to achieve maximum decoupling from
205
211
// ! the calling code for how the underlying values are used or where they come
206
212
// ! from. It also means certain operations can be done very efficiently. For example,
207
213
// ! a stream can be attached to a row writer function to copy the values directly
208
- // ! into the data frame storage.
214
+ // ! into the data frame storage with no marshalling costs .
209
215
// !
210
- // ! Read and writes to storage can optionally happen in a separate thread to the
211
- // ! row reading and writing to deal with the case that these operations can by
212
- // ! time consuming.
216
+ // ! Read from and writes to storage can optionally happen in a separate thread
217
+ // ! to the row reading and writing to deal with the case that these operations
218
+ // ! can by time consuming.
213
219
class CORE_EXPORT CDataFrame final {
214
220
public:
215
221
using TBoolVec = std::vector<bool >;
222
+ using TSizeVec = std::vector<std::size_t >;
216
223
using TStrVec = std::vector<std::string>;
217
224
using TStrVecVec = std::vector<TStrVec>;
218
225
using TStrCRng = CVectorRange<const TStrVec>;
219
- using TFloatVec = std::vector<CFloatStorage>;
226
+ using TFloatVec = std::vector<CFloatStorage, CAlignedAllocator<CFloatStorage> >;
220
227
using TFloatVecItr = TFloatVec::iterator;
221
228
using TInt32Vec = std::vector<std::int32_t >;
229
+ using TSizeAlignmentPrVec = std::vector<std::pair<std::size_t , CAlignment::EType>>;
222
230
using TRowRef = data_frame_detail::CRowRef;
223
231
using TRowItr = data_frame_detail::CRowIterator;
224
232
using TRowFunc = std::function<void (TRowItr, TRowItr)>;
@@ -245,6 +253,7 @@ class CORE_EXPORT CDataFrame final {
245
253
public:
246
254
// ! \param[in] inMainMemory True if the data frame is stored in main memory.
247
255
// ! \param[in] numberColumns The number of columns in the data frame.
256
+ // ! \param[in] rowAlignment The alignment to use for the start of each row.
248
257
// ! \param[in] sliceCapacityInRows The capacity of a slice of the data frame
249
258
// ! as a number of rows.
250
259
// ! \param[in] readAndWriteToStoreSyncStrategy Controls whether reads and
@@ -256,13 +265,15 @@ class CORE_EXPORT CDataFrame final {
256
265
// ! the implementers responsibility to ensure these conditions are satisfied.
257
266
CDataFrame (bool inMainMemory,
258
267
std::size_t numberColumns,
268
+ CAlignment::EType rowAlignment,
259
269
std::size_t sliceCapacityInRows,
260
270
EReadWriteToStorage readAndWriteToStoreSyncStrategy,
261
271
const TWriteSliceToStoreFunc& writeSliceToStore);
262
272
263
273
// ! Overload which manages the setting of slice capacity to a sensible default.
264
274
CDataFrame (bool inMainMemory,
265
275
std::size_t numberColumns,
276
+ CAlignment::EType rowAlignment,
266
277
EReadWriteToStorage readAndWriteToStoreSyncStrategy,
267
278
const TWriteSliceToStoreFunc& writeSliceToStore);
268
279
@@ -297,6 +308,18 @@ class CORE_EXPORT CDataFrame final {
297
308
// ! \param[in] numberColumns The desired number of columns.
298
309
void resizeColumns (std::size_t numberThreads, std::size_t numberColumns);
299
310
311
+ // ! Resize to contain \p extraColumns columns.
312
+ // !
313
+ // ! These are split up into blocks of columns with their required alignment.
314
+ // ! Pads are automatically inserted for alignment and a vector of the start
315
+ // ! position of each block of columns is returned.
316
+ // !
317
+ // ! \param[in] numberThreads The target number of threads to use.
318
+ // ! \param[in] extraColumns The desired additional columns.
319
+ // ! \return The index of each (block of) columns in \p extraColumns.
320
+ // ! \warning This only supports alignments less than or equal the row alignment.
321
+ TSizeVec resizeColumns (std::size_t numberThreads, const TSizeAlignmentPrVec& extraColumns);
322
+
300
323
// ! This reads rows using one or more readers.
301
324
// !
302
325
// ! One reader is bound to one thread. Each thread reads a disjoint subset
@@ -351,7 +374,7 @@ class CORE_EXPORT CDataFrame final {
351
374
std::vector<READER> readers;
352
375
readers.reserve (result.first .size ());
353
376
for (auto & reader_ : result.first ) {
354
- readers.push_back (std::move (*reader_.target <READER>()));
377
+ readers.emplace_back (std::move (*reader_.target <READER>()));
355
378
}
356
379
357
380
return {std::move (readers), result.second };
@@ -412,7 +435,7 @@ class CORE_EXPORT CDataFrame final {
412
435
std::vector<WRITER> writers;
413
436
writers.reserve (result.first .size ());
414
437
for (auto & writer_ : result.first ) {
415
- writers.push_back (std::move (*writer_.target <WRITER>()));
438
+ writers.emplace_back (std::move (*writer_.target <WRITER>()));
416
439
}
417
440
418
441
return {std::move (writers), result.second };
@@ -485,7 +508,8 @@ class CORE_EXPORT CDataFrame final {
485
508
// ! \p numberColumns columns.
486
509
static std::size_t estimateMemoryUsage (bool inMainMemory,
487
510
std::size_t numberRows,
488
- std::size_t numberColumns);
511
+ std::size_t numberColumns,
512
+ CAlignment::EType alignment);
489
513
490
514
// ! Get the value to use for a missing element in a data frame.
491
515
static constexpr double valueOfMissing () {
@@ -568,6 +592,8 @@ class CORE_EXPORT CDataFrame final {
568
592
std::size_t m_RowCapacity;
569
593
// ! The capacity of a slice of the data frame as a number of rows.
570
594
std::size_t m_SliceCapacityInRows;
595
+ // ! The start of row memory alignment.
596
+ core::CAlignment::EType m_RowAlignment;
571
597
572
598
// ! If true read and write asynchronously to storage.
573
599
EReadWriteToStorage m_ReadAndWriteToStoreSyncStrategy;
@@ -610,12 +636,14 @@ class CORE_EXPORT CDataFrame final {
610
636
// ! capacity in rows.
611
637
// ! \param[in] readWriteToStoreSyncStrategy Controls whether reads and writes
612
638
// ! from slice storage are synchronous or asynchronous.
639
+ // ! \param[in] alignment The alignment to use for the start of each row.
613
640
CORE_EXPORT
614
641
std::pair<std::unique_ptr<CDataFrame>, std::shared_ptr<CTemporaryDirectory>>
615
642
makeMainStorageDataFrame (std::size_t numberColumns,
616
643
boost::optional<std::size_t > sliceCapacity = boost::none,
617
644
CDataFrame::EReadWriteToStorage readWriteToStoreSyncStrategy =
618
- CDataFrame::EReadWriteToStorage::E_Sync);
645
+ CDataFrame::EReadWriteToStorage::E_Sync,
646
+ CAlignment::EType alignment = CAlignment::E_Aligned16);
619
647
620
648
// ! Make a data frame which uses disk storage for its slices.
621
649
// !
@@ -627,14 +655,16 @@ makeMainStorageDataFrame(std::size_t numberColumns,
627
655
// ! capacity in rows.
628
656
// ! \param[in] readWriteToStoreSyncStrategy Controls whether reads and writes
629
657
// ! from slice storage are synchronous or asynchronous.
658
+ // ! \param[in] alignment The alignment to use for the start of each row.
630
659
CORE_EXPORT
631
660
std::pair<std::unique_ptr<CDataFrame>, std::shared_ptr<CTemporaryDirectory>>
632
661
makeDiskStorageDataFrame (const std::string& rootDirectory,
633
662
std::size_t numberColumns,
634
663
std::size_t numberRows,
635
664
boost::optional<std::size_t > sliceCapacity = boost::none,
636
665
CDataFrame::EReadWriteToStorage readWriteToStoreSyncStrategy =
637
- CDataFrame::EReadWriteToStorage::E_Async);
666
+ CDataFrame::EReadWriteToStorage::E_Async,
667
+ CAlignment::EType alignment = CAlignment::E_Aligned16);
638
668
}
639
669
}
640
670
0 commit comments