diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc
index 9ffe4000d1..e760eb5878 100644
--- a/docs/CHANGELOG.asciidoc
+++ b/docs/CHANGELOG.asciidoc
@@ -58,6 +58,8 @@
   (See {ml-pull}1126[#1126], issue: {issue}54506[#54506].)
 * Added a {ml} native code build for Linux on AArch64. (See {ml-pull}1132[#1132] and
   {ml-pull}1135[#1135].)
+* Improve data frame analysis runtime by optimising memory alignment for intrinsic
+  operations. (See {ml-pull}1142[#1142].)
 
 == {es} version 7.7.1
 
@@ -66,7 +68,6 @@
 * Fixed background persistence of categorizer state (See {ml-pull}1137[#1137],
   issue: {ml-issue}1136[#1136].)
 
-
 == {es} version 7.7.0
 
 === New Features
diff --git a/include/api/CDataFrameTrainBoostedTreeRunner.h b/include/api/CDataFrameTrainBoostedTreeRunner.h
index fc20903f74..1dff2b5178 100644
--- a/include/api/CDataFrameTrainBoostedTreeRunner.h
+++ b/include/api/CDataFrameTrainBoostedTreeRunner.h
@@ -52,7 +52,7 @@ class API_EXPORT CDataFrameTrainBoostedTreeRunner : public CDataFrameAnalysisRun
     static const std::string NUM_TOP_FEATURE_IMPORTANCE_VALUES;
     static const std::string TRAINING_PERCENT_FIELD_NAME;
 
-    //Output
+    // Output
     static const std::string IS_TRAINING_FIELD_NAME;
     static const std::string FEATURE_NAME_FIELD_NAME;
     static const std::string IMPORTANCE_FIELD_NAME;
diff --git a/include/core/CAlignment.h b/include/core/CAlignment.h
new file mode 100644
index 0000000000..bedde0e994
--- /dev/null
+++ b/include/core/CAlignment.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+
+#ifndef INCLUDED_ml_core_CAlignment_h
+#define INCLUDED_ml_core_CAlignment_h
+
+#include <core/ImportExport.h>
+
+#include <Eigen/Core>
+
+#include <array>
+#include <cstddef>
+#include <vector>
+
+namespace ml {
+namespace core {
+
+class CORE_EXPORT CAlignment {
+public:
+    //! Alignment types.
+    enum EType {
+        E_Unaligned = 1,
+        E_Aligned8 = 8,
+        E_Aligned16 = 16,
+        E_Aligned32 = 32
+    };
+
+    //! This is an ordering by inclusion, i.e. \p lhs < \p rhs if an address is
+    //! \p rhs aligned implies it is lhs aligned but not vice versa.
+    static bool less(EType lhs, EType rhs) { return bytes(lhs) < bytes(rhs); }
+
+    //! Get the alignment of \p address.
+    template<typename T>
+    static EType maxAlignment(const T* address) {
+        // clang-format off
+        return (isAligned(address, E_Aligned32) ? E_Aligned32 :
+               (isAligned(address, E_Aligned16) ? E_Aligned16 :
+               (isAligned(address, E_Aligned8)  ? E_Aligned8  :
+               (E_Unaligned))));
+        // clang-format on
+    }
+
+    //! Check if \p address has \p alignment.
+    template<typename T>
+    static bool isAligned(const T* address, EType alignment) {
+        return offset(address, alignment) == 0;
+    }
+
+    //! Get the next index in \p buffer which is aligned to \p alignment.
+    template<typename T, std::size_t N>
+    static std::size_t
+    nextAligned(const std::array<T, N>& buffer, std::size_t index, EType alignment) {
+        std::size_t offset_{offset(&buffer[index], alignment)};
+        return offset_ == 0 ? index : index + (bytes(alignment) - offset_) / sizeof(T);
+    }
+
+    //! Get the next index in \p buffer which is aligned to \p alignment.
+    template<typename T>
+    static std::size_t
+    nextAligned(const std::vector<T>& buffer, std::size_t index, EType alignment) {
+        std::size_t offset_{offset(&buffer[index], alignment)};
+        return offset_ == 0 ? index : index + (bytes(alignment) - offset_) / sizeof(T);
+    }
+
+    //! Round up n items of T so they use a multiple of \p alignment size memory.
+    template<typename T>
+    static std::size_t roundup(EType alignment, std::size_t n) {
+        return roundupSizeof<T>(alignment, n) / sizeof(T);
+    }
+
+    //! Round up sizeof(T) up to multiple of \p alignment bytes.
+    template<typename T>
+    static std::size_t roundupSizeof(EType alignment, std::size_t n = 1) {
+        std::size_t bytes_{bytes(alignment)};
+        return ((n * sizeof(T) + bytes_ - 1) / bytes_) * bytes_;
+    }
+
+    //! Print the type.
+    static std::string print(EType type) {
+        switch (type) {
+        case E_Unaligned:
+            return "unaligned";
+        case E_Aligned8:
+            return "aligned 8";
+        case E_Aligned16:
+            return "aligned 16";
+        case E_Aligned32:
+            return "aligned 32";
+        }
+        return "";
+    }
+
+private:
+    template<typename T>
+    static std::size_t offset(const T* address, EType alignment) {
+        return reinterpret_cast<std::size_t>(address) & mask(alignment);
+    }
+
+    static std::size_t mask(EType alignment) { return bytes(alignment) - 1; }
+
+    static std::size_t bytes(EType alignment) {
+        return static_cast<std::size_t>(alignment);
+    }
+};
+
+template<typename T>
+using CAlignedAllocator = Eigen::aligned_allocator<T>;
+}
+}
+
+#endif // INCLUDED_ml_core_CAlignment_h
diff --git a/include/core/CDataFrame.h b/include/core/CDataFrame.h
index 44f54aafac..a9effe0135 100644
--- a/include/core/CDataFrame.h
+++ b/include/core/CDataFrame.h
@@ -7,6 +7,7 @@
 #ifndef INCLUDED_ml_core_CDataFrame_h
 #define INCLUDED_ml_core_CDataFrame_h
 
+#include <core/CAlignment.h>
 #include <core/CFloatStorage.h>
 #include <core/CPackedBitVector.h>
 #include <core/CVectorRange.h>
@@ -32,7 +33,7 @@ class CTemporaryDirectory;
 
 namespace data_frame_detail {
 
-using TFloatVec = std::vector<CFloatStorage>;
+using TFloatVec = std::vector<CFloatStorage, CAlignedAllocator<CFloatStorage>>;
 using TFloatVecItr = TFloatVec::iterator;
 using TInt32Vec = std::vector<std::int32_t>;
 using TInt32VecCItr = TInt32Vec::const_iterator;
@@ -178,8 +179,9 @@ class CORE_EXPORT CRowIterator final
 //! parallelized in which case each reader reads a disjoint subset of the data
 //! frame's rows.
 //!
-//! Space can be reserved at any point to hold one or more additional columns.
-//! These are not visible until they are written.
+//! Space can be reserved for additional rows and the data frame can be resized
+//! to hold one or more additional columns. Resizing is a heavyweight operation
+//! and should be minimized.
 //!
 //! IMPLEMENTATION:\n
 //! This is a fairly lightweight container which is essentially responsible
@@ -187,8 +189,9 @@ class CORE_EXPORT CRowIterator final
 //! The store format is determined by the user implementing functionality to
 //! read and write state from the store. For example, these could copy to /
 //! from main memory, "write to" / "read from" disk, etc. A factory function
-//! must be provided to the constructor which effectively that determines the
-//! type of storage used. It is assumed that copying this has no side effects.
+//! for new chunks of storage must be provided to the constructor and this
+//! effectively determines the type of storage used. It is assumed that copying
+//! this function has no side effects.
 //!
 //! The data frame is divided into slices each of which represent a number of
 //! contiguous rows. The idea is that they contain a reasonable amount of memory
@@ -196,29 +199,34 @@ class CORE_EXPORT CRowIterator final
 //! "reads from" disk (a whole slice being written or read in one go), mean we'll
 //! get good locality of reference and mean there is minimal book keeping overhead
 //! (such as state for vector sizes, pointers to starts of memory blocks, etc).
-//! In addition, it is assumed that access to the individual slices is thread
-//! safe. If they share state the implementation must ensure that access to this
-//! is synchronized.
+//! It is possible to choose an alignment for each row in which case the address
+//! of the start of each row is 8, 16, etc byte aligned. This comes with a memory
+//! overhead as row sizes are then rounded up to the nearest multiple of the
+//! alignment size. Finally, note that it is assumed that access to the individual
+//! slices is thread safe. If they share state the implementation must ensure that
+//! access to this is synchronized.
 //!
-//! Reads and writes of a single row are also done via call backs supplied to the
+//! Reads and writes of a single row are done via call backs supplied to the
 //! readRows and writeRow functions. This is to achieve maximum decoupling from
 //! the calling code for how the underlying values are used or where they come
 //! from. It also means certain operations can be done very efficiently. For example,
 //! a stream can be attached to a row writer function to copy the values directly
-//! into the data frame storage.
+//! into the data frame storage with no marshalling costs.
 //!
-//! Read and writes to storage can optionally happen in a separate thread to the
-//! row reading and writing to deal with the case that these operations can by
-//! time consuming.
+//! Read from and writes to storage can optionally happen in a separate thread
+//! to the row reading and writing to deal with the case that these operations
+//! can by time consuming.
 class CORE_EXPORT CDataFrame final {
 public:
     using TBoolVec = std::vector<bool>;
+    using TSizeVec = std::vector<std::size_t>;
     using TStrVec = std::vector<std::string>;
     using TStrVecVec = std::vector<TStrVec>;
     using TStrCRng = CVectorRange<const TStrVec>;
-    using TFloatVec = std::vector<CFloatStorage>;
+    using TFloatVec = std::vector<CFloatStorage, CAlignedAllocator<CFloatStorage>>;
     using TFloatVecItr = TFloatVec::iterator;
     using TInt32Vec = std::vector<std::int32_t>;
+    using TSizeAlignmentPrVec = std::vector<std::pair<std::size_t, CAlignment::EType>>;
     using TRowRef = data_frame_detail::CRowRef;
     using TRowItr = data_frame_detail::CRowIterator;
     using TRowFunc = std::function<void(TRowItr, TRowItr)>;
@@ -245,6 +253,7 @@ class CORE_EXPORT CDataFrame final {
 public:
     //! \param[in] inMainMemory True if the data frame is stored in main memory.
     //! \param[in] numberColumns The number of columns in the data frame.
+    //! \param[in] rowAlignment The alignment to use for the start of each row.
     //! \param[in] sliceCapacityInRows The capacity of a slice of the data frame
     //! as a number of rows.
     //! \param[in] readAndWriteToStoreSyncStrategy Controls whether reads and
@@ -256,6 +265,7 @@ class CORE_EXPORT CDataFrame final {
     //! the implementers responsibility to ensure these conditions are satisfied.
     CDataFrame(bool inMainMemory,
                std::size_t numberColumns,
+               CAlignment::EType rowAlignment,
                std::size_t sliceCapacityInRows,
                EReadWriteToStorage readAndWriteToStoreSyncStrategy,
                const TWriteSliceToStoreFunc& writeSliceToStore);
@@ -263,6 +273,7 @@ class CORE_EXPORT CDataFrame final {
     //! Overload which manages the setting of slice capacity to a sensible default.
     CDataFrame(bool inMainMemory,
                std::size_t numberColumns,
+               CAlignment::EType rowAlignment,
                EReadWriteToStorage readAndWriteToStoreSyncStrategy,
                const TWriteSliceToStoreFunc& writeSliceToStore);
 
@@ -297,6 +308,18 @@ class CORE_EXPORT CDataFrame final {
     //! \param[in] numberColumns The desired number of columns.
     void resizeColumns(std::size_t numberThreads, std::size_t numberColumns);
 
+    //! Resize to contain \p extraColumns columns.
+    //!
+    //! These are split up into blocks of columns with their required alignment.
+    //! Pads are automatically inserted for alignment and a vector of the start
+    //! position of each block of columns is returned.
+    //!
+    //! \param[in] numberThreads The target number of threads to use.
+    //! \param[in] extraColumns The desired additional columns.
+    //! \return The index of each (block of) columns in \p extraColumns.
+    //! \warning This only supports alignments less than or equal the row alignment.
+    TSizeVec resizeColumns(std::size_t numberThreads, const TSizeAlignmentPrVec& extraColumns);
+
     //! This reads rows using one or more readers.
     //!
     //! One reader is bound to one thread. Each thread reads a disjoint subset
@@ -351,7 +374,7 @@ class CORE_EXPORT CDataFrame final {
         std::vector<READER> readers;
         readers.reserve(result.first.size());
         for (auto& reader_ : result.first) {
-            readers.push_back(std::move(*reader_.target<READER>()));
+            readers.emplace_back(std::move(*reader_.target<READER>()));
         }
 
         return {std::move(readers), result.second};
@@ -412,7 +435,7 @@ class CORE_EXPORT CDataFrame final {
         std::vector<WRITER> writers;
         writers.reserve(result.first.size());
         for (auto& writer_ : result.first) {
-            writers.push_back(std::move(*writer_.target<WRITER>()));
+            writers.emplace_back(std::move(*writer_.target<WRITER>()));
         }
 
         return {std::move(writers), result.second};
@@ -485,7 +508,8 @@ class CORE_EXPORT CDataFrame final {
     //! \p numberColumns columns.
     static std::size_t estimateMemoryUsage(bool inMainMemory,
                                            std::size_t numberRows,
-                                           std::size_t numberColumns);
+                                           std::size_t numberColumns,
+                                           CAlignment::EType alignment);
 
     //! Get the value to use for a missing element in a data frame.
     static constexpr double valueOfMissing() {
@@ -568,6 +592,8 @@ class CORE_EXPORT CDataFrame final {
     std::size_t m_RowCapacity;
     //! The capacity of a slice of the data frame as a number of rows.
     std::size_t m_SliceCapacityInRows;
+    //! The start of row memory alignment.
+    core::CAlignment::EType m_RowAlignment;
 
     //! If true read and write asynchronously to storage.
     EReadWriteToStorage m_ReadAndWriteToStoreSyncStrategy;
@@ -610,12 +636,14 @@ class CORE_EXPORT CDataFrame final {
 //! capacity in rows.
 //! \param[in] readWriteToStoreSyncStrategy Controls whether reads and writes
 //! from slice storage are synchronous or asynchronous.
+//! \param[in] alignment The alignment to use for the start of each row.
 CORE_EXPORT
 std::pair<std::unique_ptr<CDataFrame>, std::shared_ptr<CTemporaryDirectory>>
 makeMainStorageDataFrame(std::size_t numberColumns,
                          boost::optional<std::size_t> sliceCapacity = boost::none,
                          CDataFrame::EReadWriteToStorage readWriteToStoreSyncStrategy =
-                             CDataFrame::EReadWriteToStorage::E_Sync);
+                             CDataFrame::EReadWriteToStorage::E_Sync,
+                         CAlignment::EType alignment = CAlignment::E_Aligned16);
 
 //! Make a data frame which uses disk storage for its slices.
 //!
@@ -627,6 +655,7 @@ makeMainStorageDataFrame(std::size_t numberColumns,
 //! capacity in rows.
 //! \param[in] readWriteToStoreSyncStrategy Controls whether reads and writes
 //! from slice storage are synchronous or asynchronous.
+//! \param[in] alignment The alignment to use for the start of each row.
 CORE_EXPORT
 std::pair<std::unique_ptr<CDataFrame>, std::shared_ptr<CTemporaryDirectory>>
 makeDiskStorageDataFrame(const std::string& rootDirectory,
@@ -634,7 +663,8 @@ makeDiskStorageDataFrame(const std::string& rootDirectory,
                          std::size_t numberRows,
                          boost::optional<std::size_t> sliceCapacity = boost::none,
                          CDataFrame::EReadWriteToStorage readWriteToStoreSyncStrategy =
-                             CDataFrame::EReadWriteToStorage::E_Async);
+                             CDataFrame::EReadWriteToStorage::E_Async,
+                         CAlignment::EType alignment = CAlignment::E_Aligned16);
 }
 }
 
diff --git a/include/core/CDataFrameRowSlice.h b/include/core/CDataFrameRowSlice.h
index c27707f82f..52ba395775 100644
--- a/include/core/CDataFrameRowSlice.h
+++ b/include/core/CDataFrameRowSlice.h
@@ -7,6 +7,7 @@
 #ifndef INCLUDED_ml_core_CDataFrameRowSlice_h
 #define INCLUDED_ml_core_CDataFrameRowSlice_h
 
+#include <core/CAlignment.h>
 #include <core/CFloatStorage.h>
 #include <core/CompressUtils.h>
 #include <core/ImportExport.h>
@@ -24,7 +25,7 @@ namespace data_frame_row_slice_detail {
 //! \brief The implementation backing a data frame row slice handle.
 class CORE_EXPORT CDataFrameRowSliceHandleImpl {
 public:
-    using TFloatVec = std::vector<CFloatStorage>;
+    using TFloatVec = std::vector<CFloatStorage, CAlignedAllocator<CFloatStorage>>;
     using TInt32Vec = std::vector<std::int32_t>;
     using TImplPtr = std::unique_ptr<CDataFrameRowSliceHandleImpl>;
 
@@ -42,7 +43,7 @@ class CORE_EXPORT CDataFrameRowSliceHandleImpl {
 //! CDataFrame storage.
 class CORE_EXPORT CDataFrameRowSliceHandle {
 public:
-    using TFloatVec = std::vector<CFloatStorage>;
+    using TFloatVec = std::vector<CFloatStorage, CAlignedAllocator<CFloatStorage>>;
     using TFloatVecItr = TFloatVec::iterator;
     using TInt32Vec = std::vector<std::int32_t>;
     using TInt32VecCItr = TInt32Vec::const_iterator;
@@ -83,7 +84,7 @@ class CORE_EXPORT CDataFrameRowSliceHandle {
 //! \brief CDataFrame slice storage interface.
 class CORE_EXPORT CDataFrameRowSlice {
 public:
-    using TFloatVec = std::vector<CFloatStorage>;
+    using TFloatVec = std::vector<CFloatStorage, CAlignedAllocator<CFloatStorage>>;
     using TInt32Vec = std::vector<std::int32_t>;
 
 public:
diff --git a/include/maths/CBoostedTreeImpl.h b/include/maths/CBoostedTreeImpl.h
index 8969a743c9..276e3b3b9f 100644
--- a/include/maths/CBoostedTreeImpl.h
+++ b/include/maths/CBoostedTreeImpl.h
@@ -102,8 +102,8 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     //! Get the column containing the dependent variable.
     std::size_t columnHoldingDependentVariable() const;
 
-    //! Get the number of columns in the original data frame.
-    std::size_t numberInputColumns() const;
+    //! Get start indices of the extra columns.
+    const TSizeVec& extraColumns() const;
 
     //! Get the weights to apply to each class's predicted probability when
     //! assigning classes.
@@ -303,7 +303,7 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     mutable CPRNG::CXorOShiro128Plus m_Rng;
     std::size_t m_NumberThreads;
     std::size_t m_DependentVariable = std::numeric_limits<std::size_t>::max();
-    std::size_t m_NumberInputColumns = 0;
+    TSizeVec m_ExtraColumns;
     TLossFunctionUPtr m_Loss;
     CBoostedTree::EClassAssignmentObjective m_ClassAssignmentObjective =
         CBoostedTree::E_MinimumRecall;
diff --git a/include/maths/CBoostedTreeLeafNodeStatistics.h b/include/maths/CBoostedTreeLeafNodeStatistics.h
index 4f5627699f..df887f9612 100644
--- a/include/maths/CBoostedTreeLeafNodeStatistics.h
+++ b/include/maths/CBoostedTreeLeafNodeStatistics.h
@@ -7,6 +7,7 @@
 #ifndef INCLUDED_ml_maths_CBoostedTreeLeafNodeStatistics_h
 #define INCLUDED_ml_maths_CBoostedTreeLeafNodeStatistics_h
 
+#include <core/CAlignment.h>
 #include <core/CImmutableRadixSet.h>
 #include <core/CMemory.h>
 #include <core/CPackedBitVector.h>
@@ -26,6 +27,7 @@
 
 #include <cstddef>
 #include <cstdint>
+#include <functional>
 #include <limits>
 #include <numeric>
 #include <vector>
@@ -56,9 +58,9 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final {
     using TImmutableRadixSetVec = std::vector<TImmutableRadixSet>;
     using TPtr = std::shared_ptr<CBoostedTreeLeafNodeStatistics>;
     using TPtrPtrPr = std::pair<TPtr, TPtr>;
-    using TMemoryMappedFloatVector = CMemoryMappedDenseVector<CFloatStorage>;
-    using TMemoryMappedDoubleVector = CMemoryMappedDenseVector<double>;
-    using TMemoryMappedDoubleMatrix = CMemoryMappedDenseMatrix<double>;
+    using TMemoryMappedFloatVector = CMemoryMappedDenseVector<CFloatStorage, Eigen::Aligned16>;
+    using TMemoryMappedDoubleVector = CMemoryMappedDenseVector<double, Eigen::Aligned16>;
+    using TMemoryMappedDoubleMatrix = CMemoryMappedDenseMatrix<double, Eigen::Aligned16>;
 
     //! \brief Accumulates aggregate derivatives.
     class MATHS_EXPORT CDerivatives {
@@ -70,10 +72,9 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final {
         static bool dynamicSizeAlwaysZero() { return true; }
 
     public:
-        CDerivatives(std::size_t numberLossParameters, double* storage)
-            : m_Count{0}, m_Gradient{storage, static_cast<int>(numberLossParameters)},
-              m_Curvature{storage + numberLossParameters,
-                          static_cast<int>(numberLossParameters),
+        CDerivatives(std::size_t numberLossParameters, double* storageGradients, double* storageCurvatures)
+            : m_Gradient{storageGradients, static_cast<int>(numberLossParameters)},
+              m_Curvature{storageCurvatures, static_cast<int>(numberLossParameters),
                           static_cast<int>(numberLossParameters)} {}
 
         //! Get the accumulated count.
@@ -87,20 +88,16 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final {
             return m_Curvature;
         }
 
-        //! Add \p count, \p gradient and \p curvature to the accumulator.
-        void add(std::size_t count,
-                 const TMemoryMappedFloatVector& gradient,
-                 const TMemoryMappedFloatVector& curvature) {
+        //! Add \p count and \p derivatives to the accumulator.
+        void add(std::size_t count, const TMemoryMappedFloatVector& derivatives) {
             m_Count += count;
-            m_Gradient += gradient;
-            this->curvatureTriangleView() += curvature;
+            this->upperTriangularFlatView() += derivatives;
         }
 
         //! Compute the accumulation of both collections of derivatives.
         void add(const CDerivatives& other) {
             m_Count += other.m_Count;
-            m_Gradient += other.m_Gradient;
-            m_Curvature += other.m_Curvature;
+            this->flatView() += const_cast<CDerivatives*>(&other)->flatView();
         }
 
         //! Set to the difference of \p lhs and \p rhs.
@@ -134,15 +131,16 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final {
 
         //! Remap the accumulated curvature to lower triangle row major format.
         void remapCurvature() {
-            // For performance, we accumulate curvatures into the first n (n + 1) / 2
-            // elements of the array backing m_Curvature. However, the memory mapped
-            // matrix class expects them to be stored column major in the lower triangle
-            // of n x n matrix. This copies them backwards to their correct positions.
-            for (std::ptrdiff_t j = m_Curvature.cols() - 1,
-                                k = m_Curvature.rows() * (m_Curvature.rows() + 1) / 2 - 1;
+            // For performance, we accumulate curvatures into the first n + n (n + 1) / 2
+            // elements of the array backing upperTriangularFlatView. However, the memory
+            // mapped matrix class expects them to be stored column major in the lower
+            // triangle of an n x n matrix. This copies them backwards to their correct
+            // positions.
+            TMemoryMappedDoubleVector derivatives{this->upperTriangularFlatView()};
+            for (std::ptrdiff_t j = m_Curvature.cols() - 1, k = derivatives.rows() - 1;
                  j >= 0; --j) {
                 for (std::ptrdiff_t i = m_Curvature.rows() - 1; i >= j; --i, --k) {
-                    m_Curvature(i, j) = m_Curvature.array()(k);
+                    m_Curvature(i, j) = derivatives(k);
                 }
             }
         }
@@ -155,8 +153,17 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final {
         }
 
     private:
-        TMemoryMappedDoubleVector curvatureTriangleView() {
-            return {m_Curvature.data(), m_Curvature.rows() * (m_Curvature.rows() + 1) / 2};
+        TMemoryMappedDoubleVector upperTriangularFlatView() {
+            // Gradient + upper triangle of the Hessian.
+            auto n = m_Gradient.rows();
+            return {m_Gradient.data(), n * (n + 3) / 2};
+        }
+
+        TMemoryMappedDoubleVector flatView() {
+            // Gradient + pad + full Hessian.
+            auto n = m_Curvature.data() - m_Gradient.data() +
+                     m_Curvature.rows() * m_Curvature.cols();
+            return {m_Gradient.data(), n};
         }
 
     private:
@@ -166,27 +173,26 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final {
     };
 
     //! \brief A collection of aggregate derivatives for candidate feature splits.
-    class MATHS_EXPORT CPerSplitDerivatives {
+    class MATHS_EXPORT CSplitsDerivatives {
     public:
         using TDerivativesVec = std::vector<CDerivatives>;
 
     public:
-        explicit CPerSplitDerivatives(std::size_t numberLossParameters = 0)
+        explicit CSplitsDerivatives(std::size_t numberLossParameters = 0)
             : m_NumberLossParameters{numberLossParameters} {}
-        CPerSplitDerivatives(const TImmutableRadixSetVec& candidateSplits,
-                             std::size_t numberLossParameters)
+        CSplitsDerivatives(const TImmutableRadixSetVec& candidateSplits, std::size_t numberLossParameters)
             : m_NumberLossParameters{numberLossParameters} {
             this->map(candidateSplits);
         }
-        CPerSplitDerivatives(const CPerSplitDerivatives& other)
+        CSplitsDerivatives(const CSplitsDerivatives& other)
             : m_NumberLossParameters{other.m_NumberLossParameters} {
             this->map(other.m_Derivatives);
             this->add(other);
         }
-        CPerSplitDerivatives(CPerSplitDerivatives&&) = default;
+        CSplitsDerivatives(CSplitsDerivatives&&) = default;
 
-        CPerSplitDerivatives& operator=(const CPerSplitDerivatives& other) = delete;
-        CPerSplitDerivatives& operator=(CPerSplitDerivatives&&) = default;
+        CSplitsDerivatives& operator=(const CSplitsDerivatives& other) = delete;
+        CSplitsDerivatives& operator=(CSplitsDerivatives&&) = default;
 
         //! \return The aggregate count for \p feature and \p split.
         std::size_t count(std::size_t feature, std::size_t split) const {
@@ -227,21 +233,19 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final {
         //! the \p split of \p feature.
         void addDerivatives(std::size_t feature,
                             std::size_t split,
-                            const TMemoryMappedFloatVector& gradient,
-                            const TMemoryMappedFloatVector& curvature) {
-            m_Derivatives[feature][split].add(1, gradient, curvature);
+                            const TMemoryMappedFloatVector& derivatives) {
+            m_Derivatives[feature][split].add(1, derivatives);
         }
 
         //! Add \p gradient and \p curvature to the accumulated derivatives for
         //! missing values of \p feature.
         void addMissingDerivatives(std::size_t feature,
-                                   const TMemoryMappedFloatVector& gradient,
-                                   const TMemoryMappedFloatVector& curvature) {
-            m_MissingDerivatives[feature].add(1, gradient, curvature);
+                                   const TMemoryMappedFloatVector& derivatives) {
+            m_MissingDerivatives[feature].add(1, derivatives);
         }
 
         //! Compute the accumulation of both collections of per split derivatives.
-        void add(const CPerSplitDerivatives& other) {
+        void add(const CSplitsDerivatives& other) {
             for (std::size_t i = 0; i < other.m_Derivatives.size(); ++i) {
                 for (std::size_t j = 0; j < other.m_Derivatives[i].size(); ++j) {
                     m_Derivatives[i][j].add(other.m_Derivatives[i][j]);
@@ -251,7 +255,7 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final {
         }
 
         //! Subtract \p rhs.
-        void subtract(const CPerSplitDerivatives& rhs) {
+        void subtract(const CSplitsDerivatives& rhs) {
             for (std::size_t i = 0; i < m_Derivatives.size(); ++i) {
                 for (std::size_t j = 0; j < m_Derivatives[i].size(); ++j) {
                     m_Derivatives[i][j].subtract(rhs.m_Derivatives[i][j]);
@@ -287,7 +291,7 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final {
                                         sizeof(CDerivatives)};
             std::size_t storageSize{numberFeatures * (numberSplitsPerFeature + 1) * numberLossParameters *
                                     (numberLossParameters + 1) * sizeof(double)};
-            return sizeof(CPerSplitDerivatives) + derivativesSize + storageSize;
+            return sizeof(CSplitsDerivatives) + derivativesSize + storageSize;
         }
 
         //! Get a checksum of this object.
@@ -300,6 +304,7 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final {
 
     private:
         using TDerivativesVecVec = std::vector<TDerivativesVec>;
+        using TAlignedDoubleVec = std::vector<double, core::CAlignedAllocator<double>>;
 
     private:
         static std::size_t number(const TDerivativesVec& derivatives) {
@@ -318,42 +323,62 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final {
             //       |            |                     |              |
             //       V            V                     V              V
             // |     n     |      n^2      | ... |      n       |      n^2       |
+            //
+            // Note we ensure 16 byte alignment because we're using aligned memory
+            // mapped vectors which have much better performance.
 
+            std::size_t numberFeatures{splits.size()};
             std::size_t totalNumberSplits{
                 std::accumulate(splits.begin(), splits.end(), std::size_t{0},
                                 [](std::size_t size, const auto& featureSplits) {
                                     return size + number(featureSplits);
                                 })};
 
-            int numberGradients{static_cast<int>(m_NumberLossParameters)};
-            int numberCurvatures{numberGradients * numberGradients};
-            int numberDerivatives{numberGradients + numberCurvatures};
+            std::size_t numberGradients{this->numberGradients()};
+            std::size_t numberDerivatives{this->numberDerivatives()};
 
-            m_Derivatives.resize(splits.size());
-            m_MissingDerivatives.reserve(splits.size());
-            m_Storage.resize((totalNumberSplits + splits.size()) * numberDerivatives, 0.0);
+            m_Derivatives.resize(numberFeatures);
+            m_MissingDerivatives.reserve(numberFeatures);
+
+            m_Storage.resize((totalNumberSplits + numberFeatures) * numberDerivatives, 0.0);
 
             double* storage{&m_Storage[0]};
-            for (std::size_t i = 0; i < splits.size(); ++i, storage += numberDerivatives) {
+            for (std::size_t i = 0; i < numberFeatures; ++i, storage += numberDerivatives) {
                 std::size_t size{number(splits[i])};
                 m_Derivatives[i].reserve(size);
                 for (std::size_t j = 0; j < size; ++j, storage += numberDerivatives) {
-                    m_Derivatives[i].emplace_back(m_NumberLossParameters, storage);
+                    m_Derivatives[i].emplace_back(m_NumberLossParameters, storage,
+                                                  storage + numberGradients);
                 }
-                m_MissingDerivatives.emplace_back(m_NumberLossParameters, storage);
+                m_MissingDerivatives.emplace_back(m_NumberLossParameters, storage,
+                                                  storage + numberGradients);
             }
         }
 
+        std::size_t numberDerivatives() const {
+            return this->numberGradients() + this->numberCurvatures();
+        }
+
+        std::size_t numberGradients() const {
+            return core::CAlignment::roundup<double>(core::CAlignment::E_Aligned16,
+                                                     m_NumberLossParameters);
+        }
+
+        std::size_t numberCurvatures() const {
+            return core::CAlignment::roundup<double>(
+                core::CAlignment::E_Aligned16, m_NumberLossParameters * m_NumberLossParameters);
+        }
+
     private:
         std::size_t m_NumberLossParameters = 0;
         TDerivativesVecVec m_Derivatives;
         TDerivativesVec m_MissingDerivatives;
-        TDoubleVec m_Storage;
+        TAlignedDoubleVec m_Storage;
     };
 
 public:
     CBoostedTreeLeafNodeStatistics(std::size_t id,
-                                   std::size_t numberInputColumns,
+                                   const TSizeVec& extraColumns,
                                    std::size_t numberLossParameters,
                                    std::size_t numberThreads,
                                    const core::CDataFrame& frame,
@@ -366,7 +391,7 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final {
 
     //! Only called by split but is public so it's accessible to std::make_shared.
     CBoostedTreeLeafNodeStatistics(std::size_t id,
-                                   std::size_t numberInputColumns,
+                                   const TSizeVec& extraColumns,
                                    std::size_t numberLossParameters,
                                    std::size_t numberThreads,
                                    const core::CDataFrame& frame,
@@ -442,6 +467,8 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final {
                                            std::size_t numberLossParameters);
 
 private:
+    using TSizeVecCRef = std::reference_wrapper<const TSizeVec>;
+
     //! \brief Statistics relating to a split of the node.
     struct MATHS_EXPORT SSplitStatistics
         : private boost::less_than_comparable<SSplitStatistics> {
@@ -490,18 +517,18 @@ class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final {
                                                    const CBoostedTreeNode& split,
                                                    const core::CPackedBitVector& parentRowMask);
     void addRowDerivatives(const CEncodedDataFrameRowRef& row,
-                           CPerSplitDerivatives& splitDerivatives) const;
+                           CSplitsDerivatives& splitsDerivatives) const;
     SSplitStatistics computeBestSplitStatistics(const TRegularization& regularization,
                                                 const TSizeVec& featureBag) const;
 
 private:
     std::size_t m_Id;
     std::size_t m_Depth;
-    std::size_t m_NumberInputColumns;
+    TSizeVecCRef m_ExtraColumns;
     std::size_t m_NumberLossParameters;
     const TImmutableRadixSetVec& m_CandidateSplits;
     core::CPackedBitVector m_RowMask;
-    CPerSplitDerivatives m_Derivatives;
+    CSplitsDerivatives m_Derivatives;
     SSplitStatistics m_BestSplit;
 };
 }
diff --git a/include/maths/CBoostedTreeUtils.h b/include/maths/CBoostedTreeUtils.h
index 700ee07019..d447a0754e 100644
--- a/include/maths/CBoostedTreeUtils.h
+++ b/include/maths/CBoostedTreeUtils.h
@@ -22,83 +22,85 @@ namespace boosted_tree {
 class CLoss;
 }
 namespace boosted_tree_detail {
+using TSizeVec = std::vector<std::size_t>;
 using TRowRef = core::CDataFrame::TRowRef;
 using TMemoryMappedFloatVector = CMemoryMappedDenseVector<CFloatStorage>;
+using TSizeAlignmentPrVec = std::vector<std::pair<std::size_t, core::CAlignment::EType>>;
+using TAlignedMemoryMappedFloatVector =
+    CMemoryMappedDenseVector<CFloatStorage, Eigen::Aligned16>;
 
-inline std::size_t lossHessianStoredSize(std::size_t numberLossParameters) {
+//! Get the size of upper triangle of the loss Hessain.
+inline std::size_t lossHessianUpperTriangleSize(std::size_t numberLossParameters) {
     return numberLossParameters * (numberLossParameters + 1) / 2;
 }
 
-inline std::size_t predictionColumn(std::size_t numberInputColumns) {
-    return numberInputColumns;
-}
-
-inline std::size_t lossGradientColumn(std::size_t numberInputColumns,
-                                      std::size_t numberLossParameters) {
-    return predictionColumn(numberInputColumns) + numberLossParameters;
-}
-
-inline std::size_t lossCurvatureColumn(std::size_t numberInputColumns,
-                                       std::size_t numberLossParameters) {
-    return lossGradientColumn(numberInputColumns, numberLossParameters) + numberLossParameters;
-}
-
-inline std::size_t exampleWeightColumn(std::size_t numberInputColumns,
-                                       std::size_t numberLossParameters) {
-    return lossCurvatureColumn(numberInputColumns, numberLossParameters) +
-           lossHessianStoredSize(numberLossParameters);
-}
+//! Get the extra columns needed by training.
+MATHS_EXPORT
+TSizeAlignmentPrVec extraColumns(std::size_t numberLossParameters);
 
 //! Read the prediction from \p row.
 MATHS_EXPORT
 TMemoryMappedFloatVector readPrediction(const TRowRef& row,
-                                        std::size_t numberInputColumns,
-                                        std::size_t numberLossParamaters);
+                                        const TSizeVec& extraColumns,
+                                        std::size_t numberLossParameters);
 
 //! Zero the prediction of \p row.
 MATHS_EXPORT
-void zeroPrediction(const TRowRef& row, std::size_t numberInputColumns, std::size_t numberLossParamaters);
+void zeroPrediction(const TRowRef& row, const TSizeVec& extraColumns, std::size_t numberLossParameters);
+
+//! Read all the loss derivatives from \p row into an aligned vector.
+MATHS_EXPORT
+TAlignedMemoryMappedFloatVector readLossDerivatives(const TRowRef& row,
+                                                    const TSizeVec& extraColumns,
+                                                    std::size_t numberLossParameters);
 
 //! Read the loss gradient from \p row.
 MATHS_EXPORT
 TMemoryMappedFloatVector readLossGradient(const TRowRef& row,
-                                          std::size_t numberInputColumns,
+                                          const TSizeVec& extraColumns,
                                           std::size_t numberLossParameters);
 
 //! Zero the loss gradient of \p row.
 MATHS_EXPORT
-void zeroLossGradient(const TRowRef& row, std::size_t numberInputColumns, std::size_t numberLossParameters);
+void zeroLossGradient(const TRowRef& row, const TSizeVec& extraColumns, std::size_t numberLossParameters);
 
 //! Write the loss gradient to \p row.
 MATHS_EXPORT
 void writeLossGradient(const TRowRef& row,
-                       std::size_t numberInputColumns,
+                       const TSizeVec& extraColumns,
                        const boosted_tree::CLoss& loss,
                        const TMemoryMappedFloatVector& prediction,
                        double actual,
                        double weight = 1.0);
 
+//! Read the loss flat column major Hessian from \p row.
 MATHS_EXPORT
 TMemoryMappedFloatVector readLossCurvature(const TRowRef& row,
-                                           std::size_t numberInputColumns,
+                                           const TSizeVec& extraColumns,
                                            std::size_t numberLossParameters);
 
+//! Zero the loss Hessian of \p row.
 MATHS_EXPORT
-void zeroLossCurvature(const TRowRef& row, std::size_t numberInputColumns, std::size_t numberLossParameters);
+void zeroLossCurvature(const TRowRef& row, const TSizeVec& extraColumns, std::size_t numberLossParameters);
 
+//! Write the loss Hessian to \p row.
 MATHS_EXPORT
 void writeLossCurvature(const TRowRef& row,
-                        std::size_t numberInputColumns,
+                        const TSizeVec& extraColumns,
                         const boosted_tree::CLoss& curvature,
                         const TMemoryMappedFloatVector& prediction,
                         double actual,
                         double weight = 1.0);
 
+//! Read the example weight from \p row.
+MATHS_EXPORT
+double readExampleWeight(const TRowRef& row, const TSizeVec& extraColumns);
+
+//! Write the example weight to \p row .
 MATHS_EXPORT
-double readExampleWeight(const TRowRef& row,
-                         std::size_t numberInputColumns,
-                         std::size_t numberLossParameters);
+void writeExampleWeight(const TRowRef& row, const TSizeVec& extraColumns, double weight);
 
+//! Read the actual value for the target from \p row.
 MATHS_EXPORT
 double readActual(const TRowRef& row, std::size_t dependentVariable);
 
diff --git a/include/maths/CDataFrameUtils.h b/include/maths/CDataFrameUtils.h
index ab822d34c1..581cd74f30 100644
--- a/include/maths/CDataFrameUtils.h
+++ b/include/maths/CDataFrameUtils.h
@@ -35,9 +35,10 @@ struct SRowTo {
     static_assert(sizeof(T) < 0, "Vector type not supported");
 };
 
-template<typename T>
-struct SRowTo<CMemoryMappedDenseVector<T>> {
-    static CMemoryMappedDenseVector<T> dispatch(const core::CDataFrame::TRowRef& row) {
+template<typename T, Eigen::AlignmentType ALIGNMENT>
+struct SRowTo<CMemoryMappedDenseVector<T, ALIGNMENT>> {
+    static CMemoryMappedDenseVector<T, ALIGNMENT>
+    dispatch(const core::CDataFrame::TRowRef& row) {
         return {row.data(), static_cast<long>(row.numberColumns())};
     }
 };
diff --git a/include/maths/CInformationCriteria.h b/include/maths/CInformationCriteria.h
index 1881b0adbe..c000eb62f5 100644
--- a/include/maths/CInformationCriteria.h
+++ b/include/maths/CInformationCriteria.h
@@ -95,7 +95,7 @@ class CSphericalGaussianInfoCriterion {
 public:
     using TPointVec = std::vector<POINT>;
     using TPointVecVec = std::vector<TPointVec>;
-    using TBarePoint = typename SStripped<POINT>::Type;
+    using TBarePoint = typename SUnannotated<POINT>::Type;
     using TBarePointPrecise = typename SFloatingPoint<TBarePoint, double>::Type;
     using TCoordinate = typename SCoordinate<TBarePointPrecise>::Type;
     using TMeanVarAccumulator =
@@ -194,7 +194,7 @@ class CGaussianInfoCriterion {
 public:
     using TPointVec = std::vector<POINT>;
     using TPointVecVec = std::vector<TPointVec>;
-    using TBarePoint = typename SStripped<POINT>::Type;
+    using TBarePoint = typename SUnannotated<POINT>::Type;
     using TBarePointPrecise = typename SFloatingPoint<TBarePoint, double>::Type;
     using TCoordinate = typename SCoordinate<TBarePointPrecise>::Type;
     using TCovariances = CBasicStatistics::SSampleCovariances<TBarePointPrecise>;
diff --git a/include/maths/CKMeans.h b/include/maths/CKMeans.h
index 22a1e38f82..6352295172 100644
--- a/include/maths/CKMeans.h
+++ b/include/maths/CKMeans.h
@@ -217,7 +217,7 @@ class CKMeans {
 
 protected:
     using TCoordinate = typename SCoordinate<POINT>::Type;
-    using TBarePoint = typename SStripped<POINT>::Type;
+    using TBarePoint = typename SUnannotated<POINT>::Type;
     using TBarePointPrecise = typename SFloatingPoint<TBarePoint, double>::Type;
     using TMeanAccumulator =
         typename CBasicStatistics::SSampleMean<TBarePointPrecise>::TAccumulator;
diff --git a/include/maths/CLinearAlgebraEigen.h b/include/maths/CLinearAlgebraEigen.h
index 79732967e3..7a630769bd 100644
--- a/include/maths/CLinearAlgebraEigen.h
+++ b/include/maths/CLinearAlgebraEigen.h
@@ -362,11 +362,11 @@ struct SConstant<CDenseVector<SCALAR>> {
 //! of CMemoryMappedDenseVector.
 //!
 //! \sa CMemoryMappedDenseVector for more information.
-template<typename SCALAR>
+template<typename SCALAR, Eigen::AlignmentType ALIGNMENT = Eigen::Unaligned>
 class CMemoryMappedDenseMatrix
-    : public Eigen::Map<typename CDenseMatrix<SCALAR>::TBase> {
+    : public Eigen::Map<typename CDenseMatrix<SCALAR>::TBase, ALIGNMENT> {
 public:
-    using TBase = Eigen::Map<typename CDenseMatrix<SCALAR>::TBase>;
+    using TBase = Eigen::Map<typename CDenseMatrix<SCALAR>::TBase, ALIGNMENT>;
 
     //! See core::CMemory.
     static bool dynamicSizeAlwaysZero() { return true; }
@@ -426,15 +426,16 @@ class CMemoryMappedDenseMatrix
 };
 
 //! Free efficient efficient swap for ADLU.
-template<typename SCALAR>
-void swap(CMemoryMappedDenseMatrix<SCALAR>& lhs, CMemoryMappedDenseMatrix<SCALAR>& rhs) {
+template<typename SCALAR, Eigen::AlignmentType ALIGNMENT>
+void swap(CMemoryMappedDenseMatrix<SCALAR, ALIGNMENT>& lhs,
+          CMemoryMappedDenseMatrix<SCALAR, ALIGNMENT>& rhs) {
     lhs.swap(rhs);
 }
 
 //! \brief Gets a constant square dense matrix with specified dimension or with
 //! specified numbers of rows and columns.
-template<typename SCALAR>
-struct SConstant<CMemoryMappedDenseMatrix<SCALAR>> {
+template<typename SCALAR, Eigen::AlignmentType ALIGNMENT>
+struct SConstant<CMemoryMappedDenseMatrix<SCALAR, ALIGNMENT>> {
     static auto get(std::ptrdiff_t dimension, SCALAR constant)
         -> decltype(SConstant<CDenseMatrix<SCALAR>>::get(dimension, 1)) {
         return SConstant<CDenseMatrix<SCALAR>>::get(dimension, constant);
@@ -476,12 +477,12 @@ struct SConstant<CMemoryMappedDenseMatrix<SCALAR>> {
 //! This better fits our needs with data frames where we want to reference the
 //! memory stored in the data frame rows, but never modify it directly through
 //! this vector type.
-template<typename SCALAR>
+template<typename SCALAR, Eigen::AlignmentType ALIGNMENT = Eigen::Unaligned>
 class CMemoryMappedDenseVector
-    : public Eigen::Map<typename CDenseVector<SCALAR>::TBase> {
+    : public Eigen::Map<typename CDenseVector<SCALAR>::TBase, ALIGNMENT> {
 public:
     using TDenseVector = CDenseVector<SCALAR>;
-    using TBase = Eigen::Map<typename TDenseVector::TBase>;
+    using TBase = Eigen::Map<typename TDenseVector::TBase, ALIGNMENT>;
 
     //! See core::CMemory.
     static bool dynamicSizeAlwaysZero() { return true; }
@@ -545,14 +546,15 @@ class CMemoryMappedDenseVector
 };
 
 //! Free efficient efficient swap for ADLU.
-template<typename SCALAR>
-void swap(CMemoryMappedDenseVector<SCALAR>& lhs, CMemoryMappedDenseVector<SCALAR>& rhs) {
+template<typename SCALAR, Eigen::AlignmentType ALIGNMENT>
+void swap(CMemoryMappedDenseVector<SCALAR, ALIGNMENT>& lhs,
+          CMemoryMappedDenseVector<SCALAR, ALIGNMENT>& rhs) {
     lhs.swap(rhs);
 }
 
 //! \brief Gets a constant dense vector with specified dimension.
-template<typename SCALAR>
-struct SConstant<CMemoryMappedDenseVector<SCALAR>> {
+template<typename SCALAR, Eigen::AlignmentType ALIGNMENT>
+struct SConstant<CMemoryMappedDenseVector<SCALAR, ALIGNMENT>> {
     static auto get(std::ptrdiff_t dimension, SCALAR constant)
         -> decltype(SConstant<CDenseVector<SCALAR>>::get(dimension, constant)) {
         return SConstant<CDenseVector<SCALAR>>::get(dimension, constant);
diff --git a/include/maths/CLinearAlgebraFwd.h b/include/maths/CLinearAlgebraFwd.h
index 805fa84892..594f723125 100644
--- a/include/maths/CLinearAlgebraFwd.h
+++ b/include/maths/CLinearAlgebraFwd.h
@@ -72,9 +72,9 @@ template<typename VECTOR>
 class CDenseVectorInitializer;
 template<typename MATRIX>
 class CDenseMatrixInitializer;
-template<typename SCALAR>
+template<typename SCALAR, Eigen::AlignmentType ALIGNMENT>
 class CMemoryMappedDenseVector;
-template<typename SCALAR>
+template<typename SCALAR, Eigen::AlignmentType ALIGNMENT>
 class CMemoryMappedDenseMatrix;
 }
 }
diff --git a/include/maths/CLinearAlgebraShims.h b/include/maths/CLinearAlgebraShims.h
index 6a1bd0549d..aef41801a0 100644
--- a/include/maths/CLinearAlgebraShims.h
+++ b/include/maths/CLinearAlgebraShims.h
@@ -31,8 +31,8 @@ std::size_t dimension(const CDenseVector<SCALAR>& x) {
 }
 
 //! Get the dimension of an Eigen memory mapped vector.
-template<typename SCALAR>
-std::size_t dimension(const CMemoryMappedDenseVector<SCALAR>& x) {
+template<typename SCALAR, Eigen::AlignmentType ALIGNMENT>
+std::size_t dimension(const CMemoryMappedDenseVector<SCALAR, ALIGNMENT>& x) {
     return static_cast<std::size_t>(x.size());
 }
 
@@ -75,9 +75,10 @@ CDenseMatrix<SCALAR> conformableZeroMatrix(const CDenseVector<SCALAR>& x) {
 }
 
 //! Get the conformable zero initialized matrix for the Eigen memory mapped vector.
-template<typename SCALAR>
-CDenseMatrix<SCALAR> conformableZeroMatrix(const CMemoryMappedDenseVector<SCALAR>& x) {
-    return SConstant<CMemoryMappedDenseMatrix<SCALAR>>::get(dimension(x), 0);
+template<typename SCALAR, Eigen::AlignmentType ALIGNMENT>
+CDenseMatrix<SCALAR>
+conformableZeroMatrix(const CMemoryMappedDenseVector<SCALAR, ALIGNMENT>& x) {
+    return SConstant<CMemoryMappedDenseMatrix<SCALAR, ALIGNMENT>>::get(dimension(x), 0);
 }
 
 //! Get the conformable zero initialized matrix for the underlying vector.
@@ -129,41 +130,41 @@ void max(const VECTOR& x, VECTOR& y) {
 
 //! Expose componentwise operations for our internal vectors.
 template<typename VECTOR>
-typename SArrayView<VECTOR>::Type componentwise(VECTOR& x) {
+VECTOR& componentwise(VECTOR& x) {
     return x;
 }
 
 //! Expose componentwise operations for Eigen dense vectors.
 template<typename SCALAR>
-typename SArrayView<const CDenseVector<SCALAR>>::Type
-componentwise(const CDenseVector<SCALAR>& x) {
+auto componentwise(const CDenseVector<SCALAR>& x) -> decltype(x.array()) {
     return x.array();
 }
 template<typename SCALAR>
-typename SArrayView<CDenseVector<SCALAR>>::Type componentwise(CDenseVector<SCALAR>& x) {
+auto componentwise(CDenseVector<SCALAR>& x) -> decltype(x.array()) {
     return x.array();
 }
 
 //! Expose componentwise operations for Eigen memory mapped vectors.
-template<typename SCALAR>
-typename SArrayView<const CMemoryMappedDenseVector<SCALAR>>::Type
-componentwise(const CMemoryMappedDenseVector<SCALAR>& x) {
+template<typename SCALAR, Eigen::AlignmentType ALIGNMENT>
+auto componentwise(const CMemoryMappedDenseVector<SCALAR, ALIGNMENT>& x)
+    -> decltype(x.array()) {
     return x.array();
 }
-template<typename SCALAR>
-typename SArrayView<CMemoryMappedDenseVector<SCALAR>>::Type
-componentwise(CMemoryMappedDenseVector<SCALAR>& x) {
+template<typename SCALAR, Eigen::AlignmentType ALIGNMENT>
+auto componentwise(CMemoryMappedDenseVector<SCALAR, ALIGNMENT>& x)
+    -> decltype(x.array()) {
     return x.array();
 }
 
 //! Expose componentwise operations for our annotated vectors.
 template<typename VECTOR, typename ANNOTATION>
-typename SArrayView<const VECTOR>::Type
-componentwise(const CAnnotatedVector<VECTOR, ANNOTATION>& x) {
+auto componentwise(const CAnnotatedVector<VECTOR, ANNOTATION>& x)
+    -> decltype(componentwise(static_cast<const VECTOR&>(x))) {
     return componentwise(static_cast<const VECTOR&>(x));
 }
 template<typename VECTOR, typename ANNOTATION>
-typename SArrayView<VECTOR>::Type& componentwise(CAnnotatedVector<VECTOR, ANNOTATION>& x) {
+auto componentwise(CAnnotatedVector<VECTOR, ANNOTATION>& x)
+    -> decltype(componentwise(static_cast<VECTOR&>(x))) {
     return componentwise(static_cast<VECTOR&>(x));
 }
 
@@ -186,9 +187,9 @@ SCALAR distance(const CDenseVector<SCALAR>& x, const CDenseVector<SCALAR>& y) {
 }
 
 //! Euclidean distance implementation for an Eigen memory mapped vector.
-template<typename SCALAR>
-SCALAR distance(const CMemoryMappedDenseVector<SCALAR>& x,
-                const CMemoryMappedDenseVector<SCALAR>& y) {
+template<typename SCALAR, Eigen::AlignmentType ALIGNMENT>
+SCALAR distance(const CMemoryMappedDenseVector<SCALAR, ALIGNMENT>& x,
+                const CMemoryMappedDenseVector<SCALAR, ALIGNMENT>& y) {
     return (y - x).norm();
 }
 
@@ -213,8 +214,8 @@ SCALAR norm(const CDenseVector<SCALAR>& x) {
 }
 
 //! Get the Euclidean norm of an Eigen memory mapped vector.
-template<typename SCALAR>
-SCALAR norm(const CMemoryMappedDenseVector<SCALAR>& x) {
+template<typename SCALAR, Eigen::AlignmentType ALIGNMENT>
+SCALAR norm(const CMemoryMappedDenseVector<SCALAR, ALIGNMENT>& x) {
     return x.norm();
 }
 
@@ -237,8 +238,8 @@ SCALAR L1(const CDenseVector<SCALAR>& x) {
 }
 
 //! Get the Manhattan norm of an Eigen memory mapped vector.
-template<typename SCALAR>
-SCALAR L1(const CMemoryMappedDenseVector<SCALAR>& x) {
+template<typename SCALAR, Eigen::AlignmentType ALIGNMENT>
+SCALAR L1(const CMemoryMappedDenseVector<SCALAR, ALIGNMENT>& x) {
     return x.template lpNorm<1>();
 }
 
@@ -261,8 +262,8 @@ SCALAR frobenius(const CDenseMatrix<SCALAR>& x) {
 }
 
 //! Get the Euclidean norm of an Eigen memory mapped matrix.
-template<typename SCALAR>
-SCALAR frobenius(const CMemoryMappedDenseMatrix<SCALAR>& x) {
+template<typename SCALAR, Eigen::AlignmentType ALIGNMENT>
+SCALAR frobenius(const CMemoryMappedDenseMatrix<SCALAR, ALIGNMENT>& x) {
     return x.norm();
 }
 
@@ -279,19 +280,21 @@ SCALAR inner(const CDenseVector<SCALAR>& x, const CDenseVector<SCALAR>& y) {
 }
 
 //! Get the inner product of two Eigen memory mapped vectors.
-template<typename SCALAR>
-SCALAR inner(const CMemoryMappedDenseVector<SCALAR>& x,
-             const CMemoryMappedDenseVector<SCALAR>& y) {
+template<typename SCALAR, Eigen::AlignmentType ALIGNMENT>
+SCALAR inner(const CMemoryMappedDenseVector<SCALAR, ALIGNMENT>& x,
+             const CMemoryMappedDenseVector<SCALAR, ALIGNMENT>& y) {
     return x.dot(y);
 }
 //! Get the inner product of Eigen dense and memory mapped vectors.
-template<typename SCALAR>
-SCALAR inner(const CDenseVector<SCALAR>& x, const CMemoryMappedDenseVector<SCALAR>& y) {
+template<typename SCALAR, Eigen::AlignmentType ALIGNMENT>
+SCALAR inner(const CDenseVector<SCALAR>& x,
+             const CMemoryMappedDenseVector<SCALAR, ALIGNMENT>& y) {
     return x.dot(y);
 }
 //! Get the inner product of Eigen dense and memory mapped vectors.
-template<typename SCALAR>
-SCALAR inner(const CMemoryMappedDenseVector<SCALAR>& x, const CDenseVector<SCALAR>& y) {
+template<typename SCALAR, Eigen::AlignmentType ALIGNMENT>
+SCALAR inner(const CMemoryMappedDenseVector<SCALAR, ALIGNMENT>& x,
+             const CDenseVector<SCALAR>& y) {
     return x.dot(y);
 }
 
@@ -327,8 +330,8 @@ CDenseMatrix<SCALAR> outer(const CDenseVector<SCALAR>& x) {
 }
 
 //! Get the outer product of an Eigen memory mapped vector.
-template<typename SCALAR>
-CDenseMatrix<SCALAR> outer(const CMemoryMappedDenseVector<SCALAR>& x) {
+template<typename SCALAR, Eigen::AlignmentType ALIGNMENT>
+CDenseMatrix<SCALAR> outer(const CMemoryMappedDenseVector<SCALAR, ALIGNMENT>& x) {
     return outer(CDenseVector<SCALAR>(x));
 }
 
diff --git a/include/maths/CTypeTraits.h b/include/maths/CTypeTraits.h
index f9be718125..50fa05a2cd 100644
--- a/include/maths/CTypeTraits.h
+++ b/include/maths/CTypeTraits.h
@@ -70,14 +70,14 @@ struct SPromoted<CDenseVector<SCALAR>> {
 };
 
 //! \brief Defines the promoted type for an Eigen memory mapped matrix.
-template<typename SCALAR>
-struct SPromoted<CMemoryMappedDenseMatrix<SCALAR>> {
+template<typename SCALAR, Eigen::AlignmentType ALIGNMENT>
+struct SPromoted<CMemoryMappedDenseMatrix<SCALAR, ALIGNMENT>> {
     using Type = CDenseMatrix<typename SPromoted<SCALAR>::Type>;
 };
 
 //! \brief Defines the promoted type for an Eigen memory mapped vector.
-template<typename SCALAR>
-struct SPromoted<CMemoryMappedDenseVector<SCALAR>> {
+template<typename SCALAR, Eigen::AlignmentType ALIGNMENT>
+struct SPromoted<CMemoryMappedDenseVector<SCALAR, ALIGNMENT>> {
     using Type = CDenseVector<typename SPromoted<SCALAR>::Type>;
 };
 
@@ -142,14 +142,14 @@ struct SFloatingPoint<CDenseVector<SCALAR>, U> {
 };
 
 //! \brief Defines an Eigen dense matrix on a suitable floating point type.
-template<typename SCALAR, typename U>
-struct SFloatingPoint<CMemoryMappedDenseMatrix<SCALAR>, U> {
+template<typename SCALAR, Eigen::AlignmentType ALIGNMENT, typename U>
+struct SFloatingPoint<CMemoryMappedDenseMatrix<SCALAR, ALIGNMENT>, U> {
     using Type = CDenseMatrix<typename SFloatingPoint<SCALAR, U>::Type>;
 };
 
 //! \brief Defines an Eigen dense vector on a suitable floating point type.
-template<typename SCALAR, typename U>
-struct SFloatingPoint<CMemoryMappedDenseVector<SCALAR>, U> {
+template<typename SCALAR, Eigen::AlignmentType ALIGNMENT, typename U>
+struct SFloatingPoint<CMemoryMappedDenseVector<SCALAR, ALIGNMENT>, U> {
     using Type = CDenseVector<typename SFloatingPoint<SCALAR, U>::Type>;
 };
 
@@ -214,14 +214,14 @@ struct SCoordinate<CDenseVector<SCALAR>> {
 };
 
 //! \brief Extracts the coordinate type for an Eigen memory mapped matrix.
-template<typename SCALAR>
-struct SCoordinate<CMemoryMappedDenseMatrix<SCALAR>> {
+template<typename SCALAR, Eigen::AlignmentType ALIGNMENT>
+struct SCoordinate<CMemoryMappedDenseMatrix<SCALAR, ALIGNMENT>> {
     using Type = SCALAR;
 };
 
 //! \brief Extracts the coordinate type for an Eigen memory mapped vector.
-template<typename SCALAR>
-struct SCoordinate<CMemoryMappedDenseVector<SCALAR>> {
+template<typename SCALAR, Eigen::AlignmentType ALIGNMENT>
+struct SCoordinate<CMemoryMappedDenseVector<SCALAR, ALIGNMENT>> {
     using Type = SCALAR;
 };
 
@@ -268,9 +268,9 @@ struct SConformableMatrix<CDenseVector<SCALAR>> {
 };
 
 //! \brief Extracts the conformable matrix type for an Eigen memory mapped vector.
-template<typename SCALAR>
-struct SConformableMatrix<CMemoryMappedDenseVector<SCALAR>> {
-    using Type = CMemoryMappedDenseMatrix<SCALAR>;
+template<typename SCALAR, Eigen::AlignmentType ALIGNMENT>
+struct SConformableMatrix<CMemoryMappedDenseVector<SCALAR, ALIGNMENT>> {
+    using Type = CMemoryMappedDenseMatrix<SCALAR, ALIGNMENT>;
 };
 
 //! \brief Extracts the conformable matrix type for an Eigen sparse vector.
@@ -285,68 +285,6 @@ struct SConformableMatrix<CAnnotatedVector<VECTOR, ANNOTATION>> {
     using Type = typename SConformableMatrix<VECTOR>::Type;
 };
 
-//! \brief Defines the array view for componentwise operations on our internal
-//! vectors and matrices.
-template<typename VECTOR>
-struct SArrayView {
-    using Type = VECTOR&;
-};
-
-//! \brief Defines the array view for componentwise operations on a Eigen dense matrix.
-template<typename SCALAR>
-struct SArrayView<const CDenseMatrix<SCALAR>> {
-    using Type =
-        Eigen::ArrayWrapper<const Eigen::Matrix<SCALAR, Eigen::Dynamic, Eigen::Dynamic, 0, Eigen::Dynamic, Eigen::Dynamic>>;
-};
-template<typename SCALAR>
-struct SArrayView<CDenseMatrix<SCALAR>> {
-    using Type =
-        Eigen::ArrayWrapper<Eigen::Matrix<SCALAR, Eigen::Dynamic, Eigen::Dynamic, 0, Eigen::Dynamic, Eigen::Dynamic>>;
-};
-
-//! \brief Defines the array view for componentwise operations on an Eigen dense matrix.
-template<typename SCALAR>
-struct SArrayView<const CDenseVector<SCALAR>> {
-    using Type =
-        Eigen::ArrayWrapper<const Eigen::Matrix<SCALAR, Eigen::Dynamic, 1, 0, Eigen::Dynamic, 1>>;
-};
-template<typename SCALAR>
-struct SArrayView<CDenseVector<SCALAR>> {
-    using Type =
-        Eigen::ArrayWrapper<Eigen::Matrix<SCALAR, Eigen::Dynamic, 1, 0, Eigen::Dynamic, 1>>;
-};
-
-//! \brief Defines the array view for componentwise operations on an Eigen memory mapped matrix.
-template<typename SCALAR>
-struct SArrayView<const CMemoryMappedDenseMatrix<SCALAR>> {
-    using Type = Eigen::ArrayWrapper<
-        const Eigen::Map<Eigen::Matrix<SCALAR, Eigen::Dynamic, Eigen::Dynamic, 0, Eigen::Dynamic, Eigen::Dynamic>, 0, Eigen::Stride<0, 0>>>;
-};
-template<typename SCALAR>
-struct SArrayView<CMemoryMappedDenseMatrix<SCALAR>> {
-    using Type = Eigen::ArrayWrapper<
-        Eigen::Map<Eigen::Matrix<SCALAR, Eigen::Dynamic, Eigen::Dynamic, 0, Eigen::Dynamic, Eigen::Dynamic>, 0, Eigen::Stride<0, 0>>>;
-};
-
-//! \brief Defines the array view for componentwise operations on an Eigen memory mapped vector.
-template<typename SCALAR>
-struct SArrayView<const CMemoryMappedDenseVector<SCALAR>> {
-    using Type = Eigen::ArrayWrapper<
-        const Eigen::Map<Eigen::Matrix<SCALAR, Eigen::Dynamic, 1, 0, Eigen::Dynamic, 1>, 0, Eigen::Stride<0, 0>>>;
-};
-template<typename SCALAR>
-struct SArrayView<CMemoryMappedDenseVector<SCALAR>> {
-    using Type = Eigen::ArrayWrapper<
-        Eigen::Map<Eigen::Matrix<SCALAR, Eigen::Dynamic, 1, 0, Eigen::Dynamic, 1>, 0, Eigen::Stride<0, 0>>>;
-};
-
-//! \brief Defines the array view for componentwise operations on Eigen dense
-//! vectors and matrices.
-template<typename VECTOR, typename ANNOTATION>
-struct SArrayView<CAnnotatedVector<VECTOR, ANNOTATION>> {
-    using Type = typename SArrayView<VECTOR>::Type;
-};
-
 //! \brief Defines the type of a singular value decomposition of a matrix.
 template<typename MATRIX>
 struct SJacobiSvd {
@@ -372,14 +310,14 @@ struct SJacobiSvd<Eigen::Matrix<SCALAR, ROWS, COLS, OPTIONS, MAX_ROWS, MAX_COLS>
 //! \brief Defines a type which strips off any annotation from a vector.
 //! This is the raw vector type by default.
 template<typename VECTOR>
-struct SStripped {
+struct SUnannotated {
     using Type = VECTOR;
 };
 
 //! \brief Specialisation for annotated vectors. This is the underlying
 //! vector type.
 template<typename VECTOR, typename ANNOTATION>
-struct SStripped<CAnnotatedVector<VECTOR, ANNOTATION>> {
+struct SUnannotated<CAnnotatedVector<VECTOR, ANNOTATION>> {
     using Type = VECTOR;
 };
 }
diff --git a/lib/api/CDataFrameAnalysisRunner.cc b/lib/api/CDataFrameAnalysisRunner.cc
index c1ba307ca7..dc3d15d0a7 100644
--- a/lib/api/CDataFrameAnalysisRunner.cc
+++ b/lib/api/CDataFrameAnalysisRunner.cc
@@ -163,7 +163,7 @@ std::size_t CDataFrameAnalysisRunner::estimateMemoryUsage(std::size_t totalNumbe
                                                           std::size_t numberColumns) const {
     return core::CDataFrame::estimateMemoryUsage(
                this->storeDataFrameInMainMemory(), totalNumberRows,
-               numberColumns + this->numberExtraColumns()) +
+               numberColumns + this->numberExtraColumns(), core::CAlignment::E_Aligned16) +
            this->estimateBookkeepingMemoryUsage(m_NumberPartitions, totalNumberRows,
                                                 partitionNumberRows, numberColumns);
 }
diff --git a/lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc b/lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc
index c69c2352bc..1b580eadc3 100644
--- a/lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc
+++ b/lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc
@@ -165,17 +165,17 @@ void CDataFrameTrainBoostedTreeClassifierRunner::writeOneRow(
         featureImportance->shap(
             row, [&writer, &classValues](
                      const maths::CTreeShapFeatureImportance::TSizeVec& indices,
-                     const TStrVec& names,
+                     const TStrVec& featureNames,
                      const maths::CTreeShapFeatureImportance::TVectorVec& shap) {
-                writer.Key(CDataFrameTrainBoostedTreeRunner::FEATURE_IMPORTANCE_FIELD_NAME);
+                writer.Key(FEATURE_IMPORTANCE_FIELD_NAME);
                 writer.StartArray();
                 for (auto i : indices) {
                     if (shap[i].norm() != 0.0) {
                         writer.StartObject();
-                        writer.Key(CDataFrameTrainBoostedTreeRunner::FEATURE_NAME_FIELD_NAME);
-                        writer.String(names[i]);
+                        writer.Key(FEATURE_NAME_FIELD_NAME);
+                        writer.String(featureNames[i]);
                         if (shap[i].size() == 1) {
-                            writer.Key(CDataFrameTrainBoostedTreeRunner::IMPORTANCE_FIELD_NAME);
+                            writer.Key(IMPORTANCE_FIELD_NAME);
                             writer.Double(shap[i](0));
                         } else {
                             for (int j = 0; j < shap[i].size(); ++j) {
diff --git a/lib/api/CDataFrameTrainBoostedTreeRegressionRunner.cc b/lib/api/CDataFrameTrainBoostedTreeRegressionRunner.cc
index 2a146fca6e..92fddd3cc3 100644
--- a/lib/api/CDataFrameTrainBoostedTreeRegressionRunner.cc
+++ b/lib/api/CDataFrameTrainBoostedTreeRegressionRunner.cc
@@ -96,16 +96,16 @@ void CDataFrameTrainBoostedTreeRegressionRunner::writeOneRow(
     if (featureImportance != nullptr) {
         featureImportance->shap(
             row, [&writer](const maths::CTreeShapFeatureImportance::TSizeVec& indices,
-                           const TStrVec& names,
+                           const TStrVec& featureNames,
                            const maths::CTreeShapFeatureImportance::TVectorVec& shap) {
-                writer.Key(CDataFrameTrainBoostedTreeRunner::FEATURE_IMPORTANCE_FIELD_NAME);
+                writer.Key(FEATURE_IMPORTANCE_FIELD_NAME);
                 writer.StartArray();
                 for (auto i : indices) {
                     if (shap[i].norm() != 0.0) {
                         writer.StartObject();
-                        writer.Key(CDataFrameTrainBoostedTreeRunner::FEATURE_NAME_FIELD_NAME);
-                        writer.String(names[i]);
-                        writer.Key(CDataFrameTrainBoostedTreeRunner::IMPORTANCE_FIELD_NAME);
+                        writer.Key(FEATURE_NAME_FIELD_NAME);
+                        writer.String(featureNames[i]);
+                        writer.Key(IMPORTANCE_FIELD_NAME);
                         writer.Double(shap[i](0));
                         writer.EndObject();
                     }
diff --git a/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc b/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc
index 7cfb5ababa..f6ab101fbb 100644
--- a/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc
+++ b/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc
@@ -197,7 +197,7 @@ BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor10000Rows) {
 }
 
 BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor100000Rows) {
-    testEstimateMemoryUsage(100000, "40mb", "9mb", 0);
+    testEstimateMemoryUsage(100000, "41mb", "10mb", 0);
 }
 
 BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor10000000Rows) {
diff --git a/lib/core/CDataFrame.cc b/lib/core/CDataFrame.cc
index c65baba8ab..20bd435610 100644
--- a/lib/core/CDataFrame.cc
+++ b/lib/core/CDataFrame.cc
@@ -119,11 +119,13 @@ std::size_t computeSliceCapacity(std::size_t numberColumns) {
 
 CDataFrame::CDataFrame(bool inMainMemory,
                        std::size_t numberColumns,
+                       CAlignment::EType rowAlignment,
                        std::size_t sliceCapacityInRows,
                        EReadWriteToStorage readAndWriteToStoreSyncStrategy,
                        const TWriteSliceToStoreFunc& writeSliceToStore)
     : m_InMainMemory{inMainMemory}, m_NumberColumns{numberColumns},
-      m_RowCapacity{numberColumns}, m_SliceCapacityInRows{sliceCapacityInRows},
+      m_RowCapacity{CAlignment::roundup<CFloatStorage>(rowAlignment, numberColumns)},
+      m_SliceCapacityInRows{sliceCapacityInRows}, m_RowAlignment{rowAlignment},
       m_ReadAndWriteToStoreSyncStrategy{readAndWriteToStoreSyncStrategy},
       m_WriteSliceToStore{writeSliceToStore}, m_ColumnNames(numberColumns),
       m_CategoricalColumnValues(numberColumns), m_MissingString{DEFAULT_MISSING_STRING},
@@ -132,10 +134,15 @@ CDataFrame::CDataFrame(bool inMainMemory,
 
 CDataFrame::CDataFrame(bool inMainMemory,
                        std::size_t numberColumns,
+                       CAlignment::EType rowAlignment,
                        EReadWriteToStorage readAndWriteToStoreSyncStrategy,
                        const TWriteSliceToStoreFunc& writeSliceToStore)
-    : CDataFrame{inMainMemory, numberColumns, computeSliceCapacity(numberColumns),
-                 readAndWriteToStoreSyncStrategy, writeSliceToStore} {
+    : CDataFrame{inMainMemory,
+                 numberColumns,
+                 rowAlignment,
+                 computeSliceCapacity(numberColumns),
+                 readAndWriteToStoreSyncStrategy,
+                 writeSliceToStore} {
 }
 
 CDataFrame::~CDataFrame() = default;
@@ -153,15 +160,20 @@ std::size_t CDataFrame::numberColumns() const {
 }
 
 void CDataFrame::reserve(std::size_t numberThreads, std::size_t rowCapacity) {
+
+    rowCapacity = CAlignment::roundup<CFloatStorage>(m_RowAlignment, rowCapacity);
+
     if (m_RowCapacity >= rowCapacity) {
         return;
     }
 
+    std::size_t oldRowCapacity{m_RowCapacity};
     m_RowCapacity = rowCapacity;
 
-    parallel_for_each(numberThreads, m_Slices.begin(), m_Slices.end(), [this](TRowSlicePtr& slice) {
-        slice->reserve(m_NumberColumns, m_RowCapacity - m_NumberColumns);
-    });
+    parallel_for_each(numberThreads, m_Slices.begin(), m_Slices.end(),
+                      [oldRowCapacity, this](TRowSlicePtr& slice) {
+                          slice->reserve(oldRowCapacity, m_RowCapacity - oldRowCapacity);
+                      });
 }
 
 void CDataFrame::resizeColumns(std::size_t numberThreads, std::size_t numberColumns) {
@@ -172,6 +184,26 @@ void CDataFrame::resizeColumns(std::size_t numberThreads, std::size_t numberColu
     m_NumberColumns = numberColumns;
 }
 
+CDataFrame::TSizeVec CDataFrame::resizeColumns(std::size_t numberThreads,
+                                               const TSizeAlignmentPrVec& extraColumns) {
+    TSizeVec result;
+    result.reserve(extraColumns.size());
+    std::size_t index{m_NumberColumns};
+    for (const auto& columns : extraColumns) {
+        std::size_t count;
+        CAlignment::EType alignment;
+        std::tie(count, alignment) = columns;
+        if (CAlignment::less(m_RowAlignment, alignment)) {
+            HANDLE_FATAL(<< "Unsupported column alignment " << CAlignment::print(alignment));
+        }
+        index = CAlignment::roundup<CFloatStorage>(alignment, index);
+        result.push_back(index);
+        index += count;
+    }
+    this->resizeColumns(numberThreads, index);
+    return result;
+}
+
 CDataFrame::TRowFuncVecBoolPr CDataFrame::readRows(std::size_t numberThreads,
                                                    std::size_t beginRows,
                                                    std::size_t endRows,
@@ -386,8 +418,11 @@ std::uint64_t CDataFrame::checksum() const {
 
 std::size_t CDataFrame::estimateMemoryUsage(bool inMainMemory,
                                             std::size_t numberRows,
-                                            std::size_t numberColumns) {
-    return inMainMemory ? numberRows * numberColumns * sizeof(float) : 0;
+                                            std::size_t numberColumns,
+                                            CAlignment::EType alignment) {
+    return inMainMemory
+               ? numberRows * CAlignment::roundupSizeof<CFloatStorage>(alignment, numberColumns)
+               : 0;
 }
 
 CDataFrame::TRowFuncVecBoolPr
@@ -450,13 +485,13 @@ CDataFrame::parallelApplyToAllRows(std::size_t numberThreads,
             },
             std::move(func)));
 
-    TRowFuncVec functions;
-    functions.reserve(results.size());
+    TRowFuncVec funcs;
+    funcs.reserve(results.size());
     for (auto& result : results) {
-        functions.emplace_back(std::move(result.s_FunctionState));
+        funcs.emplace_back(std::move(result.s_FunctionState));
     }
 
-    return {std::move(functions), successful.load()};
+    return {std::move(funcs), successful.load()};
 }
 
 CDataFrame::TRowFuncVecBoolPr
@@ -562,7 +597,14 @@ CDataFrame::sequentialApplyToAllRows(std::size_t beginRows,
         break;
     }
 
-    return {{std::move(func)}, true};
+    // TRowFuncVec funcs{std::move(func)}; moves func into an std::inializer_list
+    // but then *copies* from the list because the standard requires its elements
+    // are treated as constant, see 8.5.4/5.
+    TRowFuncVec funcs;
+    funcs.reserve(1);
+    funcs.emplace_back(std::move(func));
+
+    return TRowFuncVecBoolPr{std::move(funcs), true};
 }
 
 void CDataFrame::applyToRowsOfOneSlice(TRowFunc& func,
@@ -634,11 +676,10 @@ void CDataFrame::CDataFrameRowSliceWriter::operator()(const TWriteFunc& writeRow
     // Write the next row at the end of the current slice being written
     // and if the slice is full pass to the thread storing slices.
 
-    std::size_t end{m_RowsOfSliceBeingWritten.size()};
-
-    m_RowsOfSliceBeingWritten.resize(end + m_RowCapacity);
+    std::size_t start{m_RowsOfSliceBeingWritten.size()};
+    m_RowsOfSliceBeingWritten.resize(start + m_RowCapacity);
     m_DocHashesOfSliceBeingWritten.emplace_back();
-    writeRow(m_RowsOfSliceBeingWritten.begin() + end,
+    writeRow(m_RowsOfSliceBeingWritten.begin() + start,
              m_DocHashesOfSliceBeingWritten.back());
     ++m_NumberRows;
 
@@ -691,19 +732,20 @@ CDataFrame::CDataFrameRowSliceWriter::finishWritingRows() {
 std::pair<std::unique_ptr<CDataFrame>, std::shared_ptr<CTemporaryDirectory>>
 makeMainStorageDataFrame(std::size_t numberColumns,
                          boost::optional<std::size_t> sliceCapacity,
-                         CDataFrame::EReadWriteToStorage readWriteToStoreSyncStrategy) {
+                         CDataFrame::EReadWriteToStorage readWriteToStoreSyncStrategy,
+                         CAlignment::EType alignment) {
     auto writer = [](std::size_t firstRow, TFloatVec rows, TInt32Vec docHashes) {
         return std::make_unique<CMainMemoryDataFrameRowSlice>(
             firstRow, std::move(rows), std::move(docHashes));
     };
 
     if (sliceCapacity != boost::none) {
-        return {std::make_unique<CDataFrame>(true, numberColumns, *sliceCapacity,
+        return {std::make_unique<CDataFrame>(true, numberColumns, alignment, *sliceCapacity,
                                              readWriteToStoreSyncStrategy, writer),
                 nullptr};
     }
 
-    return {std::make_unique<CDataFrame>(true, numberColumns,
+    return {std::make_unique<CDataFrame>(true, numberColumns, alignment,
                                          readWriteToStoreSyncStrategy, writer),
             nullptr};
 }
@@ -713,7 +755,8 @@ makeDiskStorageDataFrame(const std::string& rootDirectory,
                          std::size_t numberColumns,
                          std::size_t numberRows,
                          boost::optional<std::size_t> sliceCapacity,
-                         CDataFrame::EReadWriteToStorage readWriteToStoreSyncStrategy) {
+                         CDataFrame::EReadWriteToStorage readWriteToStoreSyncStrategy,
+                         CAlignment::EType alignment) {
     std::size_t minimumSpace{2 * numberRows * numberColumns * sizeof(CFloatStorage)};
 
     auto directory = std::make_shared<CTemporaryDirectory>(rootDirectory, minimumSpace);
@@ -728,11 +771,11 @@ makeDiskStorageDataFrame(const std::string& rootDirectory,
     };
 
     if (sliceCapacity != boost::none) {
-        return {std::make_unique<CDataFrame>(false, numberColumns, *sliceCapacity,
+        return {std::make_unique<CDataFrame>(false, numberColumns, alignment, *sliceCapacity,
                                              readWriteToStoreSyncStrategy, writer),
                 directory};
     }
-    return {std::make_unique<CDataFrame>(false, numberColumns,
+    return {std::make_unique<CDataFrame>(false, numberColumns, alignment,
                                          readWriteToStoreSyncStrategy, writer),
             directory};
 }
diff --git a/lib/core/CDataFrameRowSlice.cc b/lib/core/CDataFrameRowSlice.cc
index f750db6166..ca405d74b9 100644
--- a/lib/core/CDataFrameRowSlice.cc
+++ b/lib/core/CDataFrameRowSlice.cc
@@ -20,7 +20,7 @@
 
 namespace ml {
 namespace core {
-using TFloatVec = std::vector<CFloatStorage>;
+using TFloatVec = std::vector<CFloatStorage, CAlignedAllocator<CFloatStorage>>;
 using TFloatVecItr = TFloatVec::iterator;
 using TInt32Vec = std::vector<std::int32_t>;
 using TInt32VecCItr = TInt32Vec::const_iterator;
@@ -104,8 +104,8 @@ class CBadDataFrameRowSliceHandle final : public CDataFrameRowSliceHandleImpl {
 };
 
 //! Checksum \p vec.
-template<typename T>
-std::uint64_t computeChecksum(const std::vector<T>& vec) {
+template<typename T, typename ALLOCATOR>
+std::uint64_t computeChecksum(const std::vector<T, ALLOCATOR>& vec) {
     return CHashing::murmurHash64(vec.data(), static_cast<int>(sizeof(T) * vec.size()), 0);
 }
 
diff --git a/lib/core/Makefile b/lib/core/Makefile
index 735f5fe320..01e617a2a0 100644
--- a/lib/core/Makefile
+++ b/lib/core/Makefile
@@ -15,6 +15,7 @@ USE_BOOST_IOSTREAMS_LIBS=1
 USE_BOOST_LOGSETUP_LIBS=1
 USE_BOOST_THREAD_LIBS=1
 USE_RAPIDJSON=1
+USE_EIGEN=1
 USE_XML=1
 USE_ZLIB=1
 USE_STRPTIME=1
diff --git a/lib/core/unittest/CAlignmentTest.cc b/lib/core/unittest/CAlignmentTest.cc
new file mode 100644
index 0000000000..fd720de37c
--- /dev/null
+++ b/lib/core/unittest/CAlignmentTest.cc
@@ -0,0 +1,181 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+
+#include <core/CAlignment.h>
+#include <core/CLogger.h>
+
+#include <boost/test/unit_test.hpp>
+
+#include <array>
+#include <vector>
+
+BOOST_AUTO_TEST_SUITE(CAlignmentTest)
+
+using namespace ml;
+
+BOOST_AUTO_TEST_CASE(testMaxAlignment) {
+
+    // Test some known alignments.
+
+    alignas(32) const char addresses[64]{};
+    for (std::size_t i = 0; i < 64; ++i) {
+        if (i % 32 == 0) {
+            BOOST_TEST_REQUIRE(core::CAlignment::maxAlignment(&addresses[i]) ==
+                               core::CAlignment::E_Aligned32);
+        } else if (i % 16 == 0) {
+            BOOST_TEST_REQUIRE(core::CAlignment::maxAlignment(&addresses[i]) ==
+                               core::CAlignment::E_Aligned16);
+        } else if (i % 8 == 0) {
+            BOOST_TEST_REQUIRE(core::CAlignment::maxAlignment(&addresses[i]) ==
+                               core::CAlignment::E_Aligned8);
+        } else {
+            BOOST_TEST_REQUIRE(core::CAlignment::maxAlignment(&addresses[i]) ==
+                               core::CAlignment::E_Unaligned);
+        }
+    }
+}
+
+BOOST_AUTO_TEST_CASE(testIsAligned) {
+
+    // Test some known alignments.
+
+    alignas(32) const char addresses[64]{};
+    for (std::size_t i = 0; i < 64; ++i) {
+        if (i % 32 == 0) {
+            BOOST_TEST_REQUIRE(core::CAlignment::isAligned(
+                &addresses[i], core::CAlignment::E_Aligned32));
+            BOOST_TEST_REQUIRE(core::CAlignment::isAligned(
+                &addresses[i], core::CAlignment::E_Aligned16));
+            BOOST_TEST_REQUIRE(core::CAlignment::isAligned(
+                &addresses[i], core::CAlignment::E_Aligned8));
+        } else if (i % 16 == 0) {
+            BOOST_TEST_REQUIRE(core::CAlignment::isAligned(
+                                   &addresses[i], core::CAlignment::E_Aligned32) == false);
+            BOOST_TEST_REQUIRE(core::CAlignment::isAligned(
+                &addresses[i], core::CAlignment::E_Aligned16));
+            BOOST_TEST_REQUIRE(core::CAlignment::isAligned(
+                &addresses[i], core::CAlignment::E_Aligned8));
+        } else if (i % 8 == 0) {
+            BOOST_TEST_REQUIRE(core::CAlignment::isAligned(
+                                   &addresses[i], core::CAlignment::E_Aligned32) == false);
+            BOOST_TEST_REQUIRE(core::CAlignment::isAligned(
+                                   &addresses[i], core::CAlignment::E_Aligned16) == false);
+            BOOST_TEST_REQUIRE(core::CAlignment::isAligned(
+                &addresses[i], core::CAlignment::E_Aligned8));
+        } else {
+            BOOST_TEST_REQUIRE(core::CAlignment::isAligned(
+                                   &addresses[i], core::CAlignment::E_Aligned32) == false);
+            BOOST_TEST_REQUIRE(core::CAlignment::isAligned(
+                                   &addresses[i], core::CAlignment::E_Aligned16) == false);
+            BOOST_TEST_REQUIRE(core::CAlignment::isAligned(
+                                   &addresses[i], core::CAlignment::E_Aligned8) == false);
+        }
+    }
+}
+
+BOOST_AUTO_TEST_CASE(testNextAligned) {
+
+    // Test that next aligned is the first position with the required alignment
+    // after the current index.
+
+    alignas(32) std::array<double, 8> addresses;
+
+    for (std::size_t i = 0; i < 8; ++i) {
+        std::size_t i32{core::CAlignment::nextAligned(addresses, i, core::CAlignment::E_Aligned32)};
+        BOOST_TEST_REQUIRE(core::CAlignment::isAligned(
+            &addresses[i32], core::CAlignment::E_Aligned32));
+        for (std::size_t j = i + 1; j < i32; ++j) {
+            BOOST_TEST_REQUIRE(core::CAlignment::isAligned(
+                                   &addresses[j], core::CAlignment::E_Aligned32) == false);
+        }
+
+        std::size_t i16{core::CAlignment::nextAligned(addresses, i, core::CAlignment::E_Aligned16)};
+        BOOST_TEST_REQUIRE(core::CAlignment::isAligned(
+            &addresses[i16], core::CAlignment::E_Aligned16));
+        for (std::size_t j = i + 1; j < i16; ++j) {
+            BOOST_TEST_REQUIRE(core::CAlignment::isAligned(
+                                   &addresses[j], core::CAlignment::E_Aligned16) == false);
+        }
+
+        std::size_t i8{core::CAlignment::nextAligned(addresses, i, core::CAlignment::E_Aligned8)};
+        BOOST_TEST_REQUIRE(core::CAlignment::isAligned(
+            &addresses[i8], core::CAlignment::E_Aligned8));
+        for (std::size_t j = i + 1; j < i8; ++j) {
+            BOOST_TEST_REQUIRE(core::CAlignment::isAligned(
+                                   &addresses[j], core::CAlignment::E_Aligned8) == false);
+        }
+    }
+}
+
+BOOST_AUTO_TEST_CASE(testRoundup) {
+
+    // Test rounding up the size of a block of char objects generates the expected sizes.
+
+    BOOST_TEST_REQUIRE(
+        core::CAlignment::roundup<char>(core::CAlignment::E_Aligned32, 0) == 0);
+    BOOST_TEST_REQUIRE(
+        core::CAlignment::roundup<char>(core::CAlignment::E_Aligned16, 0) == 0);
+    BOOST_TEST_REQUIRE(core::CAlignment::roundup<char>(core::CAlignment::E_Aligned8, 0) == 0);
+    BOOST_TEST_REQUIRE(
+        core::CAlignment::roundup<char>(core::CAlignment::E_Unaligned, 0) == 0);
+    for (std::size_t i = 1; i < 128; ++i) {
+        BOOST_TEST_REQUIRE(core::CAlignment::roundup<char>(core::CAlignment::E_Aligned32,
+                                                           i) == 32 * ((i + 31) / 32));
+        BOOST_TEST_REQUIRE(core::CAlignment::roundup<char>(core::CAlignment::E_Aligned16,
+                                                           i) == 16 * ((i + 15) / 16));
+        BOOST_TEST_REQUIRE(core::CAlignment::roundup<char>(core::CAlignment::E_Aligned8,
+                                                           i) == 8 * ((i + 7) / 8));
+        BOOST_TEST_REQUIRE(
+            core::CAlignment::roundup<char>(core::CAlignment::E_Unaligned, i) == i);
+    }
+}
+
+BOOST_AUTO_TEST_CASE(testRoundupSizeof) {
+
+    // Test rounding up the size of a block of float objects generates the expected memory.
+
+    BOOST_TEST_REQUIRE(core::CAlignment::roundupSizeof<float>(
+                           core::CAlignment::E_Aligned32, 0) == 0);
+    BOOST_TEST_REQUIRE(core::CAlignment::roundupSizeof<float>(
+                           core::CAlignment::E_Aligned16, 0) == 0);
+    BOOST_TEST_REQUIRE(core::CAlignment::roundupSizeof<float>(
+                           core::CAlignment::E_Aligned8, 0) == 0);
+    BOOST_TEST_REQUIRE(core::CAlignment::roundupSizeof<float>(
+                           core::CAlignment::E_Unaligned, 0) == 0);
+    for (std::size_t i = 1; i < 32; ++i) {
+        BOOST_TEST_REQUIRE(
+            core::CAlignment::roundupSizeof<float>(core::CAlignment::E_Aligned32, i) ==
+            32 * ((4 * i + 31) / 32));
+        BOOST_TEST_REQUIRE(
+            core::CAlignment::roundupSizeof<float>(core::CAlignment::E_Aligned16, i) ==
+            16 * ((4 * i + 15) / 16));
+        BOOST_TEST_REQUIRE(core::CAlignment::roundupSizeof<float>(
+                               core::CAlignment::E_Aligned8, i) == 8 * ((4 * i + 7) / 8));
+        BOOST_TEST_REQUIRE(core::CAlignment::roundupSizeof<float>(
+                               core::CAlignment::E_Unaligned, i) == 4 * i);
+    }
+}
+
+BOOST_AUTO_TEST_CASE(testAlignedAllocator) {
+
+    core::CAlignedAllocator<double> allocator;
+
+    std::vector<double*> addresses;
+
+    bool aligned32{true};
+    for (std::size_t i = 0; i < 20; ++i) {
+        double* address{allocator.allocate(6)};
+        addresses.push_back(address);
+        aligned32 = aligned32 &&
+                    core::CAlignment::isAligned(address, core::CAlignment::E_Aligned32);
+    }
+    for (auto& address : addresses) {
+        allocator.deallocate(address, 6);
+    }
+    BOOST_TEST_REQUIRE(aligned32);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/lib/core/unittest/CDataFrameTest.cc b/lib/core/unittest/CDataFrameTest.cc
index 8b87eb6f2a..f38a20f1ce 100644
--- a/lib/core/unittest/CDataFrameTest.cc
+++ b/lib/core/unittest/CDataFrameTest.cc
@@ -4,9 +4,11 @@
  * you may not use this file except in compliance with the Elastic License.
  */
 
+#include <core/CAlignment.h>
 #include <core/CContainerPrinter.h>
 #include <core/CDataFrame.h>
 #include <core/CDataFrameRowSlice.h>
+#include <core/CFloatStorage.h>
 #include <core/CPackedBitVector.h>
 #include <core/Concurrency.h>
 
@@ -28,7 +30,8 @@ using namespace ml;
 namespace {
 using TBoolVec = std::vector<bool>;
 using TDoubleVec = std::vector<double>;
-using TFloatVec = std::vector<core::CFloatStorage>;
+using TFloatVec =
+    std::vector<core::CFloatStorage, core::CAlignedAllocator<core::CFloatStorage>>;
 using TFloatVecItr = TFloatVec::iterator;
 using TFloatVecCItr = TFloatVec::const_iterator;
 using TSizeFloatVecUMap = boost::unordered_map<std::size_t, TFloatVec>;
@@ -831,4 +834,115 @@ BOOST_FIXTURE_TEST_CASE(testRowMask, CTestFixture) {
     }
 }
 
+BOOST_FIXTURE_TEST_CASE(testAlignment, CTestFixture) {
+
+    // Test all the rows have the requested alignment.
+
+    using TAlignedFactoryFunc =
+        std::function<std::unique_ptr<core::CDataFrame>(core::CAlignment::EType)>;
+
+    std::size_t rows{5000};
+    std::size_t cols{15};
+    std::size_t capacity{1000};
+    TFloatVec components{testData(rows, cols)};
+
+    test::CRandomNumbers rng;
+
+    TAlignedFactoryFunc makeOnDisk = [=](core::CAlignment::EType alignment) {
+        return core::makeDiskStorageDataFrame(
+                   boost::filesystem::current_path().string(), cols, rows, capacity,
+                   core::CDataFrame::EReadWriteToStorage::E_Async, alignment)
+            .first;
+    };
+    TAlignedFactoryFunc makeMainMemory = [=](core::CAlignment::EType alignment) {
+        return core::makeMainStorageDataFrame(
+                   cols, capacity, core::CDataFrame::EReadWriteToStorage::E_Sync, alignment)
+            .first;
+    };
+
+    std::string type[]{"on disk", "main memory"};
+    std::size_t t{0};
+    for (const auto& factory : {makeOnDisk, makeMainMemory}) {
+        for (auto alignment : {core::CAlignment::E_Aligned8, core::CAlignment::E_Aligned16,
+                               core::CAlignment::E_Aligned32}) {
+            LOG_DEBUG(<< "Test aligned " << alignment << " " << type[t]);
+
+            auto frame = factory(alignment);
+
+            for (std::size_t i = 0; i < components.size(); i += cols) {
+                frame->writeRow(makeWriter(components, cols, i));
+            }
+            frame->finishWritingRows();
+
+            frame->readRows(1, [alignment](TRowItr beginRows, TRowItr endRows) {
+                for (auto row = beginRows; row != endRows; ++row) {
+                    BOOST_TEST_REQUIRE(core::CAlignment::isAligned(row->data(), alignment));
+                }
+            });
+        }
+        ++t;
+    }
+}
+
+BOOST_FIXTURE_TEST_CASE(testAlignedExtraColumns, CTestFixture) {
+
+    // Test all the rows have the requested alignment.
+
+    using TAlignedFactoryFunc =
+        std::function<std::unique_ptr<core::CDataFrame>(core::CAlignment::EType)>;
+
+    std::size_t rows{5000};
+    std::size_t cols{15};
+    std::size_t capacity{1000};
+    TFloatVec components{testData(rows, cols)};
+    core::CDataFrame::TSizeAlignmentPrVec extraCols{{2, core::CAlignment::E_Unaligned},
+                                                    {3, core::CAlignment::E_Aligned16},
+                                                    {1, core::CAlignment::E_Unaligned}};
+    test::CRandomNumbers rng;
+
+    TAlignedFactoryFunc makeOnDisk = [=](core::CAlignment::EType alignment) {
+        return core::makeDiskStorageDataFrame(
+                   boost::filesystem::current_path().string(), cols, rows, capacity,
+                   core::CDataFrame::EReadWriteToStorage::E_Async, alignment)
+            .first;
+    };
+    TAlignedFactoryFunc makeMainMemory = [=](core::CAlignment::EType alignment) {
+        return core::makeMainStorageDataFrame(
+                   cols, capacity, core::CDataFrame::EReadWriteToStorage::E_Sync, alignment)
+            .first;
+    };
+
+    std::string type[]{"on disk", "main memory"};
+    std::size_t t{0};
+    for (const auto& factory : {makeOnDisk, makeMainMemory}) {
+        for (auto alignment : {core::CAlignment::E_Aligned16, core::CAlignment::E_Aligned32}) {
+            LOG_DEBUG(<< "Test aligned " << alignment << " " << type[t]);
+
+            auto frame = factory(alignment);
+
+            for (std::size_t i = 0; i < components.size(); i += cols) {
+                frame->writeRow(makeWriter(components, cols, i));
+            }
+            frame->finishWritingRows();
+
+            auto offsets = frame->resizeColumns(1, extraCols);
+            for (std::size_t i = 1; i < offsets.size(); ++i) {
+                BOOST_TEST_REQUIRE(offsets[i] - offsets[i - 1] >=
+                                   extraCols[i - 1].first);
+            }
+
+            BOOST_TEST_REQUIRE(extraCols.size() == offsets.size());
+            frame->readRows(1, [&](TRowItr beginRows, TRowItr endRows) {
+                for (auto row = beginRows; row != endRows; ++row) {
+                    for (std::size_t i = 0; i < extraCols.size(); ++i) {
+                        BOOST_TEST_REQUIRE(core::CAlignment::isAligned(
+                            row->data() + offsets[i], extraCols[i].second));
+                    }
+                }
+            });
+        }
+        ++t;
+    }
+}
+
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/lib/core/unittest/Makefile b/lib/core/unittest/Makefile
index 85cca582fd..535258553c 100644
--- a/lib/core/unittest/Makefile
+++ b/lib/core/unittest/Makefile
@@ -26,6 +26,7 @@ CProcessPriorityTest.cc \
 SRCS=\
 $(OS_SRCS) \
 Main.cc \
+CAlignmentTest.cc \
 CAllocationStrategyTest.cc \
 CBase64FilterTest.cc \
 CCompressedDictionaryTest.cc \
diff --git a/lib/maths/CBoostedTree.cc b/lib/maths/CBoostedTree.cc
index 60f273b78e..e1353531eb 100644
--- a/lib/maths/CBoostedTree.cc
+++ b/lib/maths/CBoostedTree.cc
@@ -170,8 +170,8 @@ std::size_t CBoostedTree::columnHoldingDependentVariable() const {
 CBoostedTree::TDouble2Vec CBoostedTree::readPrediction(const TRowRef& row) const {
     const auto& loss = m_Impl->loss();
     return loss
-        .transform(boosted_tree_detail::readPrediction(
-            row, m_Impl->numberInputColumns(), loss.numberParameters()))
+        .transform(boosted_tree_detail::readPrediction(row, m_Impl->extraColumns(),
+                                                       loss.numberParameters()))
         .to<TDouble2Vec>();
 }
 
@@ -180,7 +180,7 @@ CBoostedTree::TDouble2Vec CBoostedTree::readAndAdjustPrediction(const TRowRef& r
     const auto& loss = m_Impl->loss();
 
     auto prediction = loss.transform(boosted_tree_detail::readPrediction(
-        row, m_Impl->numberInputColumns(), loss.numberParameters()));
+        row, m_Impl->extraColumns(), loss.numberParameters()));
 
     switch (loss.type()) {
     case CLoss::E_BinaryClassification:
diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc
index 04d9c2e655..70f50c6ad6 100644
--- a/lib/maths/CBoostedTreeFactory.cc
+++ b/lib/maths/CBoostedTreeFactory.cc
@@ -24,7 +24,6 @@ namespace ml {
 namespace maths {
 using namespace boosted_tree_detail;
 using TDoubleVec = std::vector<double>;
-using TSizeVec = std::vector<std::size_t>;
 using TRowItr = core::CDataFrame::TRowItr;
 
 namespace {
@@ -274,9 +273,9 @@ void CBoostedTreeFactory::initializeNumberFolds(core::CDataFrame& frame) const {
 }
 
 void CBoostedTreeFactory::resizeDataFrame(core::CDataFrame& frame) const {
-    m_TreeImpl->m_NumberInputColumns = frame.numberColumns();
-    frame.resizeColumns(m_TreeImpl->m_NumberThreads,
-                        frame.numberColumns() + this->numberExtraColumnsForTrain());
+    std::size_t numberLossParameters{m_TreeImpl->m_Loss->numberParameters()};
+    m_TreeImpl->m_ExtraColumns = frame.resizeColumns(
+        m_TreeImpl->m_NumberThreads, extraColumns(numberLossParameters));
     m_TreeImpl->m_Instrumentation->updateMemoryUsage(core::CMemory::dynamicSize(frame));
 }
 
@@ -293,11 +292,8 @@ void CBoostedTreeFactory::initializeCrossValidation(core::CDataFrame& frame) con
 
     frame.writeColumns(m_NumberThreads, 0, frame.numberRows(),
                        [&](TRowItr beginRows, TRowItr endRows) {
-                           std::size_t column{exampleWeightColumn(
-                               m_TreeImpl->m_NumberInputColumns,
-                               m_TreeImpl->m_Loss->numberParameters())};
                            for (auto row = beginRows; row != endRows; ++row) {
-                               row->writeColumn(column, 1.0);
+                               writeExampleWeight(*row, m_TreeImpl->m_ExtraColumns, 1.0);
                            }
                        },
                        &allTrainingRowsMask);
diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc
index 91c0f639df..59cd52942a 100644
--- a/lib/maths/CBoostedTreeImpl.cc
+++ b/lib/maths/CBoostedTreeImpl.cc
@@ -290,7 +290,7 @@ void CBoostedTreeImpl::predict(core::CDataFrame& frame) const {
         m_NumberThreads, 0, frame.numberRows(), [&](TRowItr beginRows, TRowItr endRows) {
             std::size_t numberLossParameters{m_Loss->numberParameters()};
             for (auto row = beginRows; row != endRows; ++row) {
-                auto prediction = readPrediction(*row, m_NumberInputColumns, numberLossParameters);
+                auto prediction = readPrediction(*row, m_ExtraColumns, numberLossParameters);
                 prediction = predictRow(m_Encoder->encode(*row), m_BestForest);
             }
         });
@@ -406,14 +406,14 @@ void CBoostedTreeImpl::computeClassificationWeights(const core::CDataFrame& fram
                         // We predict the log-odds but this is expected to return
                         // the log of the predicted class probabilities.
                         TMemoryMappedFloatVector result{&storage[0], 2};
-                        result.array() = m_Loss
-                                             ->transform(readPrediction(
-                                                 row, m_NumberInputColumns, numberClasses))
-                                             .array()
-                                             .log();
+                        result.array() =
+                            m_Loss
+                                ->transform(readPrediction(row, m_ExtraColumns, numberClasses))
+                                .array()
+                                .log();
                         return result;
                     }
-                    return readPrediction(row, m_NumberInputColumns, numberClasses);
+                    return readPrediction(row, m_ExtraColumns, numberClasses);
                 });
             break;
         }
@@ -507,9 +507,9 @@ CBoostedTreeImpl::TNodeVec CBoostedTreeImpl::initializePredictionsAndLossDerivat
         [this](TRowItr beginRows, TRowItr endRows) {
             std::size_t numberLossParameters{m_Loss->numberParameters()};
             for (auto row = beginRows; row != endRows; ++row) {
-                zeroPrediction(*row, m_NumberInputColumns, numberLossParameters);
-                zeroLossGradient(*row, m_NumberInputColumns, numberLossParameters);
-                zeroLossCurvature(*row, m_NumberInputColumns, numberLossParameters);
+                zeroPrediction(*row, m_ExtraColumns, numberLossParameters);
+                zeroLossGradient(*row, m_ExtraColumns, numberLossParameters);
+                zeroLossCurvature(*row, m_ExtraColumns, numberLossParameters);
             }
         },
         &updateRowMask);
@@ -665,7 +665,7 @@ CBoostedTreeImpl::candidateSplits(const core::CDataFrame& frame,
             [this](const TRowRef& row) {
                 std::size_t numberLossParameters{m_Loss->numberParameters()};
                 return trace(numberLossParameters,
-                             readLossCurvature(row, m_NumberInputColumns, numberLossParameters));
+                             readLossCurvature(row, m_ExtraColumns, numberLossParameters));
             })
             .first;
 
@@ -736,8 +736,8 @@ CBoostedTreeImpl::trainTree(core::CDataFrame& frame,
 
     TLeafNodeStatisticsPtrQueue leaves(maximumTreeSize / 2 + 3);
     leaves.push_back(std::make_shared<CBoostedTreeLeafNodeStatistics>(
-        0 /*root*/, m_NumberInputColumns, m_Loss->numberParameters(),
-        m_NumberThreads, frame, *m_Encoder, m_Regularization, candidateSplits,
+        0 /*root*/, m_ExtraColumns, m_Loss->numberParameters(), m_NumberThreads,
+        frame, *m_Encoder, m_Regularization, candidateSplits,
         this->featureBag(), 0 /*depth*/, trainingRowMask));
 
     // We update local variables because the callback can be expensive if it
@@ -1006,11 +1006,10 @@ void CBoostedTreeImpl::refreshPredictionsAndLossDerivatives(core::CDataFrame& fr
                 [&](TArgMinLossVec& leafValues_, TRowItr beginRows, TRowItr endRows) {
                     std::size_t numberLossParameters{m_Loss->numberParameters()};
                     for (auto row = beginRows; row != endRows; ++row) {
-                        auto prediction = readPrediction(*row, m_NumberInputColumns,
+                        auto prediction = readPrediction(*row, m_ExtraColumns,
                                                          numberLossParameters);
                         double actual{readActual(*row, m_DependentVariable)};
-                        double weight{readExampleWeight(*row, m_NumberInputColumns,
-                                                        numberLossParameters)};
+                        double weight{readExampleWeight(*row, m_ExtraColumns)};
                         leafValues_[root(tree).leafIndex(m_Encoder->encode(*row), tree)]
                             .add(prediction, actual, weight);
                     }
@@ -1040,14 +1039,12 @@ void CBoostedTreeImpl::refreshPredictionsAndLossDerivatives(core::CDataFrame& fr
         [&](TRowItr beginRows, TRowItr endRows) {
             std::size_t numberLossParameters{m_Loss->numberParameters()};
             for (auto row = beginRows; row != endRows; ++row) {
-                auto prediction = readPrediction(*row, m_NumberInputColumns, numberLossParameters);
+                auto prediction = readPrediction(*row, m_ExtraColumns, numberLossParameters);
                 double actual{readActual(*row, m_DependentVariable)};
-                double weight{readExampleWeight(*row, m_NumberInputColumns, numberLossParameters)};
+                double weight{readExampleWeight(*row, m_ExtraColumns)};
                 prediction += root(tree).value(m_Encoder->encode(*row), tree);
-                writeLossGradient(*row, m_NumberInputColumns, *m_Loss,
-                                  prediction, actual, weight);
-                writeLossCurvature(*row, m_NumberInputColumns, *m_Loss,
-                                   prediction, actual, weight);
+                writeLossGradient(*row, m_ExtraColumns, *m_Loss, prediction, actual, weight);
+                writeLossCurvature(*row, m_ExtraColumns, *m_Loss, prediction, actual, weight);
             }
         },
         &updateRowMask);
@@ -1062,8 +1059,7 @@ double CBoostedTreeImpl::meanLoss(const core::CDataFrame& frame,
             [&](TMeanAccumulator& loss, TRowItr beginRows, TRowItr endRows) {
                 std::size_t numberLossParameters{m_Loss->numberParameters()};
                 for (auto row = beginRows; row != endRows; ++row) {
-                    auto prediction = readPrediction(*row, m_NumberInputColumns,
-                                                     numberLossParameters);
+                    auto prediction = readPrediction(*row, m_ExtraColumns, numberLossParameters);
                     double actual{readActual(*row, m_DependentVariable)};
                     loss.add(m_Loss->value(prediction, actual));
                 }
@@ -1559,8 +1555,8 @@ std::size_t CBoostedTreeImpl::columnHoldingDependentVariable() const {
     return m_DependentVariable;
 }
 
-std::size_t CBoostedTreeImpl::numberInputColumns() const {
-    return m_NumberInputColumns;
+const CBoostedTreeImpl::TSizeVec& CBoostedTreeImpl::extraColumns() const {
+    return m_ExtraColumns;
 }
 
 CBoostedTreeImpl::TVector CBoostedTreeImpl::classificationWeights() const {
diff --git a/lib/maths/CBoostedTreeLeafNodeStatistics.cc b/lib/maths/CBoostedTreeLeafNodeStatistics.cc
index abd7becb35..79c26a8a15 100644
--- a/lib/maths/CBoostedTreeLeafNodeStatistics.cc
+++ b/lib/maths/CBoostedTreeLeafNodeStatistics.cc
@@ -28,7 +28,7 @@ const std::size_t ASSIGN_MISSING_TO_RIGHT{1};
 
 CBoostedTreeLeafNodeStatistics::CBoostedTreeLeafNodeStatistics(
     std::size_t id,
-    std::size_t numberInputColumns,
+    const TSizeVec& extraColumns,
     std::size_t numberLossParameters,
     std::size_t numberThreads,
     const core::CDataFrame& frame,
@@ -38,8 +38,8 @@ CBoostedTreeLeafNodeStatistics::CBoostedTreeLeafNodeStatistics(
     const TSizeVec& featureBag,
     std::size_t depth,
     const core::CPackedBitVector& rowMask)
-    : m_Id{id}, m_Depth{depth}, m_NumberInputColumns{numberInputColumns},
-      m_NumberLossParameters{numberLossParameters}, m_CandidateSplits{candidateSplits}, m_RowMask{rowMask} {
+    : m_Id{id}, m_Depth{depth}, m_ExtraColumns{extraColumns}, m_NumberLossParameters{numberLossParameters},
+      m_CandidateSplits{candidateSplits}, m_RowMask{rowMask} {
 
     this->computeAggregateLossDerivatives(numberThreads, frame, encoder);
     m_BestSplit = this->computeBestSplitStatistics(regularization, featureBag);
@@ -47,7 +47,7 @@ CBoostedTreeLeafNodeStatistics::CBoostedTreeLeafNodeStatistics(
 
 CBoostedTreeLeafNodeStatistics::CBoostedTreeLeafNodeStatistics(
     std::size_t id,
-    std::size_t numberInputColumns,
+    const TSizeVec& extraColumns,
     std::size_t numberLossParameters,
     std::size_t numberThreads,
     const core::CDataFrame& frame,
@@ -59,7 +59,7 @@ CBoostedTreeLeafNodeStatistics::CBoostedTreeLeafNodeStatistics(
     std::size_t depth,
     const CBoostedTreeNode& split,
     const core::CPackedBitVector& parentRowMask)
-    : m_Id{id}, m_Depth{depth}, m_NumberInputColumns{numberInputColumns},
+    : m_Id{id}, m_Depth{depth}, m_ExtraColumns{extraColumns},
       m_NumberLossParameters{numberLossParameters}, m_CandidateSplits{candidateSplits} {
 
     this->computeRowMaskAndAggregateLossDerivatives(
@@ -74,7 +74,7 @@ CBoostedTreeLeafNodeStatistics::CBoostedTreeLeafNodeStatistics(
     const TRegularization& regularization,
     const TSizeVec& featureBag,
     core::CPackedBitVector rowMask)
-    : m_Id{id}, m_Depth{sibling.m_Depth}, m_NumberInputColumns{sibling.m_NumberInputColumns},
+    : m_Id{id}, m_Depth{sibling.m_Depth}, m_ExtraColumns{sibling.m_ExtraColumns},
       m_NumberLossParameters{sibling.m_NumberLossParameters},
       m_CandidateSplits{sibling.m_CandidateSplits}, m_RowMask{std::move(rowMask)},
       m_Derivatives{std::move(parent.m_Derivatives)} {
@@ -96,9 +96,9 @@ CBoostedTreeLeafNodeStatistics::split(std::size_t leftChildId,
 
     if (this->leftChildHasFewerRows()) {
         auto leftChild = std::make_shared<CBoostedTreeLeafNodeStatistics>(
-            leftChildId, m_NumberInputColumns, m_NumberLossParameters,
-            numberThreads, frame, encoder, regularization, candidateSplits,
-            featureBag, true /*is left child*/, m_Depth + 1, split, m_RowMask);
+            leftChildId, m_ExtraColumns, m_NumberLossParameters, numberThreads,
+            frame, encoder, regularization, candidateSplits, featureBag,
+            true /*is left child*/, m_Depth + 1, split, m_RowMask);
         core::CPackedBitVector rightChildRowMask{std::move(m_RowMask)};
         rightChildRowMask ^= leftChild->rowMask();
         auto rightChild = std::make_shared<CBoostedTreeLeafNodeStatistics>(
@@ -111,9 +111,9 @@ CBoostedTreeLeafNodeStatistics::split(std::size_t leftChildId,
     }
 
     auto rightChild = std::make_shared<CBoostedTreeLeafNodeStatistics>(
-        rightChildId, m_NumberInputColumns, m_NumberLossParameters,
-        numberThreads, frame, encoder, regularization, candidateSplits,
-        featureBag, false /*is left child*/, m_Depth + 1, split, m_RowMask);
+        rightChildId, m_ExtraColumns, m_NumberLossParameters, numberThreads,
+        frame, encoder, regularization, candidateSplits, featureBag,
+        false /*is left child*/, m_Depth + 1, split, m_RowMask);
     core::CPackedBitVector leftChildRowMask{std::move(m_RowMask)};
     leftChildRowMask ^= rightChild->rowMask();
     auto leftChild = std::make_shared<CBoostedTreeLeafNodeStatistics>(
@@ -171,15 +171,15 @@ CBoostedTreeLeafNodeStatistics::estimateMemoryUsage(std::size_t numberRows,
     // case for memory usage. This is because the rows will be spread over many
     // rows so the masks will mainly contain 0 bits in this case.
     std::size_t rowMaskSize{numberRows / PACKED_BIT_VECTOR_MAXIMUM_ROWS_PER_BYTE};
-    std::size_t perSplitDerivativesSize{CPerSplitDerivatives::estimateMemoryUsage(
+    std::size_t splitsDerivativesSize{CSplitsDerivatives::estimateMemoryUsage(
         numberFeatures, numberSplitsPerFeature, numberLossParameters)};
-    return sizeof(CBoostedTreeLeafNodeStatistics) + rowMaskSize + perSplitDerivativesSize;
+    return sizeof(CBoostedTreeLeafNodeStatistics) + rowMaskSize + splitsDerivativesSize;
 }
 
 void CBoostedTreeLeafNodeStatistics::maybeRecoverMemory() {
     if (this->gain() <= 0.0) {
         m_RowMask = core::CPackedBitVector{};
-        m_Derivatives = CPerSplitDerivatives{};
+        m_Derivatives = CSplitsDerivatives{};
     }
 }
 
@@ -191,12 +191,12 @@ void CBoostedTreeLeafNodeStatistics::computeAggregateLossDerivatives(
     auto result = frame.readRows(
         numberThreads, 0, frame.numberRows(),
         core::bindRetrievableState(
-            [&](CPerSplitDerivatives& perSplitDerivatives, TRowItr beginRows, TRowItr endRows) {
+            [&](CSplitsDerivatives& splitsDerivatives, TRowItr beginRows, TRowItr endRows) {
                 for (auto row = beginRows; row != endRows; ++row) {
-                    this->addRowDerivatives(encoder.encode(*row), perSplitDerivatives);
+                    this->addRowDerivatives(encoder.encode(*row), splitsDerivatives);
                 }
             },
-            CPerSplitDerivatives{m_CandidateSplits, m_NumberLossParameters}),
+            CSplitsDerivatives{m_CandidateSplits, m_NumberLossParameters}),
         &m_RowMask);
 
     m_Derivatives = std::move(result.first[0].s_FunctionState);
@@ -217,22 +217,22 @@ void CBoostedTreeLeafNodeStatistics::computeRowMaskAndAggregateLossDerivatives(
     auto result = frame.readRows(
         numberThreads, 0, frame.numberRows(),
         core::bindRetrievableState(
-            [&](std::pair<core::CPackedBitVector, CPerSplitDerivatives>& state,
+            [&](std::pair<core::CPackedBitVector, CSplitsDerivatives>& state,
                 TRowItr beginRows, TRowItr endRows) {
                 auto& mask = state.first;
-                auto& perSplitDerivatives = state.second;
+                auto& splitsDerivatives = state.second;
                 for (auto row = beginRows; row != endRows; ++row) {
                     auto encodedRow = encoder.encode(*row);
                     if (split.assignToLeft(encodedRow) == isLeftChild) {
                         std::size_t index{row->index()};
                         mask.extend(false, index - mask.size());
                         mask.extend(true);
-                        this->addRowDerivatives(encodedRow, perSplitDerivatives);
+                        this->addRowDerivatives(encodedRow, splitsDerivatives);
                     }
                 }
             },
             std::make_pair(core::CPackedBitVector{},
-                           CPerSplitDerivatives{m_CandidateSplits, m_NumberLossParameters})),
+                           CSplitsDerivatives{m_CandidateSplits, m_NumberLossParameters})),
         &parentRowMask);
 
     for (auto& mask_ : result.first) {
@@ -250,18 +250,17 @@ void CBoostedTreeLeafNodeStatistics::computeRowMaskAndAggregateLossDerivatives(
 }
 
 void CBoostedTreeLeafNodeStatistics::addRowDerivatives(const CEncodedDataFrameRowRef& row,
-                                                       CPerSplitDerivatives& perSplitDerivatives) const {
+                                                       CSplitsDerivatives& splitsDerivatives) const {
 
     const TRowRef& unencodedRow{row.unencodedRow()};
-    auto gradient = readLossGradient(unencodedRow, m_NumberInputColumns, m_NumberLossParameters);
-    auto curvature = readLossCurvature(unencodedRow, m_NumberInputColumns, m_NumberLossParameters);
+    auto derivatives = readLossDerivatives(unencodedRow, m_ExtraColumns, m_NumberLossParameters);
     for (std::size_t feature = 0; feature < m_CandidateSplits.size(); ++feature) {
         double featureValue{row[feature]};
         if (CDataFrameUtils::isMissing(featureValue)) {
-            perSplitDerivatives.addMissingDerivatives(feature, gradient, curvature);
+            splitsDerivatives.addMissingDerivatives(feature, derivatives);
         } else {
             std::ptrdiff_t split{m_CandidateSplits[feature].upperBound(featureValue)};
-            perSplitDerivatives.addDerivatives(feature, split, gradient, curvature);
+            splitsDerivatives.addDerivatives(feature, split, derivatives);
         }
     }
 }
diff --git a/lib/maths/CBoostedTreeUtils.cc b/lib/maths/CBoostedTreeUtils.cc
index 934aa4610c..efaa8d9967 100644
--- a/lib/maths/CBoostedTreeUtils.cc
+++ b/lib/maths/CBoostedTreeUtils.cc
@@ -13,44 +13,57 @@ namespace ml {
 namespace maths {
 namespace boosted_tree_detail {
 using namespace boosted_tree;
+namespace {
+enum EExtraColumn { E_Prediction = 0, E_Gradient, E_Curvature, E_Weight };
+}
+
+TSizeAlignmentPrVec extraColumns(std::size_t numberLossParameters) {
+    return {{numberLossParameters, core::CAlignment::E_Unaligned},
+            {numberLossParameters, core::CAlignment::E_Aligned16},
+            {numberLossParameters * numberLossParameters, core::CAlignment::E_Unaligned},
+            {1, core::CAlignment::E_Unaligned}};
+}
 
 TMemoryMappedFloatVector readPrediction(const TRowRef& row,
-                                        std::size_t numberInputColumns,
-                                        std::size_t numberLossParamaters) {
-    return {row.data() + predictionColumn(numberInputColumns),
-            static_cast<int>(numberLossParamaters)};
+                                        const TSizeVec& extraColumns,
+                                        std::size_t numberLossParameters) {
+    return {row.data() + extraColumns[E_Prediction], static_cast<int>(numberLossParameters)};
 }
 
-void zeroPrediction(const TRowRef& row, std::size_t numberInputColumns, std::size_t numberLossParamaters) {
-    std::size_t offset{predictionColumn(numberInputColumns)};
-    for (std::size_t i = 0; i < numberLossParamaters; ++i) {
-        row.writeColumn(offset + i, 0.0);
+void zeroPrediction(const TRowRef& row, const TSizeVec& extraColumns, std::size_t numberLossParameters) {
+    for (std::size_t i = 0; i < numberLossParameters; ++i) {
+        row.writeColumn(extraColumns[E_Prediction] + i, 0.0);
     }
 }
 
+TAlignedMemoryMappedFloatVector readLossDerivatives(const TRowRef& row,
+                                                    const TSizeVec& extraColumns,
+                                                    std::size_t numberLossParameters) {
+    return {row.data() + extraColumns[E_Gradient],
+            static_cast<int>(numberLossParameters +
+                             lossHessianUpperTriangleSize(numberLossParameters))};
+}
+
 TMemoryMappedFloatVector readLossGradient(const TRowRef& row,
-                                          std::size_t numberInputColumns,
+                                          const TSizeVec& extraColumns,
                                           std::size_t numberLossParameters) {
-    return {row.data() + lossGradientColumn(numberInputColumns, numberLossParameters),
-            static_cast<int>(numberLossParameters)};
+    return {row.data() + extraColumns[E_Gradient], static_cast<int>(numberLossParameters)};
 }
 
-void zeroLossGradient(const TRowRef& row, std::size_t numberInputColumns, std::size_t numberLossParameters) {
-    std::size_t offset{lossGradientColumn(numberInputColumns, numberLossParameters)};
+void zeroLossGradient(const TRowRef& row, const TSizeVec& extraColumns, std::size_t numberLossParameters) {
     for (std::size_t i = 0; i < numberLossParameters; ++i) {
-        row.writeColumn(offset + i, 0.0);
+        row.writeColumn(extraColumns[E_Gradient] + i, 0.0);
     }
 }
 
 void writeLossGradient(const TRowRef& row,
-                       std::size_t numberInputColumns,
+                       const TSizeVec& extraColumns,
                        const CLoss& loss,
                        const TMemoryMappedFloatVector& prediction,
                        double actual,
                        double weight) {
-    std::size_t offset{lossGradientColumn(numberInputColumns, prediction.size())};
-    auto writer = [&row, offset](std::size_t i, double value) {
-        row.writeColumn(offset + i, value);
+    auto writer = [&row, &extraColumns](std::size_t i, double value) {
+        row.writeColumn(extraColumns[E_Gradient] + i, value);
     };
     // We wrap the writer in another lambda which we know takes advantage
     // of std::function small size optimization to avoid heap allocations.
@@ -59,29 +72,27 @@ void writeLossGradient(const TRowRef& row,
 }
 
 TMemoryMappedFloatVector readLossCurvature(const TRowRef& row,
-                                           std::size_t numberInputColumns,
+                                           const TSizeVec& extraColumns,
                                            std::size_t numberLossParameters) {
-    return {row.data() + lossCurvatureColumn(numberInputColumns, numberLossParameters),
-            static_cast<int>(lossHessianStoredSize(numberLossParameters))};
+    return {row.data() + extraColumns[E_Curvature],
+            static_cast<int>(lossHessianUpperTriangleSize(numberLossParameters))};
 }
 
-void zeroLossCurvature(const TRowRef& row, std::size_t numberInputColumns, std::size_t numberLossParameters) {
-    std::size_t offset{lossCurvatureColumn(numberInputColumns, numberLossParameters)};
-    for (std::size_t i = 0, size = lossHessianStoredSize(numberLossParameters);
+void zeroLossCurvature(const TRowRef& row, const TSizeVec& extraColumns, std::size_t numberLossParameters) {
+    for (std::size_t i = 0, size = lossHessianUpperTriangleSize(numberLossParameters);
          i < size; ++i) {
-        row.writeColumn(offset + i, 0.0);
+        row.writeColumn(extraColumns[E_Curvature] + i, 0.0);
     }
 }
 
 void writeLossCurvature(const TRowRef& row,
-                        std::size_t numberInputColumns,
+                        const TSizeVec& extraColumns,
                         const CLoss& loss,
                         const TMemoryMappedFloatVector& prediction,
                         double actual,
                         double weight) {
-    std::size_t offset{lossCurvatureColumn(numberInputColumns, prediction.size())};
-    auto writer = [&row, offset](std::size_t i, double value) {
-        row.writeColumn(offset + i, value);
+    auto writer = [&row, &extraColumns](std::size_t i, double value) {
+        row.writeColumn(extraColumns[E_Curvature] + i, value);
     };
     // We wrap the writer in another lambda which we know takes advantage
     // of std::function small size optimization to avoid heap allocations.
@@ -89,10 +100,12 @@ void writeLossCurvature(const TRowRef& row,
                    [&writer](std::size_t i, double value) { writer(i, value); }, weight);
 }
 
-double readExampleWeight(const TRowRef& row,
-                         std::size_t numberInputColumns,
-                         std::size_t numberLossParameters) {
-    return row[exampleWeightColumn(numberInputColumns, numberLossParameters)];
+double readExampleWeight(const TRowRef& row, const TSizeVec& extraColumns) {
+    return row[extraColumns[E_Weight]];
+}
+
+void writeExampleWeight(const TRowRef& row, const TSizeVec& extraColumns, double weight) {
+    row.writeColumn(extraColumns[E_Weight], weight);
 }
 
 double readActual(const TRowRef& row, std::size_t dependentVariable) {
diff --git a/lib/maths/COutliers.cc b/lib/maths/COutliers.cc
index e7eae3b466..a004f75a32 100644
--- a/lib/maths/COutliers.cc
+++ b/lib/maths/COutliers.cc
@@ -6,6 +6,7 @@
 
 #include <maths/COutliers.h>
 
+#include <core/CAlignment.h>
 #include <core/CDataFrame.h>
 #include <core/CProgramCounters.h>
 #include <core/CStopWatch.h>
@@ -34,6 +35,8 @@ const std::string COMPUTE_OUTLIER_SCORES{"compute_outlier_scores"};
 
 using TRowItr = core::CDataFrame::TRowItr;
 using TStepCallback = std::function<void(const std::string&)>;
+using TMemoryMappedFloatVector = CMemoryMappedDenseVector<CFloatStorage, Eigen::Aligned16>;
+using TDenseFloatVector = CDenseVector<CFloatStorage>;
 
 double shift(double score) {
     return std::exp(-2.0) + score;
@@ -878,7 +881,7 @@ bool computeOutliersNoPartitions(const COutliers::SComputeParameters& params,
                                  core::CDataFrame& frame,
                                  CDataFrameAnalysisInstrumentationInterface& instrumentation) {
 
-    using TPoint = CMemoryMappedDenseVector<CFloatStorage>;
+    using TPoint = TMemoryMappedFloatVector;
     using TPointVec = std::vector<TPoint>;
 
     std::int64_t frameMemory{signedMemoryUsage(frame)};
@@ -964,7 +967,7 @@ bool computeOutliersPartitioned(const COutliers::SComputeParameters& params,
                                 core::CDataFrame& frame,
                                 CDataFrameAnalysisInstrumentationInterface& instrumentation) {
 
-    using TPoint = CDenseVector<CFloatStorage>;
+    using TPoint = TDenseFloatVector;
     using TPointVec = std::vector<TPoint>;
 
     core::CStopWatch watch{true};
@@ -1079,9 +1082,9 @@ std::size_t COutliers::estimateMemoryUsedByCompute(const SComputeParameters& par
                                                    std::size_t partitionNumberPoints,
                                                    std::size_t dimension) {
     return params.s_NumberPartitions == 1
-               ? COutliers::estimateMemoryUsedByCompute<CMemoryMappedDenseVector<CFloatStorage>>(
+               ? COutliers::estimateMemoryUsedByCompute<TMemoryMappedFloatVector>(
                      params, totalNumberPoints, partitionNumberPoints, dimension)
-               : COutliers::estimateMemoryUsedByCompute<CDenseVector<CFloatStorage>>(
+               : COutliers::estimateMemoryUsedByCompute<TDenseFloatVector>(
                      params, totalNumberPoints, partitionNumberPoints, dimension);
 }
 
diff --git a/lib/maths/CTreeShapFeatureImportance.cc b/lib/maths/CTreeShapFeatureImportance.cc
index 9a1f6e8f27..240ea1eee8 100644
--- a/lib/maths/CTreeShapFeatureImportance.cc
+++ b/lib/maths/CTreeShapFeatureImportance.cc
@@ -4,7 +4,6 @@
  * you may not use this file except in compliance with the Elastic License.
  */
 
-#include "core/Concurrency.h"
 #include <maths/CTreeShapFeatureImportance.h>
 
 #include <core/CContainerPrinter.h>
@@ -24,12 +23,8 @@ CTreeShapFeatureImportance::CTreeShapFeatureImportance(const core::CDataFrame& f
                                                        const CDataFrameCategoryEncoder& encoder,
                                                        TTreeVec& forest,
                                                        std::size_t numberTopShapValues)
-    : m_NumberTopShapValues{numberTopShapValues}, m_Encoder{&encoder}, m_Forest{&forest} {
-
-    m_ColumnNames.reserve(frame.columnNames().size());
-    for (const auto& name : frame.columnNames()) {
-        m_ColumnNames.push_back(name);
-    }
+    : m_NumberTopShapValues{numberTopShapValues}, m_Encoder{&encoder}, m_Forest{&forest},
+      m_ColumnNames{frame.columnNames()} {
 
     // When traversing a tree, we successively copy the parent path and add one
     // new element to it. This means that if a tree has maxDepth depth, we store
diff --git a/lib/maths/unittest/CBoostedTreeLeafNodeStatisticsTest.cc b/lib/maths/unittest/CBoostedTreeLeafNodeStatisticsTest.cc
index a0a2c28584..f27423baef 100644
--- a/lib/maths/unittest/CBoostedTreeLeafNodeStatisticsTest.cc
+++ b/lib/maths/unittest/CBoostedTreeLeafNodeStatisticsTest.cc
@@ -22,7 +22,9 @@ using TDoubleVec = std::vector<double>;
 using TDoubleVecVec = std::vector<TDoubleVec>;
 using TSizeVec = std::vector<std::size_t>;
 using TSizeVecVec = std::vector<TSizeVec>;
-using TFloatVec = std::vector<maths::CFloatStorage>;
+using TAlignedFloatVec =
+    std::vector<maths::CFloatStorage, core::CAlignedAllocator<maths::CFloatStorage>>;
+using TAlignedDoubleVec = std::vector<double, core::CAlignedAllocator<double>>;
 using TVector = maths::CDenseVector<double>;
 using TVectorVec = std::vector<TVector>;
 using TVectorVecVec = std::vector<TVectorVec>;
@@ -32,22 +34,22 @@ using TMatrixVecVec = std::vector<TMatrixVec>;
 using TImmutableRadixSet = maths::CBoostedTreeLeafNodeStatistics::TImmutableRadixSet;
 using TImmutableRadixSetVec = maths::CBoostedTreeLeafNodeStatistics::TImmutableRadixSetVec;
 using TDerivatives = maths::CBoostedTreeLeafNodeStatistics::CDerivatives;
-using TPerSplitDerivatives = maths::CBoostedTreeLeafNodeStatistics::CPerSplitDerivatives;
+using TSplitsDerivatives = maths::CBoostedTreeLeafNodeStatistics::CSplitsDerivatives;
 
 namespace {
 
 template<typename T>
-maths::CMemoryMappedDenseVector<T> makeGradient(T* storage, std::size_t n) {
+maths::CMemoryMappedDenseVector<T> makeVector(T* storage, std::size_t n) {
     return maths::CMemoryMappedDenseVector<T>{storage, static_cast<int>(n)};
 }
 
-template<typename T>
-maths::CMemoryMappedDenseVector<T> makeCurvature(T* storage, std::size_t n) {
-    return maths::CMemoryMappedDenseVector<T>(storage, static_cast<int>(n));
+template<Eigen::AlignmentType ALIGNMENT, typename T>
+maths::CMemoryMappedDenseVector<T, ALIGNMENT> makeAlignedVector(T* storage, std::size_t n) {
+    return maths::CMemoryMappedDenseVector<T, ALIGNMENT>{storage, static_cast<int>(n)};
 }
 
 template<typename T>
-TMatrix rowMajorHessian(std::size_t n, const maths::CMemoryMappedDenseVector<T>& curvatures) {
+TMatrix columnMajorHessian(std::size_t n, const maths::CMemoryMappedDenseVector<T>& curvatures) {
     TMatrix result{n, n};
     for (std::size_t i = 0, k = 0; i < n; ++i) {
         for (std::size_t j = i; j < n; ++j, ++k) {
@@ -77,20 +79,24 @@ void testDerivativesFor(std::size_t numberParameters) {
 
     LOG_DEBUG(<< "Accumulate");
 
-    TDoubleVec storage1(numberGradients * (numberGradients + 1), 0.0);
-    TDerivatives derivatives1{numberParameters, &storage1[0]};
+    std::size_t paddedNumberGradients{core::CAlignment::roundup<double>(
+        core::CAlignment::E_Aligned16, numberGradients)};
+
+    TAlignedDoubleVec storage1(paddedNumberGradients + numberGradients * numberGradients, 0.0);
+    TDerivatives derivatives1{numberParameters, &storage1[0],
+                              &storage1[paddedNumberGradients]};
 
     for (std::size_t j = 0; j < 10; ++j) {
-        TFloatVec storage;
+        TAlignedFloatVec rowStorage;
         for (std::size_t i = 0; i < numberGradients; ++i) {
-            storage.push_back(gradients[i][j]);
+            rowStorage.push_back(gradients[i][j]);
         }
         for (std::size_t i = 0; i < numberCurvatures; ++i) {
-            storage.push_back(curvatures[i][j]);
+            rowStorage.push_back(curvatures[i][j]);
         }
-        auto gradient = makeGradient(&storage[0], numberGradients);
-        auto curvature = makeCurvature(&storage[numberGradients], numberCurvatures);
-        derivatives1.add(1, gradient, curvature);
+        auto derivatives_ = makeAlignedVector<Eigen::Aligned16>(
+            &rowStorage[0], numberGradients + numberCurvatures);
+        derivatives1.add(1, derivatives_);
     }
     derivatives1.remapCurvature();
 
@@ -110,20 +116,21 @@ void testDerivativesFor(std::size_t numberParameters) {
 
     LOG_DEBUG(<< "Merge");
 
-    TDoubleVec storage2(numberGradients * (numberGradients + 1), 0.0);
-    TDerivatives derivatives2{numberParameters, &storage2[0]};
+    TAlignedDoubleVec storage2(paddedNumberGradients + numberGradients * numberGradients, 0.0);
+    TDerivatives derivatives2{numberParameters, &storage2[0],
+                              &storage2[paddedNumberGradients]};
 
     for (std::size_t j = 10; j < 20; ++j) {
-        TFloatVec storage;
+        TAlignedFloatVec storage;
         for (std::size_t i = 0; i < numberGradients; ++i) {
             storage.push_back(gradients[i][j]);
         }
         for (std::size_t i = 0; i < numberCurvatures; ++i) {
             storage.push_back(curvatures[i][j]);
         }
-        auto gradient = makeGradient(&storage[0], numberGradients);
-        auto curvature = makeCurvature(&storage[numberGradients], numberCurvatures);
-        derivatives2.add(1, gradient, curvature);
+        auto derivatives = makeAlignedVector<Eigen::Aligned16>(
+            &storage[0], numberGradients + numberCurvatures);
+        derivatives2.add(1, derivatives);
     }
     derivatives2.remapCurvature();
 
@@ -204,36 +211,37 @@ void testPerSplitDerivativesFor(std::size_t numberParameters) {
                                          TMatrix::Zero(numberParameters, numberParameters));
         }
 
-        auto addDerivatives = [&](TPerSplitDerivatives& derivatives) {
+        auto addDerivatives = [&](TSplitsDerivatives& derivatives) {
             for (std::size_t i = 0, j = 0, k = 0; i < numberSamples;
                  ++i, j += numberGradients, k += numberCurvatures) {
 
-                TFloatVec storage;
+                TAlignedFloatVec storage;
                 storage.insert(storage.end(), &gradients[j], &gradients[j + numberGradients]);
                 storage.insert(storage.end(), &curvatures[j],
                                &curvatures[k + numberCurvatures]);
-                auto gradient = makeGradient(&storage[0], numberGradients);
-                auto curvature = makeCurvature(&storage[numberGradients], numberCurvatures);
+                auto derivatives_ = makeAlignedVector<Eigen::Aligned16>(
+                    &storage[0], numberGradients + numberCurvatures);
+                auto gradient = makeVector(&storage[0], numberGradients);
+                auto curvature = makeVector(&storage[numberGradients], numberCurvatures);
 
                 if (uniform01[i] < 0.1) {
-                    derivatives.addMissingDerivatives(features[i], gradient, curvature);
+                    derivatives.addMissingDerivatives(features[i], derivatives_);
                     ++expectedMissingCounts[features[i]];
                     expectedMissingGradients[features[i]] += gradient;
                     expectedMissingCurvatures[features[i]] +=
-                        rowMajorHessian(numberParameters, curvature);
+                        columnMajorHessian(numberParameters, curvature);
                 } else {
-                    derivatives.addDerivatives(features[i], splits[features[i]][i],
-                                               gradient, curvature);
+                    derivatives.addDerivatives(features[i], splits[features[i]][i], derivatives_);
                     ++expectedCounts[features[i]][splits[features[i]][i]];
                     expectedGradients[features[i]][splits[features[i]][i]] += gradient;
                     expectedCurvatures[features[i]][splits[features[i]][i]] +=
-                        rowMajorHessian(numberParameters, curvature);
+                        columnMajorHessian(numberParameters, curvature);
                 }
             }
             derivatives.remapCurvature();
         };
 
-        auto validate = [&](const TPerSplitDerivatives& derivatives) {
+        auto validate = [&](const TSplitsDerivatives& derivatives) {
             for (std::size_t i = 0; i < expectedCounts.size(); ++i) {
                 for (std::size_t j = 0; j < expectedGradients[i].size(); ++j) {
                     TMatrix curvature{
@@ -256,7 +264,7 @@ void testPerSplitDerivativesFor(std::size_t numberParameters) {
 
         LOG_TRACE(<< "Test accumulation");
 
-        TPerSplitDerivatives derivatives1{featureSplits, numberParameters};
+        TSplitsDerivatives derivatives1{featureSplits, numberParameters};
 
         addDerivatives(derivatives1);
         validate(derivatives1);
@@ -267,7 +275,7 @@ void testPerSplitDerivativesFor(std::size_t numberParameters) {
         rng.generateUniformSamples(-1.5, 1.0, numberSamples * numberGradients, gradients);
         rng.generateUniformSamples(0.1, 0.5, numberSamples * numberCurvatures, curvatures);
 
-        TPerSplitDerivatives derivatives2{featureSplits, numberParameters};
+        TSplitsDerivatives derivatives2{featureSplits, numberParameters};
 
         addDerivatives(derivatives2);
         derivatives1.add(derivatives2);
@@ -275,7 +283,7 @@ void testPerSplitDerivativesFor(std::size_t numberParameters) {
 
         LOG_TRACE(<< "Test copy");
 
-        TPerSplitDerivatives derivatives3{derivatives1};
+        TSplitsDerivatives derivatives3{derivatives1};
         BOOST_REQUIRE_EQUAL(derivatives1.checksum(), derivatives3.checksum());
     }
 }
diff --git a/lib/maths/unittest/CDataFrameCategoryEncoderTest.cc b/lib/maths/unittest/CDataFrameCategoryEncoderTest.cc
index 2936d350dd..4530f1e06b 100644
--- a/lib/maths/unittest/CDataFrameCategoryEncoderTest.cc
+++ b/lib/maths/unittest/CDataFrameCategoryEncoderTest.cc
@@ -31,7 +31,8 @@ using TDoubleVec = std::vector<double>;
 using TDoubleVecVec = std::vector<TDoubleVec>;
 using TSizeVec = std::vector<std::size_t>;
 using TSizeVecVec = std::vector<TSizeVec>;
-using TFloatVec = std::vector<maths::CFloatStorage>;
+using TFloatVec =
+    std::vector<maths::CFloatStorage, core::CAlignedAllocator<maths::CFloatStorage>>;
 using TMeanAccumulator = maths::CBasicStatistics::SSampleMean<double>::TAccumulator;
 using TMeanAccumulatorVec = std::vector<TMeanAccumulator>;
 using TMeanAccumulatorVecVec = std::vector<TMeanAccumulatorVec>;
@@ -604,7 +605,7 @@ BOOST_AUTO_TEST_CASE(testUnseenCategoryEncoding) {
 
     maths::CDataFrameCategoryEncoder encoder{{1, *frame, 3}};
 
-    TFloatVec unseen{3.0, 5.0, 4.0, 1.5};
+    TFloatVec unseen{3.0f, 5.0f, 4.0f, 1.5f};
     core::CDataFrame::TRowRef row{rows, unseen.begin(), unseen.end(), 0};
 
     auto encodedRow = encoder.encode(row);
diff --git a/lib/maths/unittest/COutliersTest.cc b/lib/maths/unittest/COutliersTest.cc
index 776af37c07..dfb96853a7 100644
--- a/lib/maths/unittest/COutliersTest.cc
+++ b/lib/maths/unittest/COutliersTest.cc
@@ -4,6 +4,7 @@
  * you may not use this file except in compliance with the Elastic License.
  */
 
+#include <core/CAlignment.h>
 #include <core/CContainerPrinter.h>
 #include <core/CDataFrame.h>
 #include <core/CLogger.h>
@@ -595,7 +596,7 @@ BOOST_AUTO_TEST_CASE(testEstimateMemoryUsedByCompute) {
                                                     0.05}; // Outlier fraction
 
         std::int64_t estimatedMemoryUsage(
-            core::CDataFrame::estimateMemoryUsage(i == 0, 40500, 6) +
+            core::CDataFrame::estimateMemoryUsage(i == 0, 40500, 6, core::CAlignment::E_Aligned16) +
             maths::COutliers::estimateMemoryUsedByCompute(
                 params, numberPoints,
                 (numberPoints + numberPartitions[i] - 1) / numberPartitions[i],
@@ -624,7 +625,7 @@ BOOST_AUTO_TEST_CASE(testEstimateMemoryUsedByCompute) {
         LOG_DEBUG(<< "estimated peak memory = " << estimatedMemoryUsage);
         LOG_DEBUG(<< "high water mark = " << maxMemoryUsage);
         BOOST_TEST_REQUIRE(std::abs(maxMemoryUsage - estimatedMemoryUsage) <
-                           std::max(maxMemoryUsage.load(), estimatedMemoryUsage) / 10);
+                           std::max(maxMemoryUsage.load(), estimatedMemoryUsage) / 6);
     }
 }
 
diff --git a/mk/linux.mk b/mk/linux.mk
index f11e436f28..2c6d845536 100644
--- a/mk/linux.mk
+++ b/mk/linux.mk
@@ -75,7 +75,7 @@ else
 RAPIDJSONCPPFLAGS=-DRAPIDJSON_HAS_STDSTRING -DRAPIDJSON_SSE42
 endif
 EIGENINCLUDES=-isystem $(CPP_SRC_HOME)/3rd_party/eigen
-EIGENCPPFLAGS=-DEIGEN_MPL2_ONLY
+EIGENCPPFLAGS=-DEIGEN_MPL2_ONLY -DEIGEN_MAX_ALIGN_BYTES=32
 XMLINCLUDES=`/usr/local/gcc75/bin/xml2-config --cflags`
 XMLLIBS=`/usr/local/gcc75/bin/xml2-config --libs`
 DYNAMICLIBLDFLAGS=$(PLATPICFLAGS) -shared -Wl,--as-needed -L$(CPP_PLATFORM_HOME)/$(DYNAMIC_LIB_DIR) $(COVERAGE) -Wl,-z,relro -Wl,-z,now -Wl,-rpath,'$$ORIGIN/.'
diff --git a/mk/linux_crosscompile_linux.mk b/mk/linux_crosscompile_linux.mk
index a32eef7d72..4d34bbaa6a 100644
--- a/mk/linux_crosscompile_linux.mk
+++ b/mk/linux_crosscompile_linux.mk
@@ -76,7 +76,7 @@ else
 RAPIDJSONCPPFLAGS=-DRAPIDJSON_HAS_STDSTRING
 endif
 EIGENINCLUDES=-isystem $(CPP_SRC_HOME)/3rd_party/eigen
-EIGENCPPFLAGS=-DEIGEN_MPL2_ONLY
+EIGENCPPFLAGS=-DEIGEN_MPL2_ONLY -DEIGEN_MAX_ALIGN_BYTES=32
 XMLINCLUDES=-I$(SYSROOT)/usr/local/gcc75/include/libxml2
 XMLLIBS=-L$(SYSROOT)/usr/local/gcc75/lib -lxml2 -lz -lm -ldl
 DYNAMICLIBLDFLAGS=$(PLATPICFLAGS) -shared -Wl,--as-needed -L$(CPP_PLATFORM_HOME)/$(DYNAMIC_LIB_DIR) $(COVERAGE) -Wl,-z,relro -Wl,-z,now -Wl,-rpath,'$$ORIGIN/.'
diff --git a/mk/linux_crosscompile_macosx.mk b/mk/linux_crosscompile_macosx.mk
index 1734e8be23..eb79720ea6 100644
--- a/mk/linux_crosscompile_macosx.mk
+++ b/mk/linux_crosscompile_macosx.mk
@@ -72,7 +72,7 @@ BOOSTTESTLIBS=-lboost_unit_test_framework-clang-darwin$(BOOSTCLANGVER)-mt-x64-$(
 RAPIDJSONINCLUDES=-isystem $(CPP_SRC_HOME)/3rd_party/rapidjson/include
 RAPIDJSONCPPFLAGS=-DRAPIDJSON_HAS_STDSTRING -DRAPIDJSON_SSE42
 EIGENINCLUDES=-isystem $(CPP_SRC_HOME)/3rd_party/eigen
-EIGENCPPFLAGS=-DEIGEN_MPL2_ONLY
+EIGENCPPFLAGS=-DEIGEN_MPL2_ONLY -DEIGEN_MAX_ALIGN_BYTES=32
 XMLINCLUDES=-isystem $(SYSROOT)/usr/include/libxml2
 XMLLIBLDFLAGS=-L$(SYSROOT)/usr/lib
 XMLLIBS=-lxml2
diff --git a/mk/macosx.mk b/mk/macosx.mk
index c39b0de3f7..61933f6116 100644
--- a/mk/macosx.mk
+++ b/mk/macosx.mk
@@ -62,7 +62,7 @@ BOOSTTESTLIBS=-lboost_unit_test_framework-clang-darwin$(BOOSTCLANGVER)-mt-x64-$(
 RAPIDJSONINCLUDES=-isystem $(CPP_SRC_HOME)/3rd_party/rapidjson/include
 RAPIDJSONCPPFLAGS=-DRAPIDJSON_HAS_STDSTRING -DRAPIDJSON_SSE42
 EIGENINCLUDES=-isystem $(CPP_SRC_HOME)/3rd_party/eigen
-EIGENCPPFLAGS=-DEIGEN_MPL2_ONLY
+EIGENCPPFLAGS=-DEIGEN_MPL2_ONLY -DEIGEN_MAX_ALIGN_BYTES=32
 XMLINCLUDES=-isystem $(SDK_PATH)/usr/include/libxml2
 XMLLIBLDFLAGS=-L/usr/lib
 XMLLIBS=-lxml2
diff --git a/mk/windows.mk b/mk/windows.mk
index d9d55d020d..a1517a2d98 100644
--- a/mk/windows.mk
+++ b/mk/windows.mk
@@ -94,7 +94,7 @@ RAPIDJSONCPPFLAGS=-DRAPIDJSON_HAS_STDSTRING -DRAPIDJSON_SSE42
 # Eigen automatically uses SSE and SSE2 on 64 bit Windows - only the higher
 # versions need to be explicitly enabled
 EIGENINCLUDES=-I$(CPP_SRC_HOME)/3rd_party/eigen
-EIGENCPPFLAGS=-DEIGEN_MPL2_ONLY -DEIGEN_VECTORIZE_SSE3 -DEIGEN_VECTORIZE_SSE4_1 -DEIGEN_VECTORIZE_SSE4_2
+EIGENCPPFLAGS=-DEIGEN_MPL2_ONLY -DEIGEN_VECTORIZE_SSE3 -DEIGEN_VECTORIZE_SSE4_1 -DEIGEN_VECTORIZE_SSE4_2 -DEIGEN_MAX_ALIGN_BYTES=32
 XMLINCLUDES=-I$(LOCAL_DRIVE):/usr/local/include/libxml2
 XMLLIBLDFLAGS=-LIBPATH:$(LOCAL_DRIVE):/usr/local/lib
 XMLLIBS=libxml2.lib