droberts195
diff --git a/‎include/api/CDataFrameTrainBoostedTreeRunner.h
+13 b/‎include/api/CDataFrameTrainBoostedTreeRunner.h
+13
diff --git a/‎include/maths/CBoostedTree.h
+14 b/‎include/maths/CBoostedTree.h
+14
diff --git a/‎include/maths/CBoostedTreeFactory.h
+5 b/‎include/maths/CBoostedTreeFactory.h
+5
diff --git a/‎include/maths/CBoostedTreeImpl.h
+21-4 b/‎include/maths/CBoostedTreeImpl.h
+21-4
diff --git a/‎include/maths/CDataFrameCategoryEncoder.h
+5-2 b/‎include/maths/CDataFrameCategoryEncoder.h
+5-2
diff --git a/‎include/maths/CDataFrameRegressionModel.h
+17 b/‎include/maths/CDataFrameRegressionModel.h
+17
diff --git a/‎include/maths/CTreeShapFeatureImportance.h
+149 b/‎include/maths/CTreeShapFeatureImportance.h
+149
diff --git a/‎include/test/CDataFrameAnalysisSpecificationFactory.h
+1 b/‎include/test/CDataFrameAnalysisSpecificationFactory.h
+1
@@ -7,6 +7,8 @@
 #ifndef INCLUDED_ml_api_CDataFrameTrainBoostedTreeRunner_h
 #define INCLUDED_ml_api_CDataFrameTrainBoostedTreeRunner_h
 
+#include <maths/CBasicStatistics.h>
+
 #include <api/CDataFrameAnalysisRunner.h>
 #include <api/CDataFrameAnalysisSpecification.h>
 #include <api/ImportExport.h>
@@ -44,6 +46,7 @@ class API_EXPORT CDataFrameTrainBoostedTreeRunner : public CDataFrameAnalysisRun
     static const std::string NUMBER_FOLDS;
     static const std::string NUMBER_ROUNDS_PER_HYPERPARAMETER;
     static const std::string BAYESIAN_OPTIMISATION_RESTARTS;
+    static const std::string TOP_SHAP_VALUES;
 
 public:
     ~CDataFrameTrainBoostedTreeRunner() override;
@@ -57,6 +60,8 @@ class API_EXPORT CDataFrameTrainBoostedTreeRunner : public CDataFrameAnalysisRun
     //! The boosted tree factory.
     const maths::CBoostedTreeFactory& boostedTreeFactory() const;
 
+    std::size_t topShapValues() const;
+
 protected:
     using TBoostedTreeUPtr = std::unique_ptr<maths::CBoostedTree>;
     using TLossFunctionUPtr = std::unique_ptr<maths::boosted_tree::CLoss>;
@@ -76,6 +81,14 @@ class API_EXPORT CDataFrameTrainBoostedTreeRunner : public CDataFrameAnalysisRun
     //! The boosted tree factory.
     maths::CBoostedTreeFactory& boostedTreeFactory();
 
+    //! Factory for the largest SHAP value accumulator.
+    template<typename LESS>
+    maths::CBasicStatistics::COrderStatisticsHeap<std::size_t, LESS>
+    makeLargestShapAccumulator(std::size_t n, LESS less) const {
+        return maths::CBasicStatistics::COrderStatisticsHeap<std::size_t, LESS>{
+            n, std::size_t{}, less};
+    };
+
 private:
     using TBoostedTreeFactoryUPtr = std::unique_ptr<maths::CBoostedTreeFactory>;
     using TDataSearcherUPtr = CDataFrameAnalysisSpecification::TDataSearcherUPtr;
 
@@ -300,6 +300,9 @@ class MATHS_EXPORT CBoostedTreeNode final {
                       double curvature,
                       TNodeVec& tree);
 
+    //! Get the feature index of the split.
+    std::size_t splitFeature() const { return m_SplitFeature; };
+
     //! Persist by passing information to \p inserter.
     void acceptPersistInserter(core::CStatePersistInserter& inserter) const;
 
@@ -382,6 +385,11 @@ class MATHS_EXPORT CBoostedTree final : public CDataFrameRegressionModel {
     //! \warning This can only be called after train.
     void predict() const override;
 
+    //! Write SHAP values to the data frame supplied to the constructor.
+    //!
+    //! \warning This can only be called after train.
+    void computeShapValues() override;
+
     //! Get the feature weights the model has chosen.
     const TDoubleVec& featureWeights() const override;
 
@@ -391,6 +399,12 @@ class MATHS_EXPORT CBoostedTree final : public CDataFrameRegressionModel {
     //! Get the column containing the model's prediction for the dependent variable.
     std::size_t columnHoldingPrediction(std::size_t numberColumns) const override;
 
+    //! Get the optional vector of column indices with SHAP values
+    TSizeVec columnsHoldingShapValues() const override;
+
+    //! Get the number of largest SHAP values that will be returned for every row.
+    std::size_t topShapValues() const override;
+
     //! Get the model produced by training if it has been run.
     const TNodeVecVec& trainedModel() const;
 
 
@@ -87,6 +87,10 @@ class MATHS_EXPORT CBoostedTreeFactory final {
     CBoostedTreeFactory& bayesianOptimisationRestarts(std::size_t restarts);
     //! Set the number of training examples we need per feature we'll include.
     CBoostedTreeFactory& rowsPerFeature(std::size_t rowsPerFeature);
+
+    //! Set the number of training examples we need per feature we'll include.
+    CBoostedTreeFactory& topShapValues(std::size_t topShapValues);
+
     //! Set whether to try and balance within class accuracy. For classification
     //! this reweights examples so approximately the same total loss is assigned
     //! to every class.
@@ -205,6 +209,7 @@ class MATHS_EXPORT CBoostedTreeFactory final {
     TProgressCallback m_RecordProgress = noopRecordProgress;
     TMemoryUsageCallback m_RecordMemoryUsage = noopRecordMemoryUsage;
     TTrainingStateCallback m_RecordTrainingState = noopRecordTrainingState;
+    std::size_t m_TopShapValues = 0;
 };
 }
 }
 
@@ -60,6 +60,7 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     using TTrainingStateCallback = CBoostedTree::TTrainingStateCallback;
     using TOptionalDouble = boost::optional<double>;
     using TRegularization = CBoostedTreeRegularization<double>;
+    using TSizeVec = std::vector<std::size_t>;
 
 public:
     static const double MINIMUM_RELATIVE_GAIN_PER_SPLIT;
@@ -83,6 +84,11 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     //! \note Must be called only if a trained model is available.
     void predict(core::CDataFrame& frame, const TProgressCallback& /*recordProgress*/) const;
 
+    //! Compute SHAP values using the best trained model to \p frame.
+    //!
+    //! \note Must be called only if a trained model is available.
+    void computeShapValues(core::CDataFrame& frame, const TProgressCallback&);
+
     //! Get the feature sample probabilities.
     const TDoubleVec& featureWeights() const;
 
@@ -132,12 +138,20 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     //! \return The best hyperparameters for validation error found so far.
     const CBoostedTreeHyperparameters& bestHyperparameters() const;
 
+    //! Get the indices of the columns containing SHAP values.
+    TSizeVec columnsHoldingShapValues() const;
+
+    //! Get the number of largest SHAP values that will be returned for every row.
+    std::size_t topShapValues() const;
+
+    //! Get the number of input columns.
+    std::size_t numberInputColumns() const;
+
 private:
     using TSizeDoublePr = std::pair<std::size_t, double>;
     using TDoubleDoublePr = std::pair<double, double>;
     using TOptionalSize = boost::optional<std::size_t>;
     using TImmutableRadixSetVec = std::vector<core::CImmutableRadixSet<double>>;
-    using TSizeVec = std::vector<std::size_t>;
     using TVector = CDenseVector<double>;
     using TRowItr = core::CDataFrame::TRowItr;
     using TPackedBitVectorVec = std::vector<core::CPackedBitVector>;
@@ -383,6 +397,7 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     // The maximum number of rows encoded by a single byte in the packed bit
     // vector assuming best compression.
     static const std::size_t PACKED_BIT_VECTOR_MAXIMUM_ROWS_PER_BYTE;
+    static const double INF;
 
 private:
     CBoostedTreeImpl();
@@ -492,9 +507,6 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     //! Record the training state using the \p recordTrainState callback function
     void recordState(const TTrainingStateCallback& recordTrainState) const;
 
-private:
-    static const double INF;
-
 private:
     mutable CPRNG::CXorOShiro128Plus m_Rng;
     std::size_t m_NumberThreads;
@@ -529,7 +541,12 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     std::size_t m_NumberRounds = 1;
     std::size_t m_CurrentRound = 0;
     core::CLoopProgress m_TrainingProgress;
+    std::size_t m_TopShapValues = 0;
+    std::size_t m_FirstShapColumnIndex = 0;
+    std::size_t m_LastShapColumnIndex = 0;
+    std::size_t m_NumberInputColumns = 0;
 
+private:
     friend class CBoostedTreeFactory;
 };
 
 
@@ -211,7 +211,8 @@ class MATHS_EXPORT CDataFrameCategoryEncoder final {
     };
 
 public:
-    CDataFrameCategoryEncoder(CMakeDataFrameCategoryEncoder parameters);
+    CDataFrameCategoryEncoder(CMakeDataFrameCategoryEncoder& builder);
+    CDataFrameCategoryEncoder(CMakeDataFrameCategoryEncoder&& builder);
 
     //! Initialize from serialized data.
     CDataFrameCategoryEncoder(core::CStateRestoreTraverser& traverser);
@@ -288,6 +289,8 @@ class MATHS_EXPORT CMakeDataFrameCategoryEncoder {
                                   const core::CDataFrame& frame,
                                   std::size_t targetColumn);
 
+    virtual ~CMakeDataFrameCategoryEncoder() = default;
+
     //! Set the minimum number of training rows needed per feature used.
     CMakeDataFrameCategoryEncoder& minimumRowsPerFeature(std::size_t minimumRowsPerFeature);
 
@@ -313,7 +316,7 @@ class MATHS_EXPORT CMakeDataFrameCategoryEncoder {
     CMakeDataFrameCategoryEncoder& columnMask(TSizeVec columnMask);
 
     //! Make the encoding.
-    TEncodingUPtrVec makeEncodings();
+    virtual TEncodingUPtrVec makeEncodings();
 
     //! \name Test Methods
     //@{
 
@@ -11,6 +11,8 @@
 
 #include <maths/ImportExport.h>
 
+#include <boost/optional.hpp>
+
 #include <functional>
 #include <utility>
 #include <vector>
@@ -27,6 +29,7 @@ namespace maths {
 class MATHS_EXPORT CDataFrameRegressionModel {
 public:
     using TDoubleVec = std::vector<double>;
+    using TSizeVec = std::vector<std::size_t>;
     using TProgressCallback = std::function<void(double)>;
     using TMemoryUsageCallback = std::function<void(std::uint64_t)>;
     using TPersistFunc = std::function<void(core::CStatePersistInserter&)>;
@@ -44,6 +47,11 @@ class MATHS_EXPORT CDataFrameRegressionModel {
     //! \warning This can only be called after train.
     virtual void predict() const = 0;
 
+    //! Write SHAP values to the data frame supplied to the contructor.
+    //!
+    //! \warning This can only be called after train.
+    virtual void computeShapValues() = 0;
+
     //! Get the feature weights the model has chosen.
     virtual const TDoubleVec& featureWeights() const = 0;
 
@@ -53,6 +61,15 @@ class MATHS_EXPORT CDataFrameRegressionModel {
     //! Get the column containing the model's prediction for the dependent variable.
     virtual std::size_t columnHoldingPrediction(std::size_t numberColumns) const = 0;
 
+    //! Get the number of largest SHAP values that will be returned for every row.
+    virtual std::size_t topShapValues() const = 0;
+
+    //! Get the optional vector of column indices with SHAP values
+    virtual TSizeVec columnsHoldingShapValues() const = 0;
+
+public:
+    static const std::string SHAP_PREFIX;
+
 protected:
     CDataFrameRegressionModel(core::CDataFrame& frame,
                               TProgressCallback recordProgress,
 
@@ -0,0 +1,149 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+
+#ifndef INCLUDED_ml_maths_CTreeShapFeatureImportance_h
+#define INCLUDED_ml_maths_CTreeShapFeatureImportance_h
+
+#include <maths/CBoostedTree.h>
+#include <maths/ImportExport.h>
+
+#include <vector>
+
+namespace ml {
+namespace maths {
+
+//! \brief Computes SHAP (SHapley Additive exPlanation) values for feature importance estimation for gradient boosting
+//! trees.
+//!
+//! DESCRIPTION:\n
+//! SHAP values is a unique consistent and locally accurate attribution value. This mean that the sum of the SHAP
+//! feature importance values approximates the model prediction up to a constant bias. This implementation follows the
+//! algorithm "Consistent Individualized Feature Attribution for Tree Ensembles" by  Lundberg, Erion, and Lee.
+//! The algorithm has the complexity O(TLD^2) where T is the number of trees, L is the maximum number of leaves in the
+//! tree, and D is the maximum depth of a tree in the ensemble.
+class MATHS_EXPORT CTreeShapFeatureImportance {
+public:
+    using TTree = std::vector<CBoostedTreeNode>;
+    using TTreeVec = std::vector<TTree>;
+    using TIntVec = std::vector<int>;
+    using TDoubleVec = std::vector<double>;
+    using TDoubleVecVec = std::vector<TDoubleVec>;
+
+public:
+    explicit CTreeShapFeatureImportance(TTreeVec trees, std::size_t threads = 1);
+
+    //! Compute SHAP values for the data in \p frame using the specified \p encoder.
+    //! The results are written directly back into the \p frame, the index of the first result column is controller
+    //! by \p offset.
+    void shap(core::CDataFrame& frame, const CDataFrameCategoryEncoder& encoder, std::size_t offset);
+
+    //! Compute number of training samples from \p frame that pass every node in the \p tree.
+    static TDoubleVec samplesPerNode(const TTree& tree,
+                                     const core::CDataFrame& frame,
+                                     const CDataFrameCategoryEncoder& encoder,
+                                     std::size_t numThreads);
+
+    //! Recursively computes inner node values as weighted average of the children (leaf) values
+    //! \returns The maximum depth the the tree.
+    static std::size_t updateNodeValues(TTree& tree,
+                                        std::size_t nodeIndex,
+                                        const TDoubleVec& samplesPerNode,
+                                        std::size_t depth);
+
+    //! Get the reference to the trees.
+    TTreeVec& trees() { return m_Trees; };
+
+private:
+    using TSizeVec = std::vector<std::size_t>;
+
+    //! Manages variables for the current path through the tree as the main algorithm proceeds.
+    struct SPath {
+        explicit SPath(std::size_t length)
+            : s_FractionOnes(length), s_FractionZeros(length),
+              s_FeatureIndex(length, -1), s_Scale(length), s_NextIndex(0),
+              s_MaxLength(length) {}
+
+        void extend(int featureIndex, double fractionZero, double fractionOne) {
+            if (s_NextIndex < s_MaxLength) {
+                s_FeatureIndex[s_NextIndex] = featureIndex;
+                s_FractionZeros[s_NextIndex] = fractionZero;
+                s_FractionOnes[s_NextIndex] = fractionOne;
+                if (s_NextIndex == 0) {
+                    s_Scale[s_NextIndex] = 1.0;
+                } else {
+                    s_Scale[s_NextIndex] = 0.0;
+                }
+                ++s_NextIndex;
+            }
+        }
+
+        void reduce(std::size_t pathIndex) {
+            for (std::size_t i = pathIndex; i < this->depth(); ++i) {
+                s_FeatureIndex[i] = s_FeatureIndex[i + 1];
+                s_FractionZeros[i] = s_FractionZeros[i + 1];
+                s_FractionOnes[i] = s_FractionOnes[i + 1];
+            }
+            --s_NextIndex;
+        }
+
+        //! Indicator whether or not the feature \p pathIndex is decicive for the path.
+        double fractionOnes(std::size_t pathIndex) const {
+            return s_FractionOnes[pathIndex];
+        }
+
+        //! Fraction of all training data that reached the \pathIndex in the path.
+        double fractionZeros(std::size_t pathIndex) const {
+            return s_FractionZeros[pathIndex];
+        }
+
+        int featureIndex(std::size_t pathIndex) const {
+            return s_FeatureIndex[pathIndex];
+        }
+
+        //! Scaling coefficients (factorials), see. Equation (2) in the paper by Lundberg et al.
+        double scale(std::size_t pathIndex) const { return s_Scale[pathIndex]; }
+
+        //! Current depth in the tree
+        std::size_t depth() const { return s_NextIndex - 1; };
+
+        TDoubleVec s_FractionOnes;
+        TDoubleVec s_FractionZeros;
+        TIntVec s_FeatureIndex;
+        TDoubleVec s_Scale;
+        std::size_t s_NextIndex;
+        std::size_t s_MaxLength;
+    };
+
+private:
+    //! Recursively traverses all pathes in the \p tree and updated SHAP values once it hits a leaf.
+    //! Ref. Algorithm 2 in the paper by Lundberg et al.
+    void shapRecursive(const TTree& tree,
+                       const TDoubleVec& samplesPerNode,
+                       const CDataFrameCategoryEncoder& encoder,
+                       const CEncodedDataFrameRowRef& encodedRow,
+                       SPath splitPath,
+                       std::size_t nodeIndex,
+                       double parentFractionZero,
+                       double parentFractionOne,
+                       int parentFeatureIndex,
+                       std::size_t offset,
+                       core::CDataFrame::TRowItr& row) const;
+    //! Extend the \p path object, update the variables and factorial scaling coefficients.
+    static void extendPath(SPath& path, double fractionZero, double fractionOne, int featureIndex);
+    //! Sum the scaling coefficients for the \p path without the feature defined in \p pathIndex.
+    static double sumUnwoundPath(const SPath& path, std::size_t pathIndex);
+    //! Updated the scaling coefficients in the \p path if the feature defined in \p pathIndex was seen again.
+    static void unwindPath(SPath& path, std::size_t pathIndex);
+
+private:
+    TTreeVec m_Trees;
+    std::size_t m_NumberThreads;
+    TDoubleVecVec m_SamplesPerNode;
+};
+}
+}
+
+#endif // INCLUDED_ml_maths_CTreeShapFeatureImportance_h
@@ -57,6 +57,7 @@ class TEST_EXPORT CDataFrameAnalysisSpecificationFactory {
                    double eta = -1.0,
                    std::size_t maximumNumberTrees = 0,
                    double featureBagFraction = -1.0,
+                   size_t topShapValues = 0,
                    TPersisterSupplier* persisterSupplier = nullptr,
                    TRestoreSearcherSupplier* restoreSearcherSupplier = nullptr);
 };