diff --git a/modules/optflow/CMakeLists.txt b/modules/optflow/CMakeLists.txt
index 39a37f143dd..e4232749ea2 100644
--- a/modules/optflow/CMakeLists.txt
+++ b/modules/optflow/CMakeLists.txt
@@ -1,2 +1,2 @@
 set(the_description "Optical Flow Algorithms")
-ocv_define_module(optflow opencv_core opencv_imgproc opencv_video opencv_ximgproc opencv_imgcodecs opencv_flann WRAP python)
+ocv_define_module(optflow opencv_core opencv_imgproc opencv_calib3d opencv_video opencv_ximgproc opencv_imgcodecs opencv_flann WRAP python)
diff --git a/modules/optflow/doc/optflow.bib b/modules/optflow/doc/optflow.bib
index c36504b3e3c..5b4869564bd 100644
--- a/modules/optflow/doc/optflow.bib
+++ b/modules/optflow/doc/optflow.bib
@@ -61,3 +61,52 @@ @inproceedings{Wang_2016_CVPR
   month = {June},
   year = {2016}
 }
+
+@inproceedings{Geistert2016,
+  author = {Jonas Geistert and Tobias Senst and Thomas Sikora},
+  title = {Robust Local Optical Flow: Dense Motion Vector Field Interpolation},
+  booktitle = {Picture Coding Symposium},
+  year = {2016},
+  pages = {1--5},
+}
+
+@inproceedings{Senst2016,
+  author = {Tobias Senst and Jonas Geistert and Thomas Sikora},
+  title = {Robust local optical flow: Long-range motions and varying illuminations},
+  booktitle = {IEEE International Conference on Image Processing},
+  year = {2016},
+  pages = {4478--4482},
+}
+
+@inproceedings{Senst2014,
+  author = {Tobias Senst and Thilo Borgmann and Ivo Keller and Thomas Sikora},
+  title = {Cross based Robust Local Optical Flow},
+  booktitle = {21th IEEE International Conference on Image Processing},
+  year = {2014},
+  pages = {1967--1971},
+}
+
+@inproceedings{Senst2013,
+  author = {Tobias Senst and Jonas Geistert and Ivo Keller and Thomas Sikora},
+  title = {Robust Local Optical Flow Estimation using Bilinear Equations for Sparse Motion Estimation},
+  booktitle = {20th IEEE International Conference on Image Processing},
+  year = {2013},
+  pages = {2499--2503},
+}
+
+@article{Senst2012,
+  author = {Tobias Senst and Volker Eiselein and Thomas Sikora},
+  title = {Robust Local Optical Flow for Feature Tracking},
+  journal = {IEEE Transactions on Circuits and Systems for Video Technology},
+  year = {2012},
+  pages = {1377--1387},
+  volume = {22},
+  number = {9},
+}
+
+@article{Tibshirani2008,
+  title={Fast computation of the median by successive binning},
+  author={Tibshirani, Ryan J},
+  journal={arXiv preprint arXiv:0806.3301},
+  year={2008}
+}
diff --git a/modules/optflow/include/opencv2/optflow.hpp b/modules/optflow/include/opencv2/optflow.hpp
index 65802750d1a..093b5fe2c7f 100644
--- a/modules/optflow/include/opencv2/optflow.hpp
+++ b/modules/optflow/include/opencv2/optflow.hpp
@@ -68,7 +68,7 @@ Functions reading and writing .flo files in "Middlebury" format, see: <http://vi
 
 #include "opencv2/optflow/pcaflow.hpp"
 #include "opencv2/optflow/sparse_matching_gpc.hpp"
-
+#include "opencv2/optflow/rlofflow.hpp"
 namespace cv
 {
 namespace optflow
diff --git a/modules/optflow/include/opencv2/optflow/rlofflow.hpp b/modules/optflow/include/opencv2/optflow/rlofflow.hpp
new file mode 100644
index 00000000000..bbed7f0272c
--- /dev/null
+++ b/modules/optflow/include/opencv2/optflow/rlofflow.hpp
@@ -0,0 +1,493 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#ifndef __OPENCV_OPTFLOW_RLOFFLOW_HPP__
+#define __OPENCV_OPTFLOW_RLOFFLOW_HPP__
+
+#include "opencv2/core.hpp"
+#include "opencv2/video.hpp"
+
+namespace cv
+{
+namespace optflow
+{
+//! @addtogroup optflow
+//! @{
+
+enum SupportRegionType {
+    SR_FIXED = 0,           /**<  Apply a constant support region */
+    SR_CROSS = 1            /**<  Apply a adaptive support region obtained by cross-based segmentation
+                             *    as described in @cite Senst2014
+                            */
+};
+enum SolverType {
+    ST_STANDART = 0,        /**< Apply standard iterative refinement */
+    ST_BILINEAR = 1         /**< Apply optimized iterative refinement based bilinear equation solutions
+                             *   as described in @cite Senst2013
+                            */
+};
+
+enum InterpolationType
+{
+    INTERP_GEO = 0,    /**<  Fast geodesic interpolation, see @cite Geistert2016 */
+    INTERP_EPIC = 1,    /**<  Edge-preserving interpolation, see @cite Revaud2015,Geistert2016. */
+};
+
+/** @brief This is used store and set up the parameters of the robust local optical flow (RLOF) algoritm.
+ *
+ * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as
+ * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK().
+ * This RLOF implementation can be seen as an improved pyramidal iterative Lucas-Kanade and includes
+ * a set of improving modules. The main improvements in respect to the pyramidal iterative Lucas-Kanade
+ * are:
+ *  - A more robust redecending M-estimator framework (see @cite Senst2012) to improve the accuracy at
+ *     motion boundaries and appearing and disappearing pixels.
+ *  - an adaptive support region strategies to improve the accuracy at motion boundaries to reduce the
+ *     corona effect, i.e oversmoothing of the PLK at motion/object boundaries. The cross-based segementation
+ *     strategy (SR_CROSS) proposed in @cite Senst2014 uses a simple segmenation approach to obtain the optimal
+ *     shape of the support region.
+ *  - To deal with illumination changes (outdoor sequences and shadow) the intensity constancy assumption
+ *     based optical flow equation has been adopt with the Gennert and Negahdaripour illumination model
+ *     (see @cite Senst2016). This model can be switched on/off with the useIlluminationModel variable.
+ *  - By using a global motion prior initialization (see @cite Senst2016) of the iterative refinement
+ *     the accuracy could be significantly improved for large displacements. This initialization can be
+ *     switched on and of with useGlobalMotionPrior variable.
+ *
+ * The RLOF can be computed with the SparseOpticalFlow class or function interface to track a set of features
+ * or with the DenseOpticalFlow class or function interface to compute dense optical flow.
+ *
+ * @see optflow::DenseRLOFOpticalFlow, optflow::calcOpticalFlowDenseRLOF(), optflow::SparseRLOFOpticalFlow, optflow::calcOpticalFlowSparseRLOF()
+ */
+class CV_EXPORTS_W RLOFOpticalFlowParameter{
+public:
+    RLOFOpticalFlowParameter()
+        :solverType(ST_BILINEAR)
+        ,supportRegionType(SR_CROSS)
+        ,normSigma0(3.2f)
+        ,normSigma1(7.f)
+        ,smallWinSize(9)
+        ,largeWinSize(21)
+        ,crossSegmentationThreshold(25)
+        ,maxLevel(5)
+        ,useInitialFlow(false)
+        ,useIlluminationModel(true)
+        ,useGlobalMotionPrior(true)
+        ,maxIteration(30)
+        ,minEigenValue(0.0001f)
+        ,globalMotionRansacThreshold(10)
+    {}
+
+    SolverType solverType;
+    /**< Variable specifies the iterative refinement strategy. Please consider citing  @cite Senst2013 when
+     *  using ST_BILINEAR.
+    */
+
+    SupportRegionType supportRegionType;
+    /**< Variable specifies the support region shape extraction or shrinking strategy.
+    */
+
+    float normSigma0;
+    /**< &sigma paramter of the shrinked Hampel norm introduced in @cite Senst2012. If
+     * &sigma = std::numeric_limist<float>::max() the least-square estimator will be used
+     * instead of the M-estimator. Althoug M-estimator is more robust against outlier in the support
+     * region the least-square can be fast in computation.
+    */
+    float normSigma1;
+    /**< &sigma paramter of the shrinked Hampel norm introduced in @cite Senst2012. If
+     * &sigma = std::numeric_limist<float>::max() the least-square estimator will be used
+     * instead of the M-estimator. Althoug M-estimator is more robust against outlier in the support
+     * region the least-square can be fast in computation.
+    */
+    int smallWinSize;
+    /**< Minimal window size of the support region. This parameter is only used if supportRegionType is SR_CROSS.
+    */
+    int largeWinSize;
+    /**< Maximal window size of the support region. If supportRegionType is SR_FIXED this gives the exact support
+     * region size. The speed of the RLOF is related to the applied win sizes. The smaller the window size the lower is the runtime,
+     * but the more sensitive to noise is the method.
+    */
+    int crossSegmentationThreshold;
+    /**< Color similarity threshold used by cross-based segmentation following @cite Senst2014 .
+     *   (Only used  if supportRegionType is SR_CROSS). With the cross-bassed segmentation
+     *   motion boundaries can be computed more accurately.
+    */
+    int maxLevel;
+    /**< Maximal number of pyramid level used. The large this value is the more likely it is
+     *   to obtain accurate solutions for long-range motions. The runtime is linear related to
+     *   this parameter.
+    */
+    bool useInitialFlow;
+    /**< Use next point list as initial values. A good intialization can imporve the algortihm
+     *   accuracy and reduce the runtime by a faster convergence of the iteration refinement.
+    */
+    bool useIlluminationModel;
+    /**< Use the Gennert and Negahdaripour illumination model instead of the intensity brigthness
+     *   constraint. (proposed in @cite Senst2016 ) This model is defined as follow:
+     *   \f[ I(\mathbf{x},t) + m \cdot I(\mathbf{x},t) + c = I(\mathbf{x},t+1) \f]
+     *   and contains with m and c a multiplicative and additive term which makes the estimate
+     *   more robust against illumination changes. The computational complexity is increased by
+     *   enabling the illumination model.
+    */
+    bool useGlobalMotionPrior;
+    /**< Use global motion prior initialisation has been introduced in @cite Senst2016 . It
+     *   allows to be more accurate for long-range motion. The computational complexity is
+     *   slightly increased by enabling the global motion prior initialisation.
+    */
+    int maxIteration;
+    /**< Number of maximal iterations used for the iterative refinement. Lower values can
+     *   reduce the runtime but also the accuracy.
+    */
+    float minEigenValue;
+    /**< Threshold for the minimal eigenvalue of the gradient matrix defines when to abort the
+     *   iterative refinement.
+    */
+    float globalMotionRansacThreshold;
+    /**< To apply the global motion prior motion vectors will be computed on a regulary sampled which
+     *   are the basis for Homography estimation using RANSAC. The reprojection threshold is based on
+     *   n-th percentil (given by this value [0 ... 100]) of the motion vectors magnitude.
+     *   See @cite Senst2016 for more details.
+    */
+
+    CV_WRAP void setSolverType(SolverType val);
+    CV_WRAP SolverType getSolverType() const;
+
+    CV_WRAP void setSupportRegionType(SupportRegionType val);
+    CV_WRAP SupportRegionType getSupportRegionType() const;
+
+    CV_WRAP void setNormSigma0(float val);
+    CV_WRAP float getNormSigma0() const;
+
+    CV_WRAP void setNormSigma1(float val);
+    CV_WRAP float getNormSigma1() const;
+
+    CV_WRAP void setSmallWinSize(int val);
+    CV_WRAP int getSmallWinSize() const;
+
+    CV_WRAP void setLargeWinSize(int val);
+    CV_WRAP int getLargeWinSize() const;
+
+    CV_WRAP void setCrossSegmentationThreshold(int val);
+    CV_WRAP int getCrossSegmentationThreshold() const;
+
+    CV_WRAP void setMaxLevel(int val);
+    CV_WRAP int getMaxLevel() const;
+
+    CV_WRAP void setUseInitialFlow(bool val);
+    CV_WRAP bool getUseInitialFlow() const;
+
+    CV_WRAP void setUseIlluminationModel(bool val);
+    CV_WRAP bool getUseIlluminationModel() const;
+
+    CV_WRAP void setUseGlobalMotionPrior(bool val);
+    CV_WRAP bool getUseGlobalMotionPrior() const;
+
+    CV_WRAP void setMaxIteration(int val);
+    CV_WRAP int getMaxIteration() const;
+
+    CV_WRAP void setMinEigenValue(float val);
+    CV_WRAP float getMinEigenValue() const;
+
+    CV_WRAP void setGlobalMotionRansacThreshold(float val);
+    CV_WRAP float getGlobalMotionRansacThreshold() const;
+
+    //! @brief Creates instance of optflow::RLOFOpticalFlowParameter
+    CV_WRAP static Ptr<RLOFOpticalFlowParameter> create();
+};
+
+/** @brief Fast dense optical flow computation based on robust local optical flow (RLOF) algorithms and sparse-to-dense interpolation
+ * scheme.
+ *
+ * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as
+ * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK().
+ *
+ * The sparse-to-dense interpolation scheme allows for fast computation of dense optical flow using RLOF (see @cite Geistert2016).
+ * For this scheme the following steps are applied:
+ * -# motion vector seeded at a regular sampled grid are computed. The sparsity of this grid can be configured with setGridStep
+ * -# (optinally) errornous motion vectors are filter based on the forward backward confidence. The threshold can be configured
+ * with setForwardBackward. The filter is only applied if the threshold >0 but than the runtime is doubled due to the estimation
+ * of the backward flow.
+ * -# Vector field interpolation is applied to the motion vector set to obtain a dense vector field.
+ *
+ * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details.
+ * Parameters have been described in @cite Senst2012 @cite Senst2013 @cite Senst2014 and @cite Senst2016.
+ *
+ * @note SIMD parallelization is only available when compiling with SSE4.1. If the grid size is set to (1,1) and the
+ * forward backward threshold <= 0 that the dense optical flow field is purely.
+ * computed with the RLOF.
+ *
+ * @see optflow::calcOpticalFlowDenseRLOF(), optflow::RLOFOpticalFlowParameter
+*/
+class CV_EXPORTS_W DenseRLOFOpticalFlow : public DenseOpticalFlow
+{
+public:
+    //! @brief Configuration of the RLOF alogrithm.
+    /**
+        @see optflow::RLOFOpticalFlowParameter, getRLOFOpticalFlowParameter
+    */
+    CV_WRAP virtual void setRLOFOpticalFlowParameter(Ptr<RLOFOpticalFlowParameter>  val) = 0;
+    /** @copybrief setRLOFOpticalFlowParameter
+        @see optflow::RLOFOpticalFlowParameter, setRLOFOpticalFlowParameter
+    */
+    CV_WRAP virtual Ptr<RLOFOpticalFlowParameter>  getRLOFOpticalFlowParameter() const = 0;
+    //! @brief Threshold for the forward backward confidence check
+    /**For each grid point \f$ \mathbf{x} \f$ a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed.
+     *     If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f]
+     *     is larger than threshold given by this function then the motion vector will not be used by the following
+     *    vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test
+     *    will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation.
+     *    @see getForwardBackward, setGridStep
+    */
+    CV_WRAP virtual void setForwardBackward(float val) = 0;
+    /** @copybrief setForwardBackward
+        @see setForwardBackward
+    */
+    CV_WRAP virtual float getForwardBackward() const = 0;
+    //! @brief Size of the grid to spawn the motion vectors.
+    /** For each grid point a motion vector is computed. Some motion vectors will be removed due to the forwatd backward
+     *  threshold (if set >0). The rest will be the base of the vector field interpolation.
+     *    @see getForwardBackward, setGridStep
+    */
+    CV_WRAP virtual Size getGridStep() const = 0;
+    /** @copybrief getGridStep
+     *    @see getGridStep
+     */
+    CV_WRAP virtual void setGridStep(Size val) = 0;
+
+    //! @brief Interpolation used to compute the dense optical flow.
+    /** Two interpolation algorithms are supported
+     * - **INTERP_GEO** applies the fast geodesic interpolation, see @cite Geistert2016.
+     * - **INTERP_EPIC_RESIDUAL** applies the edge-preserving interpolation, see @cite Revaud2015,Geistert2016.
+     * @see ximgproc::EdgeAwareInterpolator, getInterpolation
+    */
+    CV_WRAP virtual void setInterpolation(InterpolationType val) = 0;
+    /** @copybrief setInterpolation
+     *    @see ximgproc::EdgeAwareInterpolator, setInterpolation
+     */
+    CV_WRAP virtual InterpolationType getInterpolation() const = 0;
+    //! @brief see ximgproc::EdgeAwareInterpolator() K value.
+    /** K is a number of nearest-neighbor matches considered, when fitting a locally affine
+     *    model. Usually it should be around 128. However, lower values would make the interpolation noticeably faster.
+     *    @see ximgproc::EdgeAwareInterpolator,  setEPICK
+    */
+    CV_WRAP virtual int getEPICK() const = 0;
+    /** @copybrief getEPICK
+     *    @see ximgproc::EdgeAwareInterpolator, getEPICK
+     */
+    CV_WRAP virtual void setEPICK(int val) = 0;
+    //! @brief see ximgproc::EdgeAwareInterpolator() sigma value.
+    /** Sigma is a parameter defining how fast the weights decrease in the locally-weighted affine
+     *  fitting. Higher values can help preserve fine details, lower values can help to get rid of noise in the
+     *  output flow.
+     *    @see ximgproc::EdgeAwareInterpolator, setEPICSigma
+    */
+    CV_WRAP virtual float getEPICSigma() const = 0;
+    /** @copybrief getEPICSigma
+     *  @see ximgproc::EdgeAwareInterpolator, getEPICSigma
+     */
+    CV_WRAP virtual void setEPICSigma(float val) = 0;
+    //! @brief  see ximgproc::EdgeAwareInterpolator() lambda value.
+    /** Lambda is a parameter defining the weight of the edge-aware term in geodesic distance,
+     *    should be in the range of 0 to 1000.
+     *    @see ximgproc::EdgeAwareInterpolator, setEPICSigma
+    */
+    CV_WRAP virtual float getEPICLambda() const = 0;
+    /** @copybrief getEPICLambda
+     *    @see ximgproc::EdgeAwareInterpolator, getEPICLambda
+    */
+    CV_WRAP virtual void setEPICLambda(float val) = 0;
+    //! @brief see ximgproc::EdgeAwareInterpolator().
+    /** Sets the respective fastGlobalSmootherFilter() parameter.
+     *    @see ximgproc::EdgeAwareInterpolator, setFgsLambda
+    */
+    CV_WRAP virtual float getFgsLambda() const = 0;
+    /** @copybrief getFgsLambda
+     *    @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, getFgsLambda
+    */
+    CV_WRAP virtual void setFgsLambda(float val) = 0;
+    //! @brief see ximgproc::EdgeAwareInterpolator().
+    /** Sets the respective fastGlobalSmootherFilter() parameter.
+     *    @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, setFgsSigma
+    */
+    CV_WRAP virtual float getFgsSigma() const = 0;
+    /** @copybrief getFgsSigma
+     *    @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, getFgsSigma
+     */
+    CV_WRAP virtual void setFgsSigma(float val) = 0;
+    //! @brief enables ximgproc::fastGlobalSmootherFilter
+    /**
+     * @see getUsePostProc
+     */
+    CV_WRAP virtual void setUsePostProc(bool val) = 0;
+    /** @copybrief setUsePostProc
+     *    @see ximgproc::fastGlobalSmootherFilter, setUsePostProc
+     */
+    CV_WRAP virtual bool getUsePostProc() const = 0;
+    //! @brief Creates instance of optflow::DenseRLOFOpticalFlow
+    /**
+     *    @param rlofParam see optflow::RLOFOpticalFlowParameter
+     *    @param forwardBackwardThreshold see setForwardBackward
+     *    @param gridStep see setGridStep
+     *    @param interp_type see setInterpolation
+     *    @param epicK see setEPICK
+     *    @param epicSigma see setEPICSigma
+     *    @param epicLambda see setEPICLambda
+     *    @param use_post_proc see setUsePostProc
+     *    @param fgsLambda see setFgsLambda
+     *    @param fgsSigma see setFgsSigma
+    */
+    CV_WRAP static Ptr<DenseRLOFOpticalFlow> create(
+        Ptr<RLOFOpticalFlowParameter> rlofParam = Ptr<RLOFOpticalFlowParameter>(),
+        float forwardBackwardThreshold = 1.f,
+        Size gridStep = Size(6, 6),
+        InterpolationType interp_type = InterpolationType::INTERP_EPIC,
+        int epicK = 128,
+        float epicSigma = 0.05f,
+        float epicLambda = 999.0f,
+        bool use_post_proc = true,
+        float fgsLambda = 500.0f,
+        float fgsSigma = 1.5f);
+};
+
+/** @brief Class used for calculation sparse optical flow and feature tracking with robust local optical flow (RLOF) algorithms.
+*
+* The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as
+* proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK().
+*
+* For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details.
+* Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014 and @cite Senst2016.
+*
+* @note SIMD parallelization is only available when compiling with SSE4.1.
+* @see optflow::calcOpticalFlowSparseRLOF(), optflow::RLOFOpticalFlowParameter
+*/
+class CV_EXPORTS_W SparseRLOFOpticalFlow : public SparseOpticalFlow
+{
+public:
+    /** @copydoc DenseRLOFOpticalFlow::setRLOFOpticalFlowParameter
+    */
+    CV_WRAP virtual void setRLOFOpticalFlowParameter(Ptr<RLOFOpticalFlowParameter> val) = 0;
+    /** @copybrief setRLOFOpticalFlowParameter
+     *    @see setRLOFOpticalFlowParameter
+    */
+    CV_WRAP virtual Ptr<RLOFOpticalFlowParameter>  getRLOFOpticalFlowParameter() const = 0;
+    //! @brief Threshold for the forward backward confidence check
+    /** For each feature point a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed.
+     *     If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f]
+     *     is larger than threshold given by this function then the status  will not be used by the following
+     *    vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test
+     *    will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation.
+     *    @see setForwardBackward
+    */
+    CV_WRAP virtual void setForwardBackward(float val) = 0;
+    /** @copybrief setForwardBackward
+     *    @see setForwardBackward
+    */
+    CV_WRAP virtual float getForwardBackward() const = 0;
+
+    //! @brief Creates instance of SparseRLOFOpticalFlow
+    /**
+     *    @param rlofParam see setRLOFOpticalFlowParameter
+     *    @param forwardBackwardThreshold see setForwardBackward
+    */
+    CV_WRAP static Ptr<SparseRLOFOpticalFlow> create(
+        Ptr<RLOFOpticalFlowParameter> rlofParam = Ptr<RLOFOpticalFlowParameter>(),
+        float forwardBackwardThreshold = 1.f);
+
+};
+
+/** @brief Fast dense optical flow computation based on robust local optical flow (RLOF) algorithms and sparse-to-dense interpolation scheme.
+ *
+ * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as
+ * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK().
+ *
+ * The sparse-to-dense interpolation scheme allows for fast computation of dense optical flow using RLOF (see @cite Geistert2016).
+ * For this scheme the following steps are applied:
+ * -# motion vector seeded at a regular sampled grid are computed. The sparsity of this grid can be configured with setGridStep
+ * -# (optinally) errornous motion vectors are filter based on the forward backward confidence. The threshold can be configured
+ * with setForwardBackward. The filter is only applied if the threshold >0 but than the runtime is doubled due to the estimation
+ * of the backward flow.
+ * -# Vector field interpolation is applied to the motion vector set to obtain a dense vector field.
+ *
+ * @param I0 first 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType
+ * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image.
+ * @param I1 second 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType
+ * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image.
+ * @param flow computed flow image that has the same size as I0 and type CV_32FC2.
+ * @param rlofParam see optflow::RLOFOpticalFlowParameter
+ * @param forwardBackwardThreshold Threshold for the forward backward confidence check.
+ * For each grid point \f$ \mathbf{x} \f$ a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed.
+ * If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f]
+ * is larger than threshold given by this function then the motion vector will not be used by the following
+ * vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test
+ *    will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation.
+ * @param gridStep Size of the grid to spawn the motion vectors. For each grid point a motion vector is computed.
+ * Some motion vectors will be removed due to the forwatd backward threshold (if set >0). The rest will be the
+ * base of the vector field interpolation.
+ * @param interp_type interpolation method used to compute the dense optical flow. Two interpolation algorithms are
+ * supported:
+ * - **INTERP_GEO** applies the fast geodesic interpolation, see @cite Geistert2016.
+ * - **INTERP_EPIC_RESIDUAL** applies the edge-preserving interpolation, see @cite Revaud2015,Geistert2016.
+ * @param epicK see ximgproc::EdgeAwareInterpolator() sets the respective parameter.
+ * @param epicSigma see ximgproc::EdgeAwareInterpolator() sets the respective parameter.
+ * @param epicLambda see ximgproc::EdgeAwareInterpolator() sets the respective parameter.
+ * @param use_post_proc enables ximgproc::fastGlobalSmootherFilter() parameter.
+ * @param fgsLambda sets the respective ximgproc::fastGlobalSmootherFilter() parameter.
+ * @param fgsSigma sets the respective ximgproc::fastGlobalSmootherFilter() parameter.
+ *
+ * Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014, @cite Senst2016.
+ * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details.
+ * @note If the grid size is set to (1,1) and the forward backward threshold <= 0 that the dense optical flow field is purely
+ * computed with the RLOF.
+ *
+ * @note SIMD parallelization is only available when compiling with SSE4.1.
+ *
+ * @sa optflow::DenseRLOFOpticalFlow, optflow::RLOFOpticalFlowParameter
+*/
+CV_EXPORTS_W void calcOpticalFlowDenseRLOF(InputArray I0, InputArray I1, InputOutputArray flow,
+    Ptr<RLOFOpticalFlowParameter> rlofParam = Ptr<RLOFOpticalFlowParameter>(),
+    float forwardBackwardThreshold = 0, Size gridStep = Size(6, 6),
+    InterpolationType interp_type = InterpolationType::INTERP_EPIC,
+    int epicK = 128, float epicSigma = 0.05f, float epicLambda = 100.f,
+    bool use_post_proc = true, float fgsLambda = 500.0f, float fgsSigma = 1.5f);
+
+/** @brief Calculates fast optical flow for a sparse feature set using the robust local optical flow (RLOF) similar
+* to optflow::calcOpticalFlowPyrLK().
+*
+* The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as
+* proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK().
+*
+* @param prevImg first 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType
+* = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image.
+* @param nextImg second 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType
+* = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image.
+* @param prevPts vector of 2D points for which the flow needs to be found; point coordinates must be single-precision
+* floating-point numbers.
+* @param nextPts output vector of 2D points (with single-precision floating-point coordinates) containing the calculated
+* new positions of input features in the second image; when optflow::RLOFOpticalFlowParameter::useInitialFlow variable is true  the vector must
+* have the same size as in the input and contain the initialization point correspondences.
+* @param status output status vector (of unsigned chars); each element of the vector is set to 1 if the flow for the
+* corresponding features has passed the forward backward check.
+* @param err output vector of errors; each element of the vector is set to the forward backward error for the corresponding feature.
+* @param rlofParam see optflow::RLOFOpticalFlowParameter
+* @param forwardBackwardThreshold Threshold for the forward backward confidence check. If forewardBackwardThreshold <=0 the forward
+*
+* @note SIMD parallelization is only available when compiling with SSE4.1.
+*
+* Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014 and @cite Senst2016.
+* For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details.
+*/
+CV_EXPORTS_W void calcOpticalFlowSparseRLOF(InputArray prevImg, InputArray nextImg,
+    InputArray prevPts, InputOutputArray nextPts,
+    OutputArray status, OutputArray err,
+    Ptr<RLOFOpticalFlowParameter> rlofParam = Ptr<RLOFOpticalFlowParameter>(),
+    float forwardBackwardThreshold = 0);
+
+//! Additional interface to the Dense RLOF algorithm - optflow::calcOpticalFlowDenseRLOF()
+CV_EXPORTS_W Ptr<DenseOpticalFlow> createOptFlow_DenseRLOF();
+
+//! Additional interface to the Sparse RLOF algorithm - optflow::calcOpticalFlowSparseRLOF()
+CV_EXPORTS_W Ptr<SparseOpticalFlow> createOptFlow_SparseRLOF();
+//! @}
+
+} // namespace
+} // namespace
+#endif
diff --git a/modules/optflow/perf/perf_rlof.cpp b/modules/optflow/perf/perf_rlof.cpp
new file mode 100644
index 00000000000..1295c526a6e
--- /dev/null
+++ b/modules/optflow/perf/perf_rlof.cpp
@@ -0,0 +1,72 @@
+#include "perf_precomp.hpp"
+namespace opencv_test { namespace {
+
+typedef tuple<std::string, std::string, bool> ST_SR_IM_Sparse_t;
+typedef TestBaseWithParam<ST_SR_IM_Sparse_t> ST_SR_IM_Sparse;
+PERF_TEST_P(ST_SR_IM_Sparse, OpticalFlow_SparseRLOF,
+    testing::Combine(
+        testing::Values<std::string>("ST_BILINEAR", "ST_STANDART"),
+        testing::Values<std::string>("SR_CROSS", "SR_FIXED"),
+        testing::Values(true, false))
+)
+{
+    Mat frame1 = imread(getDataPath("cv/optflow/RubberWhale1.png"));
+    Mat frame2 = imread(getDataPath("cv/optflow/RubberWhale2.png"));
+    ASSERT_FALSE(frame1.empty());
+    ASSERT_FALSE(frame2.empty());
+    vector<Point2f> prevPts, currPts;
+    for (int r = 0; r < frame1.rows; r += 10)
+    {
+        for (int c = 0; c < frame1.cols; c += 10)
+        {
+            prevPts.push_back(Point2f(static_cast<float>(c), static_cast<float>(r)));
+        }
+    }
+    vector<uchar> status(prevPts.size());
+    vector<float> err(prevPts.size());
+
+    Ptr<RLOFOpticalFlowParameter> param = Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter);
+    if (get<0>(GetParam()) == "ST_BILINEAR")
+        param->solverType = ST_BILINEAR;
+    if (get<0>(GetParam()) == "ST_STANDART")
+        param->solverType = ST_STANDART;
+    if (get<1>(GetParam()) == "SR_CROSS")
+        param->supportRegionType = SR_CROSS;
+    if (get<1>(GetParam()) == "SR_FIXED")
+        param->supportRegionType = SR_FIXED;
+    param->useIlluminationModel = get<2>(GetParam());
+
+    PERF_SAMPLE_BEGIN()
+        calcOpticalFlowSparseRLOF(frame1, frame2, prevPts, currPts, status, err, param, 1.f);
+    PERF_SAMPLE_END()
+
+    SANITY_CHECK_NOTHING();
+}
+
+typedef tuple<std::string, int> INTERP_GRID_Dense_t;
+typedef TestBaseWithParam<INTERP_GRID_Dense_t> INTERP_GRID_Dense;
+PERF_TEST_P(INTERP_GRID_Dense, OpticalFlow_DenseRLOF,
+    testing::Combine(
+        testing::Values<std::string>("INTERP_EPIC", "INTERP_GEO"),
+        testing::Values<int>(4,10))
+)
+{
+    Mat flow;
+    Mat frame1 = imread(getDataPath("cv/optflow/RubberWhale1.png"));
+    Mat frame2 = imread(getDataPath("cv/optflow/RubberWhale1.png"));
+    ASSERT_FALSE(frame1.empty());
+    ASSERT_FALSE(frame2.empty());
+    Ptr<RLOFOpticalFlowParameter> param = Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter);;
+    Ptr< DenseRLOFOpticalFlow> algo = DenseRLOFOpticalFlow::create();
+    InterpolationType interp_type = INTERP_EPIC;
+    if (get<0>(GetParam()) == "INTERP_EPIC")
+        interp_type = INTERP_EPIC;
+    if (get<0>(GetParam()) == "INTERP_GEO")
+        interp_type = INTERP_GEO;
+    PERF_SAMPLE_BEGIN()
+        calcOpticalFlowDenseRLOF(frame1, frame2,flow, param, 1.0f, Size(get<1>(GetParam()), get<1>(GetParam())), interp_type);
+    PERF_SAMPLE_END()
+    SANITY_CHECK_NOTHING();
+}
+
+}} // namespace
diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp
new file mode 100644
index 00000000000..604305250a5
--- /dev/null
+++ b/modules/optflow/src/rlof/berlof_invoker.hpp
@@ -0,0 +1,2107 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#ifndef _BERLOF_INVOKER_HPP_
+#define _BERLOF_INVOKER_HPP_
+#include "rlof_invokerbase.hpp"
+
+
+namespace cv{
+namespace optflow{
+
+
+static inline bool checkSolution(float x, float y, float * c )
+{
+    float _a = x - 0.002f;
+    float _b = y - 0.002f;
+    cv::Point2f tl( c[0] * _a * _b + c[1] * _a + c[2] * _b + c[3],  c[4] * _a * _b + c[5] * _a + c[6] * _b + c[7]);
+    _a = x + 0.002f;
+    cv::Point2f tr( c[0] * _a * _b + c[1] * _a + c[2] * _b + c[3],  c[4] * _a * _b + c[5] * _a + c[6] * _b + c[7]);
+    _b = y + 0.002f;
+    cv::Point2f br( c[0] * _a * _b + c[1] * _a + c[2] * _b + c[3],  c[4] * _a * _b + c[5] * _a + c[6] * _b + c[7]);
+    _a = x - 0.002f;
+    cv::Point2f bl( c[0] * _a * _b + c[1] * _a + c[2] * _b + c[3],  c[4] * _a * _b + c[5] * _a + c[6] * _b + c[7]);
+    return (tl.x >= 0 && tl.y >= 0) && (tr.x <= 0 && tr.y >= 0)
+        && (bl.x >= 0 && bl.y <= 0) && (br.x <= 0 && br.y <= 0);
+}
+
+static inline cv::Point2f est_DeltaGain(const cv::Matx44f& src, const cv::Vec4f& val)
+{
+    return cv::Point2f(
+        src(2,0) * val[0] + src(2,1) * val[1] + src(2,2) * val[2] + src(2,3) * val[3],
+        src(3,0) * val[0] + src(3,1) * val[1] + src(3,2) * val[2] + src(3,3) * val[3]);
+}
+static inline void est_Result(const cv::Matx44f& src, const cv::Vec4f & val, cv::Point2f & delta, cv::Point2f & gain)
+{
+
+    delta = cv::Point2f(
+        -(src(0,0) * val[0] + src(0,1) * val[1] + src(0,2) * val[2] + src(0,3) * val[3]),
+        -(src(1,0) * val[0] + src(1,1) * val[1] + src(1,2) * val[2] + src(1,3) * val[3]));
+
+    gain = cv::Point2f(
+        src(2,0) * val[0] + src(2,1) * val[1] + src(2,2) * val[2] + src(2,3) * val[3],
+        src(3,0) * val[0] + src(3,1) * val[1] + src(3,2) * val[2] + src(3,3) * val[3]);
+}
+
+namespace berlof {
+
+namespace ica {
+
+class TrackerInvoker : public cv::ParallelLoopBody
+{
+public:
+    TrackerInvoker(
+        const Mat&      _prevImg,
+        const Mat&      _prevDeriv,
+        const Mat&      _nextImg,
+        const Mat&      _rgbPrevImg,
+        const Mat&      _rgbNextImg,
+        const Point2f*  _prevPts,
+        Point2f*        _nextPts,
+        uchar*          _status,
+        float*          _err,
+        int             _level,
+        int             _maxLevel,
+        int             _winSize[2],
+        int             _maxIteration,
+        bool            _useInitialFlow,
+        int             _supportRegionType,
+        int             _crossSegmentationThreshold,
+        const std::vector<float>& _normSigmas,
+        float           _minEigenValue
+    ) :
+        normSigma0(_normSigmas[0]),
+        normSigma1(_normSigmas[1]),
+        normSigma2(_normSigmas[2])
+    {
+        prevImg =       &_prevImg;
+        prevDeriv =     &_prevDeriv;
+        nextImg =       &_nextImg;
+        rgbPrevImg =    &_rgbPrevImg;
+        rgbNextImg =    &_rgbNextImg;
+        prevPts =       _prevPts;
+        nextPts =       _nextPts;
+        status =        _status;
+        err =           _err;
+        minWinSize =    _winSize[0];
+        maxWinSize =    _winSize[1];
+        criteria.maxCount = _maxIteration;
+        criteria.epsilon = 0.01;
+        level =         _level;
+        maxLevel =      _maxLevel;
+        windowType =    _supportRegionType;
+        minEigThreshold = _minEigenValue;
+        useInitialFlow = _useInitialFlow;
+        crossSegmentationThreshold = _crossSegmentationThreshold;
+    }
+
+    void operator()(const cv::Range& range) const CV_OVERRIDE
+    {
+        Point2f halfWin;
+        cv::Size winSize;
+        const Mat& I = *prevImg;
+        const Mat& J = *nextImg;
+        const Mat& derivI = *prevDeriv;
+        const Mat& BI = *rgbPrevImg;
+
+
+        const float FLT_SCALE = (1.f/(1 << 16));
+
+        winSize = cv::Size(maxWinSize,maxWinSize);
+        int winMaskwidth = roundUp(winSize.width, 16);
+        cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
+        winMaskMatBuf.setTo(1);
+
+        int j, cn = I.channels(), cn2 = cn*2;
+        int winbufwidth = roundUp(winSize.width, 16);
+        cv::Size winBufSize(winbufwidth,winbufwidth);
+
+        cv::AutoBuffer<deriv_type> _buf(winBufSize.area()*(cn + cn2));
+        int derivDepth = DataType<deriv_type>::depth;
+
+        Mat IWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf.data());
+        Mat derivIWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf.data() + winBufSize.area()*cn);
+
+
+        for( int ptidx = range.start; ptidx < range.end; ptidx++ )
+        {
+            Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
+            Point2f nextPt;
+            if( level == maxLevel )
+            {
+                if( useInitialFlow )
+                    nextPt = nextPts[ptidx]*(float)(1./(1 << level));
+                else
+                    nextPt = prevPt;
+            }
+            else
+                nextPt = nextPts[ptidx]*2.f;
+            nextPts[ptidx] = nextPt;
+
+            Point2i iprevPt, inextPt;
+            iprevPt.x = cvFloor(prevPt.x);
+            iprevPt.y = cvFloor(prevPt.y);
+            int winArea = maxWinSize * maxWinSize;
+            cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize));
+            winMaskMatBuf.setTo(0);
+            if( calcWinMaskMat(BI, windowType, iprevPt,
+                                winMaskMat,winSize,halfWin,winArea,
+                                minWinSize,maxWinSize) == false)
+            {
+                continue;
+            }
+            halfWin = Point2f(static_cast<float>(maxWinSize), static_cast<float>(maxWinSize) ) - halfWin;
+            prevPt += halfWin;
+            iprevPt.x = cvFloor(prevPt.x);
+            iprevPt.y = cvFloor(prevPt.y);
+            if( iprevPt.x < 0 || iprevPt.x >= derivI.cols - winSize.width ||
+                iprevPt.y < 0 || iprevPt.y >= derivI.rows - winSize.height - 1)
+            {
+                if( level == 0 )
+                {
+                    if( status )
+                        status[ptidx] = 3;
+                    if( err )
+                        err[ptidx] = 0;
+                }
+                continue;
+            }
+
+            float a = prevPt.x - iprevPt.x;
+            float b = prevPt.y - iprevPt.y;
+            const int W_BITS = 14, W_BITS1 = 14;
+
+            int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+            int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+            int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+            int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+
+            float A11 = 0, A12 = 0, A22 = 0;
+            float D = 0;
+
+            // extract the patch from the first image, compute covariation matrix of derivatives
+            int x, y;
+            for( y = 0; y < winSize.height; y++ )
+            {
+                const uchar* src = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
+                const uchar* src1 = I.ptr<uchar>(y + iprevPt.y + 1, 0) + iprevPt.x*cn;
+                const short* dsrc = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
+                const short* dsrc1 = derivI.ptr<short>(y + iprevPt.y + 1, 0) + iprevPt.x*cn2;
+                short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                short* dIptr = derivIWinBuf.ptr<short>(y, 0);
+                x = 0;
+                for( ; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2 )
+                {
+                    if( winMaskMat.at<uchar>(y,x) == 0)
+                    {
+                        dIptr[0] = 0;
+                        dIptr[1] = 0;
+                        continue;
+                    }
+                    int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
+                                          src1[x]*iw10 + src1[x+cn]*iw11, W_BITS1-5);
+                    int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
+                                           dsrc1[0]*iw10 + dsrc1[cn2]*iw11, W_BITS1);
+                    int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 +
+                                           dsrc1[1]*iw10 + dsrc1[cn2+1]*iw11, W_BITS1);
+                    Iptr[x] = (short)ival;
+                    dIptr[0] = (short)ixval;
+                    dIptr[1] = (short)iyval;
+                }
+            }
+
+            cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1);
+            cv::Point2f backUpNextPt = nextPt;
+            nextPt += halfWin;
+            Point2f prevDelta(0,0);    //denotes h(t-1)
+            cv::Size _winSize = winSize;
+#ifdef RLOF_SSE
+            __m128i mmMask0, mmMask1, mmMask;
+            getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask);
+#endif
+            float MEstimatorScale = 1;
+            int buffIdx = 0;
+            float c[8];
+            cv::Mat GMc0, GMc1, GMc2, GMc3;
+            cv::Vec2f Mc0, Mc1, Mc2, Mc3;
+            int noIteration = 0;
+            int noReusedIteration = 0;
+            int noSolvedIteration = 0;
+            for( j = 0; j < criteria.maxCount; j++ )
+            {
+                cv::Point2f delta(0,0);
+                cv::Point2f deltaGain(0,0);
+                bool hasSolved = false;
+                a = nextPt.x - inextPt.x;
+                b = nextPt.y - inextPt.y;
+                float ab = a * b;
+                if( j == 0
+                    || ( inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j % 2 != 0 ))
+                {
+                    inextPt.x = cvFloor(nextPt.x);
+                    inextPt.y = cvFloor(nextPt.y);
+                    if( inextPt.x < 0 || inextPt.x >= J.cols - winSize.width ||
+                       inextPt.y < 0 || inextPt.y >= J.rows - winSize.height - 1)
+                    {
+                        if( level == 0 && status )
+                            status[ptidx] = 3;
+                        break;
+                    }
+
+
+                    a = nextPt.x - inextPt.x;
+                    b = nextPt.y - inextPt.y;
+                    ab = a * b;
+                    iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+                    iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+                    iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+                    iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+                    // mismatch
+                    if( j == 0 )
+                    {
+                        A11 = 0;
+                        A12 = 0;
+                        A22 = 0;
+                    }
+
+                    if ( j == 0 )
+                    {
+                        buffIdx = 0;
+                        for( y = 0; y < winSize.height; y++ )
+                        {
+                            const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                            const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
+                            const short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                            const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
+                            const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
+                            x = 0;
+                            for( ; x < winSize.width*cn; x++, dIptr += 2)
+                            {
+                                if( maskPtr[x] == 0)
+                                    continue;
+                                int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr1[x]*iw10 + Jptr1[x+cn]*iw11, W_BITS1-5)
+                                    - Iptr[x];
+                                residualMat.at<short>(buffIdx++) = static_cast<short>(diff);
+                            }
+                        }
+                        /*! Estimation for the residual */
+                        cv::Mat residualMatRoi(residualMat, cv::Rect(0,0,1, buffIdx));
+                        MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi);
+                    }
+
+                    float eta = 1.f / winArea;
+                    float fParam0 = normSigma0 * 32.f;
+                    float fParam1 = normSigma1 * 32.f;
+                    fParam0 = normSigma0 * MEstimatorScale;
+                    fParam1 = normSigma1 * MEstimatorScale;
+
+
+                    buffIdx = 0;
+                    float _b0[4] = {0,0,0,0};
+                    float _b1[4] = {0,0,0,0};
+
+                    /*
+                    */
+                    for( y = 0; y < _winSize.height; y++ )
+                    {
+                        const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                        const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
+                        const short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                        const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
+                        const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
+                        x = 0;
+                        for( ; x < _winSize.width*cn; x++, dIptr += 2 )
+                        {
+                            if( maskPtr[x] == 0)
+                                continue;
+                            int illValue =   - Iptr[x];
+
+                            float It[4] = {static_cast<float>((Jptr1[x+cn]<< 5)    + illValue),
+                                         static_cast<float>((Jptr[x+cn]<< 5)        + illValue),
+                                         static_cast<float>((Jptr1[x]<< 5)        + illValue),
+                                         static_cast<float>((Jptr[x] << 5)            + illValue)};
+
+
+
+                            int J_val  =  CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
+                                                  Jptr1[x]*iw10 + Jptr1[x+cn]*iw11,
+                                                  W_BITS1-5);
+
+
+                            int diff = J_val + illValue;
+
+
+                            MEstimatorScale += (diff < MEstimatorScale) ? -eta : eta;
+
+                            int abss = (diff < 0) ? -diff : diff;
+
+                            // compute the missmatch vector
+                            if( j >= 0)
+                            {
+                                if( abss > fParam1)
+                                {
+                                    It[0] = 0;
+                                    It[1] = 0;
+                                    It[2] = 0;
+                                    It[3] = 0;
+                                }
+                                else if( abss > fParam0 && diff >= 0 )
+                                {
+                                    It[0] = normSigma2 * (It[0] - fParam1);
+                                    It[1] = normSigma2 * (It[1] - fParam1);
+                                    It[2] = normSigma2 * (It[2] - fParam1);
+                                    It[3] = normSigma2 * (It[3] - fParam1);
+                                }
+                                else if( abss > fParam0 && diff < 0 )
+                                {
+                                    It[0] = normSigma2 * (It[0] + fParam1);
+                                    It[1] = normSigma2 * (It[1] + fParam1);
+                                    It[2] = normSigma2 * (It[2] + fParam1);
+                                    It[3] = normSigma2 * (It[3] + fParam1);
+                                }
+                            }
+
+                            float It0 = It[0];
+                            float It1 = It[1];
+                            float It2 = It[2];
+                            float It3 = It[3];
+
+                            float ixval = static_cast<float>(dIptr[0]);
+                            float iyval = static_cast<float>(dIptr[1]);
+                            _b0[0] += It0 * ixval;
+                            _b0[1] += It1 * ixval;
+                            _b0[2] += It2 * ixval;
+                            _b0[3] += It3 * ixval;
+
+
+                            _b1[0] += It0*iyval;
+                            _b1[1] += It1*iyval;
+                            _b1[2] += It2*iyval;
+                            _b1[3] += It3*iyval;
+
+
+                            // compute the Gradient Matrice
+                            if( j == 0)
+                            {
+                                float tale = normSigma2 * FLT_RESCALE;
+                                if( abss < fParam0 || j < 0)
+                                {
+                                    tale = FLT_RESCALE;
+                                }
+                                else if( abss > fParam1)
+                                {
+                                    tale *= 0.01f;
+                                }
+                                else
+                                {
+                                    tale *= normSigma2;
+                                }
+
+                                A11 += (float)(ixval*ixval)*tale;
+                                A12 += (float)(ixval*iyval)*tale;
+                                A22 += (float)(iyval*iyval)*tale;
+
+                            }
+                        }
+                    }
+
+                    if( j == 0 )
+                    {
+
+                        A11 *= FLT_SCALE; // 54866744.
+                        A12 *= FLT_SCALE; // -628764.00
+                        A22 *= FLT_SCALE; // 19730.000
+
+                        D = A11 * A22 - A12 * A12;
+                        float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
+                                4.f*A12*A12))/(2*winArea);
+
+                        if(  minEig < minEigThreshold || std::abs(D) < FLT_EPSILON)
+                        {
+                            if( level == 0 && status )
+                                status[ptidx] = 0;
+                            if( level > 0)
+                            {
+                                nextPts[ptidx] = backUpNextPt;
+                            }
+                            noIteration++;
+                            break;
+                        }
+
+                        D = (1.f / D);
+
+                    }
+
+                    _b0[0] *= FLT_SCALE;_b0[1] *= FLT_SCALE;_b0[2] *= FLT_SCALE;_b0[3] *= FLT_SCALE;
+                    _b1[0] *= FLT_SCALE;_b1[1] *= FLT_SCALE;_b1[2] *= FLT_SCALE;_b1[3] *= FLT_SCALE;
+
+
+                    Mc0[0] =   _b0[0] - _b0[1] - _b0[2] + _b0[3];
+                    Mc0[1] =   _b1[0] - _b1[1] - _b1[2] + _b1[3];
+
+                    Mc1[0] =   _b0[1] - _b0[3];
+                    Mc1[1] =   _b1[1] - _b1[3];
+
+
+                    Mc2[0] =   _b0[2] - _b0[3];
+                    Mc2[1] =   _b1[2] - _b1[3];
+
+
+                    Mc3[0] =  _b0[3];
+                    Mc3[1] =  _b1[3];
+
+                    c[0] = -Mc0[0];
+                    c[1] = -Mc1[0];
+                    c[2] = -Mc2[0];
+                    c[3] = -Mc3[0];
+                    c[4] = -Mc0[1];
+                    c[5] = -Mc1[1];
+                    c[6] = -Mc2[1];
+                    c[7] = -Mc3[1];
+
+                    float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]);
+                    float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]);
+                    float e2 = e0 * (c[1] * c[7] -c[3] * c[5]);
+                    e0 = e1 * e1 - e2;
+                    hasSolved = false;
+                    if ( e0 > 0)
+                    {
+                        e0 = sqrt(e0);
+                        float _y[2] = {-e1 - e0, e0 - e1};
+                        float c0yc1[2] = {c[0] * _y[0] + c[1],
+                                            c[0] * _y[1] + c[1]};
+                        float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0],
+                                        -(c[2] * _y[1] + c[3]) / c0yc1[1]};
+                        bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1);
+                        bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1);
+
+                        bool isSolution1 = checkSolution(_x[0], _y[0], c );
+                        bool isSolution2 = checkSolution(_x[1], _y[1], c );
+                        bool isSink1 = isIn1 && isSolution1;
+                        bool isSink2 = isIn2 && isSolution2;
+
+                        if ( isSink1 != isSink2)
+                        {
+                            a = isSink1 ? _x[0] : _x[1];
+                            b = isSink1 ? _y[0] : _y[1];
+                            ab = a * b;
+                            hasSolved = true;
+                            delta.x = inextPt.x + a - nextPt.x;
+                            delta.y = inextPt.y + b - nextPt.y;
+                        } // isIn1 != isIn2
+                    }
+                    if( hasSolved == false)
+                        noIteration++;
+                }
+                else
+                {
+                    hasSolved = false;
+                    noReusedIteration++;
+                }
+                if( hasSolved == false )
+                {
+
+                    cv::Vec2f mismatchVec = ab * Mc0  + Mc1 *a + Mc2 * b + Mc3;
+                    delta.x = (A12 * mismatchVec.val[1] - A22 * mismatchVec.val[0]) * D;
+                    delta.y = (A12 * mismatchVec.val[0] - A11 * mismatchVec.val[1]) * D;
+                    delta.x = MAX(-1.f, MIN(1.f, delta.x));
+                    delta.y = MAX(-1.f, MIN(1.f, delta.y));
+                    nextPt  += delta;
+                    nextPts[ptidx] = nextPt - halfWin;
+                }
+                else
+                {
+                    nextPt += delta;
+                    nextPts[ptidx] = nextPt - halfWin;
+                    noSolvedIteration++;
+                    break;
+                }
+
+                delta.x = ( delta.x != delta.x) ? 0 : delta.x;
+                delta.y = ( delta.y != delta.y) ? 0 : delta.y;
+
+                if(j > 0 && (
+                    (std::abs(delta.x - prevDelta.x) < 0.01  &&    std::abs(delta.y - prevDelta.y) < 0.01)))
+                {
+                    nextPts[ptidx]  -= delta*0.5f;
+                    break;
+                }
+
+                prevDelta = delta;
+            }
+
+        }
+
+    }
+
+    const Mat*          prevImg;
+    const Mat*          nextImg;
+    const Mat*          prevDeriv;
+    const Mat*          rgbPrevImg;
+    const Mat*          rgbNextImg;
+    const Point2f*      prevPts;
+    Point2f*            nextPts;
+    uchar*              status;
+    float*              err;
+    int                 maxWinSize;
+    int                 minWinSize;
+    TermCriteria        criteria;
+    int                 level;
+    int                 maxLevel;
+    int                 windowType;
+    float               minEigThreshold;
+    bool                useInitialFlow;
+    const float         normSigma0, normSigma1, normSigma2;
+    int                 crossSegmentationThreshold;
+
+};
+
+}  // namespace
+namespace radial {
+class TrackerInvoker : public cv::ParallelLoopBody
+{
+public:
+    TrackerInvoker(
+        const Mat&      _prevImg,
+        const Mat&      _prevDeriv,
+        const Mat&      _nextImg,
+        const Mat&      _rgbPrevImg,
+        const Mat&      _rgbNextImg,
+        const Point2f*  _prevPts,
+        Point2f*        _nextPts,
+        uchar*          _status,
+        float*          _err,
+        Point2f*        _gainVecs,
+        int             _level,
+        int             _maxLevel,
+        int             _winSize[2],
+        int             _maxIteration,
+        bool            _useInitialFlow,
+        int             _supportRegionType,
+        int             _crossSegmentationThreshold,
+        const std::vector<float>& _normSigmas,
+        float           _minEigenValue
+    ) :
+        normSigma0(_normSigmas[0]),
+        normSigma1(_normSigmas[1]),
+        normSigma2(_normSigmas[2])
+    {
+        prevImg = &_prevImg;
+        prevDeriv = &_prevDeriv;
+        nextImg = &_nextImg;
+        rgbPrevImg = &_rgbPrevImg;
+        rgbNextImg = &_rgbNextImg;
+        prevPts = _prevPts;
+        nextPts = _nextPts;
+        status = _status;
+        err = _err;
+        gainVecs = _gainVecs;
+        minWinSize = _winSize[0];
+        maxWinSize = _winSize[1];
+        criteria.maxCount = _maxIteration;
+        criteria.epsilon = 0.01;
+        level = _level;
+        maxLevel = _maxLevel;
+        windowType = _supportRegionType;
+        minEigThreshold = _minEigenValue;
+        useInitialFlow = _useInitialFlow;
+        crossSegmentationThreshold = _crossSegmentationThreshold;
+    }
+
+    void operator()(const cv::Range& range) const CV_OVERRIDE
+    {
+        Point2f halfWin;
+        cv::Size winSize;
+        const Mat& I = *prevImg;
+        const Mat& J = *nextImg;
+        const Mat& derivI = *prevDeriv;
+        const Mat& BI = *rgbPrevImg;
+        const float FLT_SCALE = (1.f/(1 << 16));//(1.f/(1 << 20)); // 20
+        winSize = cv::Size(maxWinSize,maxWinSize);
+        int winMaskwidth = roundUp(winSize.width, 16);
+        cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
+        winMaskMatBuf.setTo(1);
+
+        int cn = I.channels(), cn2 = cn*2;
+        int winbufwidth = roundUp(winSize.width, 16);
+        cv::Size winBufSize(winbufwidth,winbufwidth);
+
+
+        cv::Matx44f invTensorMat;
+
+        cv::AutoBuffer<deriv_type> _buf(winBufSize.area()*(cn + cn2));
+        int derivDepth = DataType<deriv_type>::depth;
+
+        Mat IWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf.data());
+        Mat derivIWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf.data() + winBufSize.area()*cn);
+
+        for( int ptidx = range.start; ptidx < range.end; ptidx++ )
+        {
+            Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
+            Point2f nextPt;
+            if( level == maxLevel )
+            {
+                if( useInitialFlow )
+                    nextPt = nextPts[ptidx]*(float)(1./(1 << level));
+                else
+                    nextPt = prevPt;
+            }
+            else
+                nextPt = nextPts[ptidx]*2.f;
+            nextPts[ptidx] = nextPt;
+
+            Point2i iprevPt, inextPt;
+            iprevPt.x = cvFloor(prevPt.x);
+            iprevPt.y = cvFloor(prevPt.y);
+            int winArea = maxWinSize * maxWinSize;
+            cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize));
+            winMaskMatBuf.setTo(0);
+            if( calcWinMaskMat(BI,  windowType, iprevPt,
+                                winMaskMat,winSize,halfWin,winArea,
+                                minWinSize,maxWinSize) == false)
+            continue;
+            halfWin = Point2f(static_cast<float>(maxWinSize), static_cast<float>(maxWinSize) ) - halfWin;
+            prevPt += halfWin;
+            iprevPt.x = cvFloor(prevPt.x);
+            iprevPt.y = cvFloor(prevPt.y);
+            if( iprevPt.x < 0 || iprevPt.x >= derivI.cols - winSize.width ||
+                iprevPt.y < 0 || iprevPt.y >= derivI.rows - winSize.height - 1)
+            {
+                if( level == 0 )
+                {
+                    if( status )
+                        status[ptidx] = 3;
+                    if( err )
+                        err[ptidx] = 0;
+                }
+                continue;
+            }
+
+            float a = prevPt.x - iprevPt.x;
+            float b = prevPt.y - iprevPt.y;
+            const int W_BITS = 14, W_BITS1 = 14;
+
+            int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+            int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+            int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+            int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+            float A11 = 0, A12 = 0, A22 = 0;
+
+            // tensor
+            float sumIx = 0;
+            float sumIy = 0;
+            float sumI  = 0;
+            float sumW = 0;
+            float w1 = 0, w2 = 0; // -IyI
+            float dI = 0; // I^2
+            float D = 0;
+
+#ifdef RLOF_SSE
+
+            __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+            __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+            __m128i z = _mm_setzero_si128();
+            __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1));
+            __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1));
+            __m128i mmMask4_epi32;
+            __m128i mmMaskSet_epi16   = _mm_set1_epi16(std::numeric_limits<unsigned short>::max());
+            get4BitMask(winSize.width, mmMask4_epi32);
+#endif
+
+            // extract the patch from the first image, compute covariation matrix of derivatives
+            int x, y;
+            for( y = 0; y < winSize.height; y++ )
+            {
+                x = 0;
+                const uchar* src  = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
+                const uchar* src1 = I.ptr<uchar>(y + iprevPt.y + 1, 0) + iprevPt.x*cn;
+                const short* dsrc  = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
+                const short* dsrc1 = derivI.ptr<short>(y + iprevPt.y + 1, 0) + iprevPt.x*cn2;
+                short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                short* dIptr = derivIWinBuf.ptr<short>(y, 0);
+#ifdef RLOF_SSE
+                const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
+                for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dsrc1 += 8, dIptr += 4*2 )
+                {
+                    __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16);
+                    __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16);
+
+
+                    __m128i v00, v01, v10, v11, t0, t1;
+                    v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
+                    v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
+                    v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x)), z);
+                    v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x + cn)), z);
+
+                    t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                                       _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                    t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
+                    if( x + 4 > winSize.width)
+                    {
+                        t0 = _mm_and_si128(t0, mmMask4_epi32);
+                    }
+                    t0 = _mm_and_si128(t0, mask_0_3_epi16);
+                    _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0));
+
+
+                    v00 = _mm_loadu_si128((const __m128i*)(dsrc));
+                    v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
+                    v10 = _mm_loadu_si128((const __m128i*)(dsrc1));
+                    v11 = _mm_loadu_si128((const __m128i*)(dsrc1 + cn2));
+
+                    t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                                       _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                    t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+                                       _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+                    t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
+                    t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
+                    v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
+                    if( x + 4 > winSize.width)
+                    {
+                        v00 = _mm_and_si128(v00, mmMask4_epi32);
+                    }
+                    v00 = _mm_and_si128(v00, mask_0_3_epi16);
+                    _mm_storeu_si128((__m128i*)dIptr, v00);
+                }
+#else
+
+                for( ; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2 )
+                {
+                    if( winMaskMat.at<uchar>(y,x) == 0)
+                    {
+                        dIptr[0] = 0;
+                        dIptr[1] = 0;
+                        continue;
+                    }
+                    int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
+                                          src1[x]*iw10 + src1[x+cn]*iw11, W_BITS1-5);
+                    int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
+                                           dsrc1[0]*iw10 + dsrc1[cn2]*iw11, W_BITS1);
+                    int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 +
+                                            dsrc1[1]*iw10 + dsrc1[cn2+1]*iw11, W_BITS1);
+
+                    Iptr[x] = (short)ival;
+                    dIptr[0] = (short)ixval;
+                    dIptr[1] = (short)iyval;
+
+                }
+#endif
+            }
+
+            cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1);
+            cv::Point2f backUpNextPt = nextPt;
+                    nextPt += halfWin;
+            Point2f prevDelta(0,0);    //relates to h(t-1)
+            Point2f prevGain(1,0);
+            cv::Point2f gainVec = gainVecs[ptidx];
+            cv::Point2f backUpGain = gainVec;
+            cv::Size _winSize = winSize;
+            int j;
+#ifdef RLOF_SSE
+            __m128i mmMask0, mmMask1, mmMask;
+            getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask);
+            __m128  mmOnes   = _mm_set1_ps(1.f );
+#endif
+            float MEstimatorScale = 1;
+            int buffIdx = 0;
+            float c[8];
+            cv::Mat GMc0, GMc1, GMc2, GMc3;
+            cv::Vec4f Mc0, Mc1, Mc2, Mc3;
+            int noIteration = 0;
+            int noReusedIteration = 0;
+            int noSolvedIteration = 0;
+            for( j = 0; j < criteria.maxCount; j++ )
+            {
+                cv::Point2f delta(0,0);
+                cv::Point2f deltaGain(0,0);
+                bool hasSolved = false;
+                a = nextPt.x - inextPt.x;
+                b = nextPt.y - inextPt.y;
+                float ab = a * b;
+                if (j == 0
+                    || (inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j % 2 != 0) )
+                {
+                    inextPt.x = cvFloor(nextPt.x);
+                    inextPt.y = cvFloor(nextPt.y);
+
+                    if( inextPt.x < 0 || inextPt.x >= J.cols - winSize.width ||
+                        inextPt.y < 0 || inextPt.y >= J.rows - winSize.height - 1)
+                    {
+                        if( level == 0 && status )
+                            status[ptidx] = 3;
+                        if (level > 0)
+                        {
+                            nextPts[ptidx] = backUpNextPt;
+                            gainVecs[ptidx] = backUpGain;
+                        }
+                        noIteration++;
+                        break;
+                    }
+
+
+                    a = nextPt.x - inextPt.x;
+                    b = nextPt.y - inextPt.y;
+                    ab = a * b;
+                    iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+                    iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+                    iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+                    iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+                    // mismatch
+
+                    if( j == 0)
+                    {
+                        // tensor
+                        w1 = 0; // -IxI
+                        w2 = 0; // -IyI
+                        dI = 0; // I^2
+                        sumIx = 0;
+                        sumIy = 0;
+                        sumI  = 0;
+                        sumW = 0;
+                        A11 = 0;
+                        A12 = 0;
+                        A22 = 0;
+                    }
+
+                    if ( j == 0 )
+                    {
+                        buffIdx = 0;
+                        for( y = 0; y < winSize.height; y++ )
+                        {
+                            const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                            const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
+                            const short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                            const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
+                            const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
+                            x = 0;
+                            for( ; x < winSize.width*cn; x++, dIptr += 2)
+                            {
+                                if( maskPtr[x] == 0)
+                                    continue;
+                                int diff = static_cast<int>(CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr1[x]*iw10 + Jptr1[x+cn]*iw11, W_BITS1-5)
+                                    - Iptr[x] + Iptr[x] * gainVec.x +gainVec.y);
+                                residualMat.at<short>(buffIdx++) = static_cast<short>(diff);
+                            }
+                        }
+                        /*! Estimation for the residual */
+                        cv::Mat residualMatRoi(residualMat, cv::Rect(0,0,1, buffIdx));
+                        MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi);
+                    }
+
+                    float eta = 1.f / winArea;
+                    float fParam0 = normSigma0 * 32.f;
+                    float fParam1 = normSigma1 * 32.f;
+                    fParam0 = normSigma0 * MEstimatorScale;
+                    fParam1 = normSigma1 * MEstimatorScale;
+
+    #ifdef RLOF_SSE
+
+                    qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+                    qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+                    __m128 qb0[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
+                    __m128 qb1[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
+                    __m128 qb2[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
+                    __m128 qb3[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
+                    __m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps();
+                    __m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps();
+                    __m128 mmSumIy = _mm_setzero_ps(),  mmSumIx = _mm_setzero_ps();
+                    __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();
+                    __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() -1, static_cast<short>(fParam0)));
+                    __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max()- 1, static_cast<short>(fParam1)));
+
+
+                    float s2Val = std::fabs(normSigma2);
+                    int s2bitShift = normSigma2 == 0 ? 1 : cvCeil(log(200.f / s2Val) / log(2.f));
+                    __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast<short>(normSigma2 * (float)(1 << s2bitShift)));
+                    __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift);
+                    __m128  mmParam2s = _mm_set1_ps(0.01f * normSigma2);
+                    __m128  mmParam2s2 = _mm_set1_ps(normSigma2 * normSigma2);
+                    float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x;
+                    int bitShift = gainVec.x == 0 ? 1 : cvCeil(log(200.f / gainVal) / log(2.f));
+                    __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.x * (float)(1 << bitShift)));
+                    __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.y));
+                    __m128i mmEta     = _mm_setzero_si128();
+                    __m128i mmScale      = _mm_set1_epi16(static_cast<short>(MEstimatorScale));
+
+    #endif
+
+                    buffIdx = 0;
+                    float _b0[4] = {0,0,0,0};
+                    float _b1[4] = {0,0,0,0};
+                    float _b2[4] = {0,0,0,0};
+                    float _b3[4] = {0,0,0,0};
+                    /*
+                    */
+                    for( y = 0; y < _winSize.height; y++ )
+                    {
+                        const uchar* Jptr =  J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                        const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
+                        const short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                        const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
+                        const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
+                        x = 0;
+    #ifdef RLOF_SSE
+                        for( ; x <= _winSize.width*cn; x += 8, dIptr += 8*2 )
+                        {
+                            __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16);
+                            __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x));
+
+                            __m128i v00 = _mm_unpacklo_epi8(
+                                _mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
+                            __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
+                            __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z);
+                            __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z);
+
+                            __m128i t0 = _mm_add_epi32
+                                (_mm_madd_epi16(
+                                    _mm_unpacklo_epi16(v00, v01),
+                                    qw0),
+                                _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                            __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+                                                       _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+
+                            t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
+                            t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5);
+
+                            __m128i lo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16);
+                            __m128i hi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16);
+
+                            __m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), bitShift);
+                            __m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), bitShift);
+                            __m128i Igain_epi16 =  _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32);
+
+
+                            __m128i diffValue = _mm_subs_epi16(_mm_add_epi16(Igain_epi16, mmConstValue_epi16), I_0_7_epi16);
+                            __m128i mmDiffc_epi16[4] =
+                            {
+                               _mm_add_epi16(_mm_slli_epi16(v11, 5), diffValue),
+                               _mm_add_epi16(_mm_slli_epi16(v01, 5), diffValue),
+                               _mm_add_epi16(_mm_slli_epi16(v10, 5), diffValue),
+                               _mm_add_epi16(_mm_slli_epi16(v00, 5), diffValue)
+                            };
+
+                            __m128i mmDiff_epi16 = _mm_add_epi16( _mm_packs_epi32(t0, t1), diffValue);
+
+
+                            mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16);
+
+                            __m128i scalediffIsPos_epi16    = _mm_cmpgt_epi16(mmDiff_epi16, mmScale);
+                            mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1)));
+
+
+                            __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr));
+                            __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
+
+
+                            __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16);
+                            __m128i bSet2_epi16, bSet1_epi16;
+                            // |It| < sigma1 ?
+                            bSet2_epi16        = _mm_cmplt_epi16(abs_epi16, mmParam1);
+                            // It > 0 ?
+                            __m128i diffIsPos_epi16    = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128());
+                            // sigma0 < |It| < sigma1 ?
+                            bSet1_epi16        = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0));
+                                                        // val = |It| -/+ sigma1
+                            __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)),
+                                                                 _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1)));
+                            // It == 0     ? |It| > sigma13
+                            mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16);
+
+                            for( unsigned int mmi = 0; mmi < 4; mmi++)
+                            {
+                                __m128i mmDiffc_epi16_t = _mm_and_si128(mmDiffc_epi16[mmi], mask_0_7_epi16);
+                                mmDiffc_epi16_t = _mm_and_si128(bSet2_epi16, mmDiffc_epi16_t);
+
+                                // It == val ? sigma0 < |It| < sigma1
+                                mmDiffc_epi16_t = _mm_blendv_epi8(mmDiffc_epi16_t, tmpParam1_epi16, bSet1_epi16);
+                                __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3
+                                // diff = diff * sigma2
+                                lo = _mm_mullo_epi16(tale_epi16_, mmDiffc_epi16_t);
+                                hi = _mm_mulhi_epi16(tale_epi16_, mmDiffc_epi16_t);
+                                __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift);
+                                __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift);
+
+                                mmDiffc_epi16_t = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32);
+                                __m128i diff1 = _mm_unpackhi_epi16(mmDiffc_epi16_t, mmDiffc_epi16_t); // It4 It4 It5 It5 It6 It6 It7 It7   | It12 It12 It13 It13...
+                                __m128i diff0 = _mm_unpacklo_epi16(mmDiffc_epi16_t, mmDiffc_epi16_t); // It0 It0 It1 It1 It2 It2 It3 It3   | It8 It8 It9 It9...
+
+                                // Ix * diff / Iy * diff
+                                v10 = _mm_mullo_epi16(Ixy_0, diff0);
+                                v11 = _mm_mulhi_epi16(Ixy_0, diff0);
+                                v00 = _mm_unpacklo_epi16(v10, v11);
+                                v10 = _mm_unpackhi_epi16(v10, v11);
+
+                                qb0[mmi] = _mm_add_ps(qb0[mmi], _mm_cvtepi32_ps(v00));
+                                qb1[mmi] = _mm_add_ps(qb1[mmi], _mm_cvtepi32_ps(v10));
+                                // It * Ix It * Iy [4 ... 7]
+                                // for set 1 hi sigma 1
+                                v10 = _mm_mullo_epi16(Ixy_1, diff1);
+                                v11 = _mm_mulhi_epi16(Ixy_1, diff1);
+                                v00 = _mm_unpacklo_epi16(v10, v11);
+                                v10 = _mm_unpackhi_epi16(v10, v11);
+                                qb0[mmi] = _mm_add_ps(qb0[mmi], _mm_cvtepi32_ps(v00));
+                                qb1[mmi] = _mm_add_ps(qb1[mmi], _mm_cvtepi32_ps(v10));
+                                // diff * J [0 ... 7]
+                                // for set 1  sigma 1
+                                // b3 += diff * Iptr[x]
+                                v10 = _mm_mullo_epi16(mmDiffc_epi16_t, I_0_7_epi16);
+                                v11 = _mm_mulhi_epi16(mmDiffc_epi16_t, I_0_7_epi16);
+                                v00 = _mm_unpacklo_epi16(v10, v11);
+
+                                v10 = _mm_unpackhi_epi16(v10, v11);
+                                qb2[mmi] = _mm_add_ps(qb2[mmi], _mm_cvtepi32_ps(v00));
+                                qb2[mmi] = _mm_add_ps(qb2[mmi], _mm_cvtepi32_ps(v10));
+                                qb3[mmi] = _mm_add_ps(qb3[mmi], _mm_cvtepi32_ps(diff_0_3_epi32));
+                                qb3[mmi] = _mm_add_ps(qb3[mmi], _mm_cvtepi32_ps(diff_4_7_epi32));
+                            }
+
+                            if( j == 0 )
+                            {
+                                __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16));
+                                __m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16,bSet1_epi16), 16));
+                                __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16));
+                                __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16),16)));
+
+                                __m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16));
+                                __m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16),16));
+
+                                __m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps);
+                                __m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps);
+                                tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps);
+                                tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps);
+
+                                tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps);
+                                tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps);
+
+                                t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3
+                                t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3
+
+                                __m128 fy = _mm_cvtepi32_ps(t0);
+                                __m128 fx = _mm_cvtepi32_ps(t1);
+
+                                // 0 ... 3
+                                __m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16));
+
+                                // A11 - A22
+                                __m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps);
+                                __m128 fytale = _mm_mul_ps(fy, tale_0_3_ps);
+
+                                mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
+                                mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
+                                mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
+
+                                // sumIx und sumIy
+                                mmSumIx = _mm_add_ps(mmSumIx, fxtale);
+                                mmSumIy = _mm_add_ps(mmSumIy, fytale);
+
+                                mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale));
+                                mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale));
+
+                                // sumI
+                                __m128 I_tale_ps = _mm_mul_ps(I_ps, tale_0_3_ps);
+                                mmSumI = _mm_add_ps(mmSumI,I_tale_ps);
+
+                                // sumW
+                                mmSumW = _mm_add_ps(mmSumW, tale_0_3_ps);
+
+                                // sumDI
+                                mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_tale_ps));
+
+
+                                t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11
+                                t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3
+
+                                fy =  _mm_cvtepi32_ps(t0);
+                                fx =  _mm_cvtepi32_ps(t1);
+
+                                // 4 ... 7
+                                I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16));
+
+                                // A11 - A22
+                                fxtale = _mm_mul_ps(fx, tale_4_7_ps);
+                                fytale = _mm_mul_ps(fy, tale_4_7_ps);
+
+                                mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
+                                mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
+                                mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
+
+                                // sumIx und sumIy
+                                mmSumIx = _mm_add_ps(mmSumIx, fxtale);
+                                mmSumIy = _mm_add_ps(mmSumIy, fytale);
+
+                                mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale));
+                                mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale));
+
+                                // sumI
+                                I_tale_ps = _mm_mul_ps(I_ps, tale_4_7_ps);
+                                mmSumI = _mm_add_ps(mmSumI, I_tale_ps);
+
+                                // sumW
+                                mmSumW = _mm_add_ps(mmSumW, tale_4_7_ps);
+
+                                // sumDI
+                                mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_tale_ps));
+                            }
+
+                        }
+    #else
+                        for( ; x < _winSize.width*cn; x++, dIptr += 2 )
+                        {
+                            if( maskPtr[x] == 0)
+                                continue;
+
+                            short ixval = dIptr[0];
+                            short iyval = dIptr[1];
+                            int illValue =  static_cast<int>(Iptr[x] * gainVec.x + gainVec.y  - Iptr[x]);
+
+                            int It[4] = {(Jptr1[x+cn]<< 5)    + illValue,
+                                         (Jptr[x+cn]<< 5)        + illValue,
+                                         (Jptr1[x]<< 5)        + illValue,
+                                         (Jptr[x] << 5)            + illValue};
+
+
+                            int J_val  =  CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
+                                                  Jptr1[x]*iw10 + Jptr1[x+cn]*iw11,
+                                                  W_BITS1-5);
+
+                            int diff =  J_val + illValue;
+
+
+
+                            MEstimatorScale += (diff < MEstimatorScale) ? -eta : eta;
+
+                            int abss = (diff < 0) ? -diff : diff;
+
+
+                            // compute the missmatch vector
+                            if( j >= 0)
+                            {
+                                if( abss > fParam1)
+                                {
+                                    It[0] = 0;
+                                    It[1] = 0;
+                                    It[2] = 0;
+                                    It[3] = 0;
+                                }
+                                else if( abss > static_cast<int>(fParam0) && diff >= 0 )
+                                {
+                                    It[0] = static_cast<int>(normSigma2 * (It[0] - fParam1));
+                                    It[1] = static_cast<int>(normSigma2 * (It[1] - fParam1));
+                                    It[2] = static_cast<int>(normSigma2 * (It[2] - fParam1));
+                                    It[3] = static_cast<int>(normSigma2 * (It[3] - fParam1));
+                                }
+                                else if( abss > static_cast<int>(fParam0) && diff < 0 )
+                                {
+                                    It[0] = static_cast<int>(normSigma2 * (It[0] + fParam1));
+                                    It[1] = static_cast<int>(normSigma2 * (It[1] + fParam1));
+                                    It[2] = static_cast<int>(normSigma2 * (It[2] + fParam1));
+                                    It[3] = static_cast<int>(normSigma2 * (It[3] + fParam1));
+                                }
+                            }
+                            _b0[0] += (float)(It[0]*dIptr[0]) ;
+                            _b0[1] += (float)(It[1]*dIptr[0]) ;
+                            _b0[2] += (float)(It[2]*dIptr[0]) ;
+                            _b0[3] += (float)(It[3]*dIptr[0]) ;
+
+
+                            _b1[0] += (float)(It[0]*dIptr[1]) ;
+                            _b1[1] += (float)(It[1]*dIptr[1]) ;
+                            _b1[2] += (float)(It[2]*dIptr[1]) ;
+                            _b1[3] += (float)(It[3]*dIptr[1]) ;
+
+                            _b2[0] += (float)(It[0])*Iptr[x] ;
+                            _b2[1] += (float)(It[1])*Iptr[x] ;
+                            _b2[2] += (float)(It[2])*Iptr[x] ;
+                            _b2[3] += (float)(It[3])*Iptr[x] ;
+
+                            _b3[0] += (float)(It[0]);
+                            _b3[1] += (float)(It[1]);
+                            _b3[2] += (float)(It[2]);
+                            _b3[3] += (float)(It[3]);
+
+                            // compute the Gradient Matrice
+                            if( j == 0)
+                            {
+                                float tale = normSigma2 * FLT_RESCALE;
+                                if( abss < fParam0 || j < 0)
+                                {
+                                    tale = FLT_RESCALE;
+                                }
+                                else if( abss > fParam1)
+                                {
+                                    tale *= 0.01f;
+                                }
+                                else
+                                {
+                                    tale *= normSigma2;
+                                }
+                                if( j == 0 )
+                                {
+                                    A11 += (float)(ixval*ixval)*tale;
+                                    A12 += (float)(ixval*iyval)*tale;
+                                    A22 += (float)(iyval*iyval)*tale;
+                                }
+
+                                dI += Iptr[x] * Iptr[x] * tale;
+                                float dx = static_cast<float>(dIptr[0]) * tale;
+                                float dy = static_cast<float>(dIptr[1]) * tale;
+                                sumIx += dx;
+                                sumIy += dy;
+                                w1 += dx * Iptr[x];
+                                w2 += dy * Iptr[x];
+                                sumI += Iptr[x]  * tale;
+                                sumW += tale;
+                            }
+
+                        }
+    #endif
+                    }
+
+    #ifdef RLOF_SSE
+                    short etaValues[8];
+                    _mm_storeu_si128((__m128i*)(etaValues), mmEta);
+                    MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3]
+                                               + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]);
+                    float CV_DECL_ALIGNED(32) wbuf[4];
+    #endif
+                    if( j == 0 )
+                    {
+    #ifdef RLOF_SSE
+                            _mm_store_ps(wbuf, mmSumW1);
+                            w1  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                            _mm_store_ps(wbuf, mmSumW2);
+                            w2  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                            _mm_store_ps(wbuf, mmSumDI);
+                            dI  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                            _mm_store_ps(wbuf, mmSumI);
+                            sumI  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                            _mm_store_ps(wbuf, mmSumIx);
+                            sumIx  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                            _mm_store_ps(wbuf, mmSumIy);
+                            sumIy  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                            _mm_store_ps(wbuf, mmSumW);
+                            sumW  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+    #endif
+                            sumIx *= -FLT_SCALE;
+                            sumIy *= -FLT_SCALE;
+                            sumI *=FLT_SCALE;
+                            sumW *= FLT_SCALE;
+                            w1 *= -FLT_SCALE;
+                            w2 *= -FLT_SCALE;
+                            dI *= FLT_SCALE;
+
+
+    #ifdef RLOF_SSE
+                        float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];//
+
+                        _mm_store_ps(A11buf, mmAxx);
+                        _mm_store_ps(A12buf, mmAxy);
+                        _mm_store_ps(A22buf, mmAyy);
+
+
+                        A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
+                        A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
+                        A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
+    #endif
+                        A11 *= FLT_SCALE; // 54866744.
+                        A12 *= FLT_SCALE; // -628764.00
+                        A22 *= FLT_SCALE; // 19730.000
+
+                        D = - A12*A12*sumI*sumI + dI*sumW*A12*A12 + 2*A12*sumI*sumIx*w2 + 2*A12*sumI*sumIy*w1
+                            - 2*dI*A12*sumIx*sumIy - 2*sumW*A12*w1*w2 + A11*A22*sumI*sumI - 2*A22*sumI*sumIx*w1
+                            - 2*A11*sumI*sumIy*w2 - sumIx*sumIx*w2*w2 + A22*dI*sumIx*sumIx + 2*sumIx*sumIy*w1*w2
+                            - sumIy*sumIy*w1*w1 + A11*dI*sumIy*sumIy + A22*sumW*w1*w1 + A11*sumW*w2*w2 - A11*A22*dI*sumW;
+
+                        float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
+                                4.f*A12*A12))/(2*winArea);
+                        if(  minEig < minEigThreshold || std::abs(D) < FLT_EPSILON )
+                        {
+                            if( level == 0 && status )
+                                status[ptidx] = 0;
+                            if( level > 0)
+                            {
+                                nextPts[ptidx] = backUpNextPt;
+                                gainVecs[ptidx] = backUpGain;
+                            }
+                            noIteration++;
+                            break;
+                        }
+
+
+                        D = (1.f / D);
+
+                        invTensorMat(0,0) = (A22*sumI*sumI - 2*sumI*sumIy*w2 + dI*sumIy*sumIy + sumW*w2*w2 - A22*dI*sumW)* D;
+                        invTensorMat(0,1) = (A12*dI*sumW - A12*sumI * sumI - dI*sumIx*sumIy + sumI*sumIx*w2 + sumI*sumIy*w1 - sumW*w1*w2)* D;
+                        invTensorMat(0,2) = (A12*sumI*sumIy - sumIy*sumIy*w1 - A22*sumI*sumIx - A12*sumW*w2 + A22*sumW*w1 + sumIx*sumIy*w2)* D;
+                        invTensorMat(0,3) = (A22*dI*sumIx - A12*dI*sumIy - sumIx*w2*w2 + A12*sumI*w2 - A22*sumI*w1 + sumIy*w1*w2) * D;
+                        invTensorMat(1,0) = invTensorMat(0,1);
+                        invTensorMat(1,1) = (A11*sumI * sumI - 2*sumI*sumIx*w1 + dI*sumIx * sumIx + sumW*w1*w1 - A11*dI*sumW) * D;
+                        invTensorMat(1,2) = (A12*sumI*sumIx - A11*sumI*sumIy - sumIx * sumIx*w2 + A11*sumW*w2 - A12*sumW*w1 + sumIx*sumIy*w1) * D;
+                        invTensorMat(1,3) = (A11*dI*sumIy - sumIy*w1*w1 - A12*dI*sumIx - A11*sumI*w2 + A12*sumI*w1 + sumIx*w1*w2)* D;
+                        invTensorMat(2,0) = invTensorMat(0,2);
+                        invTensorMat(2,1) = invTensorMat(1,2);
+                        invTensorMat(2,2) = (sumW*A12*A12 - 2*A12*sumIx*sumIy + A22*sumIx*sumIx + A11*sumIy*sumIy - A11*A22*sumW)* D;
+                        invTensorMat(2,3) = (A11*A22*sumI - A12*A12*sumI - A11*sumIy*w2 + A12*sumIx*w2 + A12*sumIy*w1 - A22*sumIx*w1)* D;
+                        invTensorMat(3,0) = invTensorMat(0,3);
+                        invTensorMat(3,1) = invTensorMat(1,3);
+                        invTensorMat(3,2) = invTensorMat(2,3);
+                        invTensorMat(3,3) = (dI*A12*A12 - 2*A12*w1*w2 + A22*w1*w1 + A11*w2*w2 - A11*A22*dI)* D;
+                    }
+
+
+    #ifdef RLOF_SSE
+                    float CV_DECL_ALIGNED(16) bbuf[4];
+                    for(int mmi = 0; mmi < 4; mmi++)
+                    {
+
+                        _mm_store_ps(bbuf, _mm_add_ps(qb0[mmi], qb1[mmi]));
+                        _b0[mmi] = bbuf[0] + bbuf[2];
+                        _b1[mmi] = bbuf[1] + bbuf[3];
+                        _mm_store_ps(bbuf, qb2[mmi]);
+                        _b2[mmi] = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
+                        _mm_store_ps(bbuf, qb3[mmi]);
+                        _b3[mmi] = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
+
+                    }
+    #endif
+
+                    _b0[0] *= FLT_SCALE;_b0[1] *= FLT_SCALE;_b0[2] *= FLT_SCALE;_b0[3] *= FLT_SCALE;
+                    _b1[0] *= FLT_SCALE;_b1[1] *= FLT_SCALE;_b1[2] *= FLT_SCALE;_b1[3] *= FLT_SCALE;
+                    _b2[0] *= FLT_SCALE;_b2[1] *= FLT_SCALE;_b2[2] *= FLT_SCALE;_b2[3] *= FLT_SCALE;
+                    _b3[0] *= FLT_SCALE;_b3[1] *= FLT_SCALE;_b3[2] *= FLT_SCALE;_b3[3] *= FLT_SCALE;
+
+
+                    Mc0[0] =   _b0[0] - _b0[1] - _b0[2] + _b0[3];
+                    Mc0[1] =   _b1[0] - _b1[1] - _b1[2] + _b1[3];
+                    Mc0[2] = -(_b2[0] - _b2[1] - _b2[2] + _b2[3]);
+                    Mc0[3] = -(_b3[0] - _b3[1] - _b3[2] + _b3[3]);
+
+                    Mc1[0] =   _b0[1] - _b0[3];
+                    Mc1[1] =   _b1[1] - _b1[3];
+                    Mc1[2] = -(_b2[1] - _b2[3]);
+                    Mc1[3] = -(_b3[1] - _b3[3]);
+
+
+                    Mc2[0] =   _b0[2] - _b0[3];
+                    Mc2[1] =   _b1[2] - _b1[3];
+                    Mc2[2] = -(_b2[2] - _b2[3]);
+                    Mc2[3] = -(_b3[2] - _b3[3]);
+
+
+                    Mc3[0] =  _b0[3];
+                    Mc3[1] =  _b1[3];
+                    Mc3[2] = -_b2[3];
+                    Mc3[3] = -_b3[3];
+
+                    //
+                    c[0] = -Mc0[0];
+                    c[1] = -Mc1[0];
+                    c[2] = -Mc2[0];
+                    c[3] = -Mc3[0];
+                    c[4] = -Mc0[1];
+                    c[5] = -Mc1[1];
+                    c[6] = -Mc2[1];
+                    c[7] = -Mc3[1];
+
+                    float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]);
+                    float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]);
+                    float e2 = e0 * (c[1] * c[7] -c[3] * c[5]);
+                    e0 = e1 * e1 - e2;
+                    hasSolved = false;
+                    if ( e0 > 0)
+                    {
+                        e0 = sqrt(e0);
+                        float _y[2] = {-e1 - e0, e0 - e1};
+                        float c0yc1[2] = {c[0] * _y[0] + c[1],
+                                            c[0] * _y[1] + c[1]};
+                        float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0],
+                                        -(c[2] * _y[1] + c[3]) / c0yc1[1]};
+                        bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1);
+                        bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1);
+
+                        bool isSolution1 = checkSolution(_x[0], _y[0], c );
+                        bool isSolution2 = checkSolution(_x[1], _y[1], c );
+                        bool isSink1 = isIn1 && isSolution1;
+                        bool isSink2 = isIn2 && isSolution2;
+
+                        if ( isSink1 != isSink2)
+                        {
+                            a = isSink1 ? _x[0] : _x[1];
+                            b = isSink1 ? _y[0] : _y[1];
+                            ab = a * b;
+                            hasSolved = true;
+                            delta.x = inextPt.x + a - nextPt.x;
+                            delta.y = inextPt.y + b - nextPt.y;
+
+                            cv::Vec4f mismatchVec = ab * Mc0  + Mc1 *a + Mc2 * b + Mc3;
+                            deltaGain = est_DeltaGain(invTensorMat, mismatchVec);
+
+                        } // isIn1 != isIn2
+                    }
+                    if( hasSolved == false)
+                        noIteration++;
+                }
+                else
+                {
+                    hasSolved = false;
+                    noReusedIteration++;
+                }
+                if( hasSolved == false )
+                {
+
+                    cv::Vec4f mismatchVec = ab * Mc0  + Mc1 *a + Mc2 * b + Mc3;
+                    est_Result(invTensorMat, mismatchVec, delta, deltaGain);
+
+                    delta.x  = MAX(-1.f, MIN( 1.f , delta.x));
+                    delta.y  = MAX(-1.f, MIN( 1.f , delta.y));
+
+
+                    if( j == 0)
+                        prevGain = deltaGain;
+                    gainVec += deltaGain;
+                    nextPt  += delta    ;
+                    nextPts[ptidx] = nextPt - halfWin;
+                    gainVecs[ptidx]= gainVec;
+
+                }
+                else
+                {
+                    nextPt += delta;
+                    nextPts[ptidx] = nextPt - halfWin;
+                    gainVecs[ptidx]= gainVec + deltaGain;
+                    noSolvedIteration++;
+                    break;
+                }
+
+                if(j > 0 && (
+                    (std::abs(delta.x - prevDelta.x) < 0.01  &&    std::abs(delta.y - prevDelta.y) < 0.01)
+                 || ((delta.ddot(delta) <= 0.001) && std::abs(prevGain.x - deltaGain.x) < 0.01)))
+                {
+                            nextPts[ptidx]  += delta*0.5f;
+                    gainVecs[ptidx] -= deltaGain* 0.5f;
+                    break;
+                }
+
+
+                prevDelta = delta;
+                prevGain = deltaGain;
+            }
+
+        }
+
+    }
+
+    const Mat*          prevImg;
+    const Mat*          nextImg;
+    const Mat*          prevDeriv;
+    const Mat*          rgbPrevImg;
+    const Mat*          rgbNextImg;
+    const Point2f*      prevPts;
+    Point2f*            nextPts;
+    uchar*              status;
+    cv::Point2f*        gainVecs;        // gain vector x -> multiplier y -> offset
+    float*              err;
+    int                 maxWinSize;
+    int                 minWinSize;
+    TermCriteria        criteria;
+    int                 level;
+    int                 maxLevel;
+    int                 windowType;
+    float               minEigThreshold;
+    bool                useInitialFlow;
+    const float         normSigma0, normSigma1, normSigma2;
+    int                 crossSegmentationThreshold;
+};
+
+}} // namespace
+namespace beplk {
+namespace ica {
+
+class TrackerInvoker  : public cv::ParallelLoopBody
+{
+public:
+    TrackerInvoker(
+        const Mat&      _prevImg,
+        const Mat&      _prevDeriv,
+        const Mat&      _nextImg,
+        const Mat&      _rgbPrevImg,
+        const Mat&      _rgbNextImg,
+        const Point2f*  _prevPts,
+        Point2f*        _nextPts,
+        uchar*          _status,
+        float*          _err,
+        int             _level,
+        int             _maxLevel,
+        int             _winSize[2],
+        int             _maxIteration,
+        bool            _useInitialFlow,
+        int             _supportRegionType,
+        int             _crossSegmentationThreshold,
+        float           _minEigenValue)
+    {
+        prevImg = &_prevImg;
+        prevDeriv = &_prevDeriv;
+        nextImg = &_nextImg;
+        rgbPrevImg = &_rgbPrevImg;
+        rgbNextImg = &_rgbNextImg;
+        prevPts = _prevPts;
+        nextPts = _nextPts;
+        status = _status;
+        err = _err;
+        minWinSize = _winSize[0];
+        maxWinSize = _winSize[1];
+        criteria.maxCount = _maxIteration;
+        criteria.epsilon = 0.01;
+        level = _level;
+        maxLevel = _maxLevel;
+        windowType = _supportRegionType;
+        minEigThreshold = _minEigenValue;
+        useInitialFlow = _useInitialFlow;
+        crossSegmentationThreshold = _crossSegmentationThreshold;
+    }
+
+    void operator()(const cv::Range& range) const CV_OVERRIDE
+    {
+        cv::Size    winSize;
+        cv::Point2f halfWin;
+
+        const Mat& I    = *prevImg;
+        const Mat& J    = *nextImg;
+        const Mat& derivI = *prevDeriv;
+        const Mat& BI = *rgbPrevImg;
+
+        winSize = cv::Size(maxWinSize,maxWinSize);
+        int winMaskwidth = roundUp(winSize.width, 8) * 2;
+        cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
+        winMaskMatBuf.setTo(1);
+
+        const float FLT_SCALE = (1.f/(1 << 20)); // 20
+
+        int j, cn = I.channels(), cn2 = cn*2;
+        int winbufwidth = roundUp(winSize.width, 8);
+        cv::Size winBufSize(winbufwidth,winbufwidth);
+
+        std::vector<short> _buf(winBufSize.area()*(cn + cn2));
+        Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]);
+        Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]);
+
+        for( int ptidx = range.start; ptidx < range.end; ptidx++ )
+        {
+            Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
+            Point2f nextPt;
+            if( level == maxLevel )
+            {
+                if( useInitialFlow )
+                    nextPt = nextPts[ptidx]*(float)(1./(1 << level));
+                else
+                    nextPt = prevPt;
+            }
+            else
+                nextPt = nextPts[ptidx]*2.f;
+            nextPts[ptidx] = nextPt;
+
+            Point2i iprevPt, inextPt;
+            iprevPt.x = cvFloor(prevPt.x);
+            iprevPt.y = cvFloor(prevPt.y);
+            int winArea = maxWinSize * maxWinSize;
+            cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize));
+
+            if( calcWinMaskMat(BI, windowType, iprevPt,
+                    winMaskMat,winSize,halfWin,winArea,
+                    minWinSize,maxWinSize) == false)
+                continue;
+
+            halfWin = Point2f(static_cast<float>(maxWinSize), static_cast<float>(maxWinSize)) - halfWin;
+            prevPt += halfWin;
+            iprevPt.x = cvFloor(prevPt.x);
+            iprevPt.y = cvFloor(prevPt.y);
+
+            if( iprevPt.x < 0 || iprevPt.x >= derivI.cols - winSize.width ||
+                iprevPt.y < 0 || iprevPt.y >= derivI.rows - winSize.height - 1)
+            {
+                if( level == 0 )
+                {
+                    if( status )
+                        status[ptidx] = 3;
+                    if( err )
+                        err[ptidx] = 0;
+                }
+                continue;
+            }
+
+            float a = prevPt.x - iprevPt.x;
+            float b = prevPt.y - iprevPt.y;
+            const int W_BITS = 14, W_BITS1 = 14;
+
+            int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+            int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+            int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+            int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+            float A11 = 0, A12 = 0, A22 = 0;
+
+#ifdef RLOF_SSE
+            __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+            __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+            __m128i z = _mm_setzero_si128();
+            __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1));
+            __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1));
+            __m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps();
+            __m128i mmMask4_epi32;
+            get4BitMask(winSize.width, mmMask4_epi32);
+#endif
+
+
+            int x, y;
+            for( y = 0; y < winSize.height; y++ )
+            {
+                const uchar* src = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
+                const uchar* src1 = I.ptr<uchar>(y + iprevPt.y + 1, 0) + iprevPt.x*cn;
+                const short* dsrc = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
+                const short* dsrc1 = derivI.ptr<short>(y + iprevPt.y + 1, 0) + iprevPt.x*cn2;
+                short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                short* dIptr = derivIWinBuf.ptr<short>(y, 0);
+                const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
+                x = 0;
+#ifdef RLOF_SSE
+                for( ; x < winSize.width*cn; x += 4, dsrc += 4*2, dsrc1 += 8,dIptr += 4*2 )
+                {
+                    __m128i wMask = _mm_set_epi32(MaskSet * maskPtr[x+3],
+                                                  MaskSet * maskPtr[x+2],
+                                                  MaskSet * maskPtr[x+1],
+                                                  MaskSet * maskPtr[x]);
+                    __m128i v00, v01, v10, v11, t0, t1;
+                    v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
+                    v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
+                    v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x)), z);
+                    v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x + cn)), z);
+
+                    t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                                       _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+
+                    t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
+                    _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0));
+
+                    v00 = _mm_loadu_si128((const __m128i*)(dsrc));
+                    v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
+                    v10 = _mm_loadu_si128((const __m128i*)(dsrc1));
+                    v11 = _mm_loadu_si128((const __m128i*)(dsrc1 + cn2));
+
+                    t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                                       _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                    t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+                                       _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+                    t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
+                    t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
+                    v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
+
+                    if( x + 4 > winSize.width)
+                    {
+                        v00 = _mm_and_si128(v00, mmMask4_epi32);
+                    }
+                    v00 = _mm_and_si128(v00, wMask);
+
+                    _mm_storeu_si128((__m128i*)dIptr, v00);
+                    t0 = _mm_srai_epi32(v00, 16); // Iy0 Iy1 Iy2 Iy3
+                    t1 = _mm_srai_epi32(_mm_slli_epi32(v00, 16), 16); // Ix0 Ix1 Ix2 Ix3
+
+                    __m128 fy = _mm_cvtepi32_ps(t0);
+                    __m128 fx = _mm_cvtepi32_ps(t1);
+
+                    qA22 = _mm_add_ps(qA22, _mm_mul_ps(fy, fy));
+                    qA12 = _mm_add_ps(qA12, _mm_mul_ps(fx, fy));
+                    qA11 = _mm_add_ps(qA11, _mm_mul_ps(fx, fx));
+                }
+#else
+
+                for( ; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2)
+                {
+                    if( maskPtr[x] == 0)
+                    {
+                        dIptr[0] = (short)0;
+                        dIptr[1] = (short)0;
+
+                        continue;
+                    }
+                    int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
+                                          src1[x]*iw10 + src1[x+cn]*iw11, W_BITS1-5);
+                    int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
+                                           dsrc1[0]*iw10 + dsrc1[cn2]*iw11, W_BITS1);
+                    int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc1[1]*iw10 +
+                                           dsrc1[cn2+1]*iw11, W_BITS1);
+
+                    Iptr[x] = (short)ival;
+                    dIptr[0] = (short)ixval;
+                    dIptr[1] = (short)iyval;
+                    A11 += (float)(ixval*ixval);
+                    A12 += (float)(ixval*iyval);
+                    A22 += (float)(iyval*iyval);
+
+                }
+
+
+#endif
+
+            }
+
+#ifdef RLOF_SSE
+            float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];
+            _mm_store_ps(A11buf, qA11);
+            _mm_store_ps(A12buf, qA12);
+            _mm_store_ps(A22buf, qA22);
+            A11 += A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
+            A12 += A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
+            A22 += A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
+#endif
+
+            A11 *= FLT_SCALE;
+            A12 *= FLT_SCALE;
+            A22 *= FLT_SCALE;
+
+
+            float D = A11*A22 - A12*A12;
+            float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
+                             4.f*A12*A12))/(2 * winArea);
+
+            if( err )
+                err[ptidx] = (float)minEig;
+
+            if( D < FLT_EPSILON )
+            {
+                if( level == 0 && status )
+                    status[ptidx] = 0;
+                continue;
+            }
+
+            D = 1.f/D;
+
+            cv::Point2f backUpNextPt = nextPt;
+            nextPt += halfWin;
+            Point2f prevDelta(0,0);
+
+            float c[8];
+#ifdef RLOF_SSE
+            __m128i mmMask0, mmMask1, mmMask;
+            getWBitMask(winSize.width, mmMask0, mmMask1, mmMask);
+#endif
+            for( j = 0; j < criteria.maxCount; j++ )
+            {
+                cv::Point2f delta;
+                bool hasSolved = false;
+                a = nextPt.x - cvFloor(nextPt.x);
+                b = nextPt.y - cvFloor(nextPt.y);
+                float ab = a * b;
+
+
+                if( (inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j == 0))
+                {
+                    inextPt.x = cvFloor(nextPt.x);
+                    inextPt.y = cvFloor(nextPt.y);
+                    if( inextPt.x < 0 || inextPt.x >= J.cols - winSize.width ||
+                        inextPt.y < 0 || inextPt.y >= J.rows - winSize.height - 1)
+                    {
+                        if( level == 0 && status )
+                            status[ptidx] = 3;
+                        if (level > 0)
+                            nextPts[ptidx] = backUpNextPt;
+                        break;
+                    }
+
+
+                    iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+                    iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+                    iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+                    iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+
+                    float _b1[4] = {0,0,0,0};
+                    float _b2[4] = {0,0,0,0};
+    #ifdef RLOF_SSE
+                    __m128 qbc0[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
+                    __m128 qbc1[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
+                    qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+                    qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+    #endif
+                    for( y = 0; y < winSize.height; y++ )
+                    {
+                        const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                        const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
+                        const short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                        const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
+                        x = 0;
+    #ifdef RLOF_SSE
+
+                        const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
+                        for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 )
+                        {
+                            if( maskPtr[x  ] == 0 && maskPtr[x+1] == 0 && maskPtr[x+2] == 0 && maskPtr[x+3] == 0
+                            &&    maskPtr[x+4] == 0 && maskPtr[x+5] == 0 && maskPtr[x+6] == 0 && maskPtr[x+7] == 0)
+                                continue;
+                            __m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x));
+                            __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
+                            __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
+                            __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z);
+                            __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z);
+
+                            __m128i mmDiffc_epi16[4] =
+                            { _mm_subs_epi16(_mm_slli_epi16(v00, 5), diff0),
+                              _mm_subs_epi16(_mm_slli_epi16(v01, 5), diff0),
+                              _mm_subs_epi16(_mm_slli_epi16(v10, 5), diff0),
+                              _mm_subs_epi16(_mm_slli_epi16(v11, 5), diff0)
+                            };
+
+                            __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
+                            __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
+
+                            if(  x > winSize.width*cn - 8)
+                            {
+                                Ixy_0 = _mm_and_si128(Ixy_0, mmMask0);
+                                Ixy_1 = _mm_and_si128(Ixy_1, mmMask1);
+                            }
+
+                            __m128i diffc1[4] =
+                            {_mm_unpackhi_epi16(mmDiffc_epi16[0],mmDiffc_epi16[0]),
+                             _mm_unpackhi_epi16(mmDiffc_epi16[1],mmDiffc_epi16[1]),
+                             _mm_unpackhi_epi16(mmDiffc_epi16[2],mmDiffc_epi16[2]),
+                             _mm_unpackhi_epi16(mmDiffc_epi16[3],mmDiffc_epi16[3])
+                            };
+
+                            __m128i diffc0[4] =
+                            {_mm_unpacklo_epi16(mmDiffc_epi16[0],mmDiffc_epi16[0]),
+                             _mm_unpacklo_epi16(mmDiffc_epi16[1],mmDiffc_epi16[1]),
+                             _mm_unpacklo_epi16(mmDiffc_epi16[2],mmDiffc_epi16[2]),
+                             _mm_unpacklo_epi16(mmDiffc_epi16[3],mmDiffc_epi16[3])
+                            };
+
+
+                            // It * Ix It * Iy [0 ... 3]
+                            //mask split for multiplication
+                            // for set 1 lo sigma 1
+
+
+                            v10 = _mm_mullo_epi16(Ixy_0, diffc0[0]);
+                            v11 = _mm_mulhi_epi16(Ixy_0, diffc0[0]);
+                            v00 = _mm_unpacklo_epi16(v10, v11);
+                            v10 = _mm_unpackhi_epi16(v10, v11);
+                            qbc0[0] = _mm_add_ps(qbc0[0], _mm_cvtepi32_ps(v00));
+                            qbc1[0] = _mm_add_ps(qbc1[0], _mm_cvtepi32_ps(v10));
+
+                            v10 = _mm_mullo_epi16(Ixy_0, diffc0[1]);
+                            v11 = _mm_mulhi_epi16(Ixy_0, diffc0[1]);
+                            v00 = _mm_unpacklo_epi16(v10, v11);
+                            v10 = _mm_unpackhi_epi16(v10, v11);
+                            qbc0[1] = _mm_add_ps(qbc0[1], _mm_cvtepi32_ps(v00));
+                            qbc1[1] = _mm_add_ps(qbc1[1], _mm_cvtepi32_ps(v10));
+
+                            v10 = _mm_mullo_epi16(Ixy_0, diffc0[2]);
+                            v11 = _mm_mulhi_epi16(Ixy_0, diffc0[2]);
+                            v00 = _mm_unpacklo_epi16(v10, v11);
+                            v10 = _mm_unpackhi_epi16(v10, v11);
+                            qbc0[2] = _mm_add_ps(qbc0[2], _mm_cvtepi32_ps(v00));
+                            qbc1[2] = _mm_add_ps(qbc1[2], _mm_cvtepi32_ps(v10));
+
+                            v10 = _mm_mullo_epi16(Ixy_0, diffc0[3]);
+                            v11 = _mm_mulhi_epi16(Ixy_0, diffc0[3]);
+                            v00 = _mm_unpacklo_epi16(v10, v11);
+                            v10 = _mm_unpackhi_epi16(v10, v11);
+                            qbc0[3] = _mm_add_ps(qbc0[3], _mm_cvtepi32_ps(v00));
+                            qbc1[3] = _mm_add_ps(qbc1[3], _mm_cvtepi32_ps(v10));
+                            // It * Ix It * Iy [4 ... 7]
+                            // for set 1 hi sigma 1
+
+                            v10 = _mm_mullo_epi16(Ixy_1, diffc1[0]);
+                            v11 = _mm_mulhi_epi16(Ixy_1, diffc1[0]);
+                            v00 = _mm_unpacklo_epi16(v10, v11);
+                            v10 = _mm_unpackhi_epi16(v10, v11);
+                            qbc0[0] = _mm_add_ps(qbc0[0], _mm_cvtepi32_ps(v00));
+                            qbc1[0] = _mm_add_ps(qbc1[0], _mm_cvtepi32_ps(v10));
+
+                            v10 = _mm_mullo_epi16(Ixy_1, diffc1[1]);
+                            v11 = _mm_mulhi_epi16(Ixy_1, diffc1[1]);
+                            v00 = _mm_unpacklo_epi16(v10, v11);
+                            v10 = _mm_unpackhi_epi16(v10, v11);
+                            qbc0[1] = _mm_add_ps(qbc0[1], _mm_cvtepi32_ps(v00));
+                            qbc1[1] = _mm_add_ps(qbc1[1], _mm_cvtepi32_ps(v10));
+
+                            v10 = _mm_mullo_epi16(Ixy_1, diffc1[2]);
+                            v11 = _mm_mulhi_epi16(Ixy_1, diffc1[2]);
+                            v00 = _mm_unpacklo_epi16(v10, v11);
+                            v10 = _mm_unpackhi_epi16(v10, v11);
+                            qbc0[2] = _mm_add_ps(qbc0[2], _mm_cvtepi32_ps(v00));
+                            qbc1[2] = _mm_add_ps(qbc1[2], _mm_cvtepi32_ps(v10));
+
+                            v10 = _mm_mullo_epi16(Ixy_1, diffc1[3]);
+                            v11 = _mm_mulhi_epi16(Ixy_1, diffc1[3]);
+                            v00 = _mm_unpacklo_epi16(v10, v11);
+                            v10 = _mm_unpackhi_epi16(v10, v11);
+                            qbc0[3] = _mm_add_ps(qbc0[3], _mm_cvtepi32_ps(v00));
+                            qbc1[3] = _mm_add_ps(qbc1[3], _mm_cvtepi32_ps(v10));
+
+                         }
+    #else
+                        for( ; x < winSize.width*cn; x++, dIptr += 2 )
+                        {
+                            if( dIptr[0] == 0 && dIptr[1] == 0)
+                                continue;
+                            short It[4] = {(Jptr[x] << 5)            - Iptr[x],
+                                                (Jptr[x+cn]<< 5)        - Iptr[x],
+                                                (Jptr1[x]<< 5)        - Iptr[x],
+                                                (Jptr1[x+cn]<< 5)    - Iptr[x]};
+
+                            _b1[0] += (float)(It[0]*dIptr[0]);
+                            _b1[1] += (float)(It[1]*dIptr[0]);
+                            _b1[2] += (float)(It[2]*dIptr[0]);
+                            _b1[3] += (float)(It[3]*dIptr[0]);
+
+                            _b2[0] += (float)(It[0]*dIptr[1]);
+                            _b2[1] += (float)(It[1]*dIptr[1]);
+                            _b2[2] += (float)(It[2]*dIptr[1]);
+                            _b2[3] += (float)(It[3]*dIptr[1]);
+                        }
+    #endif
+
+                    }
+
+    #ifdef RLOF_SSE
+                    float CV_DECL_ALIGNED(16) bbuf[4];
+                    _mm_store_ps(bbuf, _mm_add_ps(qbc0[0], qbc1[0]));
+                    _b1[0] += bbuf[0] + bbuf[2];
+                    _b2[0] += bbuf[1] + bbuf[3];
+
+                    _mm_store_ps(bbuf, _mm_add_ps(qbc0[1], qbc1[1]));
+                    _b1[1] += bbuf[0] + bbuf[2];
+                    _b2[1] += bbuf[1] + bbuf[3];
+
+                    _mm_store_ps(bbuf, _mm_add_ps(qbc0[2], qbc1[2]));
+                    _b1[2] += bbuf[0] + bbuf[2];
+                    _b2[2] += bbuf[1] + bbuf[3];
+
+                    _mm_store_ps(bbuf, _mm_add_ps(qbc0[3], qbc1[3]));
+                    _b1[3] += bbuf[0] + bbuf[2];
+                    _b2[3] += bbuf[1] + bbuf[3];
+
+    #endif
+                    _b1[0] *= FLT_SCALE;
+                    _b1[1] *= FLT_SCALE;
+                    _b1[2] *= FLT_SCALE;
+                    _b1[3] *= FLT_SCALE;
+                    _b2[0] *= FLT_SCALE;
+                    _b2[1] *= FLT_SCALE;
+                    _b2[2] *= FLT_SCALE;
+                    _b2[3] *= FLT_SCALE;
+
+                    float c0[4] = {    _b1[3] + _b1[0] - _b1[2] - _b1[1],
+                                    _b1[1] - _b1[0],
+                                    _b1[2] - _b1[0],
+                                    _b1[0]};
+                    float c1[4] = {    _b2[3] + _b2[0] - _b2[2] - _b2[1],
+                                    _b2[1] - _b2[0],
+                                    _b2[2] - _b2[0],
+                                    _b2[0]};
+
+                    float DA12 = A12 * D ;
+                    float DA22 = A22 * D ;
+                    float DA11 = A11 * D ;
+                    c[0]    = DA12 * c1[0] - DA22 * c0[0];
+                    c[1]    = DA12 * c1[1] - DA22 * c0[1];
+                    c[2]    = DA12 * c1[2] - DA22 * c0[2];
+                    c[3]    = DA12 * c1[3] - DA22 * c0[3];
+                    c[4]    = DA12 * c0[0] - DA11 * c1[0];
+                    c[5]    = DA12 * c0[1] - DA11 * c1[1];
+                    c[6]    = DA12 * c0[2] - DA11 * c1[2];
+                    c[7]    = DA12 * c0[3] - DA11 * c1[3];
+
+                    float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]);
+                    float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]);
+                    float e2 = e0 * (c[1] * c[7] -c[3] * c[5]);
+                    e0 = e1 * e1 - e2;
+                    if ( e0 >= 0)
+                    {
+                        e0 = sqrt(e0);
+                        float _y[2] = {-e1 - e0, e0 - e1};
+                        float c0yc1[2] = {c[0] * _y[0] + c[1],
+                                          c[0] * _y[1] + c[1]};
+
+                        float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0],
+                                       -(c[2] * _y[1] + c[3]) / c0yc1[1]};
+
+                        bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1);
+                        bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1);
+
+
+                        bool isSink1 = isIn1 && checkSolution(_x[0], _y[0], c );
+                        bool isSink2 = isIn2 && checkSolution(_x[1], _y[1], c );
+
+
+                        //if( isSink1 != isSink2 )
+                        {
+                            if( isSink1 )
+                            {
+                                hasSolved = true;
+                                delta.x = inextPt.x + _x[0] - nextPt.x;
+                                delta.y = inextPt.y + _y[0] - nextPt.y;
+                            }
+                            if (isSink2 )
+                            {
+                                hasSolved = true;
+                                delta.x = inextPt.x + _x[1] - nextPt.x;
+                                delta.y = inextPt.y + _y[1] - nextPt.y;
+                            }
+                        } // isIn1 != isIn2
+                    } // e0 >= 0
+                }
+                else
+                    hasSolved = false;
+                if(hasSolved == false)
+                {
+                    delta = Point2f( c[0] * ab + c[1] * a + c[2] * b + c[3],
+                                     c[4] * ab + c[5] * a + c[6] * b + c[7]);
+                    nextPt += 0.7 * delta;
+                    nextPts[ptidx] = nextPt - halfWin;
+                }
+                else
+                {
+                    nextPt += delta;
+                    nextPts[ptidx] = nextPt - halfWin;
+                    break;
+                }
+
+
+                if( delta.ddot(delta) <= criteria.epsilon)
+                    break;
+
+                if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01  &&
+                            std::abs(delta.y - prevDelta.y) < 0.01)
+                {
+                    nextPts[ptidx]  -= delta*0.35f;
+                    break;
+                }
+
+                prevDelta = delta;
+
+            }
+
+        }
+    }
+
+
+    const Mat*          prevImg;
+    const Mat*          nextImg;
+    const Mat*          prevDeriv;
+    const Mat*          rgbPrevImg;
+    const Mat*          rgbNextImg;
+    const Point2f*      prevPts;
+    Point2f*            nextPts;
+    uchar*              status;
+    float*              err;
+    int                 maxWinSize;
+    int                 minWinSize;
+    TermCriteria        criteria;
+    int                 level;
+    int                 maxLevel;
+    int                 windowType;
+    float               minEigThreshold;
+    bool                useInitialFlow;
+    int                 crossSegmentationThreshold;
+};
+
+}}}}  // namespace
+#endif
diff --git a/modules/optflow/src/rlof/geo_interpolation.cpp b/modules/optflow/src/rlof/geo_interpolation.cpp
new file mode 100644
index 00000000000..0ac2e1715da
--- /dev/null
+++ b/modules/optflow/src/rlof/geo_interpolation.cpp
@@ -0,0 +1,453 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+// This functions have been contributed by Jonas Geisters <geistert@nue.tu-berlin.de>
+
+#include "../precomp.hpp"
+
+#include "geo_interpolation.hpp"
+#include <string>
+#include <map>
+namespace cv {
+namespace optflow {
+
+struct Graph_helper {
+    std::vector<int> mem;
+    int e_size;
+    Graph_helper(int k, int num_nodes) {
+        e_size = (2 * k + 1);
+        mem.resize(e_size * num_nodes, 0);
+    }
+    inline int size(int id) {
+        int r_addr = id * (e_size);
+        return mem[r_addr];
+    }
+    inline int * data(int id) {
+        int r_addr = id * (e_size)+1;
+        return &mem[r_addr];
+    }
+    inline void add(int id, std::pair<float, int> data) {
+        int r_addr = id * (e_size);
+        int size = ++mem[r_addr];
+        r_addr += 2 * size - 1;//== 1 + 2*(size-1);
+        *(float*)&mem[r_addr] = data.first;
+        mem[r_addr + 1] = data.second;
+    }
+    inline bool color_in_target(int id, int color) {
+        int r_addr = id * (e_size);
+        int size = mem[r_addr];
+        r_addr += 2;
+        for (int i = 0; i < size; i++) {
+            if (mem[r_addr] == color) {
+                return true;
+            }
+            r_addr += 2;
+        }
+        return false;
+    }
+
+};
+
+Mat sgeo_dist(const Mat& gra, int y, int x, float max, Mat &prev)
+{
+    std::vector <Point2f> points;
+    points.push_back(Point2f(static_cast<float>(x), static_cast<float>(y)));
+    return sgeo_dist(gra, points, max, prev);
+}
+Mat sgeo_dist(const Mat& gra, const std::vector<Point2f> & points, float max, Mat &prev)
+{
+    int Dx[] = { -1,0,1,-1,1,-1,0,1 };
+    int Dy[] = { -1,-1,-1,0,0,1,1,1 };
+    Mat dm(gra.rows, gra.cols, CV_32F, Scalar(max));
+    prev = Mat(gra.rows, gra.cols, CV_8U, Scalar(255));
+
+    std::multimap<float, Vec2i > not_visited_with_value;
+
+    for (auto i = points.begin(); i != points.end(); i++)
+    {
+        int y = static_cast<int>(i->y);
+        int x = static_cast<int>(i->x);
+        not_visited_with_value.insert(std::pair<float, Vec2i >(0.f, Vec2i(y, x)));
+        dm.at<float>(y, x) = 0;
+    }
+
+    bool done = false;
+    while (!done)
+    {
+        if (not_visited_with_value.begin() == not_visited_with_value.end()) {
+            done = true;
+            break;
+        }
+        std::multimap<float, Vec2i >::iterator current_it = not_visited_with_value.begin();
+        std::pair<float, Vec2i > current_p = *current_it;
+        not_visited_with_value.erase(current_it);
+
+        int y = current_p.second[0];
+        int x = current_p.second[1];
+        float cur_d = current_p.first;
+
+        if (dm.at<float>(y, x) != cur_d) {
+            continue;
+        }
+
+        Vec8f gra_e = gra.at<Vec8f>(y, x);
+
+        for (int i = 0; i < 8; i++) {
+            if (gra_e[i] < 0) {
+                continue;
+            }
+            int dx = Dx[i];
+            int dy = Dy[i];
+
+            if (dm.at<float>(y + dy, x + dx) > cur_d + gra_e[i]) {
+                dm.at<float>(y + dy, x + dx) = cur_d + gra_e[i];
+                prev.at<uchar>(y + dy, x + dx) = static_cast<uchar>(7 - i);
+                not_visited_with_value.insert(std::pair<float, Vec2i >(cur_d + gra_e[i], Vec2i(y + dy, x + dx)));
+            }
+
+        }
+    }
+
+
+
+
+    return dm;
+}
+
+Mat interpolate_irregular_nn_raster(const std::vector<Point2f> & prevPoints,
+    const std::vector<Point2f> & nextPoints,
+    const std::vector<uchar> & status,
+    const Mat & i1)
+{
+    Mat gra = getGraph(i1, 0.1f);
+    int Dx[] = { -1,0,1,-1,1,-1,0,1 };
+    int Dy[] = { -1,-1,-1,0,0,1,1,1 };
+    int max_rounds = 10;
+    Mat dirt = Mat(gra.rows, gra.cols, CV_8U, Scalar(0));
+    Mat quellknoten = Mat(gra.rows, gra.cols, CV_32S, Scalar(-1));
+    Mat dist = Mat(gra.rows, gra.cols, CV_32F, Scalar(std::numeric_limits<float>::max()));
+    /*
+        * assign quellknoten ids.
+        */
+    for (int i = 0; i < static_cast<int>(prevPoints.size()); i++)
+    {
+        int x = (int)prevPoints[i].x;
+        int y = (int)prevPoints[i].y;
+        if (status[i] == 0)
+            continue;
+        dirt.at<uchar>(y, x) = 1;
+        dist.at<float>(y, x) = 0;
+        quellknoten.at<int>(y, x) = i;
+    }
+
+    bool clean = true;
+    bool done = false;
+    int x = 0;
+    int y = 0;
+    int rounds = 0;
+    while (!done) {
+        /*
+            * Update x and y
+            * on even rounds go rasterscanorder , on odd round inverse rasterscanorder
+            */
+        if (rounds % 2 == 0) {
+            x++;
+            if (x >= gra.cols) {
+                x = 0;
+                y++;
+                if (y >= gra.rows) {
+                    y = 0;
+                    rounds++;
+                    y = gra.rows - 1;
+                    x = gra.cols - 1;
+                    if (rounds >= max_rounds || clean) {
+                        done = true;
+                        break;
+                    }
+                }
+            }
+        }
+        else {
+            x--;
+            if (x < 0) {
+                x = gra.cols - 1;
+                y--;
+                if (y < 0) {
+                    y = gra.rows - 1;
+                    rounds++;
+                    y = 0;
+                    x = 0;
+                    if (rounds >= max_rounds || clean) {
+                        done = true;
+                        break;
+                    }
+                }
+            }
+        }
+        if (dirt.at<uchar>(y, x) == 0) {
+            continue;
+        }
+        dirt.at<uchar>(y, x) = 0;
+
+        float c_dist = dist.at<float>(y, x);
+        Vec8f gra_e = gra.at<Vec8f>(y, x);
+
+        for (int i = 0; i < 8; i++) {
+            int tx = Dx[i];
+            int ty = Dy[i];
+            if (ty == 0 && tx == 0) {
+                continue;
+            }
+            if (x + tx < 0 || x + tx >= gra.cols) {
+                continue;
+            }
+            if (y + ty < 0 || y + ty >= gra.rows) {
+                continue;
+            }
+            if (c_dist > dist.at<float>(y + ty, x + tx)) {
+                if (c_dist > dist.at<float>(y + ty, x + tx) + gra_e[i]) {
+                    quellknoten.at<int>(y, x) = quellknoten.at<int>(y + ty, x + tx);
+                    dist.at<float>(y, x) = dist.at<float>(y + ty, x + tx) + gra_e[i];
+                    dirt.at<uchar>(y, x) = 1;
+                    clean = false;
+                }
+            }
+            else {
+                if (c_dist + gra_e[i] < dist.at<float>(y + ty, x + tx)) {
+                    quellknoten.at<int>(y + ty, x + tx) = quellknoten.at<int>(y, x);
+                    dist.at<float>(y + ty, x + tx) = dist.at<float>(y, x) + gra_e[i];
+                    dirt.at<uchar>(y + ty, x + tx) = 1;
+                    clean = false;
+                }
+            }
+        }
+
+
+
+    }
+
+    Mat nnFlow(i1.rows, i1.cols, CV_32FC2, Scalar(0));
+    for (y = 0; y < i1.rows; y++) {
+        for (x = 0; x < i1.cols; x++) {
+
+            int id = quellknoten.at<int>(y, x);
+            if (id != -1)
+            {
+                nnFlow.at<Point2f>(y, x) = nextPoints[id] - prevPoints[id];
+            }
+        }
+    }
+    return nnFlow;
+}
+
+Mat interpolate_irregular_knn(
+    const std::vector<Point2f> & _prevPoints,
+    const std::vector<Point2f> & _nextPoints,
+    const std::vector<uchar> & status,
+    const Mat &color_img,
+    int k,
+    float pixeldistance)
+{
+    Mat in(color_img.rows, color_img.cols, CV_32FC2);
+    Mat mask = Mat::zeros(color_img.rows, color_img.cols, CV_8UC1);
+
+    for (unsigned n = 0; n < _prevPoints.size(); n++)
+    {
+        if (_prevPoints[n].x >= 0 && _prevPoints[n].y >= 0 && _prevPoints[n].x < color_img.cols && _prevPoints[n].y < color_img.rows)
+        {
+            in.at<Point2f>(_prevPoints[n]) = _nextPoints[n] - _prevPoints[n];
+            mask.at<uchar>(_prevPoints[n]) = status[n];
+        }
+
+    }
+    int Dx[] = { -1,0,1,-1,1,-1,0,1 };
+    int Dy[] = { -1,-1,-1,0,0,1,1,1 };
+    Mat gra = getGraph(color_img, pixeldistance);
+    Mat nnFlow(in.rows, in.cols, CV_32FC2, Scalar(0));
+
+    std::multimap<float, Vec2i > my_agents; // <arrivaltim , < target, color >>
+    Graph_helper graph_helper(k, in.rows*in.cols); //< arrivaltime, color>
+
+
+    int color = 0;
+    std::vector<Vec2i> flow_point_list;
+    for (int y = 0; y < in.rows; y++) {
+        for (int x = 0; x < in.cols; x++) {
+            if (mask.at<uchar>(y, x) > 0) {
+                flow_point_list.push_back(Vec2i(y, x));
+                nnFlow.at<Vec2f>(y, x) = in.at<Vec2f>(y, x);
+
+                int v_id = (y * in.cols + x);
+                graph_helper.add(v_id, std::pair<float, int>(0.f, color));
+
+
+                Vec8f gra_e = gra.at<Vec8f>(y, x);
+                for (int i = 0; i < 8; i++) {
+                    if (gra_e[i] < 0)
+                        continue;
+                    int dx = Dx[i];
+                    int dy = Dy[i];
+                    int target = (y + dy) * in.cols + (x + dx);
+                    Vec2i agent(target, color);
+                    my_agents.insert(std::pair<float, Vec2i >(gra_e[i], agent));
+
+                }
+                color++;
+            }
+        }
+    }
+
+    int global_time = 0;
+
+    bool done = false;
+    while (!done) {
+        if (my_agents.size() == 0) {
+            done = true;
+            break;
+        }
+        global_time++;
+
+        std::multimap<float, Vec2i >::iterator current_it = my_agents.begin();
+        std::pair<float, Vec2i > current_p = *current_it;
+        my_agents.erase(current_it);
+
+        int target = current_p.second[0];
+        color = current_p.second[1];
+        float arriv_time = current_p.first;
+
+        Vec8f gra_e = gra.at<Vec8f>(target);// (y*cols+x)
+        if (graph_helper.size(target) >= k) {
+            continue;
+        }
+
+        bool color_found_in_target = graph_helper.color_in_target(target, color);
+        if (color_found_in_target) {
+            continue;
+        }
+        graph_helper.add(target, std::pair<float, int>(arriv_time, color));
+
+
+        for (int i = 0; i < 8; i++) {
+            if (gra_e[i] < 0)
+                continue;
+
+            int dx = Dx[i];
+            int dy = Dy[i];
+            int new_target = target + dx + (dy*in.cols);
+            if (graph_helper.size(new_target) >= k) {
+                continue;
+            }
+            color_found_in_target = graph_helper.color_in_target(new_target, color);
+            if (color_found_in_target) {
+                continue;
+            }
+            Vec2i new_agent(new_target, color);
+            my_agents.insert(std::pair<float, Vec2i >(arriv_time + gra_e[i], new_agent));
+
+        }
+    }
+
+    Mat ret(in.rows, in.cols*k, CV_32FC2);
+    for (int y = 0; y < in.rows; y++) {
+        for (int x = 0; x < in.cols; x++) {
+            for (int i = 0; i < k; i++) {
+                float dist = *((float*)(graph_helper.data(y*in.cols + x) + 2 * i));
+                float id = *((float*)(graph_helper.data(y*in.cols + x) + 2 * i + 1));
+                ret.at<Vec2f>(y, k*x + i) = Vec2f(dist, id);
+            }
+        }
+    }
+    return ret;
+}
+
+Mat getGraph(const Mat &image, float edge_length)
+{
+
+    int Dx[] = { -1,0,1,-1,1,-1,0,1 };
+    int Dy[] = { -1,-1,-1,0,0,1,1,1 };
+    Mat gra(image.rows, image.cols, CV_32FC(8));
+
+    for (int y = 0; y < gra.rows; y++) {
+        for (int x = 0; x < gra.cols; x++) {
+
+            for (int i = 0; i < 8; i++) {
+                int dx = Dx[i];
+                int dy = Dy[i];
+                gra.at<Vec8f>(y, x)[i] = -1;
+
+                if (x + dx < 0 || y + dy < 0 || x + dx >= gra.cols || y + dy >= gra.rows) {
+                    continue;
+                }
+
+                if (i < 4) {
+                    int si = 7 - i;
+                    gra.at<Vec8f>(y, x)[i] = gra.at<Vec8f>(y + dy, x + dx)[si];
+                }
+                else {
+                    float p1 = dx * dx*edge_length*edge_length + dy * dy*edge_length*edge_length;
+                    float p2 = static_cast<float>(image.at<Vec3b>(y, x)[0] - image.at<Vec3b>(y + dy, x + dx)[0]);
+                    float p3 = static_cast<float>(image.at<Vec3b>(y, x)[1] - image.at<Vec3b>(y + dy, x + dx)[1]);
+                    float p4 = static_cast<float>(image.at<Vec3b>(y, x)[2] - image.at<Vec3b>(y + dy, x + dx)[2]);
+                    gra.at<Vec8f>(y, x)[i] = sqrt(p1 + p2 * p2 + p3 * p3 + p4 * p4);
+                }
+
+            }
+
+        }
+    }
+
+    return gra;
+}
+
+
+Mat interpolate_irregular_nn(
+    const std::vector<Point2f> & _prevPoints,
+    const std::vector<Point2f> & _nextPoints,
+    const std::vector<uchar> & status,
+    const Mat &color_img,
+    float pixeldistance)
+{
+    int Dx[] = { -1,0,1,-1,1,-1,0,1 };
+    int Dy[] = { -1,-1,-1,0,0,1,1,1 };
+    std::vector<Point2f> prevPoints, nextPoints;
+    std::map<std::pair<float, float>, std::pair<float, float>> flowMap;
+    for (unsigned n = 0; n < _prevPoints.size(); n++)
+    {
+        if (status[n] != 0)
+        {
+            flowMap.insert(std::make_pair(
+                std::make_pair(_prevPoints[n].x, _prevPoints[n].y),
+                std::make_pair(_nextPoints[n].x, _nextPoints[n].y)));
+            prevPoints.push_back(_prevPoints[n]);
+            nextPoints.push_back(_nextPoints[n]);
+        }
+
+    }
+
+    Mat gra = getGraph(color_img, pixeldistance);
+
+    Mat prev;
+    Mat geo_dist = sgeo_dist(gra, prevPoints, std::numeric_limits<float>::max(), prev);
+
+
+    Mat nnFlow = Mat::zeros(color_img.size(), CV_32FC2);
+    for (int y = 0; y < nnFlow.rows; y++)
+    {
+        for (int x = 0; x < nnFlow.cols; x++)
+        {
+            int cx = x;
+            int cy = y;
+            while (prev.at<uchar>(cy, cx) != 255)
+            {
+                int i = prev.at<uchar>(cy, cx);
+                cx += Dx[i];
+                cy += Dy[i];
+            }
+            auto val = flowMap[std::make_pair(static_cast<float>(cx), static_cast<float>(cy))];
+            nnFlow.at<Vec2f>(y, x) = Vec2f(val.first - cx, val.second - cy);
+        }
+    }
+    return nnFlow;
+}
+
+}} // namespace
diff --git a/modules/optflow/src/rlof/geo_interpolation.hpp b/modules/optflow/src/rlof/geo_interpolation.hpp
new file mode 100644
index 00000000000..526768903b5
--- /dev/null
+++ b/modules/optflow/src/rlof/geo_interpolation.hpp
@@ -0,0 +1,35 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#ifndef _GEO_INTERPOLATION_HPP_
+#define _GEO_INTERPOLATION_HPP_
+
+namespace cv {
+namespace optflow {
+
+typedef Vec<float, 8> Vec8f;
+Mat getGraph(const Mat & image, float edge_length);
+Mat sgeo_dist(const Mat& gra, int y, int x, float max, Mat &prev);
+Mat sgeo_dist(const Mat& gra, const std::vector<Point2f> & points, float max, Mat &prev);
+Mat interpolate_irregular_nw(const Mat &in, const Mat &mask, const Mat &color_img, float max_d, float bandwidth, float pixeldistance);
+Mat interpolate_irregular_nn(
+    const std::vector<Point2f> & prevPoints,
+    const std::vector<Point2f> & nextPoints,
+    const std::vector<uchar> & status,
+    const Mat &color_img,
+    float pixeldistance);
+Mat interpolate_irregular_knn(
+    const std::vector<Point2f> & _prevPoints,
+    const std::vector<Point2f> & _nextPoints,
+    const std::vector<uchar> & status,
+    const Mat &color_img,
+    int k,
+    float pixeldistance);
+
+Mat interpolate_irregular_nn_raster(const std::vector<Point2f> & prevPoints,
+    const std::vector<Point2f> & nextPoints,
+    const std::vector<uchar> & status,
+    const Mat & i1);
+
+}} // namespace
+#endif
diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp
new file mode 100644
index 00000000000..3c0bad6f963
--- /dev/null
+++ b/modules/optflow/src/rlof/plk_invoker.hpp
@@ -0,0 +1,1087 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#ifndef _PLK_INVOKER_HPP_
+#define _PLK_INVOKER_HPP_
+#include "rlof_invokerbase.hpp"
+
+
+namespace cv {
+namespace optflow {
+namespace plk {
+
+// implementierung ohne SSE
+namespace radial {
+
+class TrackerInvoker : public cv::ParallelLoopBody
+{
+public:
+    TrackerInvoker(
+
+        const Mat&      _prevImg,
+        const Mat&      _prevDeriv,
+        const Mat&      _nextImg,
+        const Mat&      _rgbPrevImg,
+        const Mat&      _rgbNextImg,
+        const Point2f*  _prevPts,
+        Point2f*        _nextPts,
+        uchar*          _status,
+        float*          _err,
+        Point2f*        _gainVecs,
+        int             _level,
+        int             _maxLevel,
+        int             _winSize[2],
+        int             _maxIteration,
+        bool            _useInitialFlow,
+        int             _supportRegionType,
+        float           _minEigenValue,
+        int             _crossSegmentationThreshold)
+    {
+        prevImg = &_prevImg;
+        prevDeriv = &_prevDeriv;
+        nextImg = &_nextImg;
+        rgbPrevImg = &_rgbPrevImg;
+        rgbNextImg = &_rgbNextImg;
+        prevPts = _prevPts;
+        nextPts = _nextPts;
+        status = _status;
+        err = _err;
+        gainVecs = _gainVecs;
+        minWinSize = _winSize[0];
+        maxWinSize = _winSize[1];
+        criteria.maxCount = _maxIteration;
+        criteria.epsilon = 0.01;
+        level = _level;
+        maxLevel = _maxLevel;
+        windowType = _supportRegionType;
+        minEigThreshold = _minEigenValue;
+        useInitialFlow = _useInitialFlow;
+        crossSegmentationThreshold = _crossSegmentationThreshold;
+    }
+
+    void operator()(const cv::Range& range) const CV_OVERRIDE
+    {
+        cv::Size    winSize;
+        cv::Point2f halfWin;
+
+        const Mat& I    = *prevImg;
+        const Mat& J    = *nextImg;
+        const Mat& derivI = *prevDeriv;
+        const Mat& BI = *rgbPrevImg;
+
+        winSize = cv::Size(maxWinSize,maxWinSize);
+        int winMaskwidth = roundUp(winSize.width, 16);
+        cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
+        winMaskMatBuf.setTo(1);
+        const float FLT_SCALE = (1.f/(1 << 16));//(1.f/(1 << 20)); // 20
+
+        int cn = I.channels(), cn2 = cn*2;
+        int winbufwidth = roundUp(winSize.width, 16);
+        cv::Size winBufSize(winbufwidth,winbufwidth);
+
+        cv::Matx44f invTensorMat;
+        Vec4f mismatchMat;
+        Vec4f resultMat;
+
+        cv::AutoBuffer<deriv_type> _buf(winBufSize.area()*(cn + cn2));
+        int derivDepth = DataType<deriv_type>::depth;
+
+        Mat IWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf.data());
+        Mat derivIWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf.data() + winBufSize.area()*cn);
+
+        for( int ptidx = range.start; ptidx < range.end; ptidx++ )
+        {
+            Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
+            Point2f nextPt;
+            if( level == maxLevel )
+            {
+                if( useInitialFlow )
+                {
+                    nextPt = nextPts[ptidx]*(float)(1./(1 << level));
+                }
+                else
+                    nextPt = prevPt;
+            }
+            else
+                nextPt = nextPts[ptidx]*2.f;
+            nextPts[ptidx] = nextPt;
+
+            Point2i iprevPt, inextPt;
+            iprevPt.x = cvFloor(prevPt.x);
+            iprevPt.y = cvFloor(prevPt.y);
+            int winArea = maxWinSize * maxWinSize;
+            cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize));
+            winMaskMatBuf.setTo(0);
+            if( calcWinMaskMat(BI, windowType, iprevPt,
+                                winMaskMat,winSize,halfWin,winArea,
+                                minWinSize,maxWinSize) == false)
+                        continue;
+            halfWin = Point2f(static_cast<float>(maxWinSize) ,static_cast<float>(maxWinSize) ) - halfWin;
+            prevPt += halfWin;
+            iprevPt.x = cvFloor(prevPt.x);
+            iprevPt.y = cvFloor(prevPt.y);
+
+
+            if( iprevPt.x < 0 || iprevPt.x >= derivI.cols - winSize.width ||
+                iprevPt.y < 0 || iprevPt.y >= derivI.rows - winSize.height - 1)
+            {
+                if( level == 0 )
+                {
+                    if( status )
+                        status[ptidx] = 3;
+                    if( err )
+                        err[ptidx] = 0;
+                }
+                continue;
+            }
+
+            float a = prevPt.x - iprevPt.x;
+            float b = prevPt.y - iprevPt.y;
+            const int W_BITS = 14, W_BITS1 = 14;
+            int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+            int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+            int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+            int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+            float A11 = 0, A12 = 0, A22 = 0;
+            // tensor
+            float sumIx = 0;
+            float sumIy = 0;
+            float sumI  = 0;
+            float sumW = 0;
+            float w1 = 0, w2 = 0; // -IyI
+            float dI = 0; // I^2
+            float D = 0;
+ #ifdef RLOF_SSE
+
+            __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+            __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+            __m128i z = _mm_setzero_si128();
+            __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1));
+            __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1));
+            __m128i mmMask4_epi32;
+            __m128i mmMaskSet_epi16   = _mm_set1_epi16(std::numeric_limits<unsigned short>::max());
+            get4BitMask(winSize.width, mmMask4_epi32);
+#endif
+            // extract the patch from the first image, compute covariation matrix of derivatives
+            int x, y;
+            for( y = 0; y < winSize.height; y++ )
+            {
+                const uchar* src = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
+                const uchar* src1 = I.ptr<uchar>(y + iprevPt.y + 1, 0) + iprevPt.x*cn;
+                const short* dsrc = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
+                const short* dsrc1 = derivI.ptr<short>(y + iprevPt.y + 1, 0) + iprevPt.x*cn2;
+                short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                short* dIptr = derivIWinBuf.ptr<short>(y, 0);
+                const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
+                x = 0;
+#ifdef RLOF_SSE
+               for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dsrc1 += 8, dIptr += 4*2 )
+                {
+                    __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16);
+                    __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16);
+                    __m128i v00, v01, v10, v11, t0, t1;
+                    v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
+                    v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
+                    v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x)), z);
+                    v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x + cn)), z);
+
+                    t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                                       _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                    t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
+                    if( x + 4 > winSize.width)
+                    {
+                        t0 = _mm_and_si128(t0, mmMask4_epi32);
+                    }
+                    t0 = _mm_and_si128(t0, mask_0_3_epi16);
+                    _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0));
+
+
+                    v00 = _mm_loadu_si128((const __m128i*)(dsrc));
+                    v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
+                    v10 = _mm_loadu_si128((const __m128i*)(dsrc1));
+                    v11 = _mm_loadu_si128((const __m128i*)(dsrc1 + cn2));
+
+                    t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                                       _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                    t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+                                       _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+                    t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
+                    t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
+                    v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
+                    if( x + 4 > winSize.width)
+                    {
+                        v00 = _mm_and_si128(v00, mmMask4_epi32);
+                    }
+                    v00 = _mm_and_si128(v00, mask_0_3_epi16);
+                    _mm_storeu_si128((__m128i*)dIptr, v00);
+                }
+#else
+                for( ; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2 )
+                {
+                    if( maskPtr[x] == 0)
+                    {
+                        dIptr[0] = 0;
+                        dIptr[1] = 0;
+                        continue;
+                    }
+                    int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
+                                          src1[x]*iw10 + src1[x+cn]*iw11, W_BITS1-5);
+                    int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
+                                           dsrc1[0]*iw10 + dsrc1[cn2]*iw11, W_BITS1);
+                    int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc1[1]*iw10 +
+                                           dsrc1[cn2+1]*iw11, W_BITS1);
+
+                    Iptr[x] = (short)ival;
+                    dIptr[0] = (short)ixval;
+                    dIptr[1] = (short)iyval;
+                }
+#endif
+
+            }
+
+            cv::Point2f backUpNextPt = nextPt;
+            nextPt += halfWin;
+            Point2f prevDelta(0,0);    //relates to h(t-1)
+            Point2f prevGain(1,0);
+            cv::Point2f gainVec = gainVecs[ptidx];
+            cv::Point2f backUpGain = gainVec;
+            int j;
+            for( j = 0; j < criteria.maxCount; j++ )
+            {
+                status[ptidx] = static_cast<uchar>(j);
+                inextPt.x = cvFloor(nextPt.x);
+                inextPt.y = cvFloor(nextPt.y);
+
+                if( inextPt.x < 0 || inextPt.x >= J.cols - winSize.width ||
+                    inextPt.y < 0 || inextPt.y >= J.rows - winSize.height - 1)
+                {
+                    if( level == 0 && status )
+                        status[ptidx] = 3;
+                    break;
+                }
+
+                a = nextPt.x - inextPt.x;
+                b = nextPt.y - inextPt.y;
+                iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+                iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+                iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+                iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+                float b1,b2,b3,b4;
+                b1 = 0;
+                b2 = 0;
+                b3 = 0;
+                b4 = 0;
+
+                if( j == 0 )
+                {
+                    // tensor
+                    w1 = 0; // -IxI
+                    w2 = 0; // -IyI
+                    dI = 0; // I^2
+                    sumIx = 0;
+                    sumIy = 0;
+                    sumI  = 0;
+                    sumW = 0;
+                    A11 = 0;
+                    A12 = 0;
+                    A22 = 0;
+                }
+#ifdef RLOF_SSE
+                qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+                qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+                __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(), qb2 = _mm_setzero_ps();
+                __m128 qb3 = _mm_setzero_ps();
+                __m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps();
+                __m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps();
+                __m128 mmSumIy = _mm_setzero_ps(),  mmSumIx = _mm_setzero_ps();
+                __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();
+                float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x;
+                int bitShift = gainVec.x == 0 ? 1 : cvCeil(log(200.f / gainVal) / log(2.f));
+                __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.x * (float)(1 << bitShift)));
+                __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.y));
+#endif
+                for( y = 0; y < winSize.height; y++ )
+                {
+                    const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                    const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
+                    const short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                    const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
+                    const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
+                    x = 0;
+#ifdef RLOF_SSE
+
+                    for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 )
+                    {
+                        // iw = iw << 14 (16384 short / 2)
+                        // iq0 =  iw01 |iw00, iw01 |iw00, iw01 |iw00, iw01 |iw00    16 bit
+                        __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16);
+
+                        // I [0...8]
+                        __m128i diff0, diff1;
+                        __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); //  element 0 to 7
+
+                        __m128i v00 = _mm_unpacklo_epi8(
+                            _mm_loadl_epi64((const __m128i*)(Jptr + x)) // J0, J1, J2, ..., J7, 64 bit nullen
+                                , z); //J0 , 0, J1, 0, J2, 0 ... J7,0
+                        __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
+                        __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z);
+                        __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z);
+
+
+                        __m128i t0 = _mm_add_epi32
+                            (_mm_madd_epi16(
+                                _mm_unpacklo_epi16(v00, v01), //J0, , J1, J1, J2, J2,  ... J3 , J4
+                                qw0),
+                            _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+
+                        __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+                                                   _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+
+                        t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
+                        t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5);
+
+
+                        __m128i Ilo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16);
+                        __m128i Ihi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16);
+
+                        __m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(Ilo, Ihi), bitShift);
+                        __m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(Ilo, Ihi), bitShift);
+                        __m128i Igain_epi16 =  _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32);
+
+                        // diff = J - I + I*gain.x + gain.y
+                        __m128i mmDiff_epi16 = _mm_add_epi16(
+                            _mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16),    // J - I
+                            _mm_add_epi16(Igain_epi16, mmConstValue_epi16));
+
+
+                        mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16);
+
+
+                        __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
+                        __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
+
+                        diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7   | It12 It12 It13 It13...
+                        diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3   | It8 It8 It9 It9...
+
+                        v10 = _mm_mullo_epi16(Ixy_0, diff0);
+                        v11 = _mm_mulhi_epi16(Ixy_0, diff0);
+                        v00 = _mm_unpacklo_epi16(v10, v11);
+                        v10 = _mm_unpackhi_epi16(v10, v11);
+                        qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
+                        qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
+
+                        v10 = _mm_mullo_epi16(Ixy_1, diff1);
+                        v11 = _mm_mulhi_epi16(Ixy_1, diff1);
+                        v00 = _mm_unpacklo_epi16(v10, v11);
+                        v10 = _mm_unpackhi_epi16(v10, v11);
+                        qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
+                        qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
+                        // for set 1  sigma 1
+                        // b3 += diff * Iptr[x]
+                        Ilo = _mm_mullo_epi16(mmDiff_epi16, I_0_7_epi16);
+                        Ihi = _mm_mulhi_epi16(mmDiff_epi16, I_0_7_epi16);
+                        v00 = _mm_unpacklo_epi16(Ilo, Ihi);
+                        v10 = _mm_unpackhi_epi16(Ilo, Ihi);
+                        qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v00));
+                        qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v10));
+
+                        // b4+= diff
+                        __m128 mmDiff_0_3_ps  = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mmDiff_epi16)); // (_mm_srai_epi32(_mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16),16));
+                        __m128 mmDiff_4_7_ps  = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16),16)));
+                        qb3 = _mm_add_ps(qb3, mmDiff_0_3_ps);
+                        qb3 = _mm_add_ps(qb3, mmDiff_4_7_ps);
+                        if( j == 0 )
+                        {
+                            __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16));
+                            __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16)));
+
+                            t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3
+                            t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3
+
+                            __m128 fy = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t0), mask_0_4_ps);
+                            __m128 fx = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t1), mask_0_4_ps);
+
+                            // 0 ... 3
+                            __m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16));
+
+                            // A11 - A22
+                            mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fy));
+                            mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fy));
+                            mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fx));
+
+                            // sumIx und sumIy
+                            mmSumIx = _mm_add_ps(mmSumIx, fx);
+                            mmSumIy = _mm_add_ps(mmSumIy, fy);
+
+                            mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fx));
+                            mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fy));
+
+                            // sumI
+                            I_ps  = _mm_blendv_ps(_mm_set1_ps(0), I_ps, mask_0_4_ps);
+                            mmSumI = _mm_add_ps(mmSumI,I_ps);
+
+                            // sumDI
+                            mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_ps));
+
+                            t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11
+                            t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3
+
+                            fy = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t0), mask_4_7_ps);
+                            fx = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t1), mask_4_7_ps);
+
+                            // 4 ... 7
+                            I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16));
+
+
+
+                            mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fy));
+                            mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fy));
+                            mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fx));
+
+                            // sumIx und sumIy
+                            mmSumIx = _mm_add_ps(mmSumIx, fx);
+                            mmSumIy = _mm_add_ps(mmSumIy, fy);
+
+                            mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fx));
+                            mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fy));
+
+                            // sumI
+                            I_ps  = _mm_blendv_ps(_mm_set1_ps(0), I_ps, mask_4_7_ps);
+                            mmSumI = _mm_add_ps(mmSumI, I_ps);
+
+                            // sumW
+                            // sumDI
+                            mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_ps));
+                        }
+
+                    }
+#else
+                    for( ; x < winSize.width*cn; x++, dIptr += 2 )
+                    {
+                        if( maskPtr[x] == 0)
+                            continue;
+                        int J_val = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
+                                              Jptr1[x]*iw10 + Jptr1[x+cn]*iw11,
+                                              W_BITS1-5);
+                        int diff = static_cast<int>(J_val - Iptr[x] + Iptr[x] * gainVec.x + gainVec.y);
+
+                        b1 += (float)(diff*dIptr[0])  * FLT_RESCALE;
+                        b2 += (float)(diff*dIptr[1])  * FLT_RESCALE;
+                        b3 += (float)(diff) * Iptr[x] * FLT_RESCALE;
+                        b4 += (float)(diff);
+
+                        // compute the Gradient Matrice
+                        if( j == 0 )
+                        {
+                            A11 += (float)(dIptr[0]*dIptr[0]);
+                            A12 += (float)(dIptr[0]*dIptr[1]);
+                            A22 += (float)(dIptr[1]*dIptr[1]);
+
+
+                            dI += Iptr[x] * Iptr[x] * FLT_RESCALE;
+                            float dx = static_cast<float>(dIptr[0]) * FLT_RESCALE;
+                            float dy = static_cast<float>(dIptr[1]) * FLT_RESCALE;
+                            sumIx += dx;
+                            sumIy += dy;
+                            w1 += dx * Iptr[x];
+                            w2 += dy * Iptr[x];
+                            sumI += Iptr[x]  * FLT_RESCALE;
+                            //sumW += FLT_RESCALE;
+
+                        }
+                    }
+#endif
+                }
+#ifdef RLOF_SSE
+                float CV_DECL_ALIGNED(16) wbuf[4];
+#endif
+                if( j == 0 )
+                {
+#ifdef RLOF_SSE
+                        _mm_store_ps(wbuf, mmSumW1);
+                        w1  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                        _mm_store_ps(wbuf, mmSumW2);
+                        w2  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                        _mm_store_ps(wbuf, mmSumDI);
+                        dI  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                        _mm_store_ps(wbuf, mmSumI);
+                        sumI  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                        _mm_store_ps(wbuf, mmSumIx);
+                        sumIx  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                        _mm_store_ps(wbuf, mmSumIy);
+                        sumIy  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                        _mm_store_ps(wbuf, mmSumW);
+                        sumW  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+#endif
+                    sumIx *= -FLT_SCALE;
+                    sumIy *= -FLT_SCALE;
+                    sumI *=FLT_SCALE;
+                    sumW = winArea * FLT_SCALE;
+                    w1 *= -FLT_SCALE;
+                    w2 *= -FLT_SCALE;
+                    dI *= FLT_SCALE;
+#ifdef RLOF_SSE
+                    float CV_DECL_ALIGNED(32) A11buf[4], A12buf[4], A22buf[4];//
+
+                    _mm_store_ps(A11buf, mmAxx);
+                    _mm_store_ps(A12buf, mmAxy);
+                    _mm_store_ps(A22buf, mmAyy);
+
+                    A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
+                    A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
+                    A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
+#endif
+                    A11 *= FLT_SCALE;
+                    A12 *= FLT_SCALE;
+                    A22 *= FLT_SCALE;
+                }
+
+#ifdef RLOF_SSE
+                float CV_DECL_ALIGNED(16) bbuf[4];
+                _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
+                b1 = bbuf[0] + bbuf[2];
+                b2 = bbuf[1] + bbuf[3];
+                _mm_store_ps(bbuf, qb2);
+                b3 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
+                _mm_store_ps(bbuf, qb3);
+                b4 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
+#endif
+
+                mismatchMat(0) =  b1 * FLT_SCALE;
+                mismatchMat(1) =  b2 * FLT_SCALE;
+                mismatchMat(2) = -b3 * FLT_SCALE;
+                mismatchMat(3) = -b4 * FLT_SCALE;
+
+                D = - A12*A12*sumI*sumI + dI*sumW*A12*A12 + 2*A12*sumI*sumIx*w2 + 2*A12*sumI*sumIy*w1
+                        - 2*dI*A12*sumIx*sumIy - 2*sumW*A12*w1*w2 + A11*A22*sumI*sumI - 2*A22*sumI*sumIx*w1
+                        - 2*A11*sumI*sumIy*w2 - sumIx*sumIx*w2*w2 + A22*dI*sumIx*sumIx + 2*sumIx*sumIy*w1*w2
+                        - sumIy*sumIy*w1*w1 + A11*dI*sumIy*sumIy + A22*sumW*w1*w1 + A11*sumW*w2*w2 - A11*A22*dI*sumW;
+
+
+                float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
+                        4.f*A12*A12))/(2*winArea);
+                if(  minEig < minEigThreshold )
+                {
+                    if( (level == 0 && status) || std::abs(D) < FLT_EPSILON)
+                        status[ptidx] = 0;
+                    if( level > 0)
+                    {
+                        nextPts[ptidx] = backUpNextPt;
+                        gainVecs[ptidx] = backUpGain;
+                    }
+                    break;
+                }
+
+
+
+                D = (1.f / D);
+                invTensorMat(0,0) = (A22*sumI*sumI - 2*sumI*sumIy*w2 + dI*sumIy*sumIy + sumW*w2*w2 - A22*dI*sumW)* D;
+                invTensorMat(0,1) = (A12*dI*sumW - A12*sumI * sumI - dI*sumIx*sumIy + sumI*sumIx*w2 + sumI*sumIy*w1 - sumW*w1*w2)* D;
+                invTensorMat(0,2) = (A12*sumI*sumIy - sumIy*sumIy*w1 - A22*sumI*sumIx - A12*sumW*w2 + A22*sumW*w1 + sumIx*sumIy*w2)* D;
+                invTensorMat(0,3) = (A22*dI*sumIx - A12*dI*sumIy - sumIx*w2*w2 + A12*sumI*w2 - A22*sumI*w1 + sumIy*w1*w2) * D;
+                invTensorMat(1,0) = invTensorMat(0,1);
+                invTensorMat(1,1) = (A11*sumI * sumI - 2*sumI*sumIx*w1 + dI*sumIx * sumIx + sumW*w1*w1 - A11*dI*sumW) * D;
+                invTensorMat(1,2) = (A12*sumI*sumIx - A11*sumI*sumIy - sumIx * sumIx*w2 + A11*sumW*w2 - A12*sumW*w1 + sumIx*sumIy*w1) * D;
+                invTensorMat(1,3) = (A11*dI*sumIy - sumIy*w1*w1 - A12*dI*sumIx - A11*sumI*w2 + A12*sumI*w1 + sumIx*w1*w2)* D;
+                invTensorMat(2,0) = invTensorMat(0,2);
+                invTensorMat(2,1) = invTensorMat(1,2);
+                invTensorMat(2,2) = (sumW*A12*A12 - 2*A12*sumIx*sumIy + A22*sumIx*sumIx + A11*sumIy*sumIy - A11*A22*sumW)* D;
+                invTensorMat(2,3) = (A11*A22*sumI - A12*A12*sumI - A11*sumIy*w2 + A12*sumIx*w2 + A12*sumIy*w1 - A22*sumIx*w1)* D;
+                invTensorMat(3,0) = invTensorMat(0,3);
+                invTensorMat(3,1) = invTensorMat(1,3);
+                invTensorMat(3,2) = invTensorMat(2,3);
+                invTensorMat(3,3) = (dI*A12*A12 - 2*A12*w1*w2 + A22*w1*w1 + A11*w2*w2 - A11*A22*dI)* D;
+
+                resultMat = invTensorMat * mismatchMat;
+
+                // 0.08 -12.10
+                Point2f delta(-resultMat(0), -resultMat(1));
+                Point2f deltaGain(-resultMat(2), -resultMat(3));
+
+
+                if( j == 0)
+                    prevGain = deltaGain;
+                nextPt += delta;
+                nextPts[ptidx] = nextPt - halfWin;
+                gainVecs[ptidx]= gainVec;
+                if( delta.ddot(delta) <= criteria.epsilon)
+                    break;
+                if ((
+                        std::abs(delta.x - prevDelta.x) < 0.01 &&
+                        std::abs(delta.y - prevDelta.y) < 0.01
+                    ) || (
+                        delta.ddot(delta) <= 0.001 &&
+                        std::abs(prevGain.x - deltaGain.x) < 0.01
+                    ))
+                {
+                    nextPts[ptidx]  -= delta*0.5f;
+                    gainVecs[ptidx] -= deltaGain* 0.5f;
+                    break;
+                }
+
+                if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01  &&
+                            std::abs(delta.y - prevDelta.y) < 0.01)
+                {
+                    nextPts[ptidx]  -= delta*0.5f;
+                    break;
+                }
+
+                prevDelta = delta;
+                prevGain = deltaGain;
+
+            }
+
+
+        }
+
+    }
+
+    const Mat*          prevImg;
+    const Mat*          nextImg;
+    const Mat*          prevDeriv;
+    const Mat*          rgbPrevImg;
+    const Mat*          rgbNextImg;
+    const Point2f*      prevPts;
+    Point2f*            nextPts;
+    uchar*              status;
+    cv::Point2f*        gainVecs;        // gain vector x -> multiplier y -> offset
+    float*              err;
+    int                 maxWinSize;
+    int                 minWinSize;
+    TermCriteria        criteria;
+    int                 level;
+    int                 maxLevel;
+    int                 windowType;
+    float               minEigThreshold;
+    bool                useInitialFlow;
+    int                 crossSegmentationThreshold;
+};
+
+}  // namespace
+namespace ica {
+
+class TrackerInvoker : public cv::ParallelLoopBody
+{
+public:
+    TrackerInvoker(
+        const Mat&      _prevImg,
+        const Mat&      _prevDeriv,
+        const Mat&      _nextImg,
+        const Mat&      _rgbPrevImg,
+        const Mat&      _rgbNextImg,
+        const Point2f*  _prevPts,
+        Point2f*        _nextPts,
+        uchar*          _status,
+        float*          _err,
+        int             _level,
+        int             _maxLevel,
+        int             _winSize[2],
+        int             _maxIteration,
+        bool            _useInitialFlow,
+        int             _supportRegionType,
+        int             _crossSegmentationThreshold,
+        float           _minEigenValue)
+    {
+        prevImg = &_prevImg;
+        prevDeriv = &_prevDeriv;
+        nextImg = &_nextImg;
+        rgbPrevImg = &_rgbPrevImg;
+        rgbNextImg = &_rgbNextImg;
+        prevPts = _prevPts;
+        nextPts = _nextPts;
+        status = _status;
+        err = _err;
+        minWinSize = _winSize[0];
+        maxWinSize = _winSize[1];
+        criteria.maxCount = _maxIteration;
+        criteria.epsilon = 0.01;
+        level = _level;
+        maxLevel = _maxLevel;
+        windowType = _supportRegionType;
+        minEigThreshold = _minEigenValue;
+        useInitialFlow = _useInitialFlow;
+        crossSegmentationThreshold = _crossSegmentationThreshold;
+    }
+
+    void operator()(const cv::Range& range) const CV_OVERRIDE
+    {
+        cv::Size    winSize;
+        cv::Point2f halfWin;
+
+        const Mat& I    = *prevImg;
+        const Mat& J    = *nextImg;
+        const Mat& derivI = *prevDeriv;
+        const Mat& BI = *rgbPrevImg;
+
+        winSize = cv::Size(maxWinSize,maxWinSize);
+        int winMaskwidth = roundUp(winSize.width, 8) * 2;
+        cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
+        winMaskMatBuf.setTo(1);
+
+
+
+        const float FLT_SCALE = (1.f/(1 << 20)); // 20
+
+        int j, cn = I.channels(), cn2 = cn*2;
+        int winbufwidth = roundUp(winSize.width, 8);
+        cv::Size winBufSize(winbufwidth,winbufwidth);
+
+        std::vector<short> _buf(winBufSize.area()*(cn + cn2));
+        Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]);
+        Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]);
+
+        for( int ptidx = range.start; ptidx < range.end; ptidx++ )
+        {
+            Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
+            Point2f nextPt;
+            if( level == maxLevel )
+            {
+                if( useInitialFlow )
+                    nextPt = nextPts[ptidx]*(float)(1./(1 << level));
+                else
+                    nextPt = prevPt;
+            }
+            else
+                nextPt = nextPts[ptidx]*2.f;
+            nextPts[ptidx] = nextPt;
+
+            Point2i iprevPt, inextPt;
+            iprevPt.x = cvFloor(prevPt.x);
+            iprevPt.y = cvFloor(prevPt.y);
+            int winArea = maxWinSize * maxWinSize;
+            cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize));
+
+            if( calcWinMaskMat(BI,  windowType, iprevPt,
+                                winMaskMat,winSize,halfWin,winArea,
+                                minWinSize,maxWinSize) == false)
+                        continue;
+            halfWin = Point2f(static_cast<float>(maxWinSize), static_cast<float>(maxWinSize)) - halfWin;
+            prevPt += halfWin;
+            iprevPt.x = cvFloor(prevPt.x);
+            iprevPt.y = cvFloor(prevPt.y);
+
+            if( iprevPt.x < 0 || iprevPt.x >= derivI.cols - winSize.width ||
+                iprevPt.y < 0 || iprevPt.y >= derivI.rows - winSize.height - 1)
+            {
+                if( level == 0 )
+                {
+                    if( status )
+                        status[ptidx] = 3;
+                    if( err )
+                        err[ptidx] = 0;
+                }
+                continue;
+            }
+
+            float a = prevPt.x - iprevPt.x;
+            float b = prevPt.y - iprevPt.y;
+            const int W_BITS = 14, W_BITS1 = 14;
+
+            int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+            int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+            int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+            int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+            float A11 = 0, A12 = 0, A22 = 0;
+
+#ifdef RLOF_SSE
+            __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+            __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+            __m128i z = _mm_setzero_si128();
+            __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1));
+            __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1));
+            __m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps();
+            __m128i mmMask4_epi32;
+            get4BitMask(winSize.width, mmMask4_epi32);
+#endif
+
+            // extract the patch from the first image, compute covariation matrix of derivatives
+            int x, y;
+            for( y = 0; y < winSize.height; y++ )
+            {
+                const uchar* src = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
+                const uchar* src1 = I.ptr<uchar>(y + iprevPt.y + 1, 0) + iprevPt.x*cn;
+                const short* dsrc = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
+                const short* dsrc1 = derivI.ptr<short>(y + iprevPt.y + 1, 0) + iprevPt.x*cn2;
+                short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                short* dIptr = derivIWinBuf.ptr<short>(y, 0);
+                const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
+                x = 0;
+#ifdef RLOF_SSE
+                for( ; x <= winSize.width*cn; x += 4, dsrc += 4*2, dsrc1 += 8, dIptr += 4*2 )
+                {
+                    __m128i wMask = _mm_set_epi32(MaskSet * maskPtr[x+3],
+                                                  MaskSet * maskPtr[x+2],
+                                                  MaskSet * maskPtr[x+1],
+                                                  MaskSet * maskPtr[x]);
+                    __m128i v00, v01, v10, v11, t0, t1;
+                    v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
+                    v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
+                    v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x)), z);
+                    v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x + cn)), z);
+
+                    t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                                       _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+
+
+
+
+                    t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
+                    _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0));
+
+
+
+                    v00 = _mm_loadu_si128((const __m128i*)(dsrc));
+                    v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
+                    v10 = _mm_loadu_si128((const __m128i*)(dsrc1));
+                    v11 = _mm_loadu_si128((const __m128i*)(dsrc1 + cn2));
+
+                    t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                                       _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                    t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+                                       _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+                    t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
+                    t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
+                    v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
+
+                    if( x + 4 > winSize.width)
+                    {
+                        v00 = _mm_and_si128(v00, mmMask4_epi32);
+                    }
+                    v00 = _mm_and_si128(v00, wMask);
+
+                    _mm_storeu_si128((__m128i*)dIptr, v00);
+                    t0 = _mm_srai_epi32(v00, 16); // Iy0 Iy1 Iy2 Iy3
+                    t1 = _mm_srai_epi32(_mm_slli_epi32(v00, 16), 16); // Ix0 Ix1 Ix2 Ix3
+
+                    __m128 fy = _mm_cvtepi32_ps(t0);
+                    __m128 fx = _mm_cvtepi32_ps(t1);
+
+
+                    qA22 = _mm_add_ps(qA22, _mm_mul_ps(fy, fy));
+                    qA12 = _mm_add_ps(qA12, _mm_mul_ps(fx, fy));
+                    qA11 = _mm_add_ps(qA11, _mm_mul_ps(fx, fx));
+                }
+#else
+
+                for( ; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2 )
+                {
+                    if( maskPtr[x] == 0)
+                    {
+                        dIptr[0] = 0;
+                        dIptr[1] = 0;
+                        continue;
+                    }
+                    int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
+                                          src1[x]*iw10 + src1[x+cn]*iw11, W_BITS1-5);
+                    int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
+                                           dsrc1[0]*iw10 + dsrc1[cn2]*iw11, W_BITS1);
+                    int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc1[1]*iw10 +
+                                           dsrc1[cn2+1]*iw11, W_BITS1);
+
+                    Iptr[x] = (short)ival;
+                    dIptr[0] = (short)ixval;
+                    dIptr[1] = (short)iyval;
+                    A11 += (float)(ixval*ixval);
+                    A12 += (float)(ixval*iyval);
+                    A22 += (float)(iyval*iyval);
+                }
+#endif
+            }
+
+#ifdef RLOF_SSE
+            float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];
+            _mm_store_ps(A11buf, qA11);
+            _mm_store_ps(A12buf, qA12);
+            _mm_store_ps(A22buf, qA22);
+            A11 += A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
+            A12 += A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
+            A22 += A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
+#endif
+
+            A11 *= FLT_SCALE;
+            A12 *= FLT_SCALE;
+            A22 *= FLT_SCALE;
+
+            float D = A11*A22 - A12*A12;
+            float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
+                             4.f*A12*A12))/(2 * winArea);
+
+            if( err )
+                err[ptidx] = (float)minEig;
+
+            if( minEig < minEigThreshold ||  D < FLT_EPSILON )
+            {
+                if( level == 0 && status )
+                    status[ptidx] = 0;
+                continue;
+            }
+
+            D = 1.f/D;
+
+            nextPt += halfWin;
+            Point2f prevDelta(0,0);    //relates to h(t-1)
+#ifdef RLOF_SSE
+            __m128i mmMask0, mmMask1, mmMask;
+            getWBitMask(winSize.width, mmMask0, mmMask1, mmMask);
+#endif
+            for( j = 0; j < criteria.maxCount; j++ )
+            {
+                status[ptidx] = static_cast<uchar>(j);
+                inextPt.x = cvFloor(nextPt.x);
+                inextPt.y = cvFloor(nextPt.y);
+
+                if( inextPt.x < 0 || inextPt.x >= J.cols - winSize.width ||
+                    inextPt.y < 0 || inextPt.y >= J.rows - winSize.height - 1)
+                {
+                    if( level == 0 && status )
+                        status[ptidx] = 3;
+                    break;
+                }
+
+                a = nextPt.x - inextPt.x;
+                b = nextPt.y - inextPt.y;
+                iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+                iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+                iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+                iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+                float b1 = 0, b2 = 0;
+#ifdef RLOF_SSE
+                qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+                qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+                __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps();
+
+
+#endif
+                for( y = 0; y < winSize.height; y++ )
+                {
+                    const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                    const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
+                    const short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                    const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
+
+                    x = 0;
+#ifdef RLOF_SSE
+                    const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
+                    for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 )
+                    {
+                        if( maskPtr[x  ] == 0 && maskPtr[x+1] == 0 && maskPtr[x+2] == 0 && maskPtr[x+3] == 0
+                        &&    maskPtr[x+4] == 0 && maskPtr[x+5] == 0 && maskPtr[x+6] == 0 && maskPtr[x+7] == 0)
+                            continue;
+                        __m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x)), diff1;
+                        __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
+                        __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
+                        __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z);
+                        __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z);
+
+                        __m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                                                   _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                        __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+                                                   _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+                        t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
+                        t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5);
+
+                        __m128i mmDiff_epi16 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), diff0);
+
+                        __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
+                        __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
+
+                        if(  x > winSize.width*cn - 8)
+                        {
+                            Ixy_0 = _mm_and_si128(Ixy_0, mmMask0);
+                            Ixy_1 = _mm_and_si128(Ixy_1, mmMask1);
+                        }
+
+
+                        diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7   | It12 It12 It13 It13...
+                        diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3   | It8 It8 It9 It9...
+
+
+                        v10 = _mm_mullo_epi16(Ixy_0, diff0);
+                        v11 = _mm_mulhi_epi16(Ixy_0, diff0);
+                        v00 = _mm_unpacklo_epi16(v10, v11);
+                        v10 = _mm_unpackhi_epi16(v10, v11);
+                        qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
+                        qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
+
+                        // It * Ix It * Iy [4 ... 7]
+                        // for set 1 hi sigma 1
+
+                        v10 = _mm_mullo_epi16(Ixy_1, diff1);
+                        v11 = _mm_mulhi_epi16(Ixy_1, diff1);
+                        v00 = _mm_unpacklo_epi16(v10, v11);
+                        v10 = _mm_unpackhi_epi16(v10, v11);
+                        qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
+                        qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
+                    }
+#else
+                    for( ; x < winSize.width*cn; x++, dIptr += 2 )
+                    {
+                        if( dIptr[0] == 0 && dIptr[1] == 0)
+                            continue;
+                        int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
+                                              Jptr1[x]*iw10 + Jptr1[x+cn]*iw11,
+                                              W_BITS1-5) - Iptr[x];
+
+                        b1 += (float)(diff*dIptr[0]) * FLT_RESCALE;
+                        b2 += (float)(diff*dIptr[1]) * FLT_RESCALE;
+                    }
+#endif
+                }
+
+#ifdef RLOF_SSE
+                float CV_DECL_ALIGNED(16) bbuf[4];
+                _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
+                b1 += bbuf[0] + bbuf[2];
+                b2 += bbuf[1] + bbuf[3];
+#endif
+
+                b1 *= FLT_SCALE;
+                b2 *= FLT_SCALE;
+
+                Point2f delta( (float)((A12*b2 - A22*b1) * D),
+                              (float)((A12*b1 - A11*b2) * D));
+
+
+
+                nextPt += delta;
+                nextPts[ptidx] = nextPt - halfWin;
+
+                if( delta.ddot(delta) <= criteria.epsilon)
+                    break;
+                if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01  &&
+                            std::abs(delta.y - prevDelta.y) < 0.01)
+                {
+                    nextPts[ptidx]  -= delta*0.5f;
+                    break;
+                }
+
+                prevDelta = delta;
+
+            }
+
+        }
+
+    }
+
+    const Mat*          prevImg;
+    const Mat*          nextImg;
+    const Mat*          prevDeriv;
+    const Mat*          rgbPrevImg;
+    const Mat*          rgbNextImg;
+    const Point2f*      prevPts;
+    Point2f*            nextPts;
+    uchar*              status;
+    float*              err;
+    int                 maxWinSize;
+    int                 minWinSize;
+    TermCriteria        criteria;
+    int                 level;
+    int                 maxLevel;
+    int                 windowType;
+    float               minEigThreshold;
+    bool                useInitialFlow;
+    int                 crossSegmentationThreshold;
+};
+
+}}}}  // namespace
+#endif
diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp
new file mode 100644
index 00000000000..8dd0ecccd75
--- /dev/null
+++ b/modules/optflow/src/rlof/rlof_invoker.hpp
@@ -0,0 +1,1423 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#ifndef _RLOF_INVOKER_HPP_
+#define _RLOF_INVOKER_HPP_
+#include "rlof_invokerbase.hpp"
+namespace cv {
+namespace optflow {
+namespace rlof {
+namespace ica {
+
+class TrackerInvoker : public cv::ParallelLoopBody
+{
+public:
+    TrackerInvoker(
+        const Mat&      _prevImg,
+        const Mat&      _prevDeriv,
+        const Mat&      _nextImg,
+        const Mat&      _rgbPrevImg,
+        const Mat&      _rgbNextImg,
+        const Point2f*  _prevPts,
+        Point2f*        _nextPts,
+        uchar*          _status,
+        float*          _err,
+        int             _level,
+        int             _maxLevel,
+        int             _winSize[2],
+        int             _maxIteration,
+        bool            _useInitialFlow,
+        int             _supportRegionType,
+        const std::vector<float>& _normSigmas,
+        float           _minEigenValue,
+        int             _crossSegmentationThreshold
+    ) :
+        normSigma0(_normSigmas[0]),
+        normSigma1(_normSigmas[1]),
+        normSigma2(_normSigmas[2])
+    {
+        prevImg = &_prevImg;
+        prevDeriv = &_prevDeriv;
+        nextImg = &_nextImg;
+        rgbPrevImg = &_rgbPrevImg;
+        rgbNextImg = &_rgbNextImg;
+        prevPts = _prevPts;
+        nextPts = _nextPts;
+        status = _status;
+        err = _err;
+        minWinSize = _winSize[0];
+        maxWinSize = _winSize[1];
+        criteria.maxCount = _maxIteration;
+        criteria.epsilon = 0.01;
+        level = _level;
+        maxLevel = _maxLevel;
+        windowType = _supportRegionType;
+        minEigThreshold = _minEigenValue;
+        useInitialFlow = _useInitialFlow;
+        crossSegmentationThreshold = _crossSegmentationThreshold;
+    }
+
+    void operator()(const cv::Range& range) const CV_OVERRIDE
+    {
+        cv::Size    winSize;
+        cv::Point2f halfWin;
+
+        const Mat& I = *prevImg;
+        const Mat& J = *nextImg;
+        const Mat& derivI = *prevDeriv;
+        const Mat& BI = *rgbPrevImg;
+
+        winSize = cv::Size(maxWinSize, maxWinSize);
+        int winMaskwidth = roundUp(winSize.width, 8) * 2;
+        cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
+        winMaskMatBuf.setTo(1);
+
+        const float FLT_SCALE = (1.f / (1 << 20)); // 20
+
+        int cn = I.channels(), cn2 = cn * 2;
+        int winbufwidth = roundUp(winSize.width, 8);
+        cv::Size winBufSize(winbufwidth, winbufwidth);
+
+        std::vector<short> _buf(winBufSize.area()*(cn + cn2));
+        Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]);
+        Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]);
+
+        for (int ptidx = range.start; ptidx < range.end; ptidx++)
+        {
+            Point2f prevPt = prevPts[ptidx] * (float)(1. / (1 << level));
+            Point2f nextPt;
+            if (level == maxLevel)
+            {
+                if ( useInitialFlow)
+                    nextPt = nextPts[ptidx] * (float)(1. / (1 << level));
+                else
+                    nextPt = prevPt;
+            }
+            else
+                nextPt = nextPts[ptidx] * 2.f;
+            nextPts[ptidx] = nextPt;
+
+            Point2i iprevPt, inextPt;
+            iprevPt.x = cvFloor(prevPt.x);
+            iprevPt.y = cvFloor(prevPt.y);
+            int winArea = maxWinSize * maxWinSize;
+            cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0, 0, maxWinSize, maxWinSize));
+
+            if (calcWinMaskMat(BI, windowType, iprevPt,
+                winMaskMat, winSize, halfWin, winArea,
+                minWinSize, maxWinSize) == false)
+                continue;
+            halfWin = Point2f(static_cast<float>(maxWinSize) ,static_cast<float>(maxWinSize) ) - halfWin;
+            prevPt += halfWin;
+            iprevPt.x = cvFloor(prevPt.x);
+            iprevPt.y = cvFloor(prevPt.y);
+            if( iprevPt.x < 0 || iprevPt.x >= derivI.cols - winSize.width ||
+                iprevPt.y < 0 || iprevPt.y >= derivI.rows - winSize.height - 1)
+            {
+                if (level == 0)
+                {
+                    if (status)
+                        status[ptidx] = 3;
+                    if (err)
+                        err[ptidx] = 0;
+                }
+                continue;
+            }
+
+            float a = prevPt.x - iprevPt.x;
+            float b = prevPt.y - iprevPt.y;
+            const int W_BITS = 14, W_BITS1 = 14;
+
+            int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+            int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+            int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+            int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+            float A11 = 0, A12 = 0, A22 = 0;
+            float b1 = 0, b2 = 0;
+            float D = 0;
+            float minEig;
+
+#ifdef RLOF_SSE
+            __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+            __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+            __m128i z = _mm_setzero_si128();
+            __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1 - 1));
+            __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1 - 5 - 1));
+            __m128i mmMask4_epi32;
+            __m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits<unsigned short>::max());
+            get4BitMask(winSize.width, mmMask4_epi32);
+#endif
+
+            // extract the patch from the first image, compute covariation matrix of derivatives
+            int x, y;
+            for (y = 0; y < winSize.height; y++)
+            {
+                const uchar* src = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
+                const uchar* src1 = I.ptr<uchar>(y + iprevPt.y + 1, 0) + iprevPt.x*cn;
+                const short* dsrc = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
+                const short* dsrc1 = derivI.ptr<short>(y + iprevPt.y + 1, 0) + iprevPt.x*cn2;
+                short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                short* dIptr = derivIWinBuf.ptr<short>(y, 0);
+                const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
+                x = 0;
+#ifdef RLOF_SSE
+
+                for (; x <= winSize.width*cn; x += 4, dsrc += 4 * 2, dsrc1 += 8, dIptr += 4 * 2)
+                {
+                    __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
+                    __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16);
+
+                    __m128i v00, v01, v10, v11, t0, t1;
+                    v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
+                    v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
+                    v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x)), z);
+                    v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x + cn)), z);
+
+                    t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                        _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                    t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5);
+                    if (x + 4 > winSize.width)
+                    {
+                        t0 = _mm_and_si128(t0, mmMask4_epi32);
+                    }
+                    t0 = _mm_and_si128(t0, mask_0_3_epi16);
+                    _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0, t0));
+
+                    v00 = _mm_loadu_si128((const __m128i*)(dsrc));
+                    v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
+                    v10 = _mm_loadu_si128((const __m128i*)(dsrc1));
+                    v11 = _mm_loadu_si128((const __m128i*)(dsrc1 + cn2));
+
+                    t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                        _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                    t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+                        _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+                    t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
+                    t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
+                    v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
+                    if (x + 4 > winSize.width)
+                    {
+                        v00 = _mm_and_si128(v00, mmMask4_epi32);
+                    }
+                    v00 = _mm_and_si128(v00, mask_0_3_epi16);
+                    _mm_storeu_si128((__m128i*)dIptr, v00);
+                }
+#else
+
+                for (; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2)
+                {
+                    if (maskPtr[x] == 0)
+                    {
+                        dIptr[0] = 0;
+                        dIptr[1] = 0;
+                        continue;
+                    }
+                    int ival = CV_DESCALE(src[x] * iw00 + src[x + cn] * iw01 +
+                        src1[x] * iw10 + src1[x + cn] * iw11, W_BITS1 - 5);
+                    int ixval = CV_DESCALE(dsrc[0] * iw00 + dsrc[cn2] * iw01 +
+                        dsrc1[0] * iw10 + dsrc1[cn2] * iw11, W_BITS1);
+                    int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc1[1] * iw10 +
+                        dsrc1[cn2 + 1] * iw11, W_BITS1);
+
+                    Iptr[x] = (short)ival;
+                    dIptr[0] = (short)ixval;
+                    dIptr[1] = (short)iyval;
+                }
+#endif
+            }
+
+            cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1);
+
+            cv::Point2f backUpNextPt = nextPt;
+            nextPt += halfWin;
+            int j;
+#ifdef RLOF_SSE
+            __m128i mmMask0, mmMask1, mmMask;
+            getWBitMask(winSize.width, mmMask0, mmMask1, mmMask);
+#endif
+            float MEstimatorScale = 1;
+            int buffIdx = 0;
+            cv::Point2f prevDelta(0, 0);
+
+            for (j = 0; j < criteria.maxCount; j++)
+            {
+                inextPt.x = cvFloor(nextPt.x);
+                inextPt.y = cvFloor(nextPt.y);
+
+                if( inextPt.x < 0 || inextPt.x >= J.cols - winSize.width ||
+                    inextPt.y < 0 || inextPt.y >= J.rows - winSize.height - 1)
+                {
+                    if (level == 0 && status)
+                        status[ptidx] = 3;
+                    if (level > 0)
+                        nextPts[ptidx] = backUpNextPt;
+                    break;
+                }
+
+
+                a = nextPt.x - inextPt.x;
+                b = nextPt.y - inextPt.y;
+                iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+                iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+                iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+                iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+                b1 = 0;
+                b2 = 0;
+
+                if (j == 0 )
+                {
+                    A11 = 0;
+                    A12 = 0;
+                    A22 = 0;
+                }
+
+                if (j == 0 )
+                {
+                    buffIdx = 0;
+                    for (y = 0; y < winSize.height; y++)
+                    {
+                        const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                        const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
+                        const short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                        const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
+                        const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
+                        x = 0;
+                        for (; x < winSize.width*cn; x++, dIptr += 2)
+                        {
+                            if (maskPtr[x] == 0)
+                                continue;
+                            int diff = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr1[x] * iw10 + Jptr1[x + cn] * iw11, W_BITS1 - 5) - Iptr[x];
+                            residualMat.at<short>(buffIdx++) = static_cast<short>(diff);
+                        }
+                    }
+                    /*! Estimation for the residual */
+                    cv::Mat residualMatRoi(residualMat, cv::Rect(0, 0, 1, buffIdx));
+                    MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi);
+                }
+
+                float eta = 1.f / winArea;
+                float fParam0 = normSigma0 * 32.f;
+                float fParam1 = normSigma1 * 32.f;
+                fParam0 = normSigma0 * MEstimatorScale;
+                fParam1 = normSigma1 * MEstimatorScale;
+#ifdef RLOF_SSE
+                qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+                qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+                __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps();
+                __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();
+                __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam0)));
+                __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam1)));
+                float s2Val = std::fabs(normSigma2);
+                int s2bitShift = normSigma2 == 0 ? 1 : cvCeil(log(200.f / s2Val) / log(2.f));
+                __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast<short>(normSigma2 * (float)(1 << s2bitShift)));
+                __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift);
+                __m128  mmParam2s = _mm_set1_ps(0.01f * normSigma2);
+                __m128  mmParam2s2 = _mm_set1_ps(normSigma2 * normSigma2);
+                __m128  mmOnes = _mm_set1_ps(1.f);
+                __m128i mmEta = _mm_setzero_si128();
+                __m128i mmScale = _mm_set1_epi16(static_cast<short>(MEstimatorScale));
+
+#endif
+
+                buffIdx = 0;
+                for (y = 0; y < winSize.height; y++)
+                {
+                    const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                    const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
+                    const short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                    const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
+                    const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
+                    x = 0;
+#ifdef RLOF_SSE
+                    for (; x <= winSize.width*cn; x += 8, dIptr += 8 * 2)
+                    {
+                        __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
+                        __m128i diff0, diff1;
+                        __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); // von element 0 bis 7
+
+                        __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
+                        __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
+                        __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z);
+                        __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z);
+
+                        __m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                            _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                        __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+                            _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+                        t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5);
+                        t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1 - 5);
+
+                        __m128i mmDiff_epi16 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16);
+
+                        mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16);
+
+
+                        __m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale);
+                        mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1)));
+
+
+                        __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
+                        __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
+
+                        __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16);
+                        __m128i bSet2_epi16, bSet1_epi16;
+                        // |It| < sigma1 ?
+                        bSet2_epi16 = _mm_cmplt_epi16(abs_epi16, mmParam1);
+                        // It > 0 ?
+                        __m128i diffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128());
+                        // sigma0 < |It| < sigma1 ?
+                        bSet1_epi16 = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0));
+                        // val = |It| -/+ sigma1
+                        __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)),
+                            _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1)));
+                        // It == 0     ? |It| > sigma13
+                        mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16);
+                        // It == val ? sigma0 < |It| < sigma1
+                        mmDiff_epi16 = _mm_blendv_epi8(mmDiff_epi16, tmpParam1_epi16, bSet1_epi16);
+
+
+                        __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3
+                        // diff = diff * sigma2
+                        __m128i lo = _mm_mullo_epi16(tale_epi16_, mmDiff_epi16);
+                        __m128i hi = _mm_mulhi_epi16(tale_epi16_, mmDiff_epi16);
+                        __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); //diff 0_3_epi32
+                        __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); // diff 4_7_epi32
+
+                        mmDiff_epi16 = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32);
+                        diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7   | It12 It12 It13 It13...
+                        diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3   | It8 It8 It9 It9...
+
+                        // Ix * diff / Iy * diff
+                        v10 = _mm_mullo_epi16(Ixy_0, diff0);
+                        v11 = _mm_mulhi_epi16(Ixy_0, diff0);
+                        v00 = _mm_unpacklo_epi16(v10, v11);
+                        v10 = _mm_unpackhi_epi16(v10, v11);
+                        qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
+                        qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
+
+                        // It * Ix It * Iy [4 ... 7]
+                        // for set 1 hi sigma 1
+
+                        v10 = _mm_mullo_epi16(Ixy_1, diff1);
+                        v11 = _mm_mulhi_epi16(Ixy_1, diff1);
+                        v00 = _mm_unpacklo_epi16(v10, v11);
+                        v10 = _mm_unpackhi_epi16(v10, v11);
+                        qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
+                        qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
+
+
+                        if (j == 0)
+                        {
+                            __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16));
+                            __m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16, bSet1_epi16), 16));
+                            __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16));
+                            __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16)));
+
+                            __m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16));
+                            __m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16), 16));
+
+                            __m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps);
+                            __m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps);
+                            tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps);
+                            tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps);
+
+                            tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps);
+                            tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps);
+
+                            t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3
+                            t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3
+
+                            __m128 fy = _mm_cvtepi32_ps(t0);
+                            __m128 fx = _mm_cvtepi32_ps(t1);
+
+                            // A11 - A22
+                            __m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps);
+                            __m128 fytale = _mm_mul_ps(fy, tale_0_3_ps);
+
+                            mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
+                            mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
+                            mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
+
+                            t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11
+                            t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3
+
+                            fy = _mm_cvtepi32_ps(t0);
+                            fx = _mm_cvtepi32_ps(t1);
+
+                            // A11 - A22
+                            fxtale = _mm_mul_ps(fx, tale_4_7_ps);
+                            fytale = _mm_mul_ps(fy, tale_4_7_ps);
+
+                            mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
+                            mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
+                            mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
+                        }
+                    }
+#else
+                    for (; x < winSize.width*cn; x++, dIptr += 2)
+                    {
+                        if (maskPtr[x] == 0)
+                            continue;
+                        int diff = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 +
+                            Jptr1[x] * iw10 + Jptr1[x + cn] * iw11,
+                            W_BITS1 - 5) - Iptr[x];
+
+
+
+
+
+                        if (diff > MEstimatorScale)
+                            MEstimatorScale += eta;
+                        if (diff < MEstimatorScale)
+                            MEstimatorScale -= eta;
+
+                        int abss = (diff < 0) ? -diff : diff;
+                        if (abss > fParam1)
+                        {
+                            diff = 0;
+                        }
+                        else if (abss > fParam0 && diff >= 0)
+                        {
+                            //diff = fParam1 * (diff - fParam1);
+                            diff = static_cast<int>(normSigma2 * (diff - fParam1));
+                        }
+                        else if (abss > fParam0 && diff < 0)
+                        {
+                            //diff = fParam1 * (diff + fParam1);
+                            diff = static_cast<int>(normSigma2 * (diff + fParam1));
+
+                        }
+
+                        float ixval = (float)(dIptr[0]);
+                        float iyval = (float)(dIptr[1]);
+                        b1 += (float)(diff*ixval);
+                        b2 += (float)(diff*iyval);
+
+                        if ( j == 0)
+                        {
+                            float tale = normSigma2 * FLT_RESCALE;
+                            if (abss < fParam0)
+                            {
+                                tale = FLT_RESCALE;
+                            }
+                            else if (abss > fParam1)
+                            {
+                                tale *= 0.01f;
+                            }
+                            else
+                            {
+                                tale *= normSigma2;
+                            }
+                            A11 += (float)(ixval*ixval*tale);
+                            A12 += (float)(ixval*iyval*tale);
+                            A22 += (float)(iyval*iyval*tale);
+                        }
+
+
+                    }
+#endif
+                }
+
+#ifdef RLOF_SSE
+                short etaValues[8];
+                _mm_storeu_si128((__m128i*)(etaValues), mmEta);
+                MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3]
+                        + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]);
+
+
+#endif
+                if (j == 0)
+                {
+#ifdef RLOF_SSE
+                    float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];//
+
+                    _mm_store_ps(A11buf, mmAxx);
+                    _mm_store_ps(A12buf, mmAxy);
+                    _mm_store_ps(A22buf, mmAyy);
+
+
+                    A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
+                    A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
+                    A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
+#endif
+                    A11 *= FLT_SCALE;
+                    A12 *= FLT_SCALE;
+                    A22 *= FLT_SCALE;
+
+
+                    D = A11 * A22 - A12 * A12;
+                    minEig = (A22 + A11 - std::sqrt((A11 - A22)*(A11 - A22) +
+                        4.f*A12*A12)) / (2 * winArea);
+                    D = 1.f / D;
+
+                    if (minEig < minEigThreshold || std::abs(D) < FLT_EPSILON)
+                    {
+                        if (level == 0 && status)
+                            status[ptidx] = 0;
+                        if (level > 0)
+                        {
+                            nextPts[ptidx] = backUpNextPt;
+                        }
+                        break;
+                    }
+                }
+
+#ifdef RLOF_SSE
+                float CV_DECL_ALIGNED(16) bbuf[4];
+                _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
+                b1 += bbuf[0] + bbuf[2];
+                b2 += bbuf[1] + bbuf[3];
+#endif
+
+                b1 *= FLT_SCALE;
+                b2 *= FLT_SCALE;
+
+
+                Point2f delta((float)((A12*b2 - A22 * b1) * D), (float)((A12*b1 - A11 * b2) * D));
+
+                delta.x = (delta.x != delta.x) ? 0 : delta.x;
+                delta.y = (delta.y != delta.y) ? 0 : delta.y;
+
+                nextPt += delta * 0.7;
+                nextPts[ptidx] = nextPt - halfWin;
+
+                if (j > 0 && std::abs(delta.x - prevDelta.x) < 0.01  &&
+                    std::abs(delta.y - prevDelta.y) < 0.01)
+                {
+                    nextPts[ptidx] -= delta * 0.5f;
+                    break;
+                }
+
+                prevDelta = delta;
+
+            }
+
+        }
+
+    }
+
+    const Mat*          prevImg;
+    const Mat*          nextImg;
+    const Mat*          prevDeriv;
+    const Mat*          rgbPrevImg;
+    const Mat*          rgbNextImg;
+    const Point2f*      prevPts;
+    Point2f*            nextPts;
+    uchar*              status;
+    float*              err;
+    int                 maxWinSize;
+    int                 minWinSize;
+    TermCriteria        criteria;
+    int                 level;
+    int                 maxLevel;
+    int                 windowType;
+    float               minEigThreshold;
+    bool                useInitialFlow;
+    const float         normSigma0, normSigma1, normSigma2;
+    int                 crossSegmentationThreshold;
+};
+
+} // namespace
+namespace radial {
+
+class TrackerInvoker : public cv::ParallelLoopBody
+{
+public:
+    TrackerInvoker(
+        const Mat&      _prevImg,
+        const Mat&      _prevDeriv,
+        const Mat&      _nextImg,
+        const Mat&      _rgbPrevImg,
+        const Mat&      _rgbNextImg,
+        const Point2f*  _prevPts,
+        Point2f*        _nextPts,
+        uchar*          _status,
+        float*          _err,
+        Point2f*        _gainVecs,
+        int             _level,
+        int             _maxLevel,
+        int             _winSize[2],
+        int             _maxIteration,
+        bool            _useInitialFlow,
+        int             _supportRegionType,
+        const std::vector<float>& _normSigmas,
+        float           _minEigenValue,
+        int             _crossSegmentationThreshold
+    ) :
+        normSigma0(_normSigmas[0]),
+        normSigma1(_normSigmas[1]),
+        normSigma2(_normSigmas[2])
+    {
+        prevImg = &_prevImg;
+        prevDeriv = &_prevDeriv;
+        nextImg = &_nextImg;
+        rgbPrevImg = &_rgbPrevImg;
+        rgbNextImg = &_rgbNextImg;
+        prevPts = _prevPts;
+        nextPts = _nextPts;
+        status = _status;
+        err = _err;
+        gainVecs = _gainVecs;
+        minWinSize = _winSize[0];
+        maxWinSize = _winSize[1];
+        criteria.maxCount = _maxIteration;
+        criteria.epsilon = 0.01;
+        level = _level;
+        maxLevel = _maxLevel;
+        windowType = _supportRegionType;
+        minEigThreshold = _minEigenValue;
+        useInitialFlow = _useInitialFlow;
+        crossSegmentationThreshold = _crossSegmentationThreshold;
+    }
+
+    void operator()(const cv::Range& range) const CV_OVERRIDE
+    {
+#ifdef DEBUG_INVOKER
+        printf("rlof::radial");fflush(stdout);
+#endif
+        Point2f halfWin;
+        cv::Size winSize;
+        const Mat& I = *prevImg;
+        const Mat& J = *nextImg;
+        const Mat& derivI = *prevDeriv;
+        const Mat& BI = *rgbPrevImg;
+
+
+        const float FLT_SCALE = (1.f / (1 << 16));
+
+        winSize = cv::Size(maxWinSize, maxWinSize);
+        int winMaskwidth = roundUp(winSize.width, 16);
+        cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
+        winMaskMatBuf.setTo(1);
+
+        int cn = I.channels(), cn2 = cn * 2;
+        int winbufwidth = roundUp(winSize.width, 16);
+        cv::Size winBufSize(winbufwidth, winbufwidth);
+
+        cv::Matx44f invTensorMat;
+        cv::Vec4f mismatchMat;
+        cv::Vec4f resultMat;
+
+
+        std::vector<short> _buf(winBufSize.area()*(cn + cn2));
+        Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]);
+        Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]);
+
+        for (int ptidx = range.start; ptidx < range.end; ptidx++)
+        {
+
+            Point2f prevPt = prevPts[ptidx] * (float)(1. / (1 << level));
+            Point2f nextPt;
+            if (level == maxLevel)
+            {
+                if (useInitialFlow)
+                {
+                    nextPt = nextPts[ptidx] * (float)(1. / (1 << level));
+                }
+                else
+                    nextPt = prevPt;
+            }
+            else
+            {
+                nextPt = nextPts[ptidx] * 2.f;
+
+            }
+            nextPts[ptidx] = nextPt;
+
+            Point2i iprevPt, inextPt;
+            iprevPt.x = cvFloor(prevPt.x);
+            iprevPt.y = cvFloor(prevPt.y);
+            int winArea = maxWinSize * maxWinSize;
+            cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0, 0, maxWinSize, maxWinSize));
+            winMaskMatBuf.setTo(0);
+            if (calcWinMaskMat(BI, windowType, iprevPt,
+                winMaskMat, winSize, halfWin, winArea,
+                minWinSize, maxWinSize) == false)
+                continue;
+            halfWin = Point2f(static_cast<float>(maxWinSize) ,static_cast<float>(maxWinSize) ) - halfWin;
+            prevPt += halfWin;
+            iprevPt.x = cvFloor(prevPt.x);
+            iprevPt.y = cvFloor(prevPt.y);
+            if( iprevPt.x < 0 || iprevPt.x >= derivI.cols - winSize.width ||
+                iprevPt.y < 0 || iprevPt.y >= derivI.rows - winSize.height - 1)
+            {
+                if (level == 0)
+                {
+                    if (status)
+                        status[ptidx] = 3;
+                    if (err)
+                        err[ptidx] = 0;
+                }
+                continue;
+            }
+
+            float a = prevPt.x - iprevPt.x;
+            float b = prevPt.y - iprevPt.y;
+            const int W_BITS = 14, W_BITS1 = 14;
+
+            int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+            int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+            int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+            int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+
+            float A11 = 0, A12 = 0, A22 = 0;
+            float b1 = 0, b2 = 0, b3 = 0, b4 = 0;
+            // tensor
+            float sumIx = 0;
+            float sumIy = 0;
+            float sumI = 0;
+            float sumW = 0;
+            float w1 = 0, w2 = 0; // -IyI
+            float dI = 0; // I^2
+            float D = 0;
+
+#ifdef RLOF_SSE
+            __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+            __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+            __m128i z = _mm_setzero_si128();
+            __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1 - 1));
+            __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1 - 5 - 1));
+            __m128i mmMask4_epi32;
+            __m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits<unsigned short>::max());
+            get4BitMask(winSize.width, mmMask4_epi32);
+#endif
+
+            // extract the patch from the first image, compute covariation matrix of derivatives
+            int x, y;
+            for (y = 0; y < winSize.height; y++)
+            {
+                const uchar* src = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
+                const uchar* src1 = I.ptr<uchar>(y + iprevPt.y + 1, 0) + iprevPt.x*cn;
+                const short* dsrc = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
+                const short* dsrc1 = derivI.ptr<short>(y + iprevPt.y + 1, 0) + iprevPt.x*cn2;
+                short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                short* dIptr = derivIWinBuf.ptr<short>(y, 0);
+                x = 0;
+
+#ifdef RLOF_SSE
+                const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
+                for (; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4 * 2, dsrc1 += 8, dIptr += 4 * 2)
+                {
+                    __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
+                    __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16);
+                    __m128i v00, v01, v10, v11, t0, t1;
+                    v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
+                    v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
+                    v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x)), z);
+                    v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x + cn)), z);
+
+                    t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                        _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                    t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5);
+                    if (x + 4 > winSize.width)
+                    {
+                        t0 = _mm_and_si128(t0, mmMask4_epi32);
+                    }
+                    t0 = _mm_and_si128(t0, mask_0_3_epi16);
+                    _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0, t0));
+
+
+                    v00 = _mm_loadu_si128((const __m128i*)(dsrc));
+                    v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
+                    v10 = _mm_loadu_si128((const __m128i*)(dsrc1));
+                    v11 = _mm_loadu_si128((const __m128i*)(dsrc1 + cn2));
+
+                    t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                        _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                    t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+                        _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+                    t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
+                    t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
+                    v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
+                    if (x + 4 > winSize.width)
+                    {
+                        v00 = _mm_and_si128(v00, mmMask4_epi32);
+                    }
+                    v00 = _mm_and_si128(v00, mask_0_3_epi16);
+                    _mm_storeu_si128((__m128i*)dIptr, v00);
+                }
+#else
+
+                for (; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2)
+                {
+                    if (winMaskMat.at<uchar>(y, x) == 0)
+                    {
+                        dIptr[0] = 0;
+                        dIptr[1] = 0;
+                        continue;
+                    }
+                    int ival = CV_DESCALE(src[x] * iw00 + src[x + cn] * iw01 +
+                        src1[x] * iw10 + src1[x + cn] * iw11, W_BITS1 - 5);
+                    int ixval = CV_DESCALE(dsrc[0] * iw00 + dsrc[cn2] * iw01 +
+                        dsrc1[0] * iw10 + dsrc1[ cn2] * iw11, W_BITS1);
+                    int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc1[1] * iw10 +
+                        dsrc1[cn2 + 1] * iw11, W_BITS1);
+
+                    Iptr[x] = (short)ival;
+                    dIptr[0] = (short)ixval;
+                    dIptr[1] = (short)iyval;
+
+                }
+#endif
+            }
+
+            cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1);
+
+            cv::Point2f backUpNextPt = nextPt;
+            nextPt += halfWin;
+            Point2f prevDelta(0, 0);    //related to h(t-1)
+            Point2f prevGain(0, 0);
+            cv::Point2f gainVec = gainVecs[ptidx];
+            cv::Point2f backUpGain = gainVec;
+            cv::Size _winSize = winSize;
+            int j;
+#ifdef RLOF_SSE
+            __m128i mmMask0, mmMask1, mmMask;
+            getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask);
+            __m128  mmOnes = _mm_set1_ps(1.f);
+#endif
+            float MEstimatorScale = 1;
+            int buffIdx = 0;
+            float minEigValue;
+
+            for (j = 0; j < criteria.maxCount; j++)
+            {
+
+                inextPt.x = cvFloor(nextPt.x);
+                inextPt.y = cvFloor(nextPt.y);
+
+                if( inextPt.x < 0 || inextPt.x >= J.cols - winSize.width ||
+                    inextPt.y < 0 || inextPt.y >= J.rows - winSize.height - 1)
+                {
+                    if (level == 0 && status)
+                        status[ptidx] = 3;
+                    if (level > 0)
+                    {
+                        nextPts[ptidx] = backUpNextPt;
+                        gainVecs[ptidx] = backUpGain;
+                    }
+                    break;
+                }
+
+                a = nextPt.x - inextPt.x;
+                b = nextPt.y - inextPt.y;
+                iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+                iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+                iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+                iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+
+                // mismatch
+                b1 = 0;
+                b2 = 0;
+                b3 = 0;
+                b4 = 0;
+                if (j == 0)
+                {
+                    // tensor
+                    w1 = 0; // -IxI
+                    w2 = 0; // -IyI
+                    dI = 0; // I^2
+                    sumIx = 0;
+                    sumIy = 0;
+                    sumI = 0;
+                    sumW = 0;
+                    A11 = 0;
+                    A12 = 0;
+                    A22 = 0;
+                }
+
+                if (j == 0 )
+                {
+                    buffIdx = 0;
+                    for (y = 0; y < winSize.height; y++)
+                    {
+                        const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                        const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
+                        const short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                        const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
+                        x = 0;
+                        for (; x < winSize.width*cn; x++, dIptr += 2)
+                        {
+                            if (dIptr[0] == 0 && dIptr[1] == 0)
+                                continue;
+                            int diff = static_cast<int>(CV_DESCALE(    Jptr[x] * iw00 +
+                                                    Jptr[x + cn] * iw01 +
+                                                    Jptr1[x] * iw10 +
+                                                    Jptr1[x + cn] * iw11, W_BITS1 - 5)
+                                - Iptr[x] + Iptr[x] * gainVec.x + gainVec.y);
+                            residualMat.at<short>(buffIdx++) = static_cast<short>(diff);
+                        }
+                    }
+                    /*! Estimation for the residual */
+                    cv::Mat residualMatRoi(residualMat, cv::Rect(0, 0, 1, buffIdx));
+                    MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi);
+                }
+
+                float eta = 1.f / winArea;
+                float fParam0 = normSigma0 * 32.f;
+                float fParam1 = normSigma1 * 32.f;
+                fParam0 = normSigma0 * MEstimatorScale;
+                fParam1 = normSigma1 * MEstimatorScale;
+
+#ifdef RLOF_SSE
+
+                qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+                qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+                __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(), qb2 = _mm_setzero_ps();
+                __m128 qb3 = _mm_setzero_ps();
+                __m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps();
+                __m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps();
+                __m128 mmSumIy = _mm_setzero_ps(), mmSumIx = _mm_setzero_ps();
+                __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();
+                __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam0)));
+                __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam1)));
+
+
+                float s2Val = std::fabs(normSigma2);
+                int s2bitShift = normSigma2 == 0 ? 1 : cvCeil(log(200.f / s2Val) / log(2.f));
+                __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast<short>(normSigma2 * (float)(1 << s2bitShift)));
+                __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift);
+                __m128  mmParam2s = _mm_set1_ps(0.01f * normSigma2);
+                __m128  mmParam2s2 = _mm_set1_ps(normSigma2 * normSigma2);
+
+                float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x;
+                int bitShift = gainVec.x == 0 ? 1 : cvCeil(log(200.f / gainVal) / log(2.f));
+
+                __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.x * (float)(1 << bitShift)));
+                __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.y));
+                __m128i    mmEta = _mm_setzero_si128();
+                __m128i mmScale = _mm_set1_epi16(static_cast<short>(MEstimatorScale));
+
+#endif
+                buffIdx = 0;
+
+                for (y = 0; y < _winSize.height; y++)
+                {
+                    const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                    const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
+                    const short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                    const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
+                    const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
+                    x = 0;
+#ifdef RLOF_SSE
+
+
+                    for (; x <= _winSize.width*cn; x += 8, dIptr += 8 * 2)
+                    {
+                        __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
+                        __m128i diff0, diff1;
+                        __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); // von element 0 bis 7
+                           __m128i v00 = _mm_unpacklo_epi8(
+                        _mm_loadl_epi64((const __m128i*)(Jptr + x)) , z); //J0 , 0, J1, 0, J2, 0 ... J7,0
+                        __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
+                        __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z);
+                        __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z);
+
+                        __m128i t0 = _mm_add_epi32
+                        (_mm_madd_epi16(
+                            _mm_unpacklo_epi16(v00, v01), //J0, , J1, J1, J2, J2,  ... J3 , J4
+                            qw0),
+                            _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                        __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+                            _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+
+                        t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5);
+                        t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1 - 5);
+
+
+                        __m128i lo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16);
+                        __m128i hi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16);
+
+                        __m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), bitShift);
+                        __m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), bitShift);
+                        __m128i Igain_epi16 = _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32);
+
+                        // diff = J - I + I*gain.x + gain.y
+                        __m128i mmDiff_epi16 = _mm_add_epi16(
+                            _mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16),    // J - I
+                            _mm_add_epi16(Igain_epi16, mmConstValue_epi16));
+
+
+                        mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16);
+
+
+                        __m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale);
+                        mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1)));
+
+
+                        __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1
+                        __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
+
+
+                        __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16);
+                        __m128i bSet2_epi16, bSet1_epi16;
+                        // |It| < sigma1 ?
+                        bSet2_epi16 = _mm_cmplt_epi16(abs_epi16, mmParam1);
+                        // It > 0 ?
+                        __m128i diffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128());
+                        // sigma0 < |It| < sigma1 ?
+                        bSet1_epi16 = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0));
+                        // val = |It| -/+ sigma1
+                        __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)),
+                            _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1)));
+                        // It == 0     ? |It| > sigma13
+                        mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16);
+                        // It == val ? sigma0 < |It| < sigma1
+                        mmDiff_epi16 = _mm_blendv_epi8(mmDiff_epi16, tmpParam1_epi16, bSet1_epi16);
+
+
+                        __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3
+                        // diff = diff * sigma2
+                        lo = _mm_mullo_epi16(tale_epi16_, mmDiff_epi16);
+                        hi = _mm_mulhi_epi16(tale_epi16_, mmDiff_epi16);
+                        __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); //diff 0_3_epi32
+                        __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); // diff 4_7_epi32
+
+                        mmDiff_epi16 = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); // ,da das ergebniss kleiner als 16 bit sein sollte
+
+                        diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7   | It12 It12 It13 It13...
+                        diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3   | It8 It8 It9 It9...
+
+                        // Ix * diff / Iy * diff
+                        v10 = _mm_mullo_epi16(Ixy_0, diff0);
+                        v11 = _mm_mulhi_epi16(Ixy_0, diff0);
+                        v00 = _mm_unpacklo_epi16(v10, v11);
+                        v10 = _mm_unpackhi_epi16(v10, v11);
+                        qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
+                        qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
+
+                        // It * Ix It * Iy [4 ... 7]
+                        // for set 1 hi sigma 1
+
+                        v10 = _mm_mullo_epi16(Ixy_1, diff1);
+                        v11 = _mm_mulhi_epi16(Ixy_1, diff1);
+                        v00 = _mm_unpacklo_epi16(v10, v11);
+                        v10 = _mm_unpackhi_epi16(v10, v11);
+                        qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
+                        qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
+                        // diff * J [0 ... 7]
+                        // for set 1  sigma 1
+                        // b3 += diff * Iptr[x]
+
+                        v10 = _mm_mullo_epi16(mmDiff_epi16, I_0_7_epi16);
+                        v11 = _mm_mulhi_epi16(mmDiff_epi16, I_0_7_epi16);
+                        v00 = _mm_unpacklo_epi16(v10, v11);
+                        v10 = _mm_unpackhi_epi16(v10, v11);
+                        qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v00));
+                        qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v10));
+
+                        qb3 = _mm_add_ps(qb3, _mm_cvtepi32_ps(diff_0_3_epi32));
+                        qb3 = _mm_add_ps(qb3, _mm_cvtepi32_ps(diff_4_7_epi32));
+
+                        if (j == 0)
+                        {
+                            __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16));
+                            __m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16, bSet1_epi16), 16));
+                            __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16));
+                            __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16)));
+
+                            __m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16));
+                            __m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16), 16));
+
+                            __m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps);
+                            __m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps);
+                            tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps);
+                            tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps);
+
+                            tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps);
+                            tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps);
+                            t0 = _mm_srai_epi32(Ixy_0, 16);
+                            t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16);
+
+                            __m128 fy = _mm_cvtepi32_ps(t0);
+                            __m128 fx = _mm_cvtepi32_ps(t1);
+
+                            // 0 ... 3
+                            __m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16));
+
+                            // A11 - A22
+                            __m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps);
+                            __m128 fytale = _mm_mul_ps(fy, tale_0_3_ps);
+
+                            mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
+                            mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
+                            mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
+
+                            // sumIx und sumIy
+                            mmSumIx = _mm_add_ps(mmSumIx, fxtale);
+                            mmSumIy = _mm_add_ps(mmSumIy, fytale);
+
+                            mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale));
+                            mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale));
+
+                            // sumI
+                            __m128 I_tale_ps = _mm_mul_ps(I_ps, tale_0_3_ps);
+                            mmSumI = _mm_add_ps(mmSumI, I_tale_ps);
+
+                            // sumW
+                            mmSumW = _mm_add_ps(mmSumW, tale_0_3_ps);
+
+                            // sumDI
+                            mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps(I_ps, I_tale_ps));
+
+                            t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11
+                            t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3
+
+                            fy = _mm_cvtepi32_ps(t0);
+                            fx = _mm_cvtepi32_ps(t1);
+
+                            // 4 ... 7
+                            I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16));
+
+                            // A11 - A22
+                            fxtale = _mm_mul_ps(fx, tale_4_7_ps);
+                            fytale = _mm_mul_ps(fy, tale_4_7_ps);
+
+                            mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
+                            mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
+                            mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
+
+                            // sumIx und sumIy
+                            mmSumIx = _mm_add_ps(mmSumIx, fxtale);
+                            mmSumIy = _mm_add_ps(mmSumIy, fytale);
+
+                            mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale));
+                            mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale));
+
+                            // sumI
+                            I_tale_ps = _mm_mul_ps(I_ps, tale_4_7_ps);
+                            mmSumI = _mm_add_ps(mmSumI, I_tale_ps);
+
+                            mmSumW = _mm_add_ps(mmSumW, tale_4_7_ps);
+
+                            mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps(I_ps, I_tale_ps));
+                        }
+
+                    }
+#else
+                    for (; x < _winSize.width*cn; x++, dIptr += 2)
+                    {
+                        if (maskPtr[x] == 0)
+                            continue;
+                        int J_val = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr1[x] * iw10 + Jptr1[x + cn] * iw11,
+                            W_BITS1 - 5);
+                        short ixval = static_cast<short>(dIptr[0]);
+                        short iyval = static_cast<short>(dIptr[1]);
+                        int diff = static_cast<int>(J_val - Iptr[x] + Iptr[x] * gainVec.x + gainVec.y);
+                        int abss = (diff < 0) ? -diff : diff;
+                        if (diff > MEstimatorScale)
+                            MEstimatorScale += eta;
+                        if (diff < MEstimatorScale)
+                            MEstimatorScale -= eta;
+                        if (abss > static_cast<int>(fParam1))
+                        {
+                            diff = 0;
+                        }
+                        else if (abss > static_cast<int>(fParam0) && diff >= 0)
+                        {
+                            diff = static_cast<int>(normSigma2 * (diff - fParam1));
+                        }
+                        else if (abss > static_cast<int>(fParam0) && diff < 0)
+                        {
+                            diff = static_cast<int>(normSigma2 * (diff + fParam1));
+                        }
+                        b1 += (float)(diff*ixval);
+                        b2 += (float)(diff*iyval); ;
+                        b3 += (float)(diff)* Iptr[x];
+                        b4 += (float)(diff);
+
+
+                        // compute the Gradient Matrice
+                        if (j == 0)
+                        {
+                            float tale = normSigma2 * FLT_RESCALE;
+                            if (abss < fParam0 || j < 0)
+                            {
+                                tale = FLT_RESCALE;
+                            }
+                            else if (abss > fParam1)
+                            {
+                                tale *= 0.01f;
+                            }
+                            else
+                            {
+                                tale *= normSigma2;
+                            }
+                            if (j == 0)
+                            {
+                                A11 += (float)(ixval*ixval)*tale;
+                                A12 += (float)(ixval*iyval)*tale;
+                                A22 += (float)(iyval*iyval)*tale;
+                            }
+
+                            dI += Iptr[x] * Iptr[x] * tale;
+                            float dx = static_cast<float>(dIptr[0]) * tale;
+                            float dy = static_cast<float>(dIptr[1]) * tale;
+                            sumIx += dx;
+                            sumIy += dy;
+                            w1 += dx * Iptr[x];
+                            w2 += dy * Iptr[x];
+                            sumI += Iptr[x] * tale;
+                            sumW += tale;
+                        }
+
+                    }
+#endif
+                }
+#ifdef RLOF_SSE
+                short etaValues[8];
+                _mm_storeu_si128((__m128i*)(etaValues), mmEta);
+                MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3]
+                        + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]);
+                        float CV_DECL_ALIGNED(32) wbuf[4];
+#endif
+                if (j == 0)
+                {
+#ifdef RLOF_SSE
+                    _mm_store_ps(wbuf, mmSumW1);
+                    w1 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                    _mm_store_ps(wbuf, mmSumW2);
+                    w2 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                    _mm_store_ps(wbuf, mmSumDI);
+                    dI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                    _mm_store_ps(wbuf, mmSumI);
+                    sumI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                    _mm_store_ps(wbuf, mmSumIx);
+                    sumIx = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                    _mm_store_ps(wbuf, mmSumIy);
+                    sumIy = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                    _mm_store_ps(wbuf, mmSumW);
+                    sumW = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+#endif
+                    sumIx *= -FLT_SCALE;
+                    sumIy *= -FLT_SCALE;
+                    sumI *= FLT_SCALE;
+                    sumW *= FLT_SCALE;
+                    w1 *= -FLT_SCALE;
+                    w2 *= -FLT_SCALE;
+                    dI *= FLT_SCALE;
+#ifdef RLOF_SSE
+                    float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];
+                    _mm_store_ps(A11buf, mmAxx);
+                    _mm_store_ps(A12buf, mmAxy);
+                    _mm_store_ps(A22buf, mmAyy);
+
+
+                    A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
+                    A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
+                    A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
+#endif
+                    A11 *= FLT_SCALE;
+                    A12 *= FLT_SCALE;
+                    A22 *= FLT_SCALE;
+                }
+#ifdef RLOF_SSE
+                float CV_DECL_ALIGNED(16) bbuf[4];
+                _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
+                b1 = bbuf[0] + bbuf[2];
+                b2 = bbuf[1] + bbuf[3];
+                _mm_store_ps(bbuf, qb2);
+                b3 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
+                _mm_store_ps(bbuf, qb3);
+                b4 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
+#endif
+                mismatchMat(0) = b1 * FLT_SCALE;
+                mismatchMat(1) = b2 * FLT_SCALE;
+                mismatchMat(2) = -b3 * FLT_SCALE;
+                mismatchMat(3) = -b4 * FLT_SCALE;
+
+                D = -A12 * A12*sumI*sumI + dI * sumW*A12*A12 + 2 * A12*sumI*sumIx*w2 + 2 * A12*sumI*sumIy*w1
+                    - 2 * dI*A12*sumIx*sumIy - 2 * sumW*A12*w1*w2 + A11 * A22*sumI*sumI - 2 * A22*sumI*sumIx*w1
+                    - 2 * A11*sumI*sumIy*w2 - sumIx * sumIx*w2*w2 + A22 * dI*sumIx*sumIx + 2 * sumIx*sumIy*w1*w2
+                    - sumIy * sumIy*w1*w1 + A11 * dI*sumIy*sumIy + A22 * sumW*w1*w1 + A11 * sumW*w2*w2 - A11 * A22*dI*sumW;
+
+                float sqrtVal = std::sqrt((A11 - A22)*(A11 - A22) + 4.f*A12*A12);
+                minEigValue = (A22 + A11 - sqrtVal) / (2.0f*winArea);
+                if (minEigValue < minEigThreshold || std::abs(D) < FLT_EPSILON)
+                {
+                    if (level == 0 && status)
+                        status[ptidx] = 0;
+                    if (level > 0)
+                    {
+                        nextPts[ptidx] = backUpNextPt;
+                        gainVecs[ptidx] = backUpGain;
+                    }
+                    break;
+                }
+
+                D = (1.f / D);
+
+                invTensorMat(0, 0) = (A22*sumI*sumI - 2 * sumI*sumIy*w2 + dI * sumIy*sumIy + sumW * w2*w2 - A22 * dI*sumW)* D;
+                invTensorMat(0, 1) = (A12*dI*sumW - A12 * sumI * sumI - dI * sumIx*sumIy + sumI * sumIx*w2 + sumI * sumIy*w1 - sumW * w1*w2)* D;
+                invTensorMat(0, 2) = (A12*sumI*sumIy - sumIy * sumIy*w1 - A22 * sumI*sumIx - A12 * sumW*w2 + A22 * sumW*w1 + sumIx * sumIy*w2)* D;
+                invTensorMat(0, 3) = (A22*dI*sumIx - A12 * dI*sumIy - sumIx * w2*w2 + A12 * sumI*w2 - A22 * sumI*w1 + sumIy * w1*w2) * D;
+                invTensorMat(1, 0) = invTensorMat(0, 1);
+                invTensorMat(1, 1) = (A11*sumI * sumI - 2 * sumI*sumIx*w1 + dI * sumIx * sumIx + sumW * w1*w1 - A11 * dI*sumW) * D;
+                invTensorMat(1, 2) = (A12*sumI*sumIx - A11 * sumI*sumIy - sumIx * sumIx*w2 + A11 * sumW*w2 - A12 * sumW*w1 + sumIx * sumIy*w1) * D;
+                invTensorMat(1, 3) = (A11*dI*sumIy - sumIy * w1*w1 - A12 * dI*sumIx - A11 * sumI*w2 + A12 * sumI*w1 + sumIx * w1*w2)* D;
+                invTensorMat(2, 0) = invTensorMat(0, 2);
+                invTensorMat(2, 1) = invTensorMat(1, 2);
+                invTensorMat(2, 2) = (sumW*A12*A12 - 2 * A12*sumIx*sumIy + A22 * sumIx*sumIx + A11 * sumIy*sumIy - A11 * A22*sumW)* D;
+                invTensorMat(2, 3) = (A11*A22*sumI - A12 * A12*sumI - A11 * sumIy*w2 + A12 * sumIx*w2 + A12 * sumIy*w1 - A22 * sumIx*w1)* D;
+                invTensorMat(3, 0) = invTensorMat(0, 3);
+                invTensorMat(3, 1) = invTensorMat(1, 3);
+                invTensorMat(3, 2) = invTensorMat(2, 3);
+                invTensorMat(3, 3) = (dI*A12*A12 - 2 * A12*w1*w2 + A22 * w1*w1 + A11 * w2*w2 - A11 * A22*dI)* D;
+
+                resultMat = invTensorMat * mismatchMat;
+
+                Point2f delta(-resultMat(0), -resultMat(1));
+                Point2f deltaGain(resultMat(2), resultMat(3));
+
+
+
+
+                if (j == 0)
+                    prevGain = deltaGain;
+                gainVec += deltaGain * 0.8;
+                nextPt += delta * 0.8;
+                nextPts[ptidx] = nextPt - halfWin;
+                gainVecs[ptidx] = gainVec;
+
+                if (
+                    (std::abs(delta.x - prevDelta.x) < 0.01  &&    std::abs(delta.y - prevDelta.y) < 0.01)
+                    || ((delta.ddot(delta) <= 0.001) && std::abs(prevGain.x - deltaGain.x) < 0.01)
+                    )
+                {
+                    nextPts[ptidx] -= delta * 0.5f;
+                    gainVecs[ptidx] -= deltaGain * 0.5f;
+                    break;
+                }
+
+                prevDelta = delta;
+                prevGain = deltaGain;
+            }
+
+        }
+
+    }
+
+    const Mat*          prevImg;
+    const Mat*          nextImg;
+    const Mat*          prevDeriv;
+    const Mat*          rgbPrevImg;
+    const Mat*          rgbNextImg;
+    const Point2f*      prevPts;
+    Point2f*            nextPts;
+    uchar*              status;
+    cv::Point2f*        gainVecs;
+    float*              err;
+    int                 maxWinSize;
+    int                 minWinSize;
+    TermCriteria        criteria;
+    int                 level;
+    int                 maxLevel;
+    int                 windowType;
+    float               minEigThreshold;
+    bool                useInitialFlow;
+    const float         normSigma0, normSigma1, normSigma2;
+    int                 crossSegmentationThreshold;
+};
+
+}}}} // namespace
+#endif
diff --git a/modules/optflow/src/rlof/rlof_invokerbase.hpp b/modules/optflow/src/rlof/rlof_invokerbase.hpp
new file mode 100644
index 00000000000..a7d09ceaff3
--- /dev/null
+++ b/modules/optflow/src/rlof/rlof_invokerbase.hpp
@@ -0,0 +1,178 @@
+﻿// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#ifndef _RLOF_INVOKERBASE_HPP_
+#define _RLOF_INVOKERBASE_HPP_
+
+
+#if CV_SSE4_1
+#define RLOF_SSE
+#endif
+
+//#define DEBUG_INVOKER
+
+#ifndef CV_DESCALE
+#define CV_DESCALE(x, n)     (((x) + (1 << ((n)-1))) >> (n))
+#endif
+
+#define FLT_RESCALE 1
+
+
+#include "rlof_localflow.h"
+#include <unordered_map>
+#include "opencv2/core/hal/intrin.hpp"
+using namespace std;
+using namespace cv;
+
+
+namespace cv {
+namespace optflow {
+
+typedef short deriv_type;
+#ifdef RLOF_SSE
+static inline void get4BitMask(const int & width, __m128i & mask)
+{
+    int noBits = width - static_cast<int>(floor(width / 4.f) * 4.f);
+    unsigned int val[4];
+    for (int n = 0; n < 4; n++)
+    {
+        val[n] = (noBits > n) ? (std::numeric_limits<unsigned int>::max()) : 0;
+    }
+    mask = _mm_set_epi32(val[3], val[2], val[1], val[0]);
+
+}
+static inline void getWBitMask(const int & width, __m128i & t0, __m128i & t1, __m128i & t2)
+{
+    int noBits = width - static_cast<int>(floor(width / 8.f) * 8.f);
+    unsigned short val[8];
+    for (int n = 0; n < 8; n++)
+    {
+        val[n] = (noBits > n) ? (0xffff) : 0;
+    }
+    t1 = _mm_set_epi16(val[7], val[7], val[6], val[6], val[5], val[5], val[4], val[4]);
+    t0 = _mm_set_epi16(val[3], val[3], val[2], val[2], val[1], val[1], val[0], val[0]);
+    t2 = _mm_set_epi16(val[7], val[6], val[5], val[4], val[3], val[2], val[1], val[0]);
+}
+#endif
+typedef uchar tMaskType;
+#define tCVMaskType CV_8UC1
+#define MaskSet 0xffffffff
+
+static
+void getLocalPatch(
+        const cv::Mat & src,
+        const cv::Point2i & prevPoint, // feature points
+        cv::Mat & winPointMask,
+        int & noPoints,
+        cv::Rect & winRoi,
+        int minWinSize)
+{
+    int maxWinSizeH = (winPointMask.cols - 1) / 2;
+    winRoi.x = prevPoint.x;// - maxWinSizeH;
+    winRoi.y = prevPoint.y;// - maxWinSizeH;
+    winRoi.width  =  winPointMask.cols;
+    winRoi.height =  winPointMask.rows;
+
+    if( minWinSize == winPointMask.cols || prevPoint.x < 0 || prevPoint.y < 0
+        || prevPoint.x + 2*maxWinSizeH >= src.cols || prevPoint.y + 2*maxWinSizeH >= src.rows)
+    {
+        winRoi.x = prevPoint.x - maxWinSizeH;
+        winRoi.y = prevPoint.y - maxWinSizeH;
+        winPointMask.setTo(1);
+        noPoints = winPointMask.size().area();
+        return;
+    }
+    winPointMask.setTo(0);
+    noPoints = 0;
+    int c            = prevPoint.x + maxWinSizeH;
+    int r            = prevPoint.y + maxWinSizeH;
+    int min_c = c;
+    int max_c = c;
+    int border_left    = c - maxWinSizeH;
+    int border_top    = r - maxWinSizeH;
+    cv::Vec4i bounds = src.at<cv::Vec4i>(r,c);
+    int min_r = bounds.val[2];
+    int max_r = bounds.val[3];
+
+    for( int _r = min_r; _r <= max_r; _r++)
+    {
+        cv::Rect roi(maxWinSizeH, _r - border_top,  winPointMask.cols, 1);
+        if( _r >= 0 && _r < src.cols)
+        {
+            bounds = src.at<cv::Vec4i>(_r,c);
+            roi.x      = bounds.val[0] - border_left;
+            roi.width = bounds.val[1] - bounds.val[0];
+        }
+        else
+        {
+            bounds.val[0] = border_left;
+            bounds.val[1] = border_left + roi.width;
+        }
+        cv::Mat(winPointMask, roi).setTo(1);
+        min_c = MIN(min_c, bounds.val[0]);
+        max_c = MAX(max_c, bounds.val[1]);
+        noPoints += roi.width;
+    }
+
+    if( noPoints < minWinSize * minWinSize)
+    {
+        cv::Rect roi(    winPointMask.cols / 2 - (minWinSize-1)/2,
+                        winPointMask.rows / 2 - (minWinSize-1)/2,
+                        minWinSize, minWinSize);
+        cv::Mat(winPointMask, roi).setTo(1);
+        roi.x += border_left;
+        roi.y += border_top;
+        min_c = MIN(MIN(min_c, roi.tl().x),roi.br().x);
+        max_c = MAX(MAX(max_c, roi.tl().x),roi.br().x);
+        min_r = MIN(MIN(min_r, roi.tl().y),roi.br().y);
+        max_r = MAX(MAX(max_r, roi.tl().y),roi.br().y);
+        noPoints += minWinSize * minWinSize;
+    }
+    winRoi.x = min_c - maxWinSizeH;
+    winRoi.y = min_r - maxWinSizeH;
+    winRoi.width  =  max_c - min_c;
+    winRoi.height =  max_r - min_r;
+    winPointMask = winPointMask(cv::Rect(min_c - border_left, min_r - border_top, winRoi.width, winRoi.height));
+}
+
+static inline
+bool calcWinMaskMat(
+        const cv::Mat & BI,
+        const int windowType,
+        const cv::Point2i & iprevPt,
+        cv::Mat & winMaskMat,
+        cv::Size & winSize,
+        cv::Point2f & halfWin,
+        int & winArea,
+        const int minWinSize,
+        const int maxWinSize)
+{
+    if (windowType == SR_CROSS && maxWinSize != minWinSize)
+    {
+        // patch generation
+        cv::Rect winRoi;
+        getLocalPatch(BI, iprevPt, winMaskMat, winArea, winRoi, minWinSize);
+        if (winArea == 0)
+            return false;
+        winSize = winRoi.size();
+        halfWin = Point2f(static_cast<float>(iprevPt.x - winRoi.tl().x),
+                          static_cast<float>(iprevPt.y - winRoi.tl().y));
+    }
+    else
+    {
+        winSize = cv::Size(maxWinSize, maxWinSize);
+        halfWin = Point2f((winSize.width - 1) / 2.f, (winSize.height - 1) / 2.f);
+        winMaskMat.setTo(1);
+    }
+    return true;
+}
+
+static inline
+short estimateScale(cv::Mat & residuals)
+{
+    cv::Mat absMat = cv::abs(residuals);
+    return quickselect<short>(absMat, absMat.rows / 2);
+}
+
+}} // namespace
+#endif
diff --git a/modules/optflow/src/rlof/rlof_localflow.cpp b/modules/optflow/src/rlof/rlof_localflow.cpp
new file mode 100644
index 00000000000..e1e59860926
--- /dev/null
+++ b/modules/optflow/src/rlof/rlof_localflow.cpp
@@ -0,0 +1,691 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#include "../precomp.hpp"
+
+#include "opencv2/calib3d.hpp"  // findHomography
+#include "opencv2/highgui.hpp"
+#include "rlof_localflow.h"
+#include "berlof_invoker.hpp"
+#include "rlof_invoker.hpp"
+#include "plk_invoker.hpp"
+using namespace std;
+using namespace cv;
+
+namespace cv {
+namespace detail {
+typedef short deriv_type;
+} // namespace
+
+namespace {
+static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
+{
+    using namespace cv;
+    using cv::detail::deriv_type;
+    int rows = src.rows, cols = src.cols, cn = src.channels(), colsn = cols * cn, depth = src.depth();
+    CV_Assert(depth == CV_8U);
+    dst.create(rows, cols, CV_MAKETYPE(DataType<deriv_type>::depth, cn * 2));
+
+    int x, y, delta = (int)alignSize((cols + 2)*cn, 16);
+    AutoBuffer<deriv_type> _tempBuf(delta * 2 + 64);
+    deriv_type *trow0 = alignPtr(_tempBuf.data() + cn, 16), *trow1 = alignPtr(trow0 + delta, 16);
+
+#if CV_SIMD128
+    v_int16x8 c3 = v_setall_s16(3), c10 = v_setall_s16(10);
+    bool haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON);
+#endif
+
+    for (y = 0; y < rows; y++)
+    {
+        const uchar* srow0 = src.ptr<uchar>(y > 0 ? y - 1 : rows > 1 ? 1 : 0);
+        const uchar* srow1 = src.ptr<uchar>(y);
+        const uchar* srow2 = src.ptr<uchar>(y < rows - 1 ? y + 1 : rows > 1 ? rows - 2 : 0);
+        deriv_type* drow = dst.ptr<deriv_type>(y);
+
+        // do vertical convolution
+        x = 0;
+#if CV_SIMD128
+        if (haveSIMD)
+        {
+            for (; x <= colsn - 8; x += 8)
+            {
+                v_int16x8 s0 = v_reinterpret_as_s16(v_load_expand(srow0 + x));
+                v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(srow1 + x));
+                v_int16x8 s2 = v_reinterpret_as_s16(v_load_expand(srow2 + x));
+
+                v_int16x8 t1 = s2 - s0;
+                v_int16x8 t0 = v_mul_wrap(s0 + s2, c3) + v_mul_wrap(s1, c10);
+
+                v_store(trow0 + x, t0);
+                v_store(trow1 + x, t1);
+            }
+        }
+#endif
+
+        for (; x < colsn; x++)
+        {
+            int t0 = (srow0[x] + srow2[x]) * 3 + srow1[x] * 10;
+            int t1 = srow2[x] - srow0[x];
+            trow0[x] = (deriv_type)t0;
+            trow1[x] = (deriv_type)t1;
+        }
+
+        // make border
+        int x0 = (cols > 1 ? 1 : 0)*cn, x1 = (cols > 1 ? cols - 2 : 0)*cn;
+        for (int k = 0; k < cn; k++)
+        {
+            trow0[-cn + k] = trow0[x0 + k]; trow0[colsn + k] = trow0[x1 + k];
+            trow1[-cn + k] = trow1[x0 + k]; trow1[colsn + k] = trow1[x1 + k];
+        }
+
+        // do horizontal convolution, interleave the results and store them to dst
+        x = 0;
+#if CV_SIMD128
+        if (haveSIMD)
+        {
+            for (; x <= colsn - 8; x += 8)
+            {
+                v_int16x8 s0 = v_load(trow0 + x - cn);
+                v_int16x8 s1 = v_load(trow0 + x + cn);
+                v_int16x8 s2 = v_load(trow1 + x - cn);
+                v_int16x8 s3 = v_load(trow1 + x);
+                v_int16x8 s4 = v_load(trow1 + x + cn);
+
+                v_int16x8 t0 = s1 - s0;
+                v_int16x8 t1 = v_mul_wrap(s2 + s4, c3) + v_mul_wrap(s3, c10);
+
+                v_store_interleave((drow + x * 2), t0, t1);
+            }
+        }
+#endif
+        for (; x < colsn; x++)
+        {
+            deriv_type t0 = (deriv_type)(trow0[x + cn] - trow0[x - cn]);
+            deriv_type t1 = (deriv_type)((trow1[x + cn] + trow1[x - cn]) * 3 + trow1[x] * 10);
+            drow[x * 2] = t0; drow[x * 2 + 1] = t1;
+        }
+    }
+}
+
+} // namespace
+namespace optflow {
+
+/*! Helper function for preCalcCrossSegmentation. Everything is performed on the large
+*\param data CV_8UC3 image ( use extended image mit winSize)
+*\param winSize
+*\param dst CV_32SC1 bounding map
+*\param threshold
+*\param stride if true store into first two bounding maps
+*/
+class HorizontalCrossSegmentation  : public cv::ParallelLoopBody
+{
+public:
+    HorizontalCrossSegmentation(
+            const cv::Point2f * ptList,
+            int npoints,
+            float pointScale,
+            const cv::Mat * data,
+            const int winSize,
+            cv::Mat * dst,
+            int threshold,
+            bool stride,
+            const cv::Mat * mask
+    )
+    {
+        m_ptList        = ptList;
+        m_npoints       = npoints;
+        m_pointScale    = pointScale;
+        m_data          = data;
+        m_winSize       = winSize;
+        m_dst           = dst;
+        m_threshold     = threshold;
+        m_stride        = stride;
+        m_mask          = mask;
+    }
+
+    void operator()(const cv::Range& range) const CV_OVERRIDE
+    {
+        uchar channel[2];
+        channel[0] = m_stride ? 2 : 0;
+        channel[1] = m_stride ? 3 : 1;
+        int hWinSize        = (m_winSize - 1) / 2;
+        std::vector<int> differenz(m_winSize);
+        for( int r = range.start; r < range.end; r++ )
+        {
+            for(int c = hWinSize; c < m_data->cols - hWinSize; c++)
+            {
+                if( m_mask->at<uchar>(r,c) == 0)
+                    continue;
+        const Point3_<uchar> & ucval = m_data->at<Point3_<uchar>>(r,c);
+        Point3i val(static_cast<int>(ucval.x), static_cast<int>(ucval.y), static_cast<int>(ucval.z));
+                int x = c - hWinSize;
+        Point dstPos = m_stride ? Point(r,c) : Point(c,r);
+                for(int ix = 0; ix < m_winSize; ix++, x++)
+                {
+            const Point3_<uchar> & valref = m_data->at<Point3_<uchar>>(r,x);
+                    differenz[ix] = MAX(std::abs(static_cast<int>(valref.x) - val.x),
+                                    MAX(std::abs(static_cast<int>(valref.y) - val.y),
+                                    (std::abs(static_cast<int>(valref.z) - val.z))));
+
+                }
+                cv::Vec4i & bounds = m_dst->at<cv::Vec4i>(dstPos);
+                bounds.val[channel[0]] = c - hWinSize;
+                bounds.val[channel[1]] = c + hWinSize;
+                int * diffPtr = &differenz[hWinSize];
+                bool useUpperBound = false;
+                bool useLowerBound = false;
+                for(int ix = 1; ix <= hWinSize; ix++)
+                {
+                    if( !useUpperBound && diffPtr[-ix] > m_threshold)
+                    {
+                        useUpperBound = true;
+                        bounds.val[channel[0]] = c - ix;
+                    }
+                    if( !useLowerBound && diffPtr[ix-1] > m_threshold)
+                    {
+                        useLowerBound = true;
+                        bounds.val[channel[1]] = c + ix - 1;
+                    }
+                    if( useUpperBound && useLowerBound)
+                        break;
+                }
+            }
+        }
+    }
+
+    const cv::Point2f * m_ptList;
+    int                 m_npoints;
+    float               m_pointScale;
+    const cv::Mat *     m_data;
+    int                 m_winSize;
+    cv::Mat *           m_dst;
+    int                 m_threshold;
+    bool                m_stride;
+    const cv::Mat *     m_mask;
+};
+
+static
+void preCalcCrossSegmentation(
+    const cv::Point2f * ptList,
+    int npoints,
+    float pointScale,
+    const cv::Mat & img,
+    const int winSize,
+    cv::Mat & dst,
+    int threshold
+)
+{
+    int hWinSize = (winSize - 1) / 2;
+    cv::Mat data = img;
+    data.adjustROI(hWinSize, hWinSize, hWinSize, hWinSize);
+    if( dst.size() != dst.size() || dst.type() != CV_32SC4)
+    {
+        dst.release();
+        dst.create(data.size(), CV_32SC4);
+    }
+    cv::Mat mask(data.cols, data.rows, CV_8UC1);
+    mask.setTo(0);
+    for( unsigned int n = 0; n < static_cast<unsigned int>(npoints); n++)
+    {
+        cv::Point ipos( static_cast<int>(floor(ptList[n].y * pointScale)),
+                        static_cast<int>(floor(ptList[n].x * pointScale) + hWinSize));
+        ipos.x = MAX( MIN(ipos.x, mask.cols - 1), 0);
+        int to = MIN( mask.cols - 1, ipos.x + winSize );
+        int ypos = MAX( MIN(ipos.y, mask.rows - 1), 0);
+        for(int x = ipos.x; x <= to ; x++)
+        {
+            mask.at<uchar>(ypos, x) = 255;
+        }
+    }
+    cv::Mat datat = data.t();
+    cv::Mat maskt = mask.t();
+    parallel_for_(cv::Range(0, datat.rows),    HorizontalCrossSegmentation(ptList, npoints, pointScale, &datat, winSize, &dst, threshold, true, &mask));
+    parallel_for_(cv::Range(0, data.rows),    HorizontalCrossSegmentation(ptList, npoints, pointScale, &data, winSize, &dst, threshold, false, &maskt));
+
+}
+
+
+static inline
+bool isrobust(const RLOFOpticalFlowParameter & param)
+{
+    return (param.normSigma0 < 255 && param.normSigma1 < 255);
+}
+static inline
+std::vector<float> get_norm(float sigma0, float sigma1)
+{
+    std::vector<float> result = { sigma0, sigma1, sigma0 / (sigma0 - sigma1), sigma0 * sigma1 };
+    return result;
+}
+
+static
+int buildOpticalFlowPyramidScale(InputArray _img, OutputArrayOfArrays pyramid, Size winSize, int maxLevel, bool withDerivatives,
+    int pyrBorder, int derivBorder, bool tryReuseInputImage, float levelScale[2])
+{
+    Mat img = _img.getMat();
+    CV_Assert(img.depth() == CV_8U && winSize.width > 2 && winSize.height > 2);
+    int pyrstep = withDerivatives ? 2 : 1;
+
+    pyramid.create(1, (maxLevel + 1) * pyrstep, 0 /*type*/, -1, true);
+
+    int derivType = CV_MAKETYPE(DataType<short>::depth, img.channels() * 2);
+
+    //level 0
+    bool lvl0IsSet = false;
+    if (tryReuseInputImage && img.isSubmatrix() && (pyrBorder & BORDER_ISOLATED) == 0)
+    {
+        Size wholeSize;
+        Point ofs;
+        img.locateROI(wholeSize, ofs);
+        if (ofs.x >= winSize.width && ofs.y >= winSize.height
+            && ofs.x + img.cols + winSize.width <= wholeSize.width
+            && ofs.y + img.rows + winSize.height <= wholeSize.height)
+        {
+            pyramid.getMatRef(0) = img;
+            lvl0IsSet = true;
+        }
+    }
+
+    if (!lvl0IsSet)
+    {
+        Mat& temp = pyramid.getMatRef(0);
+
+        if (!temp.empty())
+            temp.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width);
+        if (temp.type() != img.type() || temp.cols != winSize.width * 2 + img.cols || temp.rows != winSize.height * 2 + img.rows)
+            temp.create(img.rows + winSize.height * 2, img.cols + winSize.width * 2, img.type());
+
+        if (pyrBorder == BORDER_TRANSPARENT)
+            img.copyTo(temp(Rect(winSize.width, winSize.height, img.cols, img.rows)));
+        else
+            copyMakeBorder(img, temp, winSize.height, winSize.height, winSize.width, winSize.width, pyrBorder);
+        temp.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width);
+    }
+
+    Size sz = img.size();
+    Mat prevLevel = pyramid.getMatRef(0);
+    Mat thisLevel = prevLevel;
+
+    for (int level = 0; level <= maxLevel; ++level)
+    {
+        if (level != 0)
+        {
+            Mat& temp = pyramid.getMatRef(level * pyrstep);
+
+            if (!temp.empty())
+                temp.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width);
+            if (temp.type() != img.type() || temp.cols != winSize.width * 2 + sz.width || temp.rows != winSize.height * 2 + sz.height)
+                temp.create(sz.height + winSize.height * 2, sz.width + winSize.width * 2, img.type());
+
+            thisLevel = temp(Rect(winSize.width, winSize.height, sz.width, sz.height));
+            pyrDown(prevLevel, thisLevel, sz);
+
+            if (pyrBorder != BORDER_TRANSPARENT)
+                copyMakeBorder(thisLevel, temp, winSize.height, winSize.height, winSize.width, winSize.width, pyrBorder | BORDER_ISOLATED);
+            temp.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width);
+        }
+
+        if (withDerivatives)
+        {
+            Mat& deriv = pyramid.getMatRef(level * pyrstep + 1);
+
+            if (!deriv.empty())
+                deriv.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width);
+            if (deriv.type() != derivType || deriv.cols != winSize.width * 2 + sz.width || deriv.rows != winSize.height * 2 + sz.height)
+                deriv.create(sz.height + winSize.height * 2, sz.width + winSize.width * 2, derivType);
+
+            Mat derivI = deriv(Rect(winSize.width, winSize.height, sz.width, sz.height));
+            calcSharrDeriv(thisLevel, derivI);
+
+            if (derivBorder != BORDER_TRANSPARENT)
+                copyMakeBorder(derivI, deriv, winSize.height, winSize.height, winSize.width, winSize.width, derivBorder | BORDER_ISOLATED);
+            deriv.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width);
+        }
+
+        sz = Size(static_cast<int>((sz.width + 1) / levelScale[0]),
+            static_cast<int>((sz.height + 1) / levelScale[1]));
+        if (sz.width <= winSize.width || sz.height <= winSize.height)
+        {
+            pyramid.create(1, (level + 1) * pyrstep, 0 /*type*/, -1, true);//check this
+            return level;
+        }
+
+        prevLevel = thisLevel;
+    }
+
+    return maxLevel;
+}
+int CImageBuffer::buildPyramid(cv::Size winSize, int maxLevel, float levelScale[2])
+{
+    if (m_Overwrite == false)
+        return m_maxLevel;
+    m_maxLevel = buildOpticalFlowPyramidScale(m_Image, m_ImagePyramid, winSize, maxLevel, false, 4, 0, true, levelScale);
+    return m_maxLevel;
+}
+
+static
+void calcLocalOpticalFlowCore(
+    Ptr<CImageBuffer>  prevPyramids[2],
+    Ptr<CImageBuffer> currPyramids[2],
+    InputArray _prevPts,
+    InputOutputArray _nextPts,
+    const RLOFOpticalFlowParameter & param)
+{
+
+    bool useAdditionalRGB = param.supportRegionType == SR_CROSS;
+    int iWinSize = param.largeWinSize;
+    int winSizes[2] = { iWinSize, iWinSize };
+    if (param.supportRegionType != SR_FIXED)
+    {
+        winSizes[0] = param.smallWinSize;
+    }
+    //cv::Size winSize(iWinSize, iWinSize);
+    cv::TermCriteria criteria(cv::TermCriteria::COUNT | cv::TermCriteria::EPS, param.maxIteration, 0.01);
+    std::vector<float> rlofNorm = get_norm(param.normSigma0, param.normSigma1);
+    CV_Assert(winSizes[0] <= winSizes[1]);
+
+    bool usePreComputedCross = winSizes[0] != winSizes[1];
+    Mat prevPtsMat = _prevPts.getMat();
+    const int derivDepth = DataType<detail::deriv_type>::depth;
+
+    CV_Assert(param.maxLevel >= 0 && iWinSize > 2);
+
+    int level = 0,  npoints;
+    CV_Assert((npoints = prevPtsMat.checkVector(2, CV_32F, true)) >= 0);
+
+    if (!(param.useInitialFlow))
+        _nextPts.create(prevPtsMat.size(), prevPtsMat.type(), -1, true);
+
+    Mat nextPtsMat = _nextPts.getMat();
+    CV_Assert(nextPtsMat.checkVector(2, CV_32F, true) == npoints);
+
+    const Point2f* prevPts = (const Point2f*)prevPtsMat.data;
+    Point2f* nextPts = (Point2f*)nextPtsMat.data;
+    std::vector<uchar> status(npoints);
+    std::vector<float> err(npoints);
+    std::vector<Point2f> gainPts(npoints);
+
+    float levelScale[2] = { 2.f,2.f };
+
+    int maxLevel = prevPyramids[0]->buildPyramid(cv::Size(iWinSize, iWinSize), param.maxLevel, levelScale);
+
+    maxLevel = currPyramids[0]->buildPyramid(cv::Size(iWinSize, iWinSize), maxLevel, levelScale);
+    if (useAdditionalRGB)
+    {
+        prevPyramids[1]->buildPyramid(cv::Size(iWinSize, iWinSize), maxLevel, levelScale);
+        currPyramids[1]->buildPyramid(cv::Size(iWinSize, iWinSize), maxLevel, levelScale);
+    }
+
+    if ((criteria.type & TermCriteria::COUNT) == 0)
+        criteria.maxCount = 30;
+    else
+        criteria.maxCount = std::min(std::max(criteria.maxCount, 0), 100);
+    if ((criteria.type & TermCriteria::EPS) == 0)
+        criteria.epsilon = 0.001;
+    else
+        criteria.epsilon = std::min(std::max(criteria.epsilon, 0.), 10.);
+    criteria.epsilon *= criteria.epsilon;
+
+    // dI/dx ~ Ix, dI/dy ~ Iy
+    Mat derivIBuf;
+    derivIBuf.create(prevPyramids[0]->m_ImagePyramid[0].rows + iWinSize * 2, prevPyramids[0]->m_ImagePyramid[0].cols + iWinSize * 2, CV_MAKETYPE(derivDepth, prevPyramids[0]->m_ImagePyramid[0].channels() * 2));
+
+    for (level = maxLevel; level >= 0; level--)
+    {
+        Mat derivI;
+        Size imgSize = prevPyramids[0]->getImage(level).size();
+        Mat _derivI(imgSize.height + iWinSize * 2, imgSize.width + iWinSize * 2, derivIBuf.type(), derivIBuf.data);
+        derivI = _derivI(Rect(iWinSize, iWinSize, imgSize.width, imgSize.height));
+        calcSharrDeriv(prevPyramids[0]->getImage(level), derivI);
+        copyMakeBorder(derivI, _derivI, iWinSize, iWinSize, iWinSize, iWinSize, BORDER_CONSTANT | BORDER_ISOLATED);
+
+        cv::Mat tRGBPrevPyr;
+        cv::Mat tRGBNextPyr;
+        if (useAdditionalRGB)
+        {
+            tRGBPrevPyr = prevPyramids[1]->getImage(level);
+            tRGBNextPyr = prevPyramids[1]->getImage(level);
+
+            prevPyramids[1]->m_Overwrite = false;
+            currPyramids[1]->m_Overwrite = false;
+        }
+
+        cv::Mat prevImage = prevPyramids[0]->getImage(level);
+        cv::Mat currImage = currPyramids[0]->getImage(level);
+
+        cv::Mat preCrossMap;
+        if( usePreComputedCross )
+        {
+            preCalcCrossSegmentation(prevPts, npoints, (float)(1./(1 << level)), tRGBPrevPyr, winSizes[1], preCrossMap, param.crossSegmentationThreshold);
+            tRGBNextPyr = cv::Mat();
+            tRGBPrevPyr = preCrossMap;
+        }
+        // apply plk like tracker
+            prevImage.adjustROI(iWinSize, iWinSize, iWinSize, iWinSize);
+            currImage.adjustROI(iWinSize, iWinSize, iWinSize, iWinSize);
+            derivI.adjustROI(iWinSize, iWinSize, iWinSize, iWinSize);
+        if (isrobust(param) == false)
+        {
+            if (param.useIlluminationModel)
+            {
+                cv::parallel_for_(cv::Range(0, npoints),
+                    plk::radial::TrackerInvoker(
+                        prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
+                        prevPts, nextPts, &status[0], &err[0], &gainPts[0],
+                        level, maxLevel, winSizes,
+                        param.maxIteration,
+                        param.useInitialFlow,
+                        param.supportRegionType,
+                        param.minEigenValue,
+                        param.crossSegmentationThreshold));
+            }
+            else
+            {
+                if (param.solverType == SolverType::ST_STANDART)
+                {
+                    cv::parallel_for_(cv::Range(0, npoints),
+                        plk::ica::TrackerInvoker(
+                            prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
+                            prevPts, nextPts, &status[0], &err[0],
+                            level, maxLevel, winSizes,
+                            param.maxIteration,
+                            param.useInitialFlow,
+                            param.supportRegionType,
+                            param.crossSegmentationThreshold,
+                            param.minEigenValue));
+                }
+                else
+                {
+                    cv::parallel_for_(cv::Range(0, npoints),
+                        beplk::ica::TrackerInvoker(prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
+                            prevPts, nextPts, &status[0], &err[0],
+                            level, maxLevel, winSizes,
+                            param.maxIteration,
+                            param.useInitialFlow,
+                            param.supportRegionType,
+                            param.crossSegmentationThreshold,
+                            param.minEigenValue));
+                }
+            }
+        }
+        // for robust models
+        else
+        {
+            if (param.useIlluminationModel)
+            {
+                if (param.solverType == SolverType::ST_STANDART)
+                {
+                    cv::parallel_for_(cv::Range(0, npoints),
+                        rlof::radial::TrackerInvoker(
+                            prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
+                            prevPts, nextPts, &status[0], &err[0], &gainPts[0],
+                            level, maxLevel, winSizes,
+                            param.maxIteration,
+                            param.useInitialFlow,
+                            param.supportRegionType,
+                            rlofNorm,
+                            param.minEigenValue,
+                            param.crossSegmentationThreshold));
+                }
+                else
+                {
+                    cv::parallel_for_(cv::Range(0, npoints),
+                        berlof::radial::TrackerInvoker(prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
+                            prevPts, nextPts, &status[0], &err[0], &gainPts[0],
+                            level, maxLevel, winSizes,
+                            param.maxIteration,
+                            param.useInitialFlow,
+                            param.supportRegionType,
+                            param.crossSegmentationThreshold,
+                            rlofNorm,
+                            param.minEigenValue));
+                }
+            }
+            else
+            {
+
+                if (param.solverType == SolverType::ST_STANDART)
+                {
+                    cv::parallel_for_(cv::Range(0, npoints),
+                        rlof::ica::TrackerInvoker(
+                            prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
+                            prevPts, nextPts, &status[0], &err[0],
+                            level, maxLevel, winSizes,
+                            param.maxIteration,
+                            param.useInitialFlow,
+                            param.supportRegionType,
+                            rlofNorm,
+                            param.minEigenValue,
+                            param.crossSegmentationThreshold));
+                }
+                else
+                {
+                    cv::parallel_for_(cv::Range(0, npoints),
+                        berlof::ica::TrackerInvoker(prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
+                            prevPts, nextPts, &status[0], &err[0],
+                            level, maxLevel, winSizes,
+                            param.maxIteration,
+                            param.useInitialFlow,
+                            param.supportRegionType,
+                            param.crossSegmentationThreshold,
+                            rlofNorm,
+                            param.minEigenValue));
+                }
+
+            }
+        }
+
+        prevPyramids[0]->m_Overwrite = true;
+        currPyramids[0]->m_Overwrite = true;
+    }
+}
+
+static
+void preprocess(Ptr<CImageBuffer> prevPyramids[2],
+    Ptr<CImageBuffer> currPyramids[2],
+    const std::vector<cv::Point2f> & prevPoints,
+    std::vector<cv::Point2f> & currPoints,
+    const RLOFOpticalFlowParameter & param)
+{
+    cv::Mat mask, homography;
+    if (param.useGlobalMotionPrior == false)
+        return;
+
+    currPoints.resize(prevPoints.size());
+
+    RLOFOpticalFlowParameter gmeTrackerParam = param;
+    gmeTrackerParam.useGlobalMotionPrior = false;
+    gmeTrackerParam.largeWinSize = 17;
+    // use none robust tracker for global motion estimation since it is faster
+    gmeTrackerParam.normSigma0 = std::numeric_limits<float>::max();
+    gmeTrackerParam.maxIteration = MAX(15, param.maxIteration);
+    gmeTrackerParam.minEigenValue = 0.000001f;
+
+    std::vector<cv::Point2f> gmPrevPoints, gmCurrPoints;
+
+    // Initialize point grid
+    int stepr = prevPyramids[0]->m_Image.rows / 30;
+    int stepc = prevPyramids[0]->m_Image.cols / 40;
+    for (int r = stepr / 2; r < prevPyramids[0]->m_Image.rows; r += stepr)
+    {
+        for (int c = stepc / 2; c < prevPyramids[0]->m_Image.cols; c += stepc)
+        {
+            gmPrevPoints.push_back(cv::Point2f(static_cast<float>(c), static_cast<float>(r)));
+        }
+    }
+
+    // perform motion estimation
+    calcLocalOpticalFlowCore(prevPyramids, currPyramids, gmPrevPoints, gmCurrPoints, gmeTrackerParam);
+
+    cv::Mat prevPointsMat(static_cast<int>(gmPrevPoints.size()), 1, CV_32FC2);
+    cv::Mat currPointsMat(static_cast<int>(gmPrevPoints.size()), 1, CV_32FC2);
+    cv::Mat distMat(static_cast<int>(gmPrevPoints.size()), 1, CV_32FC1);
+
+    // Forward backward confidence to estimate optimal ransac reprojection error
+    int noPoints = 0;
+    for (unsigned int n = 0; n < gmPrevPoints.size(); n++)
+    {
+        cv::Point2f flow = gmCurrPoints[n] - gmPrevPoints[n];
+        prevPointsMat.at<cv::Point2f>(noPoints) = gmPrevPoints[n];
+        currPointsMat.at<cv::Point2f>(noPoints) = gmCurrPoints[n];
+        distMat.at<float>(noPoints) = flow.x * flow.x + flow.y* flow.y;
+        if (isnan(distMat.at<float>(noPoints)) == false)
+            noPoints++;
+    }
+
+    float medianDist = (param.globalMotionRansacThreshold == 0) ? 1.f :
+        quickselect<float>(distMat, static_cast<int>(noPoints  * static_cast<float>(param.globalMotionRansacThreshold) / 100.f));
+    medianDist = sqrt(medianDist);
+
+    if (noPoints < 8)
+        return;
+
+    cv::findHomography(prevPointsMat(cv::Rect(0, 0, 1, noPoints)), currPointsMat(cv::Rect(0, 0, 1, noPoints)), cv::RANSAC, medianDist, mask).convertTo(homography, CV_32FC1);
+
+    if (homography.empty())
+        return;
+
+    cv::perspectiveTransform(prevPoints, currPoints, homography);
+}
+
+void calcLocalOpticalFlow(
+    const Mat prevImage,
+    const Mat currImage,
+    Ptr<CImageBuffer>  prevPyramids[2],
+    Ptr<CImageBuffer>  currPyramids[2],
+    const std::vector<Point2f> & prevPoints,
+    std::vector<Point2f> & currPoints,
+    const RLOFOpticalFlowParameter & param)
+{
+    if (prevImage.empty() == false && currImage.empty()== false)
+    {
+        prevPyramids[0]->m_Overwrite = true;
+        currPyramids[0]->m_Overwrite = true;
+        prevPyramids[1]->m_Overwrite = true;
+        currPyramids[1]->m_Overwrite = true;
+        if (prevImage.type() == CV_8UC3)
+        {
+            prevPyramids[0]->setGrayFromRGB(prevImage);
+            currPyramids[0]->setGrayFromRGB(currImage);
+            prevPyramids[1]->setImage(prevImage);
+            currPyramids[1]->setImage(currImage);
+
+            if (param.supportRegionType == SR_CROSS)
+            {
+                prevPyramids[1]->setBlurFromRGB(prevImage);
+                currPyramids[1]->setBlurFromRGB(currImage);
+            }
+        }
+        else
+        {
+            prevPyramids[0]->setImage(prevImage);
+            currPyramids[0]->setImage(currImage);
+        }
+    }
+    preprocess(prevPyramids, currPyramids, prevPoints, currPoints, param);
+    RLOFOpticalFlowParameter internParam = param;
+    if (param.useGlobalMotionPrior == true)
+        internParam.useInitialFlow = true;
+    calcLocalOpticalFlowCore(prevPyramids, currPyramids, prevPoints, currPoints, internParam);
+}
+
+}} // namespace
diff --git a/modules/optflow/src/rlof/rlof_localflow.h b/modules/optflow/src/rlof/rlof_localflow.h
new file mode 100644
index 00000000000..9ffe42cea3e
--- /dev/null
+++ b/modules/optflow/src/rlof/rlof_localflow.h
@@ -0,0 +1,117 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#ifndef _RLOF_LOCALFLOW_H_
+#define _RLOF_LOCALFLOW_H_
+#include <limits>
+#include <math.h>
+#include <float.h>
+#include <stdio.h>
+#include "opencv2/imgproc.hpp"
+#include "opencv2/optflow/rlofflow.hpp"
+//! Fast median estimation method based on @cite Tibshirani2008. This implementation relates to http://www.stat.cmu.edu/~ryantibs/median/
+using namespace cv;
+template<typename T>
+T quickselect(const Mat & inp, int k)
+{
+    unsigned long i;
+    unsigned long ir;
+    unsigned long j;
+    unsigned long l;
+    unsigned long mid;
+    Mat values = inp.clone();
+    T a;
+
+    l = 0;
+    ir = MAX(values.rows, values.cols) - 1;
+    while(true)
+    {
+        if (ir <= l + 1)
+        {
+            if (ir == l + 1 && values.at<T>(ir) < values.at<T>(l))
+                std::swap(values.at<T>(l), values.at<T>(ir));
+            return values.at<T>(k);
+        }
+        else
+        {
+            mid = (l + ir) >> 1;
+            std::swap(values.at<T>(mid), values.at<T>(l+1));
+            if (values.at<T>(l) > values.at<T>(ir))
+                std::swap(values.at<T>(l), values.at<T>(ir));
+            if (values.at<T>(l+1) > values.at<T>(ir))
+                std::swap(values.at<T>(l+1), values.at<T>(ir));
+            if (values.at<T>(l) > values.at<T>(l+1))
+                std::swap(values.at<T>(l), values.at<T>(l+1));
+            i = l + 1;
+            j = ir;
+            a = values.at<T>(l+1);
+            while (true)
+            {
+                do
+                {
+                    i++;
+                }
+                while (values.at<T>(i) < a);
+                do
+                {
+                    j--;
+                }
+                while (values.at<T>(j) > a);
+                if (j < i) break;
+                std::swap(values.at<T>(i), values.at<T>(j));
+            }
+            values.at<T>(l+1) = values.at<T>(j);
+            values.at<T>(j) = a;
+            if (j >= static_cast<unsigned long>(k)) ir = j - 1;
+            if (j <= static_cast<unsigned long>(k)) l = i;
+        }
+    }
+}
+
+namespace cv {
+namespace optflow {
+
+class CImageBuffer
+{
+public:
+    CImageBuffer()
+        : m_Overwrite(true)
+    {}
+    void setGrayFromRGB(const cv::Mat & inp)
+    {
+        if(m_Overwrite)
+            cv::cvtColor(inp, m_Image, cv::COLOR_BGR2GRAY);
+    }
+    void setImage(const cv::Mat & inp)
+    {
+        if(m_Overwrite)
+            inp.copyTo(m_Image);
+    }
+    void setBlurFromRGB(const cv::Mat & inp)
+    {
+        if(m_Overwrite)
+            cv::GaussianBlur(inp, m_BlurredImage, cv::Size(7,7), -1);
+    }
+
+    int buildPyramid(cv::Size winSize, int maxLevel, float levelScale[2]);
+    cv::Mat & getImage(int level) {return m_ImagePyramid[level];}
+
+    std::vector<cv::Mat>     m_ImagePyramid;
+    cv::Mat                  m_BlurredImage;
+    cv::Mat                  m_Image;
+    std::vector<cv::Mat>     m_CrossPyramid;
+    int                      m_maxLevel;
+    bool                     m_Overwrite;
+};
+
+void calcLocalOpticalFlow(
+    const Mat prevImage,
+    const Mat currImage,
+    Ptr<CImageBuffer>  prevPyramids[2],
+    Ptr<CImageBuffer>  currPyramids[2],
+    const std::vector<Point2f> & prevPoints,
+    std::vector<Point2f> & currPoints,
+    const RLOFOpticalFlowParameter & param);
+
+}} // namespace
+#endif
diff --git a/modules/optflow/src/rlofflow.cpp b/modules/optflow/src/rlofflow.cpp
new file mode 100644
index 00000000000..5ee07005eb7
--- /dev/null
+++ b/modules/optflow/src/rlofflow.cpp
@@ -0,0 +1,399 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#include "precomp.hpp"
+#include "rlof/rlof_localflow.h"
+#include "rlof/geo_interpolation.hpp"
+#include "opencv2/ximgproc.hpp"
+
+namespace cv {
+namespace optflow {
+
+Ptr<RLOFOpticalFlowParameter> RLOFOpticalFlowParameter::create()
+{
+    return Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter);
+}
+
+void RLOFOpticalFlowParameter::setSolverType(SolverType val){ solverType = val;}
+SolverType RLOFOpticalFlowParameter::getSolverType() const { return solverType;}
+
+void RLOFOpticalFlowParameter::setSupportRegionType(SupportRegionType val){ supportRegionType = val;}
+SupportRegionType RLOFOpticalFlowParameter::getSupportRegionType() const { return supportRegionType;}
+
+void RLOFOpticalFlowParameter::setNormSigma0(float val){ normSigma0 = val;}
+float RLOFOpticalFlowParameter::getNormSigma0() const { return normSigma0;}
+
+void RLOFOpticalFlowParameter::setNormSigma1(float val){ normSigma1 = val;}
+float RLOFOpticalFlowParameter::getNormSigma1() const { return normSigma1;}
+
+void RLOFOpticalFlowParameter::setSmallWinSize(int val){ smallWinSize = val;}
+int RLOFOpticalFlowParameter::getSmallWinSize() const { return smallWinSize;}
+
+void RLOFOpticalFlowParameter::setLargeWinSize(int val){ largeWinSize = val;}
+int RLOFOpticalFlowParameter::getLargeWinSize() const { return largeWinSize;}
+
+void RLOFOpticalFlowParameter::setCrossSegmentationThreshold(int val){ crossSegmentationThreshold = val;}
+int RLOFOpticalFlowParameter::getCrossSegmentationThreshold() const { return crossSegmentationThreshold;}
+
+void RLOFOpticalFlowParameter::setMaxLevel(int val){ maxLevel = val;}
+int RLOFOpticalFlowParameter::getMaxLevel() const { return maxLevel;}
+
+void RLOFOpticalFlowParameter::setUseInitialFlow(bool val){ useInitialFlow = val;}
+bool RLOFOpticalFlowParameter::getUseInitialFlow() const { return useInitialFlow;}
+
+void RLOFOpticalFlowParameter::setUseIlluminationModel(bool val){ useIlluminationModel = val;}
+bool RLOFOpticalFlowParameter::getUseIlluminationModel() const { return useIlluminationModel;}
+
+void RLOFOpticalFlowParameter::setUseGlobalMotionPrior(bool val){ useGlobalMotionPrior = val;}
+bool RLOFOpticalFlowParameter::getUseGlobalMotionPrior() const { return useGlobalMotionPrior;}
+
+void RLOFOpticalFlowParameter::setMaxIteration(int val){ maxIteration = val;}
+int RLOFOpticalFlowParameter::getMaxIteration() const { return maxIteration;}
+
+void RLOFOpticalFlowParameter::setMinEigenValue(float val){ minEigenValue = val;}
+float RLOFOpticalFlowParameter::getMinEigenValue() const { return minEigenValue;}
+
+void RLOFOpticalFlowParameter::setGlobalMotionRansacThreshold(float val){ globalMotionRansacThreshold = val;}
+float RLOFOpticalFlowParameter::getGlobalMotionRansacThreshold() const { return globalMotionRansacThreshold;}
+
+class DenseOpticalFlowRLOFImpl : public DenseRLOFOpticalFlow
+{
+public:
+    DenseOpticalFlowRLOFImpl()
+        : param(Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter))
+        , forwardBackwardThreshold(1.f)
+        , gridStep(6, 6)
+        , interp_type(InterpolationType::INTERP_GEO)
+        , k(128)
+        , sigma(0.05f)
+        , lambda(999.f)
+        , fgs_lambda(500.0f)
+        , fgs_sigma(1.5f)
+        , use_post_proc(true)
+
+    {
+        prevPyramid[0] = cv::Ptr<CImageBuffer>(new CImageBuffer);
+        prevPyramid[1] = cv::Ptr<CImageBuffer>(new CImageBuffer);
+        currPyramid[0] = cv::Ptr<CImageBuffer>(new CImageBuffer);
+        currPyramid[1] = cv::Ptr<CImageBuffer>(new CImageBuffer);
+    }
+    virtual void setRLOFOpticalFlowParameter(Ptr<RLOFOpticalFlowParameter>  val) CV_OVERRIDE { param = val; }
+    virtual Ptr<RLOFOpticalFlowParameter>  getRLOFOpticalFlowParameter() const CV_OVERRIDE { return param; }
+
+    virtual float getForwardBackward() const CV_OVERRIDE { return forwardBackwardThreshold; }
+    virtual void setForwardBackward(float val) CV_OVERRIDE { forwardBackwardThreshold = val; }
+
+    virtual void setInterpolation(InterpolationType val) CV_OVERRIDE { interp_type = val; }
+    virtual InterpolationType getInterpolation() const CV_OVERRIDE { return interp_type; }
+
+    virtual Size getGridStep() const CV_OVERRIDE { return gridStep; }
+    virtual void setGridStep(Size val) CV_OVERRIDE { gridStep = val; }
+
+    virtual int getEPICK() const CV_OVERRIDE { return k; }
+    virtual void setEPICK(int val) CV_OVERRIDE { k = val; }
+
+    virtual float getEPICSigma() const CV_OVERRIDE { return sigma; }
+    virtual void setEPICSigma(float val) CV_OVERRIDE { sigma = val; }
+
+    virtual float getEPICLambda() const CV_OVERRIDE { return lambda; }
+    virtual void setEPICLambda(float val)  CV_OVERRIDE { lambda = val; }
+
+    virtual float getFgsLambda() const CV_OVERRIDE { return fgs_lambda; }
+    virtual void setFgsLambda(float val) CV_OVERRIDE { fgs_lambda = val; }
+
+    virtual float getFgsSigma() const CV_OVERRIDE { return fgs_sigma; }
+    virtual void setFgsSigma(float val) CV_OVERRIDE { fgs_sigma = val; }
+
+    virtual bool getUsePostProc() const CV_OVERRIDE { return use_post_proc; }
+    virtual void setUsePostProc(bool val) CV_OVERRIDE { use_post_proc = val; }
+
+    virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow) CV_OVERRIDE
+    {
+        CV_Assert(!I0.empty() && I0.depth() == CV_8U && (I0.channels() == 3 || I0.channels() == 1));
+        CV_Assert(!I1.empty() && I1.depth() == CV_8U && (I1.channels() == 3 || I1.channels() == 1));
+        CV_Assert(I0.sameSize(I1));
+        if (param.empty())
+            param = Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter());
+        if (param->supportRegionType == SR_CROSS)
+            CV_Assert( I0.channels() == 3 && I1.channels() == 3);
+        CV_Assert(interp_type == InterpolationType::INTERP_EPIC || interp_type == InterpolationType::INTERP_GEO);
+        // if no parameter is used use the default parameter
+
+        Mat prevImage = I0.getMat();
+        Mat currImage = I1.getMat();
+        int noPoints = prevImage.cols * prevImage.rows;
+        std::vector<cv::Point2f> prevPoints(noPoints);
+        std::vector<cv::Point2f> currPoints, refPoints;
+        noPoints = 0;
+        cv::Size grid_h = gridStep / 2;
+        for (int r = grid_h.height; r < prevImage.rows - grid_h.height; r += gridStep.height)
+        {
+            for (int c = grid_h.width; c < prevImage.cols - grid_h.width; c += gridStep.width)
+            {
+                prevPoints[noPoints++] = cv::Point2f(static_cast<float>(c), static_cast<float>(r));
+            }
+        }
+        prevPoints.erase(prevPoints.begin() + noPoints, prevPoints.end());
+        currPoints.resize(prevPoints.size());
+        calcLocalOpticalFlow(prevImage, currImage, prevPyramid, currPyramid, prevPoints, currPoints, *(param.get()));
+        flow.create(prevImage.size(), CV_32FC2);
+        Mat dense_flow = flow.getMat();
+
+        std::vector<Point2f> filtered_prevPoints;
+        std::vector<Point2f> filtered_currPoints;
+        if (gridStep == cv::Size(1, 1) && forwardBackwardThreshold <= 0)
+        {
+            for (unsigned int n = 0; n < prevPoints.size(); n++)
+            {
+                dense_flow.at<Point2f>(prevPoints[n]) = currPoints[n] - prevPoints[n];
+            }
+            return;
+        }
+        if (forwardBackwardThreshold > 0)
+        {
+            // reuse image pyramids
+            calcLocalOpticalFlow(currImage, prevImage, currPyramid, prevPyramid, currPoints, refPoints, *(param.get()));
+
+            filtered_prevPoints.resize(prevPoints.size());
+            filtered_currPoints.resize(prevPoints.size());
+            float sqrForwardBackwardThreshold = forwardBackwardThreshold * forwardBackwardThreshold;
+            noPoints = 0;
+            for (unsigned int r = 0; r < refPoints.size(); r++)
+            {
+                Point2f diff = refPoints[r] - prevPoints[r];
+                if (diff.x * diff.x + diff.y * diff.y < sqrForwardBackwardThreshold)
+                {
+                    filtered_prevPoints[noPoints] = prevPoints[r];
+                    filtered_currPoints[noPoints++] = currPoints[r];
+                }
+            }
+
+            filtered_prevPoints.erase(filtered_prevPoints.begin() + noPoints, filtered_prevPoints.end());
+            filtered_currPoints.erase(filtered_currPoints.begin() + noPoints, filtered_currPoints.end());
+
+        }
+        else
+        {
+            filtered_prevPoints = prevPoints;
+            filtered_currPoints = currPoints;
+        }
+
+        if (interp_type == InterpolationType::INTERP_EPIC)
+        {
+            Ptr<ximgproc::EdgeAwareInterpolator> gd = ximgproc::createEdgeAwareInterpolator();
+            gd->setK(k);
+            gd->setSigma(sigma);
+            gd->setLambda(lambda);
+            gd->setUsePostProcessing(false);
+            gd->interpolate(prevImage, filtered_prevPoints, currImage, filtered_currPoints, dense_flow);
+        }
+        else
+        {
+            Mat blurredPrevImage, blurredCurrImage;
+            GaussianBlur(prevImage, blurredPrevImage, cv::Size(5, 5), -1);
+            std::vector<uchar> status(filtered_currPoints.size(), 1);
+            interpolate_irregular_nn_raster(filtered_prevPoints, filtered_currPoints, status, blurredPrevImage).copyTo(dense_flow);
+            std::vector<Mat> vecMats;
+            std::vector<Mat> vecMats2(2);
+            cv::split(dense_flow, vecMats);
+            cv::bilateralFilter(vecMats[0], vecMats2[0], 5, 2, 20);
+            cv::bilateralFilter(vecMats[1], vecMats2[1], 5, 2, 20);
+            cv::merge(vecMats2, dense_flow);
+        }
+        if (use_post_proc)
+        {
+            ximgproc::fastGlobalSmootherFilter(prevImage, flow, flow, fgs_lambda, fgs_sigma);
+        }
+    }
+
+    virtual void collectGarbage() CV_OVERRIDE
+    {
+        prevPyramid[0].release();
+        prevPyramid[1].release();
+        currPyramid[0].release();
+        currPyramid[1].release();
+    }
+
+protected:
+    Ptr<RLOFOpticalFlowParameter> param;
+    float                         forwardBackwardThreshold;
+    Ptr<CImageBuffer>             prevPyramid[2];
+    Ptr<CImageBuffer>             currPyramid[2];
+    cv::Size                      gridStep;
+    InterpolationType             interp_type;
+    int                           k;
+    float                         sigma;
+    float                         lambda;
+    float                         fgs_lambda;
+    float                         fgs_sigma;
+    bool                          use_post_proc;
+};
+
+Ptr<DenseRLOFOpticalFlow> DenseRLOFOpticalFlow::create(
+    Ptr<RLOFOpticalFlowParameter>  rlofParam,
+    float forwardBackwardThreshold,
+    cv::Size gridStep,
+    InterpolationType interp_type,
+    int epicK,
+    float epicSigma,
+    float epicLambda,
+    bool use_post_proc,
+    float fgs_lambda,
+    float fgs_sigma)
+{
+    Ptr<DenseRLOFOpticalFlow> algo = makePtr<DenseOpticalFlowRLOFImpl>();
+    algo->setRLOFOpticalFlowParameter(rlofParam);
+    algo->setForwardBackward(forwardBackwardThreshold);
+    algo->setGridStep(gridStep);
+    algo->setInterpolation(interp_type);
+    algo->setEPICK(epicK);
+    algo->setEPICSigma(epicSigma);
+    algo->setEPICLambda(epicLambda);
+    algo->setUsePostProc(use_post_proc);
+    algo->setFgsLambda(fgs_lambda);
+    algo->setFgsSigma(fgs_sigma);
+    return algo;
+}
+
+class SparseRLOFOpticalFlowImpl : public SparseRLOFOpticalFlow
+{
+    public:
+    SparseRLOFOpticalFlowImpl()
+        : param(Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter))
+        , forwardBackwardThreshold(1.f)
+    {
+        prevPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer);
+        prevPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer);
+        currPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer);
+        currPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer);
+    }
+    virtual void setRLOFOpticalFlowParameter(Ptr<RLOFOpticalFlowParameter>  val) CV_OVERRIDE { param = val; }
+    virtual Ptr<RLOFOpticalFlowParameter>  getRLOFOpticalFlowParameter() const CV_OVERRIDE { return param; }
+
+    virtual float getForwardBackward()  const CV_OVERRIDE { return forwardBackwardThreshold; }
+    virtual void setForwardBackward(float val) CV_OVERRIDE { forwardBackwardThreshold = val; }
+
+    virtual void calc(InputArray prevImg, InputArray nextImg,
+        InputArray prevPts, InputOutputArray nextPts,
+        OutputArray status,
+        OutputArray err) CV_OVERRIDE
+    {
+        CV_Assert(!prevImg.empty() && prevImg.depth() == CV_8U && (prevImg.channels() == 3 || prevImg.channels() == 1));
+        CV_Assert(!nextImg.empty() && nextImg.depth() == CV_8U && (nextImg.channels() == 3 || nextImg.channels() == 1));
+        CV_Assert(prevImg.sameSize(nextImg));
+
+        Mat prevImage = prevImg.getMat();
+        Mat nextImage = nextImg.getMat();
+        Mat prevPtsMat = prevPts.getMat();
+
+        if (param.empty())
+        {
+            param = Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter);
+        }
+
+        if (param->useInitialFlow == false)
+            nextPts.create(prevPtsMat.size(), prevPtsMat.type(), -1, true);
+
+        int npoints = 0;
+        CV_Assert((npoints = prevPtsMat.checkVector(2, CV_32F, true)) >= 0);
+
+        if (npoints == 0)
+        {
+            nextPts.release();
+            status.release();
+            err.release();
+            return;
+        }
+
+        Mat nextPtsMat = nextPts.getMat();
+        CV_Assert(nextPtsMat.checkVector(2, CV_32F, true) == npoints);
+        std::vector<cv::Point2f> prevPoints(npoints), nextPoints(npoints), refPoints;
+        prevPtsMat.copyTo(cv::Mat(1, npoints, CV_32FC2, &prevPoints[0]));
+        if (param->useInitialFlow )
+            nextPtsMat.copyTo(cv::Mat(1, nextPtsMat.cols, CV_32FC2, &nextPoints[0]));
+
+        cv::Mat statusMat;
+        cv::Mat errorMat;
+        if (status.needed() || forwardBackwardThreshold > 0)
+        {
+            status.create((int)npoints, 1, CV_8U, -1, true);
+            statusMat = status.getMat();
+            statusMat.setTo(1);
+        }
+
+        if (err.needed() || forwardBackwardThreshold > 0)
+        {
+            err.create((int)npoints, 1, CV_32F, -1, true);
+            errorMat = err.getMat();
+            errorMat.setTo(0);
+        }
+
+        calcLocalOpticalFlow(prevImage, nextImage, prevPyramid, currPyramid, prevPoints, nextPoints, *(param.get()));
+        cv::Mat(1,npoints , CV_32FC2, &nextPoints[0]).copyTo(nextPtsMat);
+        if (forwardBackwardThreshold > 0)
+        {
+            // reuse image pyramids
+            calcLocalOpticalFlow(nextImage, prevImage, currPyramid, prevPyramid, nextPoints, refPoints, *(param.get()));
+        }
+        for (unsigned int r = 0; r < refPoints.size(); r++)
+        {
+            Point2f diff = refPoints[r] - prevPoints[r];
+            errorMat.at<float>(r) = sqrt(diff.x * diff.x + diff.y * diff.y);
+            if (errorMat.at<float>(r) <= forwardBackwardThreshold)
+                statusMat.at<uchar>(r) = 0;
+        }
+
+    }
+
+protected:
+    Ptr<RLOFOpticalFlowParameter> param;
+    float                forwardBackwardThreshold;
+    Ptr<CImageBuffer>    prevPyramid[2];
+    Ptr<CImageBuffer>    currPyramid[2];
+};
+
+Ptr<SparseRLOFOpticalFlow> SparseRLOFOpticalFlow::create(
+    Ptr<RLOFOpticalFlowParameter>  rlofParam,
+    float forwardBackwardThreshold)
+{
+    Ptr<SparseRLOFOpticalFlow> algo = makePtr<SparseRLOFOpticalFlowImpl>();
+    algo->setRLOFOpticalFlowParameter(rlofParam);
+    algo->setForwardBackward(forwardBackwardThreshold);
+    return algo;
+}
+
+void calcOpticalFlowDenseRLOF(InputArray I0, InputArray I1, InputOutputArray flow,
+    Ptr<RLOFOpticalFlowParameter>  rlofParam ,
+    float forewardBackwardThreshold, Size gridStep,
+    InterpolationType interp_type,
+    int epicK, float epicSigma, float epicLambda,
+    bool use_post_proc, float fgsLambda, float fgsSigma)
+{
+    Ptr<DenseRLOFOpticalFlow> algo = DenseRLOFOpticalFlow::create(
+        rlofParam, forewardBackwardThreshold, gridStep, interp_type,
+        epicK, epicSigma, epicLambda, use_post_proc, fgsLambda, fgsSigma);
+    algo->calc(I0, I1, flow);
+    algo->collectGarbage();
+}
+
+void calcOpticalFlowSparseRLOF(InputArray prevImg, InputArray nextImg,
+    InputArray prevPts, InputOutputArray nextPts,
+    OutputArray status, OutputArray err,
+    Ptr<RLOFOpticalFlowParameter>  rlofParam,
+    float forewardBackwardThreshold)
+{
+    Ptr<SparseRLOFOpticalFlow> algo = SparseRLOFOpticalFlow::create(
+        rlofParam, forewardBackwardThreshold);
+    algo->calc(prevImg, nextImg, prevPts, nextPts, status, err);
+}
+Ptr<DenseOpticalFlow> createOptFlow_DenseRLOF()
+{
+    return DenseRLOFOpticalFlow::create();
+}
+
+Ptr<SparseOpticalFlow> createOptFlow_SparseRLOF()
+{
+    return SparseRLOFOpticalFlow::create();
+}
+
+}} // namespace
diff --git a/modules/optflow/test/test_OF_accuracy.cpp b/modules/optflow/test/test_OF_accuracy.cpp
index 6a5ed702e48..5d093cd49c3 100644
--- a/modules/optflow/test/test_OF_accuracy.cpp
+++ b/modules/optflow/test/test_OF_accuracy.cpp
@@ -83,7 +83,20 @@ static float calcRMSE(Mat flow1, Mat flow2)
     }
     return (float)sqrt(sum / (1e-9 + counter));
 }
-
+static float calcRMSE(vector<Point2f> prevPts, vector<Point2f> currPts, Mat flow)
+{
+    vector<float> ee;
+    for (unsigned int n = 0; n < prevPts.size(); n++)
+    {
+        Point2f gtFlow = flow.at<Point2f>(prevPts[n]);
+        if (isFlowCorrect(gtFlow.x) && isFlowCorrect(gtFlow.y))
+        {
+            Point2f diffFlow = (currPts[n] - prevPts[n]) - gtFlow;
+            ee.push_back(sqrt(diffFlow.x * diffFlow.x + diffFlow.y * diffFlow.y));
+        }
+    }
+    return static_cast<float>(mean(ee).val[0]);
+}
 static float calcAvgEPE(vector< pair<Point2i, Point2i> > corr, Mat flow)
 {
     double sum = 0;
@@ -111,9 +124,13 @@ static float calcAvgEPE(vector< pair<Point2i, Point2i> > corr, Mat flow)
 
 bool readRubberWhale(Mat &dst_frame_1, Mat &dst_frame_2, Mat &dst_GT)
 {
-    const string frame1_path = getRubberWhaleFrame1();
-    const string frame2_path = getRubberWhaleFrame2();
-    const string gt_flow_path = getRubberWhaleGroundTruth();
+    string frame1_path = getRubberWhaleFrame1();
+    string frame2_path = getRubberWhaleFrame2();
+    string gt_flow_path = getRubberWhaleGroundTruth();
+    // removing space may be an issue on windows machines
+    frame1_path.erase(std::remove_if(frame1_path.begin(), frame1_path.end(), isspace), frame1_path.end());
+    frame2_path.erase(std::remove_if(frame2_path.begin(), frame2_path.end(), isspace), frame2_path.end());
+    gt_flow_path.erase(std::remove_if(gt_flow_path.begin(), gt_flow_path.end(), isspace), gt_flow_path.end());
 
     dst_frame_1 = imread(frame1_path);
     dst_frame_2 = imread(frame2_path);
@@ -125,6 +142,7 @@ bool readRubberWhale(Mat &dst_frame_1, Mat &dst_frame_2, Mat &dst_GT)
         return true;
 }
 
+
 TEST(DenseOpticalFlow_SimpleFlow, ReferenceAccuracy)
 {
     Mat frame1, frame2, GT;
@@ -157,6 +175,102 @@ TEST(DenseOpticalFlow_DeepFlow, ReferenceAccuracy)
     EXPECT_LE(calcRMSE(GT, flow), target_RMSE);
 }
 
+TEST(SparseOpticalFlow, ReferenceAccuracy)
+{
+    // with the following test each invoker class should be tested once
+    Mat frame1, frame2, GT;
+    ASSERT_TRUE(readRubberWhale(frame1, frame2, GT));
+    vector<Point2f> prevPts, currPts;
+    for (int r = 0; r < frame1.rows; r+=10)
+    {
+        for (int c = 0; c < frame1.cols; c+=10)
+        {
+            prevPts.push_back(Point2f(static_cast<float>(c), static_cast<float>(r)));
+        }
+    }
+    vector<uchar> status(prevPts.size());
+    vector<float> err(prevPts.size());
+    Ptr<SparseRLOFOpticalFlow> algo = SparseRLOFOpticalFlow::create();
+    algo->setForwardBackward(0.0f);
+    Ptr<RLOFOpticalFlowParameter> param = Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter);
+    param->supportRegionType = SR_CROSS;
+    param->useIlluminationModel = true;
+    param->solverType = ST_BILINEAR;
+    algo->setRLOFOpticalFlowParameter(param);
+    algo->calc(frame1, frame2, prevPts, currPts, status, err);
+    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.3f);
+
+    param->solverType = ST_STANDART;
+    algo->setRLOFOpticalFlowParameter(param);
+    algo->calc(frame1, frame2, prevPts, currPts, status, err);
+    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.34f);
+
+    param->useIlluminationModel = false;
+    param->solverType = ST_BILINEAR;
+    algo->setRLOFOpticalFlowParameter(param);
+    algo->calc(frame1, frame2, prevPts, currPts, status, err);
+    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.27f);
+
+    param->solverType = ST_STANDART;
+    algo->setRLOFOpticalFlowParameter(param);
+    algo->calc(frame1, frame2, prevPts, currPts, status, err);
+    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.27f);
+
+    param->normSigma0 = numeric_limits<float>::max();
+    param->normSigma1 = numeric_limits<float>::max();
+    param->useIlluminationModel = true;
+
+    param->solverType = ST_BILINEAR;
+    algo->setRLOFOpticalFlowParameter(param);
+    algo->calc(frame1, frame2, prevPts, currPts, status, err);
+    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.28f);
+
+    param->solverType = ST_STANDART;
+    algo->setRLOFOpticalFlowParameter(param);
+    algo->calc(frame1, frame2, prevPts, currPts, status, err);
+    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.28f);
+
+    param->useIlluminationModel = false;
+
+    param->solverType = ST_BILINEAR;
+    algo->setRLOFOpticalFlowParameter(param);
+    algo->calc(frame1, frame2, prevPts, currPts, status, err);
+    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.80f);
+
+    param->solverType = ST_STANDART;
+    algo->setRLOFOpticalFlowParameter(param);
+    algo->calc(frame1, frame2, prevPts, currPts, status, err);
+    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.28f);
+}
+
+TEST(DenseOpticalFlow_RLOF, ReferenceAccuracy)
+{
+    Mat frame1, frame2, GT;
+    ASSERT_TRUE(readRubberWhale(frame1, frame2, GT));
+    Mat flow;
+    Ptr<DenseRLOFOpticalFlow> algo = DenseRLOFOpticalFlow::create();
+    Ptr<RLOFOpticalFlowParameter> param = Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter);
+    param->supportRegionType = SR_CROSS;
+    param->solverType = ST_BILINEAR;
+    algo->setRLOFOpticalFlowParameter(param);
+    algo->setForwardBackward(1.0f);
+    algo->setGridStep(cv::Size(4, 4));
+    algo->setInterpolation(INTERP_EPIC);
+    algo->calc(frame1, frame2, flow);
+
+    ASSERT_EQ(GT.rows, flow.rows);
+    ASSERT_EQ(GT.cols, flow.cols);
+    EXPECT_LE(calcRMSE(GT, flow), 0.44f);
+
+    algo->setInterpolation(INTERP_GEO);
+    algo->calc(frame1, frame2, flow);
+
+    ASSERT_EQ(GT.rows, flow.rows);
+    ASSERT_EQ(GT.cols, flow.cols);
+    EXPECT_LE(calcRMSE(GT, flow), 0.55f);
+
+}
+
 TEST(DenseOpticalFlow_SparseToDenseFlow, ReferenceAccuracy)
 {
     Mat frame1, frame2, GT;