From 04830a7018db749253f98fddaed7c964a43b654a Mon Sep 17 00:00:00 2001
From: tsenst <senst@nue.tu-berlin.de>
Date: Sun, 9 Dec 2018 17:38:41 +0100
Subject: [PATCH 01/25] Add robust local optical flow (RLOF) implementations
 which is an improved pyramidal iterative Lucas-Kanade approach. This
 implementations contains interfaces for sparse optical flow for feature
 tracking and dense optical flow based on sparse-to-dense interpolation
 schemes. Add performance and accuracy tests have been implementation as well
 as documentation with the related publications

---
 modules/optflow/doc/optflow.bib               |   49 +
 modules/optflow/include/opencv2/optflow.hpp   |    3 +-
 .../include/opencv2/optflow/rlofflow.hpp      |  482 ++++
 modules/optflow/include/opencv2/optflow_o.hpp |  309 +++
 modules/optflow/perf/perf_rlof.cpp            |   89 +
 modules/optflow/src/rlof/berlof_invoker.hpp   | 2231 +++++++++++++++++
 .../optflow/src/rlof/geo_interpolation.cpp    |  525 ++++
 .../optflow/src/rlof/geo_interpolation.hpp    |   75 +
 modules/optflow/src/rlof/plk_invoker.hpp      | 1141 +++++++++
 modules/optflow/src/rlof/rlof_invoker.hpp     | 1513 +++++++++++
 modules/optflow/src/rlof/rlof_invokerbase.hpp |  309 +++
 modules/optflow/src/rlof/rlof_localflow.cpp   |  586 +++++
 modules/optflow/src/rlof/rlof_localflow.h     |  158 ++
 modules/optflow/src/rlofflow.cpp              |  385 +++
 modules/optflow/test/test_OF_accuracy.cpp     |  128 +-
 15 files changed, 7978 insertions(+), 5 deletions(-)
 create mode 100644 modules/optflow/include/opencv2/optflow/rlofflow.hpp
 create mode 100644 modules/optflow/include/opencv2/optflow_o.hpp
 create mode 100644 modules/optflow/perf/perf_rlof.cpp
 create mode 100644 modules/optflow/src/rlof/berlof_invoker.hpp
 create mode 100644 modules/optflow/src/rlof/geo_interpolation.cpp
 create mode 100644 modules/optflow/src/rlof/geo_interpolation.hpp
 create mode 100644 modules/optflow/src/rlof/plk_invoker.hpp
 create mode 100644 modules/optflow/src/rlof/rlof_invoker.hpp
 create mode 100644 modules/optflow/src/rlof/rlof_invokerbase.hpp
 create mode 100644 modules/optflow/src/rlof/rlof_localflow.cpp
 create mode 100644 modules/optflow/src/rlof/rlof_localflow.h
 create mode 100644 modules/optflow/src/rlofflow.cpp

diff --git a/modules/optflow/doc/optflow.bib b/modules/optflow/doc/optflow.bib
index c36504b3e3c..616686e2698 100644
--- a/modules/optflow/doc/optflow.bib
+++ b/modules/optflow/doc/optflow.bib
@@ -61,3 +61,52 @@ @inproceedings{Wang_2016_CVPR
   month = {June},
   year = {2016}
 }
+
+@inproceedings{Geistert2016,
+	author = {Jonas Geistert and Tobias Senst and Thomas Sikora},
+	title = {Robust Local Optical Flow: Dense Motion Vector Field Interpolation},
+	booktitle = {Picture Coding Symposium},
+	year = {2016},
+	pages = {1--5},
+}
+
+@inproceedings{Senst2016,			    								  
+  author = {Tobias Senst and Jonas Geistert and Thomas Sikora},			  
+  title = {Robust local optical flow: Long-range motions and varying illuminations},															  
+  booktitle = {IEEE International Conference on Image Processing},		  
+  year = {2016},															  
+  pages = {4478--4482},														  
+}																		  
+
+@inproceedings{Senst2014,		    									  
+  author = {Tobias Senst and Thilo Borgmann and Ivo Keller and Thomas Sikora},																	  
+  title = {Cross based Robust Local Optical Flow},							  
+  booktitle = {21th IEEE International Conference on Image Processing},	  
+  year = {2014},															  
+  pages = {1967--1971},														  
+}																		  
+
+@inproceedings{Senst2013,	    										  
+  author = {Tobias Senst and Jonas Geistert and Ivo Keller and Thomas Sikora},																	  
+  title = {Robust Local Optical Flow Estimation using Bilinear Equations for Sparse Motion Estimation},												  
+  booktitle = {20th IEEE International Conference on Image Processing},	  
+  year = {2013},															  
+  pages = {2499--2503},													    
+}																		  
+
+@article{Senst2012,                                                        
+  author = {Tobias Senst and Volker Eiselein and Thomas Sikora},             
+  title = {Robust Local Optical Flow for Feature Tracking},                  
+  journal = {IEEE Transactions on Circuits and Systems for Video Technology},
+  year = {2012},                                                             
+  pages = {1377--1387},                                                      
+  volume = {22},                                                             
+  number = {9},                                                              
+}                                                                          
+
+@article{Tibshirani2008,
+  title={Fast computation of the median by successive binning},
+  author={Tibshirani, Ryan J},
+  journal={arXiv preprint arXiv:0806.3301},
+  year={2008}
+}             
diff --git a/modules/optflow/include/opencv2/optflow.hpp b/modules/optflow/include/opencv2/optflow.hpp
index 65802750d1a..a4bd3a737e6 100644
--- a/modules/optflow/include/opencv2/optflow.hpp
+++ b/modules/optflow/include/opencv2/optflow.hpp
@@ -68,7 +68,7 @@ Functions reading and writing .flo files in "Middlebury" format, see: <http://vi
 
 #include "opencv2/optflow/pcaflow.hpp"
 #include "opencv2/optflow/sparse_matching_gpc.hpp"
-
+#include "opencv2/optflow/rlofflow.hpp"
 namespace cv
 {
 namespace optflow
@@ -173,6 +173,7 @@ CV_EXPORTS_W Ptr<DenseOpticalFlow> createOptFlow_Farneback();
 //! Additional interface to the SparseToDenseFlow algorithm - calcOpticalFlowSparseToDense()
 CV_EXPORTS_W Ptr<DenseOpticalFlow> createOptFlow_SparseToDense();
 
+
 /** @brief "Dual TV L1" Optical Flow Algorithm.
 
 The class implements the "Dual TV L1" optical flow algorithm described in @cite Zach2007 and
diff --git a/modules/optflow/include/opencv2/optflow/rlofflow.hpp b/modules/optflow/include/opencv2/optflow/rlofflow.hpp
new file mode 100644
index 00000000000..e9accdab5c4
--- /dev/null
+++ b/modules/optflow/include/opencv2/optflow/rlofflow.hpp
@@ -0,0 +1,482 @@
+/*
+By downloading, copying, installing or using the software you agree to this
+license. If you do not agree to this license, do not download, install,
+copy or use the software.
+
+
+						  License Agreement
+			   For Open Source Computer Vision Library
+					   (3-clause BSD License)
+
+Copyright (C) 2018, OpenCV Foundation, all rights reserved.
+Third party copyrights are property of their respective owners.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+  * Redistributions of source code must retain the above copyright notice,
+	this list of conditions and the following disclaimer.
+
+  * Redistributions in binary form must reproduce the above copyright notice,
+	this list of conditions and the following disclaimer in the documentation
+	and/or other materials provided with the distribution.
+
+  * Neither the names of the copyright holders nor the names of the contributors
+	may be used to endorse or promote products derived from this software
+	without specific prior written permission.
+
+This software is provided by the copyright holders and contributors "as is" and
+any express or implied warranties, including, but not limited to, the implied
+warranties of merchantability and fitness for a particular purpose are
+disclaimed. In no event shall copyright holders or contributors be liable for
+any direct, indirect, incidental, special, exemplary, or consequential damages
+(including, but not limited to, procurement of substitute goods or services;
+loss of use, data, or profits; or business interruption) however caused
+and on any theory of liability, whether in contract, strict liability,
+or tort (including negligence or otherwise) arising in any way out of
+the use of this software, even if advised of the possibility of such damage.
+*/
+
+/**
+ * @file   rloflow.hpp
+ * @author Tobias Senst <tobias.senst@googlemail.com>
+ * @brief  Implementation of the robust local optical flow algorithm for sparse and dense motion 
+ * estimation from the following papers:
+ * http://elvera.nue.tu-berlin.de/files/1349Senst2012.pdf
+ * http://elvera.nue.tu-berlin.de/files/1422Senst2013.pdf
+ * http://elvera.nue.tu-berlin.de/files/1448Senst2014.pdf
+ * http://elvera.nue.tu-berlin.de/files/1498Geistert2016.pdf
+ * http://elvera.nue.tu-berlin.de/files/1496Senst2016.pdf
+ * 
+ * 
+ * @cite Senst2012, @cite Senst2013, @cite Senst2014, @cite Senst2016 or @cite Geistert2016
+ *
+*/
+#ifndef __OPENCV_OPTFLOW_RLOFFLOW_HPP__
+#define __OPENCV_OPTFLOW_RLOFFLOW_HPP__
+
+#include "opencv2/core.hpp"
+#include "opencv2/video.hpp"
+
+namespace cv
+{
+    namespace optflow
+    {
+		//! @addtogroup optflow
+		//! @{
+
+		/** @brief This is used store and set up the parameters of the robust local optical flow (RLOF) algoritm. 
+		 *
+		 * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as 
+		 * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). 
+		 * This RLOF implementation can be seen as an improved pyramidal iterative Lucas-Kanade and includes 
+		 * a set of improving modules. The main improvements in respect to the pyramidal iterative Lucas-Kanade
+		 * are:
+		 *	- A more robust redecending M-estimator framework (see @cite Senst2012) to improve the accuracy at 
+		 *     motion boundaries and appearing and disappearing pixels.
+		 *  - an adaptive support region strategies to improve the accuracy at motion boundaries to reduce the 
+		 *     corona effect, i.e oversmoothing of the PLK at motion/object boundaries. The cross-based segementation 
+		 *     strategy (SR_CROSS) proposed in @cite Senst2014 uses a simple segmenation approach to obtain the optimal 
+		 *     shape of the support region.
+		 *  - To deal with illumination changes (outdoor sequences and shadow) the intensity constancy assumption 
+		 *     based optical flow equation has been adopt with the Gennert and Negahdaripour illumination model 
+		 *     (see @cite Senst2016). This model can be switched on/off with the useIlluminationModel variable.
+		 *  - By using a global motion prior initialization (see @cite Senst2016) of the iterative refinement
+		 *     the accuracy could be significantly improved for large displacements. This initialization can be
+		 *     switched on and of with useGlobalMotionPrior variable.
+		 *
+		 * The RLOF can be computed with the SparseOpticalFlow class or function interface to track a set of features 
+		 * or with the DenseOpticalFlow class or function interface to compute dense optical flow. 
+		 * 
+		 * @see optflow::DenseRLOFOpticalFlow, optflow::calcOpticalFlowDenseRLOF(), optflow::SparseRLOFOpticalFlow, optflow::calcOpticalFlowSparseRLOF()
+		 */
+		class RLOFOpticalFlowParameter {
+		public:
+			RLOFOpticalFlowParameter()
+				:solverType(ST_BILINEAR)
+				,supportRegionType(SR_CROSS)
+			    ,normSigma0(3.2f)
+				,normSigma1(7.f)
+				,smallWinSize(9)
+				,largeWinSize(21)
+				,crossSegmentationThreshold(25)
+				,useInitialFlow(false)
+				,useIlluminationModel(true)
+				,useGlobalMotionPrior(true)
+				,maxLevel(3)
+				,maxIteration(30)
+				,minEigenValue(0.0001f)
+				,globalMotionRansacThreshold(10)
+			{}
+			enum SupportRegionType {
+				SR_FIXED = 0,			/**<  Apply a constant support region */
+				SR_CROSS				/**<  Apply a adaptive support region obtained by cross-based segmentation 
+										 *    as described in @cite Senst2014
+										 */
+			};
+			enum SolverType {
+				ST_STANDART = 0,		/**< Apply standard iterative refinement */
+				ST_BILINEAR				/**< Apply optimized iterative refinement based bilinear equation solutions 
+										 *   as described in @cite Senst2013
+										 */
+			};
+			SolverType solverType; 
+			/**< Variable specifies the iterative refinement strategy. Please consider citing  @cite Senst2013 when 
+			 *  using ST_BILINEAR.
+			*/
+			
+			SupportRegionType supportRegionType;
+			/**< Variable specifies the support region shape extraction or shrinking strategy.
+			*/
+			
+			float normSigma0;
+			/**< \$ \sigma_0 \$ paramter of the shrinked Hampel norm introduced in @cite Senst2012. If 
+			 * \$ \sigma_0 \$ = std::numeric_limist<float>::max() the least-square estimator will be used 
+			 * instead of the M-estimator. Althoug M-estimator is more robust against outlier in the support 
+			 * region the least-square can be fast in computation.
+			*/
+			float normSigma1;
+			/**< \$ \sigma_1 \$ paramter of the shrinked Hampel norm introduced in @cite Senst2012. If
+			 * \$ \sigma_1 \$ = std::numeric_limist<float>::max() the least-square estimator will be used
+			 * instead of the M-estimator. Althoug M-estimator is more robust against outlier in the support 
+			 * region the least-square can be fast in computation.
+			*/
+			int   smallWinSize;
+			/**< Minimal window size of the support region. This parameter is only used if supportRegionType is SR_CROSS. 
+			*/
+			int   largeWinSize;
+			/**< Maximal window size of the support region. If supportRegionType is SR_FIXED this gives the exact support 
+			 * region size. The speed of the RLOF is related to the applied win sizes. The smaller the window size the lower is the runtime, 
+			 * but the more sensitive to noise is the method.
+			*/
+			int   crossSegmentationThreshold;
+			/**< Color similarity threshold used by cross-based segmentation following @cite Senst2014 .
+			 *   (Only used  if supportRegionType is SR_CROSS). With the cross-bassed segmentation 
+			 *   motion boundaries can be computed more accurately.
+			*/
+			int	  maxLevel;
+			/**< Maximal number of pyramid level used. The large this value is the more likely it is
+			 *   to obtain accurate solutions for long-range motions. The runtime is linear related to 
+			 *   this parameter.
+			*/
+			bool  useInitialFlow;
+			/**< Use next point list as initial values. A good intialization can imporve the algortihm 
+			 *   accuracy and reduce the runtime by a faster convergence of the iteration refinement.
+			*/
+			bool  useIlluminationModel;		
+			/**< Use the Gennert and Negahdaripour illumination model instead of the intensity brigthness 
+			 *   constraint. (Probosed in @cite Senst2016 ) This model is defined as follow:
+			 *   \f[ I(\mathbf{x},t) + m \cdot I(\mathbf{x},t) + c = I(\mathbf{x},t+1) \f[
+			 *   and contains with m and c a multiplicative and additive term which makes the estimate 
+			 *   more robust against illumination changes. The computational complexity is increased by
+			 *   enabling the illumination model.
+			*/
+			bool  useGlobalMotionPrior;
+			/**< Use global motion prior initialisation has been introduced in @cite Senst2016 . It 
+			 *   allows to be more accurate for long-range motion. The computational complexity is 
+			 *   slightly increased by enabling the global motion prior initialisation.
+			*/
+	 		int   maxIteration;
+			/**< Number of maximal iterations used for the iterative refinement. Lower values can
+			 *   reduce the runtime but also the accuracy.
+			*/
+			float minEigenValue;
+			/**< Threshold for the minimal eigenvalue of the gradient matrix defines when to abort the 
+			 *   iterative refinement.
+			*/
+			float globalMotionRansacThreshold;
+			/**< To apply the global motion prior motion vectors will be computed on a regulary sampled which 
+			 *   are the basis for Homography estimation using RANSAC. The reprojection threshold is based on 
+			 *   n-th percentil (given by this value [0 ... 100]) of the forward backward errors relating to 
+			 *   the motion vectors. See @cite Senst2016 for more details.
+			*/
+		};
+
+		/** @brief Fast dense optical flow computation based on robust local optical flow (RLOF) algorithms and sparse-to-dense interpolation 
+		 * scheme.
+		 *
+		 * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as 
+		 * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). 
+		 *
+		 * The sparse-to-dense interpolation scheme allows for fast computation of dense optical flow using RLOF (see @cite Geistert2016).
+		 * For this scheme the following steps are applied:
+		 * -# motion vector seeded at a regular sampled grid are computed. The sparsity of this grid can be configured with setGridStep 
+		 * -# (optinally) errornous motion vectors are filter based on the forward backward confidence. The threshold can be configured
+		 * with setForwardBackward. The filter is only applied if the threshold >0 but than the runtime is doubled due to the estimation
+		 * of the backward flow.
+		 * -# Vector field interpolation is applied to the motion vector set to obtain a dense vector field.
+		 *
+		 * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details.
+		 * Parameters have been described in @cite Senst2012 @cite Senst2013 @cite Senst2014 and @cite Senst2016.
+		 *
+		 * @note If the grid size is set to (1,1) and the forward backward threshold <= 0 that the dense optical flow field is purely.
+		 * computed with the RLOF.
+		 *
+		 * @see optflow::calcOpticalFlowDenseRLOF(), optflow::RLOFOpticalFlowParameter
+		*/
+        class CV_EXPORTS_W DenseRLOFOpticalFlow : public DenseOpticalFlow
+        {	
+		public:
+			enum InterpolationType 
+			{
+				INTERP_GEO = 0,	/**<  Fast geodesic interpolation, see @cite Geister2016 */
+				INTERP_EPIC,	/**<  Edge-preserving interpolation, see @cite Revaud2015,Geistert2016. */
+			};
+			//! @brief Configuration of the RLOF alogrithm.
+			/** 
+			    @see optflow::RLOFOpticalFlowParameter, getRLOFOpticalFlowParameter 
+			*/
+			CV_WRAP virtual void setRLOFOpticalFlowParameter(RLOFOpticalFlowParameter val) = 0;
+			/** @copybrief setRLOFOpticalFlowParameter 
+				@see optflow::RLOFOpticalFlowParameter, setRLOFOpticalFlowParameter 
+			*/
+			CV_WRAP virtual RLOFOpticalFlowParameter getRLOFOpticalFlowParameter() const = 0;
+			//! @brief Threshold for the forward backward confidence check
+			/**For each grid point \f$ \mathbf{x} \f$ a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed. 
+			 *	 If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f]
+			 *	 is larger than threshold given by this function then the motion vector will not be used by the following 
+			 *	vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test
+			 *	will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation.
+			 *	@see getForwardBackward, setGridStep
+			*/
+			CV_WRAP virtual void setForwardBackward(float val) = 0;
+			/** @copybrief setForwardBackward
+				@see setForwardBackward 
+			*/
+			CV_WRAP virtual float getForwardBackward() const = 0;
+			//! @brief Size of the grid to spawn the motion vectors.
+			/** For each grid point a motion vector is computed. Some motion vectors will be removed due to the forwatd backward 
+			 *  threshold (if set >0). The rest will be the base of the vector field interpolation.
+			 *	@see getForwardBackward, setGridStep
+			*/
+			CV_WRAP virtual Size getGridStep() const = 0;
+			/** @copybrief getGridStep
+			 *	@see getGridStep 
+			 */
+			CV_WRAP virtual void setGridStep(Size val) = 0;
+
+			//! @brief Interpolation used to compute the dense optical flow.
+			/** Two interpolation algorithms have been supported
+			 * - **INTERP_GEO** applies the fast geodesic interpolation, see @cite Geister2016.
+			 * - **INTERP_EPIC_RESIDUAL** applies the edge-preserving interpolation, see @cite Revaud2015,Geistert2016.
+			 * @see ximgproc::EdgeAwareInterpolator, getInterpolation 
+			*/
+			CV_WRAP virtual void setInterpolation(InterpolationType val) = 0;
+			/** @copybrief setInterpolation
+			 *	@see ximgproc::EdgeAwareInterpolator, setInterpolation
+			 */
+			CV_WRAP virtual InterpolationType getInterpolation() const = 0;
+			//! @brief see ximgproc::EdgeAwareInterpolator() K value.
+			/** K is a number of nearest-neighbor matches considered, when fitting a locally affine
+			 *	model. Usually it should be around 128. However, lower values would make the interpolation noticeably faster.
+			 *	@see ximgproc::EdgeAwareInterpolator,  setEPICK
+			*/
+			CV_WRAP virtual int getEPICK() const = 0;
+			/** @copybrief getEPICK
+			 *	@see ximgproc::EdgeAwareInterpolator, getEPICK 
+			 */
+			CV_WRAP virtual void setEPICK(int val) = 0;
+			//! @brief see ximgproc::EdgeAwareInterpolator() sigma value.
+			/** Sigma is a parameter defining how fast the weights decrease in the locally-weighted affine
+			 *  fitting. Higher values can help preserve fine details, lower values can help to get rid of noise in the
+			 *  output flow.
+			 *	@see ximgproc::EdgeAwareInterpolator, setEPICSigma
+			*/
+			CV_WRAP virtual float getEPICSigma() const = 0;
+			/** @copybrief getEPICSigma
+			 *  @see ximgproc::EdgeAwareInterpolator, getEPICSigma 
+			 */
+			CV_WRAP virtual void setEPICSigma(float val) = 0;
+			//! @brief  see ximgproc::EdgeAwareInterpolator() lambda value.
+			/** Lambda is a parameter defining the weight of the edge-aware term in geodesic distance,
+			 *	should be in the range of 0 to 1000.
+			 *	@see ximgproc::EdgeAwareInterpolator, setEPICSigma
+			*/
+			CV_WRAP virtual float getEPICLambda() const = 0;
+			/** @copybrief getEPICLambda
+			 *	@see ximgproc::EdgeAwareInterpolator, getEPICLambda 
+			*/
+			CV_WRAP virtual void setEPICLambda(float val) = 0;
+			//! @brief see ximgproc::EdgeAwareInterpolator().
+			/** Sets the respective fastGlobalSmootherFilter() parameter.
+			 *	@see ximgproc::EdgeAwareInterpolator, setFgsLambda
+			*/
+			CV_WRAP virtual float getFgsLambda() const = 0;
+			/** @copybrief getFgsLambda
+			 *	@see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, getFgsLambda 
+			*/
+			CV_WRAP virtual void setFgsLambda(float val) = 0;
+			//! @brief see ximgproc::EdgeAwareInterpolator().
+			/** Sets the respective fastGlobalSmootherFilter() parameter.
+			 *	@see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, setFgsSigma 
+			*/
+			CV_WRAP virtual float getFgsSigma() const = 0;
+			/** @copybrief getFgsSigma
+			 *	@see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, getFgsSigma  
+			 */
+			CV_WRAP virtual void setFgsSigma(float val) = 0;
+			//! @brief enables ximgproc::fastGlobalSmootherFilter
+			/**
+			 * @see getUsePostProc
+			 */
+			CV_WRAP virtual void setUsePostProc(bool val) = 0;
+			/** @copybrief setUsePostProc
+			 *	@see ximgproc::fastGlobalSmootherFilter, setUsePostProc
+			 */
+			CV_WRAP virtual bool getUsePostProc() const = 0;
+			//! @brief Creates instance of optflow::DenseRLOFOpticalFlow 
+			/**
+			 *	@param rlofParam see optflow::RLOFOpticalFlowParameter
+			 *	@param forwardBackwardThreshold see setForwardBackward
+			 *	@param gridStep see setGridStep
+			 *	@param epicK see setEPICK
+			 *	@param epicSigma see setEPICSigma
+			 *	@param epicLambda see setEPICLambda
+			 *	@param fgsLambda see setFgsLambda
+			 *	@param fgsSigma see setFgsSigma
+			*/
+			CV_WRAP static Ptr<DenseRLOFOpticalFlow> create(
+				const RLOFOpticalFlowParameter & rlofParam = RLOFOpticalFlowParameter(),
+				float forwardBackwardThreshold = 1.f,
+				Size gridStep = Size(6, 6),
+				InterpolationType interp_type = InterpolationType::INTERP_EPIC,
+				int epicK = 128,
+				float epicSigma = 0.05f,
+				float epicLambda = 999.0f,
+				float fgsLambda = 500.0f,
+				float fgsSigma = 1.5f);
+		};
+
+		/** @brief Class used for calculation sparse optical flow and feature tracking with robust local optical flow (RLOF) algorithms.
+		*
+		* The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as 
+		* proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). 
+		*
+		* For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details. 
+		* Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014 and @cite Senst2016.
+		@sa optflow::calcOpticalFlowSparseRLOF(), optflow::RLOFOpticalFlowParameter
+
+		*/
+		class CV_EXPORTS_W SparseRLOFOpticalFlow : public SparseOpticalFlow
+		{
+		public:
+			/** @copydoc DenseRLOFOpticalFlow::setRLOFOpticalFlowParameter
+			*/
+			CV_WRAP virtual void setRLOFOpticalFlowParameter(RLOFOpticalFlowParameter val) = 0;
+			/** @copybrief setRLOFOpticalFlowParameter
+			 *	@see setRLOFOpticalFlowParameter
+			*/
+			CV_WRAP virtual RLOFOpticalFlowParameter getRLOFOpticalFlowParameter() const = 0;
+			//! @brief Threshold for the forward backward confidence check
+			/** For each feature point a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed.
+			 *	 If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f]
+			 *	 is larger than threshold given by this function then the status  will not be used by the following
+			 *	vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test
+			 *	will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation.
+			 *	@see setForwardBackward
+			*/
+			CV_WRAP virtual void setForwardBackward(float val) = 0;
+			/** @copybrief setForwardBackward
+			 *	@see setForwardBackward
+			*/
+			CV_WRAP virtual float getForwardBackward() const = 0;
+			
+			//! @brief Creates instance of SparseRLOFOpticalFlow 
+			/**
+			 *	@param rlofParam see setRLOFOpticalFlowParameter
+			 *	@param forwardBackwardThreshold see setForwardBackward
+			*/
+			CV_WRAP static Ptr<SparseRLOFOpticalFlow> create(
+				const RLOFOpticalFlowParameter & rlofParam = RLOFOpticalFlowParameter(),
+				float forwardBackwardThreshold = 1.f);
+
+		};
+
+		/** @brief Fast dense optical flow computation based on robust local optical flow (RLOF) algorithms and sparse-to-dense interpolation scheme.
+		 *
+		 * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as 
+		 * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). 
+		 *
+		 * The sparse-to-dense interpolation scheme allows for fast computation of dense optical flow using RLOF (see @cite Geistert2016).
+		 * For this scheme the following steps are applied:
+		 * -# motion vector seeded at a regular sampled grid are computed. The sparsity of this grid can be configured with setGridStep
+		 * -# (optinally) errornous motion vectors are filter based on the forward backward confidence. The threshold can be configured
+		 * with setForwardBackward. The filter is only applied if the threshold >0 but than the runtime is doubled due to the estimation
+		 * of the backward flow.
+		 * -# Vector field interpolation is applied to the motion vector set to obtain a dense vector field.
+		 *
+		 * @param I0 first 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType 
+		 * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image. 
+		 * @param I1 second 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType 
+		 * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image.
+		 * @param flow computed flow image that has the same size as I0 and type CV_32FC2.
+		 * @param rlofParam see optflow::RLOFOpticalFlowParameter
+		 * @param forwardBackwardThreshold Threshold for the forward backward confidence check.
+		 * For each grid point \f$ \mathbf{x} \f$ a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed. 
+		 * If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f]
+		 * is larger than threshold given by this function then the motion vector will not be used by the following 
+		 * vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test
+		 *	will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation.
+	     * @param gridStep Size of the grid to spawn the motion vectors. For each grid point a motion vector is computed. 
+		 * Some motion vectors will be removed due to the forwatd backward threshold (if set >0). The rest will be the 
+		 * base of the vector field interpolation.
+		 * @param epicK see ximgproc::EdgeAwareInterpolator() sets the respective parameter.
+		 * @param epicSigma see ximgproc::EdgeAwareInterpolator() sets the respective parameter.
+		 * @param epicLambda see ximgproc::EdgeAwareInterpolator() sets the respective parameter.
+		 * @param use_post_proc enables ximgproc::fastGlobalSmootherFilter() parameter.
+		 * @param fgsLambda sets the respective ximgproc::fastGlobalSmootherFilter() parameter.
+		 * @param fgsSigma sets the respective ximgproc::fastGlobalSmootherFilter() parameter.
+		 *
+		 * Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014, @cite Senst2016.
+		 * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details.
+		 * @note If the grid size is set to (1,1) and the forward backward threshold <= 0 that the dense optical flow field is purely
+		 * computed with the RLOF.
+		 *
+		 * @sa optflow::DenseRLOFOpticalFlow, optflow::RLOFOpticalFlowParameter
+		*/
+		CV_EXPORTS_W void calcOpticalFlowDenseRLOF(InputArray I0, InputArray I1, InputOutputArray flow,
+			RLOFOpticalFlowParameter rlofParam = RLOFOpticalFlowParameter(),
+			float forewardBackwardThreshold = 0, Size gridStep = Size(6, 6),
+			DenseRLOFOpticalFlow::InterpolationType interp_type = DenseRLOFOpticalFlow::InterpolationType::INTERP_EPIC,
+			int epicK = 128, float epicSigma = 0.05f, 
+			bool use_post_proc = true, float fgsLambda = 500.0f, float fgsSigma = 1.5f);
+
+		/** @brief Calculates fast optical flow for a sparse feature set using the robust local optical flow (RLOF) similar 
+		* to optflow::calcOpticalFlowPyrLK().
+		*
+		* The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as
+		* proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK().
+		*
+		* @param I0 first 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType
+		* = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image.
+		* @param I1 second 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType
+		* = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image.
+		* @param prevPts vector of 2D points for which the flow needs to be found; point coordinates must be single-precision 
+		* floating-point numbers.
+		* @param nextPts output vector of 2D points (with single-precision floating-point coordinates) containing the calculated 
+		* new positions of input features in the second image; when optflow::RLOFOpticalFlowParameter::useInitialFlow variable is true  the vector must 
+		* have the same size as in the input and contain the initialization point correspondences.
+		* @param status output status vector (of unsigned chars); each element of the vector is set to 1 if the flow for the 
+		* corresponding features has passed the forward backward check. 
+		* @param err output vector of errors; each element of the vector is set to the forward backward error for the corresponding feature. 
+		* @param rlofParam see optflow::RLOFOpticalFlowParameter
+		* @param forwardBackwardThreshold Threshold for the forward backward confidence check. If forewardBackwardThreshold <=0 the forward
+		*
+		* Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014 and @cite Senst2016.
+		* For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details.
+	    */
+		CV_EXPORTS_W void calcOpticalFlowSparseRLOF(InputArray prevImg, InputArray nextImg,
+			InputArray prevPts, InputOutputArray nextPts,
+			OutputArray status, OutputArray err,
+			RLOFOpticalFlowParameter rlofParam = RLOFOpticalFlowParameter(),
+			float forewardBackwardThreshold = 0);
+
+		//! Additional interface to the Dense RLOF algorithm - optflow::calcOpticalFlowDenseRLOF()
+		CV_EXPORTS_W Ptr<DenseOpticalFlow> createOptFlow_DenseRLOF();
+
+		//! Additional interface to the Sparse RLOF algorithm - optflow::calcOpticalFlowSparseRLOF()
+		CV_EXPORTS_W Ptr<SparseOpticalFlow> createOptFlow_SparseRLOF();
+		//! @}
+    }
+}
+#endif
diff --git a/modules/optflow/include/opencv2/optflow_o.hpp b/modules/optflow/include/opencv2/optflow_o.hpp
new file mode 100644
index 00000000000..65802750d1a
--- /dev/null
+++ b/modules/optflow/include/opencv2/optflow_o.hpp
@@ -0,0 +1,309 @@
+/*
+By downloading, copying, installing or using the software you agree to this
+license. If you do not agree to this license, do not download, install,
+copy or use the software.
+
+
+                          License Agreement
+               For Open Source Computer Vision Library
+                       (3-clause BSD License)
+
+Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+Third party copyrights are property of their respective owners.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+  * Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+  * Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+  * Neither the names of the copyright holders nor the names of the contributors
+    may be used to endorse or promote products derived from this software
+    without specific prior written permission.
+
+This software is provided by the copyright holders and contributors "as is" and
+any express or implied warranties, including, but not limited to, the implied
+warranties of merchantability and fitness for a particular purpose are
+disclaimed. In no event shall copyright holders or contributors be liable for
+any direct, indirect, incidental, special, exemplary, or consequential damages
+(including, but not limited to, procurement of substitute goods or services;
+loss of use, data, or profits; or business interruption) however caused
+and on any theory of liability, whether in contract, strict liability,
+or tort (including negligence or otherwise) arising in any way out of
+the use of this software, even if advised of the possibility of such damage.
+*/
+
+#ifndef __OPENCV_OPTFLOW_HPP__
+#define __OPENCV_OPTFLOW_HPP__
+
+#include "opencv2/core.hpp"
+#include "opencv2/video.hpp"
+
+/**
+@defgroup optflow Optical Flow Algorithms
+
+Dense optical flow algorithms compute motion for each point:
+
+- cv::optflow::calcOpticalFlowSF
+- cv::optflow::createOptFlow_DeepFlow
+
+Motion templates is alternative technique for detecting motion and computing its direction.
+See samples/motempl.py.
+
+- cv::motempl::updateMotionHistory
+- cv::motempl::calcMotionGradient
+- cv::motempl::calcGlobalOrientation
+- cv::motempl::segmentMotion
+
+Functions reading and writing .flo files in "Middlebury" format, see: <http://vision.middlebury.edu/flow/code/flow-code/README.txt>
+
+- cv::optflow::readOpticalFlow
+- cv::optflow::writeOpticalFlow
+
+ */
+
+#include "opencv2/optflow/pcaflow.hpp"
+#include "opencv2/optflow/sparse_matching_gpc.hpp"
+
+namespace cv
+{
+namespace optflow
+{
+
+//! @addtogroup optflow
+//! @{
+
+/** @overload */
+CV_EXPORTS_W void calcOpticalFlowSF( InputArray from, InputArray to, OutputArray flow,
+                                     int layers, int averaging_block_size, int max_flow);
+
+/** @brief Calculate an optical flow using "SimpleFlow" algorithm.
+
+@param from First 8-bit 3-channel image.
+@param to Second 8-bit 3-channel image of the same size as prev
+@param flow computed flow image that has the same size as prev and type CV_32FC2
+@param layers Number of layers
+@param averaging_block_size Size of block through which we sum up when calculate cost function
+for pixel
+@param max_flow maximal flow that we search at each level
+@param sigma_dist vector smooth spatial sigma parameter
+@param sigma_color vector smooth color sigma parameter
+@param postprocess_window window size for postprocess cross bilateral filter
+@param sigma_dist_fix spatial sigma for postprocess cross bilateralf filter
+@param sigma_color_fix color sigma for postprocess cross bilateral filter
+@param occ_thr threshold for detecting occlusions
+@param upscale_averaging_radius window size for bilateral upscale operation
+@param upscale_sigma_dist spatial sigma for bilateral upscale operation
+@param upscale_sigma_color color sigma for bilateral upscale operation
+@param speed_up_thr threshold to detect point with irregular flow - where flow should be
+recalculated after upscale
+
+See @cite Tao2012 . And site of project - <http://graphics.berkeley.edu/papers/Tao-SAN-2012-05/>.
+
+@note
+   -   An example using the simpleFlow algorithm can be found at samples/simpleflow_demo.cpp
+ */
+CV_EXPORTS_W void calcOpticalFlowSF( InputArray from, InputArray to, OutputArray flow, int layers,
+                                     int averaging_block_size, int max_flow,
+                                     double sigma_dist, double sigma_color, int postprocess_window,
+                                     double sigma_dist_fix, double sigma_color_fix, double occ_thr,
+                                     int upscale_averaging_radius, double upscale_sigma_dist,
+                                     double upscale_sigma_color, double speed_up_thr );
+
+/** @brief Fast dense optical flow based on PyrLK sparse matches interpolation.
+
+@param from first 8-bit 3-channel or 1-channel image.
+@param to  second 8-bit 3-channel or 1-channel image of the same size as from
+@param flow computed flow image that has the same size as from and CV_32FC2 type
+@param grid_step stride used in sparse match computation. Lower values usually
+       result in higher quality but slow down the algorithm.
+@param k number of nearest-neighbor matches considered, when fitting a locally affine
+       model. Lower values can make the algorithm noticeably faster at the cost of
+       some quality degradation.
+@param sigma parameter defining how fast the weights decrease in the locally-weighted affine
+       fitting. Higher values can help preserve fine details, lower values can help to get rid
+       of the noise in the output flow.
+@param use_post_proc defines whether the ximgproc::fastGlobalSmootherFilter() is used
+       for post-processing after interpolation
+@param fgs_lambda see the respective parameter of the ximgproc::fastGlobalSmootherFilter()
+@param fgs_sigma  see the respective parameter of the ximgproc::fastGlobalSmootherFilter()
+ */
+CV_EXPORTS_W void calcOpticalFlowSparseToDense ( InputArray from, InputArray to, OutputArray flow,
+                                                 int grid_step = 8, int k = 128, float sigma = 0.05f,
+                                                 bool use_post_proc = true, float fgs_lambda = 500.0f,
+                                                 float fgs_sigma = 1.5f );
+
+/** @brief DeepFlow optical flow algorithm implementation.
+
+The class implements the DeepFlow optical flow algorithm described in @cite Weinzaepfel2013 . See
+also <http://lear.inrialpes.fr/src/deepmatching/> .
+Parameters - class fields - that may be modified after creating a class instance:
+-   member float alpha
+Smoothness assumption weight
+-   member float delta
+Color constancy assumption weight
+-   member float gamma
+Gradient constancy weight
+-   member float sigma
+Gaussian smoothing parameter
+-   member int minSize
+Minimal dimension of an image in the pyramid (next, smaller images in the pyramid are generated
+until one of the dimensions reaches this size)
+-   member float downscaleFactor
+Scaling factor in the image pyramid (must be \< 1)
+-   member int fixedPointIterations
+How many iterations on each level of the pyramid
+-   member int sorIterations
+Iterations of Succesive Over-Relaxation (solver)
+-   member float omega
+Relaxation factor in SOR
+ */
+CV_EXPORTS_W Ptr<DenseOpticalFlow> createOptFlow_DeepFlow();
+
+//! Additional interface to the SimpleFlow algorithm - calcOpticalFlowSF()
+CV_EXPORTS_W Ptr<DenseOpticalFlow> createOptFlow_SimpleFlow();
+
+//! Additional interface to the Farneback's algorithm - calcOpticalFlowFarneback()
+CV_EXPORTS_W Ptr<DenseOpticalFlow> createOptFlow_Farneback();
+
+//! Additional interface to the SparseToDenseFlow algorithm - calcOpticalFlowSparseToDense()
+CV_EXPORTS_W Ptr<DenseOpticalFlow> createOptFlow_SparseToDense();
+
+/** @brief "Dual TV L1" Optical Flow Algorithm.
+
+The class implements the "Dual TV L1" optical flow algorithm described in @cite Zach2007 and
+@cite Javier2012 .
+Here are important members of the class that control the algorithm, which you can set after
+constructing the class instance:
+
+-   member double tau
+    Time step of the numerical scheme.
+
+-   member double lambda
+    Weight parameter for the data term, attachment parameter. This is the most relevant
+    parameter, which determines the smoothness of the output. The smaller this parameter is,
+    the smoother the solutions we obtain. It depends on the range of motions of the images, so
+    its value should be adapted to each image sequence.
+
+-   member double theta
+    Weight parameter for (u - v)\^2, tightness parameter. It serves as a link between the
+    attachment and the regularization terms. In theory, it should have a small value in order
+    to maintain both parts in correspondence. The method is stable for a large range of values
+    of this parameter.
+
+-   member int nscales
+    Number of scales used to create the pyramid of images.
+
+-   member int warps
+    Number of warpings per scale. Represents the number of times that I1(x+u0) and grad(
+    I1(x+u0) ) are computed per scale. This is a parameter that assures the stability of the
+    method. It also affects the running time, so it is a compromise between speed and
+    accuracy.
+
+-   member double epsilon
+    Stopping criterion threshold used in the numerical scheme, which is a trade-off between
+    precision and running time. A small value will yield more accurate solutions at the
+    expense of a slower convergence.
+
+-   member int iterations
+    Stopping criterion iterations number used in the numerical scheme.
+
+C. Zach, T. Pock and H. Bischof, "A Duality Based Approach for Realtime TV-L1 Optical Flow".
+Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation".
+*/
+class CV_EXPORTS_W DualTVL1OpticalFlow : public DenseOpticalFlow
+{
+public:
+    //! @brief Time step of the numerical scheme
+    /** @see setTau */
+    CV_WRAP virtual double getTau() const = 0;
+    /** @copybrief getTau @see getTau */
+    CV_WRAP virtual void setTau(double val) = 0;
+    //! @brief Weight parameter for the data term, attachment parameter
+    /** @see setLambda */
+    CV_WRAP virtual double getLambda() const = 0;
+    /** @copybrief getLambda @see getLambda */
+    CV_WRAP virtual void setLambda(double val) = 0;
+    //! @brief Weight parameter for (u - v)^2, tightness parameter
+    /** @see setTheta */
+    CV_WRAP virtual double getTheta() const = 0;
+    /** @copybrief getTheta @see getTheta */
+    CV_WRAP virtual void setTheta(double val) = 0;
+    //! @brief coefficient for additional illumination variation term
+    /** @see setGamma */
+    CV_WRAP virtual double getGamma() const = 0;
+    /** @copybrief getGamma @see getGamma */
+    CV_WRAP virtual void setGamma(double val) = 0;
+    //! @brief Number of scales used to create the pyramid of images
+    /** @see setScalesNumber */
+    CV_WRAP virtual int getScalesNumber() const = 0;
+    /** @copybrief getScalesNumber @see getScalesNumber */
+    CV_WRAP virtual void setScalesNumber(int val) = 0;
+    //! @brief Number of warpings per scale
+    /** @see setWarpingsNumber */
+    CV_WRAP virtual int getWarpingsNumber() const = 0;
+    /** @copybrief getWarpingsNumber @see getWarpingsNumber */
+    CV_WRAP virtual void setWarpingsNumber(int val) = 0;
+    //! @brief Stopping criterion threshold used in the numerical scheme, which is a trade-off between precision and running time
+    /** @see setEpsilon */
+    CV_WRAP virtual double getEpsilon() const = 0;
+    /** @copybrief getEpsilon @see getEpsilon */
+    CV_WRAP virtual void setEpsilon(double val) = 0;
+    //! @brief Inner iterations (between outlier filtering) used in the numerical scheme
+    /** @see setInnerIterations */
+    CV_WRAP virtual int getInnerIterations() const = 0;
+    /** @copybrief getInnerIterations @see getInnerIterations */
+    CV_WRAP virtual void setInnerIterations(int val) = 0;
+    //! @brief Outer iterations (number of inner loops) used in the numerical scheme
+    /** @see setOuterIterations */
+    CV_WRAP virtual int getOuterIterations() const = 0;
+    /** @copybrief getOuterIterations @see getOuterIterations */
+    CV_WRAP virtual void setOuterIterations(int val) = 0;
+    //! @brief Use initial flow
+    /** @see setUseInitialFlow */
+    CV_WRAP virtual bool getUseInitialFlow() const = 0;
+    /** @copybrief getUseInitialFlow @see getUseInitialFlow */
+    CV_WRAP virtual void setUseInitialFlow(bool val) = 0;
+    //! @brief Step between scales (<1)
+    /** @see setScaleStep */
+    CV_WRAP virtual double getScaleStep() const = 0;
+    /** @copybrief getScaleStep @see getScaleStep */
+    CV_WRAP virtual void setScaleStep(double val) = 0;
+    //! @brief Median filter kernel size (1 = no filter) (3 or 5)
+    /** @see setMedianFiltering */
+    CV_WRAP virtual int getMedianFiltering() const = 0;
+    /** @copybrief getMedianFiltering @see getMedianFiltering */
+    CV_WRAP virtual void setMedianFiltering(int val) = 0;
+
+    /** @brief Creates instance of cv::DualTVL1OpticalFlow*/
+    CV_WRAP static Ptr<DualTVL1OpticalFlow> create(
+                                            double tau = 0.25,
+                                            double lambda = 0.15,
+                                            double theta = 0.3,
+                                            int nscales = 5,
+                                            int warps = 5,
+                                            double epsilon = 0.01,
+                                            int innnerIterations = 30,
+                                            int outerIterations = 10,
+                                            double scaleStep = 0.8,
+                                            double gamma = 0.0,
+                                            int medianFiltering = 5,
+                                            bool useInitialFlow = false);
+};
+
+/** @brief Creates instance of cv::DenseOpticalFlow
+*/
+CV_EXPORTS_W Ptr<DualTVL1OpticalFlow> createOptFlow_DualTVL1();
+
+//! @}
+
+} //optflow
+}
+
+#include "opencv2/optflow/motempl.hpp"
+
+#endif
diff --git a/modules/optflow/perf/perf_rlof.cpp b/modules/optflow/perf/perf_rlof.cpp
new file mode 100644
index 00000000000..ea48a62be59
--- /dev/null
+++ b/modules/optflow/perf/perf_rlof.cpp
@@ -0,0 +1,89 @@
+#include "perf_precomp.hpp"
+namespace opencv_test {
+	namespace {
+		
+		typedef tuple<std::string, std::string, bool> ST_SR_IM_Sparse_t;
+		typedef TestBaseWithParam<ST_SR_IM_Sparse_t> ST_SR_IM_Sparse;
+		PERF_TEST_P(ST_SR_IM_Sparse, OpticalFlow_SparseRLOF,
+			testing::Combine(
+				testing::Values<std::string>("ST_BILINEAR", "ST_STANDART"),
+				testing::Values<std::string>("SR_CROSS", "SR_FIXED"),
+				testing::Values(true, false))
+		)
+		{
+			Size sz(640,480);
+			string frame1_path = "B:/workspace/opencv_extra/testdata/cv/optflow/RubberWhale1.png";
+			string frame2_path = "B:/workspace/opencv_extra/testdata/cv/optflow/RubberWhale2.png";
+			frame1_path.erase(std::remove_if(frame1_path.begin(), frame1_path.end(), isspace), frame1_path.end());
+			frame2_path.erase(std::remove_if(frame2_path.begin(), frame2_path.end(), isspace), frame2_path.end());
+			Mat frame1 = imread(frame1_path);
+			Mat frame2 = imread(frame2_path);
+			ASSERT_FALSE(frame1.empty());
+			ASSERT_FALSE(frame2.empty());
+			vector<Point2f> prevPts, currPts;
+			for (int r = 0; r < frame1.rows; r += 10)
+			{
+				for (int c = 0; c < frame1.cols; c += 10)
+				{
+					prevPts.push_back(Point2f(static_cast<float>(c), static_cast<float>(r)));
+				}
+			}
+			vector<uchar> status(prevPts.size());
+			vector<float> err(prevPts.size());
+
+			RLOFOpticalFlowParameter param;
+			if (get<0>(GetParam()) == "ST_BILINEAR")
+				param.solverType = RLOFOpticalFlowParameter::ST_BILINEAR;
+			if (get<0>(GetParam()) == "ST_STANDART")
+				param.solverType = RLOFOpticalFlowParameter::ST_STANDART;
+			if (get<1>(GetParam()) == "SR_CROSS")
+				param.supportRegionType = RLOFOpticalFlowParameter::SR_CROSS;
+			if (get<1>(GetParam()) == "SR_FIXED")
+				param.supportRegionType = RLOFOpticalFlowParameter::SR_FIXED;
+			param.useIlluminationModel = get<2>(GetParam());
+
+			TEST_CYCLE_N(1)
+			{
+				calcOpticalFlowSparseRLOF(frame1, frame2, prevPts, currPts, status, err, param, 1.f);
+			}
+
+			SANITY_CHECK_NOTHING();
+		}
+
+		typedef tuple<std::string, int> INTERP_GRID_Dense_t;
+		typedef TestBaseWithParam<INTERP_GRID_Dense_t> INTERP_GRID_Dense;
+		PERF_TEST_P(INTERP_GRID_Dense, OpticalFlow_DenseRLOF,
+			testing::Combine(
+				testing::Values<std::string>("INTERP_EPIC", "INTERP_GEO"),
+				testing::Values<int>(4,10,50))
+		)
+		{
+			Mat flow;
+			string frame1_path = TS::ptr()->get_data_path() + "/cv/optflow/RubberWhale1.png";
+			string frame2_path = TS::ptr()->get_data_path() + "/cv/optflow/RubberWhale2.png";
+			// removing space may be an issue on windows machines
+			frame1_path.erase(std::remove_if(frame1_path.begin(), frame1_path.end(), isspace), frame1_path.end());
+			frame2_path.erase(std::remove_if(frame2_path.begin(), frame2_path.end(), isspace), frame2_path.end());
+			Mat frame1 = imread(frame1_path);
+			Mat frame2 = imread(frame2_path);
+			ASSERT_FALSE(frame1.empty());
+			ASSERT_FALSE(frame2.empty());
+		
+			RLOFOpticalFlowParameter param;
+			Ptr< DenseRLOFOpticalFlow> algo = DenseRLOFOpticalFlow::create();
+			DenseRLOFOpticalFlow::InterpolationType interp_type;
+			if (get<0>(GetParam()) == "INTERP_EPIC")
+				interp_type = DenseRLOFOpticalFlow::InterpolationType::INTERP_EPIC;
+			if (get<0>(GetParam()) == "INTERP_GEO")
+				interp_type = DenseRLOFOpticalFlow::InterpolationType::INTERP_GEO;
+			
+
+			TEST_CYCLE_N(5)
+			{
+				calcOpticalFlowDenseRLOF(frame1, frame2,flow, param, 1.0f, Size(get<1>(GetParam()), get<1>(GetParam())), interp_type);
+			}
+
+			SANITY_CHECK_NOTHING();
+		}
+	}
+} // namespace
diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp
new file mode 100644
index 00000000000..d903d1733db
--- /dev/null
+++ b/modules/optflow/src/rlof/berlof_invoker.hpp
@@ -0,0 +1,2231 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+ //
+ //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+ //
+ //  By downloading, copying, installing or using the software you agree to this license.
+ //  If you do not agree to this license, do not download, install,
+ //  copy or use the software.
+ //
+ //
+ //                           License Agreement
+ //                For Open Source Computer Vision Library
+ //
+ // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+ // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+ // Third party copyrights are property of their respective owners.
+ //
+ // Redistribution and use in source and binary forms, with or without modification,
+ // are permitted provided that the following conditions are met:
+ //
+ //   * Redistribution's of source code must retain the above copyright notice,
+ //     this list of conditions and the following disclaimer.
+ //
+ //   * Redistribution's in binary form must reproduce the above copyright notice,
+ //     this list of conditions and the following disclaimer in the documentation
+ //     and/or other materials provided with the distribution.
+ //
+ //   * The name of the copyright holders may not be used to endorse or promote products
+ //     derived from this software without specific prior written permission.
+ //
+ // This software is provided by the copyright holders and contributors "as is" and
+ // any express or implied warranties, including, but not limited to, the implied
+ // warranties of merchantability and fitness for a particular purpose are disclaimed.
+ // In no event shall the Intel Corporation or contributors be liable for any direct,
+ // indirect, incidental, special, exemplary, or consequential damages
+ // (including, but not limited to, procurement of substitute goods or services;
+ // loss of use, data, or profits; or business interruption) however caused
+ // and on any theory of liability, whether in contract, strict liability,
+ // or tort (including negligence or otherwise) arising in any way out of
+ // the use of this software, even if advised of the possibility of such damage.
+ //
+ //M*/
+#ifndef _BERLOF_INVOKER_HPP_
+#define _BERLOF_INVOKER_HPP_
+#include "rlof_invokerbase.hpp"
+
+
+namespace cv{
+namespace optflow{
+	
+	bool checkSolution(float x, float y, float * c )
+	{
+		float _a = x - 0.002f;
+		float _b = y - 0.002f;
+		cv::Point2f tl( c[0] * _a * _b + c[1] * _a + c[2] * _b + c[3],  c[4] * _a * _b + c[5] * _a + c[6] * _b + c[7]);
+		_a = x + 0.002f;
+		cv::Point2f tr( c[0] * _a * _b + c[1] * _a + c[2] * _b + c[3],  c[4] * _a * _b + c[5] * _a + c[6] * _b + c[7]);
+		_b = y + 0.002f;
+		cv::Point2f br( c[0] * _a * _b + c[1] * _a + c[2] * _b + c[3],  c[4] * _a * _b + c[5] * _a + c[6] * _b + c[7]);
+		_a = x - 0.002f;
+		cv::Point2f bl( c[0] * _a * _b + c[1] * _a + c[2] * _b + c[3],  c[4] * _a * _b + c[5] * _a + c[6] * _b + c[7]);
+		return (tl.x >= 0 && tl.y >= 0) && (tr.x <= 0 && tr.y >= 0)
+			&& (bl.x >= 0 && bl.y <= 0) && (br.x <= 0 && br.y <= 0);
+	}
+
+	inline cv::Point2f est_DeltaGain( const cv::Mat & src, const cv::Vec4f & val)
+	{
+		return cv::Point2f( 
+			src.at<float>(2,0) * val[0] + src.at<float>(2,1) * val[1] + src.at<float>(2,2) * val[2] + src.at<float>(2,3) * val[3],
+			src.at<float>(3,0) * val[0] + src.at<float>(3,1) * val[1] + src.at<float>(3,2) * val[2] + src.at<float>(3,3) * val[3]);
+	}
+	inline void est_Result( const cv::Mat & src, const cv::Vec4f & val, cv::Point2f & delta, cv::Point2f & gain)
+	{
+
+		delta = cv::Point2f( 
+			-(src.at<float>(0,0) * val[0] + src.at<float>(0,1) * val[1] + src.at<float>(0,2) * val[2] + src.at<float>(0,3) * val[3]),
+			-(src.at<float>(1,0) * val[0] + src.at<float>(1,1) * val[1] + src.at<float>(1,2) * val[2] + src.at<float>(1,3) * val[3]));
+
+		gain = cv::Point2f( 
+			src.at<float>(2,0) * val[0] + src.at<float>(2,1) * val[1] + src.at<float>(2,2) * val[2] + src.at<float>(2,3) * val[3],
+			src.at<float>(3,0) * val[0] + src.at<float>(3,1) * val[1] + src.at<float>(3,2) * val[2] + src.at<float>(3,3) * val[3]);
+	}
+	
+	namespace berlof
+	{
+		
+		namespace ica
+		{
+		
+		class TrackerInvoker : public cv::ParallelLoopBody
+			{
+				public:
+				TrackerInvoker( 
+									const Mat&	_prevImg, 
+									const Mat&	_prevDeriv, 
+									const Mat&	_nextImg,
+									const Mat&	_rgbPrevImg,
+									const Mat&	_rgbNextImg,
+									const Point2f*	_prevPts, 
+									Point2f*		_nextPts,
+									uchar*			_status, 
+									float*			_err, 
+									int				_level, 
+									int				_maxLevel,
+									int				_winSize[2], 
+									int				_maxIteration,
+									bool			_useInitialFlow,
+									int				_supportRegionType,
+									int				_crossSegmentationThreshold,
+									std::vector<float> _normSigmas,
+									float           _minEigenValue)
+				{
+					prevImg =		&_prevImg;
+					prevDeriv =		&_prevDeriv;
+					nextImg =		&_nextImg;
+					rgbPrevImg =	&_rgbPrevImg;
+					rgbNextImg =	&_rgbNextImg; 
+					prevPts =		_prevPts;
+					nextPts =		_nextPts;
+					status =		_status;
+					err =			_err;
+					minWinSize			=	_winSize[0];
+					maxWinSize			=	_winSize[1];
+					criteria.maxCount	=	_maxIteration;
+					criteria.epsilon	=	0.01;
+					level				=	_level;
+					maxLevel			=	_maxLevel;
+					windowType			= _supportRegionType;
+					minEigThreshold		= _minEigenValue;
+					paramReset			= _normSigmas;
+					useInitialFlow		= _useInitialFlow;
+					crossSegmentationThreshold	= _crossSegmentationThreshold;
+				}
+    
+				TrackerInvoker & operator=(const TrackerInvoker &) {return *this;};
+				void operator()(const cv::Range& range) const
+				{
+	
+					Point2f halfWin; 
+					cv::Size winSize;
+					const Mat& I = *prevImg;
+					const Mat& J = *nextImg;
+					const Mat& derivI = *prevDeriv;
+					const Mat& BI = *rgbPrevImg;
+					
+
+					const float FLT_SCALE = (1.f/(1 << 16));
+
+					winSize = cv::Size(maxWinSize,maxWinSize);
+					int winMaskwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
+					cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
+					winMaskMatBuf.setTo(1);
+		
+					std::vector<float> param = paramReset;
+					int j, cn = I.channels(), cn2 = cn*2;
+					int winbufwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
+					cv::Size winBufSize(winbufwidth,winbufwidth);
+
+					cv::AutoBuffer<deriv_type> _buf(winBufSize.area()*(cn + cn2));
+					int derivDepth = DataType<deriv_type>::depth;
+
+					Mat IWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf.data());
+					Mat derivIWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf.data() + winBufSize.area()*cn);
+
+					
+					for( int ptidx = range.start; ptidx < range.end; ptidx++ )
+					{
+						Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
+						Point2f nextPt;
+						if( level == maxLevel )
+						{
+							if( useInitialFlow )
+								nextPt = nextPts[ptidx]*(float)(1./(1 << level));
+							else
+								nextPt = prevPt;
+						}
+						else
+							nextPt = nextPts[ptidx]*2.f;
+						nextPts[ptidx] = nextPt;
+
+						Point2i iprevPt, inextPt;
+						iprevPt.x = cvFloor(prevPt.x);
+						iprevPt.y = cvFloor(prevPt.y);
+						int winArea = maxWinSize * maxWinSize;
+						cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize));
+						winMaskMatBuf.setTo(0);
+						if( calcWinMaskMat(BI, windowType, iprevPt,
+											winMaskMat,winSize,halfWin,winArea,
+											minWinSize,maxWinSize, crossSegmentationThreshold) == false)
+							continue;
+
+						prevPt -= halfWin;
+						iprevPt.x = cvFloor(prevPt.x);
+						iprevPt.y = cvFloor(prevPt.y);
+            
+						if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols ||
+							iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows )
+						{
+							if( level == 0 )
+							{
+								if( status )
+									status[ptidx] = 3;
+								if( err )
+									err[ptidx] = 0;
+							}
+							continue;
+						}
+            
+						float a = prevPt.x - iprevPt.x;
+						float b = prevPt.y - iprevPt.y;
+						const int W_BITS = 14, W_BITS1 = 14;
+            
+						int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+						int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+						int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+						int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+            
+						int dstep = (int)(derivI.step/derivI.elemSize1());
+						int step = (int)(I.step/I.elemSize1());
+						int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1());
+						CV_Assert( step == (int)(J.step/J.elemSize1()) );
+
+						float A11 = 0, A12 = 0, A22 = 0;
+						float b1 = 0, b2 = 0;
+						float D = 0;
+
+						// extract the patch from the first image, compute covariation matrix of derivatives
+						int x, y;
+						for( y = 0; y < winSize.height; y++ )
+						{
+							const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
+							const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
+                
+							short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step);
+							short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step);
+							x = 0;
+							for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 )
+							{
+								if( winMaskMat.at<uchar>(y,x) == 0)
+								{
+									dIptr[0] = 0;
+									dIptr[1] = 0;
+									continue;
+								}
+								int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
+													  src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5);
+								int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
+													   dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
+								int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
+													   dsrc[dstep+cn2+1]*iw11, W_BITS1);
+								Iptr[x] = (short)ival;
+								dIptr[0] = (short)ixval;
+								dIptr[1] = (short)iyval;
+							}
+						}
+
+						cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1);
+						cv::Point2f backUpNextPt = nextPt;
+						nextPt -= halfWin;
+						Point2f prevDelta(0,0);	//denotes h(t-1)
+						cv::Size _winSize = winSize;
+						__m128i mmMask0, mmMask1, mmMask;
+						getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask);
+						__m128  mmOnes   = _mm_set1_ps(1.f );
+						__m128i mmOnes_epi16   = _mm_set1_epi16(1 );
+						__m128i mmZeros_epi16 = _mm_set1_epi16(0);
+						float MEstimatorScale = 1;
+						int buffIdx = 0;
+						float c[8];
+						cv::Mat GMc0, GMc1, GMc2, GMc3;
+						cv::Vec2f Mc0, Mc1, Mc2, Mc3;
+						int noIteration = 0;
+						int noReusedIteration = 0;
+						int noSolvedIteration = 0;
+						for( j = 0; j < criteria.maxCount; j++ )
+						{
+							cv::Point2f delta(0,0);
+							cv::Point2f deltaGain(0,0);
+							bool hasSolved = false; 
+							a = nextPt.x - inextPt.x;
+							b = nextPt.y - inextPt.y;
+							float ab = a * b;
+							if( j == 0	
+								|| ( inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j % 2 != 0 ))
+							{
+							inextPt.x = cvFloor(nextPt.x);
+							inextPt.y = cvFloor(nextPt.y);
+                
+							if( inextPt.x < -winSize.width  || inextPt.x >= J.cols ||
+							    inextPt.y < -winSize.height || inextPt.y >= J.rows )
+							{
+								if( level == 0 && status )
+									status[ptidx] = 3;
+								break;
+							}
+                
+							
+							a = nextPt.x - inextPt.x;
+							b = nextPt.y - inextPt.y;
+							ab = a * b;
+							iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+							iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+							iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+							iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+							// mismatch
+							b1 = 0;
+							b2 = 0;
+							if( j == 0 )
+							{
+								A11 = 0;
+								A12 = 0;
+								A22 = 0;
+							}
+
+							if ( j == 0 )
+							{
+								buffIdx = 0;
+								for( y = 0; y < winSize.height; y++ )
+								{
+									const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y )*step + (inextPt.x )*cn;
+									const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step);
+									const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
+									const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+									x = 0;
+									for( ; x < winSize.width*cn; x++, dIptr += 2)
+									{
+										if( maskPtr[x] == 0) 
+											continue;
+										int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, W_BITS1-5) 
+											- Iptr[x];
+										residualMat.at<short>(buffIdx++) = static_cast<short>(diff);
+									}
+								}
+								/*! Estimation for the residual */
+								cv::Mat residualMatRoi(residualMat, cv::Rect(0,0,1, buffIdx));
+								MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi);
+							}
+
+							float eta = 1.f / winArea;
+							float fParam0 = param[0] * 32.f;
+							float fParam1 = param[1] * 32.f;
+							fParam0 = param[0] * MEstimatorScale;
+							fParam1 = param[1] * MEstimatorScale;
+							
+							
+							buffIdx = 0;
+							float _b0[4] = {0,0,0,0};
+							float _b1[4] = {0,0,0,0};
+							
+							/*
+							*/
+							for( y = 0; y < _winSize.height; y++ )
+							{
+								const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+								const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn;
+								const short* Iptr = (const short*)(IWinBuf.data +  y*IWinBuf.step);
+								const short* dIptr = (const short*)(derivIWinBuf.data + y*derivIWinBuf.step);
+                    
+								x = 0;
+								for( ; x < _winSize.width*cn; x++, dIptr += 2 )
+								{
+									if( maskPtr[x] == 0)
+										continue;
+									int illValue =   - Iptr[x];
+									
+									float It[4] = {static_cast<float>((Jptr[x+step+cn]<< 5)	+ illValue),
+												 static_cast<float>((Jptr[x+cn]<< 5)		+ illValue),
+												 static_cast<float>((Jptr[x+step]<< 5)		+ illValue),
+												 static_cast<float>((Jptr[x] << 5)			+ illValue)};
+									
+									
+									
+									int J_val  =  CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
+														  Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11,
+														  W_BITS1-5);
+									
+								
+									int diff = J_val + illValue;
+						
+									
+									MEstimatorScale += (diff < MEstimatorScale) ? -eta : eta; 
+
+									int abss = (diff < 0) ? -diff : diff;
+						
+									// compute the missmatch vector
+									if( j >= 0)
+									{
+										if( abss > fParam1)
+										{
+											It[0] = 0;
+											It[1] = 0;
+											It[2] = 0;
+											It[3] = 0;
+										}
+										else if( abss > fParam0 && diff >= 0 )
+										{
+											It[0] = param[2] * (It[0] - fParam1);
+											It[1] = param[2] * (It[1] - fParam1);
+											It[2] = param[2] * (It[2] - fParam1);
+											It[3] = param[2] * (It[3] - fParam1);
+										}
+										else if( abss > fParam0 && diff < 0 )
+										{
+											It[0] = param[2] * (It[0] + fParam1);
+											It[1] = param[2] * (It[1] + fParam1);
+											It[2] = param[2] * (It[2] + fParam1);
+											It[3] = param[2] * (It[3] + fParam1);
+										}
+									}
+									
+									float It0 = It[0];
+									float It1 = It[1];
+									float It2 = It[2];
+									float It3 = It[3];
+
+									float ixval = static_cast<float>(dIptr[0]);
+									float iyval = static_cast<float>(dIptr[1]);
+									_b0[0] += It0 * ixval;
+									_b0[1] += It1 * ixval;
+									_b0[2] += It2 * ixval;
+									_b0[3] += It3 * ixval;
+									
+									  
+									_b1[0] += It0*iyval;
+									_b1[1] += It1*iyval;
+									_b1[2] += It2*iyval;
+									_b1[3] += It3*iyval;
+									
+									
+									// compute the Gradient Matrice
+									if( j == 0)
+									{
+										float tale = param[2] * FLT_RESCALE;
+										if( abss < fParam0 || j < 0)
+										{
+											tale = FLT_RESCALE;
+										} 
+										else if( abss > fParam1)
+										{
+											tale *= 0.01f;
+										}
+										else
+										{
+											tale *= param[2];
+										}
+
+										A11 += (float)(ixval*ixval)*tale;
+										A12 += (float)(ixval*iyval)*tale;
+										A22 += (float)(iyval*iyval)*tale;
+										
+									}
+								}
+							}
+
+							if( j == 0 )
+							{
+			
+								A11 *= FLT_SCALE; // 54866744.
+								A12 *= FLT_SCALE; // -628764.00
+								A22 *= FLT_SCALE; // 19730.000
+								
+								D = A11 * A22 - A12 * A12;
+								float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
+										4.f*A12*A12))/(2*winArea);
+
+								if(  minEig < minEigThreshold || std::abs(D) < FLT_EPSILON)
+								{
+									if( level == 0 && status )
+										status[ptidx] = 0;
+									if( level > 0)
+									{
+										nextPts[ptidx] = backUpNextPt;
+									}
+									noIteration++;
+									break;
+								}
+				
+								D = (1.f / D);
+
+							}
+
+							_b0[0] *= FLT_SCALE;_b0[1] *= FLT_SCALE;_b0[2] *= FLT_SCALE;_b0[3] *= FLT_SCALE;
+							_b1[0] *= FLT_SCALE;_b1[1] *= FLT_SCALE;_b1[2] *= FLT_SCALE;_b1[3] *= FLT_SCALE;
+							
+
+							Mc0[0] =   _b0[0] - _b0[1] - _b0[2] + _b0[3];
+							Mc0[1] =   _b1[0] - _b1[1] - _b1[2] + _b1[3];
+						
+							Mc1[0] =   _b0[1] - _b0[3];
+							Mc1[1] =   _b1[1] - _b1[3];
+						
+
+							Mc2[0] =   _b0[2] - _b0[3];
+							Mc2[1] =   _b1[2] - _b1[3];
+							
+
+							Mc3[0] =  _b0[3];
+							Mc3[1] =  _b1[3];
+							
+							c[0] = -Mc0[0];
+							c[1] = -Mc1[0];
+							c[2] = -Mc2[0];
+							c[3] = -Mc3[0];
+							c[4] = -Mc0[1];
+							c[5] = -Mc1[1];
+							c[6] = -Mc2[1];
+							c[7] = -Mc3[1];
+
+							float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]);
+							float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]);
+							float e2 = e0 * (c[1] * c[7] -c[3] * c[5]);
+							e0 = e1 * e1 - e2;
+							hasSolved = false;
+							if ( e0 > 0)
+							{
+								e0 = sqrt(e0);
+								float _y[2] = {-e1 - e0, e0 - e1};
+								float c0yc1[2] = {c[0] * _y[0] + c[1],  
+													c[0] * _y[1] + c[1]};
+								float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0], 
+												-(c[2] * _y[1] + c[3]) / c0yc1[1]};
+								float c0yc2[2] = {c[0] * _y[0] + c[2],
+											  c[0] * _y[1] + c[2]};
+									
+								float c4yc6[2] = {c[4] * _y[0] + c[6],
+												  c[4] * _y[1] + c[6]};
+
+								float c0xc1[2] = {c[0] * _x[0] + c[1],  
+												  c[0] * _x[1] + c[1]};
+							
+								float c4xc5[2] = {c[4] * _x[0] + c[5],  
+												  c[4] * _x[1] + c[5]};
+								bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1);
+								bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1);
+																
+								bool isSolution1 = checkSolution(_x[0], _y[0], c );
+								bool isSolution2 = checkSolution(_x[1], _y[1], c );
+								bool isSink1 = isIn1 && isSolution1;
+								bool isSink2 = isIn2 && isSolution2;
+							
+								if ( isSink1 != isSink2)
+								{
+									a = isSink1 ? _x[0] : _x[1];
+									b = isSink1 ? _y[0] : _y[1];
+									ab = a * b;
+									hasSolved = true;
+									delta.x = inextPt.x + a - nextPt.x;
+									delta.y = inextPt.y + b - nextPt.y;
+								} // isIn1 != isIn2
+							}
+								if( hasSolved == false)
+									noIteration++;
+							}
+							else
+							{
+								hasSolved = false;
+								noReusedIteration++;
+							}
+							if( hasSolved == false )
+							{
+								
+								cv::Vec2f mismatchVec = ab * Mc0  + Mc1 *a + Mc2 * b + Mc3;
+								delta.x = (A12 * mismatchVec.val[1] - A22 * mismatchVec.val[0]) * D;
+								delta.y = (A12 * mismatchVec.val[0] - A11 * mismatchVec.val[1]) * D;
+								delta.x = MAX(-1.f, MIN(1.f, delta.x));
+								delta.y = MAX(-1.f, MIN(1.f, delta.y));
+								nextPt  += delta;
+								nextPts[ptidx] = nextPt + halfWin;
+							}
+							else
+							{
+								nextPt += delta;
+								nextPts[ptidx] = nextPt + halfWin;
+								noSolvedIteration++;
+								break;
+							}
+
+							delta.x = ( delta.x != delta.x) ? 0 : delta.x;
+							delta.y = ( delta.y != delta.y) ? 0 : delta.y;
+
+							
+							cv::Point2f totalDistance = backUpNextPt - nextPts[ptidx];
+
+							if(j > 0 && ( 
+								(std::abs(delta.x - prevDelta.x) < 0.01  &&	std::abs(delta.y - prevDelta.y) < 0.01)))
+							{
+								nextPts[ptidx]  -= delta*0.5f;
+								break;
+							}
+							
+							
+							prevDelta = delta;
+						}
+						
+					}
+	
+				}
+    
+				const Mat*		prevImg;
+				const Mat*		nextImg;
+				const Mat*		prevDeriv;
+				const Mat*		rgbPrevImg;
+				const Mat*		rgbNextImg;
+				const Point2f*	prevPts;
+				Point2f*		nextPts;
+				uchar*			status;
+				float*			err;
+				int				maxWinSize;
+				int				minWinSize;
+				TermCriteria	criteria;
+				int				level;
+				int				maxLevel;
+				int				windowType;
+				float			minEigThreshold;
+				bool			useInitialFlow;
+				std::vector<float> paramReset;
+				int		crossSegmentationThreshold;
+				
+			};
+		}
+		namespace radial
+		{
+			class TrackerInvoker : public cv::ParallelLoopBody
+			{
+			public:
+				TrackerInvoker(
+					const Mat&	_prevImg,
+					const Mat&	_prevDeriv,
+					const Mat&	_nextImg,
+					const Mat&	_rgbPrevImg,
+					const Mat&	_rgbNextImg,
+					const Point2f*	_prevPts,
+					Point2f*		_nextPts,
+					uchar*			_status,
+					float*			_err,
+					Point2f*		_gainVecs,
+					int				_level,
+					int				_maxLevel,
+					int				_winSize[2],
+					int				_maxIteration,
+					bool			_useInitialFlow,
+					int				_supportRegionType,
+					int				_crossSegmentationThreshold,
+					std::vector<float> _normSigmas,
+					float           _minEigenValue)
+				{
+					prevImg = &_prevImg;
+					prevDeriv = &_prevDeriv;
+					nextImg = &_nextImg;
+					rgbPrevImg = &_rgbPrevImg;
+					rgbNextImg = &_rgbNextImg;
+					prevPts = _prevPts;
+					nextPts = _nextPts;
+					status = _status;
+					err = _err;
+					gainVecs = _gainVecs;
+					minWinSize = _winSize[0];
+					maxWinSize = _winSize[1];
+					criteria.maxCount = _maxIteration;
+					criteria.epsilon = 0.01;
+					level = _level;
+					maxLevel = _maxLevel;
+					windowType = _supportRegionType;
+					minEigThreshold = _minEigenValue;
+					paramReset = _normSigmas;
+					useInitialFlow = _useInitialFlow;
+					crossSegmentationThreshold = _crossSegmentationThreshold;
+				}
+    
+				TrackerInvoker & operator=(const TrackerInvoker &) {return *this;};
+				void operator()(const cv::Range& range) const
+				{
+					Point2f halfWin; 
+					cv::Size winSize;
+					const Mat& I = *prevImg;
+					const Mat& J = *nextImg;
+					const Mat& derivI = *prevDeriv;
+					const Mat& BI = *rgbPrevImg;
+					const float FLT_SCALE = (1.f/(1 << 16));//(1.f/(1 << 20)); // 20
+					winSize = cv::Size(maxWinSize,maxWinSize);
+					int winMaskwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
+					cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
+					winMaskMatBuf.setTo(1);
+		
+					std::vector<float> param = paramReset;
+					int cn = I.channels(), cn2 = cn*2;
+					int winbufwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
+					cv::Size winBufSize(winbufwidth,winbufwidth);
+
+					
+					cv::Mat invTensorMat(4,4, CV_32FC1);
+					cv::Mat mismatchMat(4, 1, CV_32FC1);
+					cv::Mat resultMat(4, 1, CV_32FC1);
+		
+					cv::AutoBuffer<deriv_type> _buf(winBufSize.area()*(cn + cn2));
+					int derivDepth = DataType<deriv_type>::depth;
+
+					Mat IWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf.data());
+					Mat derivIWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf.data() + winBufSize.area()*cn);
+
+					for( int ptidx = range.start; ptidx < range.end; ptidx++ )
+					{
+						Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
+						Point2f nextPt;
+						if( level == maxLevel )
+						{
+							if( useInitialFlow )
+								nextPt = nextPts[ptidx]*(float)(1./(1 << level));
+							else
+								nextPt = prevPt;
+						}
+						else
+							nextPt = nextPts[ptidx]*2.f;
+						nextPts[ptidx] = nextPt;
+
+						Point2i iprevPt, inextPt;
+						iprevPt.x = cvFloor(prevPt.x);
+						iprevPt.y = cvFloor(prevPt.y);
+						int winArea = maxWinSize * maxWinSize;
+						cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize));
+						winMaskMatBuf.setTo(0);
+						if( calcWinMaskMat(BI,  windowType, iprevPt,
+											winMaskMat,winSize,halfWin,winArea,
+											minWinSize,maxWinSize, crossSegmentationThreshold) == false)
+							continue;
+
+						prevPt -= halfWin;
+						iprevPt.x = cvFloor(prevPt.x);
+						iprevPt.y = cvFloor(prevPt.y);
+            
+						if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols ||
+							iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows )
+						{
+							if( level == 0 )
+							{
+								if( status )
+									status[ptidx] = 3;
+								if( err )
+									err[ptidx] = 0;
+							}
+							continue;
+						}
+            
+						float a = prevPt.x - iprevPt.x;
+						float b = prevPt.y - iprevPt.y;
+						const int W_BITS = 14, W_BITS1 = 14;
+            
+						int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+						int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+						int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+						int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+            
+						int dstep = (int)(derivI.step/derivI.elemSize1());
+						int step = (int)(I.step/I.elemSize1());
+						int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1());
+						CV_Assert( step == (int)(J.step/J.elemSize1()) );
+
+						float A11 = 0, A12 = 0, A22 = 0;
+						float b1 = 0, b2 = 0, b3 = 0, b4 = 0;
+					
+						// tensor
+						float sumIx = 0;
+						float sumIy = 0;
+						float sumI  = 0;
+						float sumW = 0;
+						float w1 = 0, w2 = 0; // -IyI
+						float dI = 0; // I^2
+						float D = 0;
+
+			#ifdef CV_SSE2
+						__m128i rightMask = _mm_set_epi16(0, std::numeric_limits<unsigned short>::max(), 0, std::numeric_limits<unsigned short>::max(), 0, std::numeric_limits<unsigned short>::max(), 0, std::numeric_limits<unsigned short>::max());
+						__m128i leftMask  = _mm_set_epi16(std::numeric_limits<unsigned short>::max(), 0, std::numeric_limits<unsigned short>::max(), 0, std::numeric_limits<unsigned short>::max(), 0, std::numeric_limits<unsigned short>::max(), 0);
+					
+						__m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+						__m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+						__m128i z = _mm_setzero_si128();
+						__m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1));
+						__m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1));
+						__m128i mmMask4_epi32;
+						__m128i mmMaskSet_epi16   = _mm_set1_epi16(std::numeric_limits<unsigned short>::max());
+						get4BitMask(winSize.width, mmMask4_epi32);
+						__m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps();
+			#endif
+            
+						// extract the patch from the first image, compute covariation matrix of derivatives
+						int x, y;
+						for( y = 0; y < winSize.height; y++ )
+						{
+							x = 0;
+							const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+							const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
+							const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
+                
+							short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step);
+							short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step);
+                
+							
+                
+			#ifdef CV_SSE2
+							for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 )
+							{
+								__m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16);
+								__m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16);
+						
+					
+								__m128i v00, v01, v10, v11, t0, t1;
+								v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
+								v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
+								v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z);
+								v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z);
+                    
+								t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+												   _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+								t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
+								if( x + 4 > winSize.width)
+								{
+									t0 = _mm_and_si128(t0, mmMask4_epi32);
+								}
+								t0 = _mm_and_si128(t0, mask_0_3_epi16);
+								_mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0));
+                   
+                    
+								v00 = _mm_loadu_si128((const __m128i*)(dsrc));
+								v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
+								v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep));
+								v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2));
+                    
+								t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+												   _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+								t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+												   _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+								t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
+								t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
+								v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
+								if( x + 4 > winSize.width)
+								{
+									v00 = _mm_and_si128(v00, mmMask4_epi32);
+								}
+								v00 = _mm_and_si128(v00, mask_0_3_epi16);
+								_mm_storeu_si128((__m128i*)dIptr, v00);
+							}
+			#else
+                
+							for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 )
+							{
+								if( winMaskMat.at<uchar>(y,x) == 0)
+								{
+									dIptr[0] = 0;
+									dIptr[1] = 0;
+									continue;
+								}
+								int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
+													  src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5);
+								int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
+													   dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
+								int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
+													   dsrc[dstep+cn2+1]*iw11, W_BITS1);
+                    
+								Iptr[x] = (short)ival;
+								dIptr[0] = (short)ixval;
+								dIptr[1] = (short)iyval;
+
+							}
+			#endif
+						}
+
+						cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1);
+
+						cv::Point2f backUpNextPt = nextPt;
+						nextPt -= halfWin;
+						Point2f prevDelta(0,0);	//relates to h(t-1)
+						Point2f prevGain(1,0);
+						cv::Point2f gainVec = gainVecs[ptidx];
+						cv::Point2f backUpGain = gainVec;
+						cv::Size _winSize = winSize;
+						int j;
+						__m128i mmMask0, mmMask1, mmMask;
+						getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask);
+						__m128  mmOnes   = _mm_set1_ps(1.f );
+						__m128i mmOnes_epi16   = _mm_set1_epi16(1 );
+						__m128i mmZeros_epi16 = _mm_set1_epi16(0);
+						float MEstimatorScale = 1;
+						int buffIdx = 0;
+						float c[8];
+						cv::Mat GMc0, GMc1, GMc2, GMc3;
+						cv::Vec4f Mc0, Mc1, Mc2, Mc3;
+						int noIteration = 0;
+						int noReusedIteration = 0;
+						int noSolvedIteration = 0;
+						for( j = 0; j < criteria.maxCount; j++ )
+						{
+							cv::Point2f delta(0,0);
+							cv::Point2f deltaGain(0,0);
+							bool hasSolved = false; 
+							a = nextPt.x - inextPt.x;
+							b = nextPt.y - inextPt.y;
+							float ab = a * b;
+							if( j == 0 
+							|| (inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j % 2 != 0) )
+							{
+							inextPt.x = cvFloor(nextPt.x);
+							inextPt.y = cvFloor(nextPt.y);
+                
+							if( inextPt.x < -winSize.width  || inextPt.x >= J.cols ||
+							    inextPt.y < -winSize.height || inextPt.y >= J.rows )
+							{
+								if( level == 0 && status )
+									status[ptidx] = 3;
+								if (level > 0)
+								{
+									nextPts[ptidx] = backUpNextPt;
+									gainVecs[ptidx] = backUpGain;
+								}
+								noIteration++;
+								break;
+							}
+                
+							
+							a = nextPt.x - inextPt.x;
+							b = nextPt.y - inextPt.y;
+							ab = a * b;
+							iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+							iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+							iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+							iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+							// mismatch
+							b1 = 0;
+							b2 = 0;
+							b3 = 0;
+							b4 = 0;
+							if( j == 0)
+							{
+								// tensor
+								w1 = 0; // -IxI
+								w2 = 0; // -IyI
+								dI = 0; // I^2
+								sumIx = 0;
+								sumIy = 0;
+								sumI  = 0;
+								sumW = 0;
+								A11 = 0;
+								A12 = 0;
+								A22 = 0;
+							}
+
+							if ( j == 0 )
+							{
+								buffIdx = 0;
+								for( y = 0; y < winSize.height; y++ )
+								{
+									const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y )*step + (inextPt.x )*cn;
+									const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step);
+									const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
+									const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+									x = 0;
+									for( ; x < winSize.width*cn; x++, dIptr += 2)
+									{
+										if( maskPtr[x] == 0) 
+											continue;
+										int diff = static_cast<int>(CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, W_BITS1-5) 
+											- Iptr[x] + Iptr[x] * gainVec.x +gainVec.y);
+										residualMat.at<short>(buffIdx++) = static_cast<short>(diff);
+									}
+								}
+								/*! Estimation for the residual */
+								cv::Mat residualMatRoi(residualMat, cv::Rect(0,0,1, buffIdx));
+								MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi);
+							}
+
+							float eta = 1.f / winArea;
+							float fParam0 = param[0] * 32.f;
+							float fParam1 = param[1] * 32.f;
+							fParam0 = param[0] * MEstimatorScale;
+							fParam1 = param[1] * MEstimatorScale;
+							
+#ifdef CV_SSE2
+							
+							qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+							qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+							__m128 qb0[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; 
+							__m128 qb1[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
+							__m128 qb2[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
+							__m128 qb3[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
+							__m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps(); 
+							__m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps();
+							__m128 mmSumIy = _mm_setzero_ps(),  mmSumIx = _mm_setzero_ps();
+							__m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();
+							__m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() -1, static_cast<short>(fParam0)));
+							__m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max()- 1, static_cast<short>(fParam1)));
+				
+
+							float s2Val = param[2] > 0 ? param[2] : -param[2];
+							int s2bitShift = param[2] == 0 ? 1 : static_cast<int>(ceil(log(200.f / s2Val) / log(2.f)));	
+							__m128i mmParam2_epi16 = _mm_set1_epi16(static_cast<short>(param[2] * (float)(1 << s2bitShift)));
+							__m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift);
+							__m128  mmParam2 = _mm_set1_ps(param[2]);		
+							__m128  mmParam2s = _mm_set1_ps(0.01f * param[2]);
+							__m128  mmParam2s2 = _mm_set1_ps(param[2] * param[2]);
+							float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x;
+							int bitShift = gainVec.x == 0 ? 1 : static_cast<int>(ceil(log(200.f / gainVal) / log(2.f)));	
+							__m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.x * (float)(1 << bitShift)));
+							__m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.y));
+							__m128i	mmEta	   = _mm_setzero_si128();
+							__m128i	mmEtaNeg  = _mm_set1_epi16(-1);
+							__m128i mmScale	  = _mm_set1_epi16(static_cast<short>(MEstimatorScale));
+				
+#endif
+	
+							buffIdx = 0;
+							float _b0[4] = {0,0,0,0};
+							float _b1[4] = {0,0,0,0};
+							float _b2[4] = {0,0,0,0};
+							float _b3[4] = {0,0,0,0};
+							float _bm[4]  = {0,0,0,0};
+							float _bc[4]  = {0,0,0,0};
+							/*
+							*/
+							for( y = 0; y < _winSize.height; y++ )
+							{
+								const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+								const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn;
+								const short* Iptr = (const short*)(IWinBuf.data +  y*IWinBuf.step);
+								const short* dIptr = (const short*)(derivIWinBuf.data + y*derivIWinBuf.step);
+                    
+								x = 0;
+#ifdef CV_SSE2
+							for( ; x <= _winSize.width*cn; x += 8, dIptr += 8*2 )
+							{
+								__m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16);
+								__m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x));
+						
+								__m128i v00 = _mm_unpacklo_epi8(
+									_mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
+								__m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);  
+								__m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z);
+								__m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z);
+                        
+								__m128i t0 = _mm_add_epi32
+									(_mm_madd_epi16(
+										_mm_unpacklo_epi16(v00, v01), 
+										qw0),
+									_mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+								__m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+														   _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+
+								t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
+								t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5);
+						
+								__m128i lo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16);
+								__m128i hi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16); 
+
+								__m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), bitShift);
+								__m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), bitShift);
+								__m128i Igain_epi16 =  _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32);
+								
+					
+								__m128i diffValue = _mm_subs_epi16(_mm_add_epi16(Igain_epi16, mmConstValue_epi16), I_0_7_epi16);
+								__m128i mmDiffc_epi16[4] = 
+								{
+								   _mm_add_epi16(_mm_slli_epi16(v11, 5), diffValue),
+								   _mm_add_epi16(_mm_slli_epi16(v01, 5), diffValue),
+								   _mm_add_epi16(_mm_slli_epi16(v10, 5), diffValue),
+								   _mm_add_epi16(_mm_slli_epi16(v00, 5), diffValue)
+								};
+
+								__m128i mmDiff_epi16 = _mm_add_epi16( _mm_packs_epi32(t0, t1), diffValue);
+
+							
+								mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16);
+
+								__m128i scalediffIsPos_epi16	= _mm_cmpgt_epi16(mmDiff_epi16, mmScale);
+								mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1)));
+									
+							
+								__m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); 
+								__m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
+						
+							
+								__m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16);
+								__m128i bSet2_epi16, bSet1_epi16;
+								// |It| < sigma1 ?
+								bSet2_epi16		= _mm_cmplt_epi16(abs_epi16, mmParam1);
+								// It > 0 ?
+								__m128i diffIsPos_epi16	= _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128());
+								// sigma0 < |It| < sigma1 ?
+								bSet1_epi16		= _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0));  
+															// val = |It| -/+ sigma1
+								__m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)),
+																	 _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1)));
+								// It == 0	 ? |It| > sigma13 
+								mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16);
+
+								for( unsigned int mmi = 0; mmi < 4; mmi++)
+								{
+									__m128i mmDiffc_epi16_t = _mm_and_si128(mmDiffc_epi16[mmi], mask_0_7_epi16);
+									mmDiffc_epi16_t = _mm_and_si128(bSet2_epi16, mmDiffc_epi16_t);
+								
+									// It == val ? sigma0 < |It| < sigma1
+									mmDiffc_epi16_t = _mm_blendv_epi8(mmDiffc_epi16_t, tmpParam1_epi16, bSet1_epi16);
+									__m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3
+									// diff = diff * sigma2 
+									lo = _mm_mullo_epi16(tale_epi16_, mmDiffc_epi16_t);
+									hi = _mm_mulhi_epi16(tale_epi16_, mmDiffc_epi16_t);
+									__m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift);
+									__m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift);
+														
+									mmDiffc_epi16_t = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); 
+									__m128i diff1 = _mm_unpackhi_epi16(mmDiffc_epi16_t, mmDiffc_epi16_t); // It4 It4 It5 It5 It6 It6 It7 It7   | It12 It12 It13 It13...
+									__m128i diff0 = _mm_unpacklo_epi16(mmDiffc_epi16_t, mmDiffc_epi16_t); // It0 It0 It1 It1 It2 It2 It3 It3   | It8 It8 It9 It9...
+									
+									// Ix * diff / Iy * diff
+									v10 = _mm_mullo_epi16(Ixy_0, diff0);
+									v11 = _mm_mulhi_epi16(Ixy_0, diff0);
+									v00 = _mm_unpacklo_epi16(v10, v11);
+									v10 = _mm_unpackhi_epi16(v10, v11);
+								
+									qb0[mmi] = _mm_add_ps(qb0[mmi], _mm_cvtepi32_ps(v00));
+									qb1[mmi] = _mm_add_ps(qb1[mmi], _mm_cvtepi32_ps(v10));
+									// It * Ix It * Iy [4 ... 7]
+									// for set 1 hi sigma 1
+									v10 = _mm_mullo_epi16(Ixy_1, diff1);
+									v11 = _mm_mulhi_epi16(Ixy_1, diff1);
+									v00 = _mm_unpacklo_epi16(v10, v11);
+									v10 = _mm_unpackhi_epi16(v10, v11);
+									qb0[mmi] = _mm_add_ps(qb0[mmi], _mm_cvtepi32_ps(v00));
+									qb1[mmi] = _mm_add_ps(qb1[mmi], _mm_cvtepi32_ps(v10));
+									// diff * J [0 ... 7]
+									// for set 1  sigma 1
+									// b3 += diff * Iptr[x]
+									v10 = _mm_mullo_epi16(mmDiffc_epi16_t, I_0_7_epi16);
+									v11 = _mm_mulhi_epi16(mmDiffc_epi16_t, I_0_7_epi16);
+									v00 = _mm_unpacklo_epi16(v10, v11);
+									
+									v10 = _mm_unpackhi_epi16(v10, v11);
+									qb2[mmi] = _mm_add_ps(qb2[mmi], _mm_cvtepi32_ps(v00));
+									qb2[mmi] = _mm_add_ps(qb2[mmi], _mm_cvtepi32_ps(v10));
+									qb3[mmi] = _mm_add_ps(qb3[mmi], _mm_cvtepi32_ps(diff_0_3_epi32));
+									qb3[mmi] = _mm_add_ps(qb3[mmi], _mm_cvtepi32_ps(diff_4_7_epi32));
+								}
+
+								if( j == 0 )
+								{
+									__m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16));
+									__m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16,bSet1_epi16), 16));
+									__m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16));
+									__m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16),16)));
+
+									__m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16));
+									__m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16),16));
+							
+									__m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps);
+									__m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps);
+									tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps);
+									tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps);
+							
+									tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps);
+									tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps);
+									
+									t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3
+									t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3
+
+									__m128 fy = _mm_cvtepi32_ps(t0);
+									__m128 fx = _mm_cvtepi32_ps(t1);
+
+									// 0 ... 3
+									__m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16));
+							
+									// A11 - A22
+									__m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps);
+									__m128 fytale = _mm_mul_ps(fy, tale_0_3_ps);
+
+									mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
+									mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
+									mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
+							
+									// sumIx und sumIy
+									mmSumIx = _mm_add_ps(mmSumIx, fxtale);
+									mmSumIy = _mm_add_ps(mmSumIy, fytale);
+							
+									mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale));
+									mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale));
+													
+									// sumI
+									__m128 I_tale_ps = _mm_mul_ps(I_ps, tale_0_3_ps);
+									mmSumI = _mm_add_ps(mmSumI,I_tale_ps);
+							
+									// sumW 
+									mmSumW = _mm_add_ps(mmSumW, tale_0_3_ps);
+
+									// sumDI
+									mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_tale_ps));
+						
+				
+									t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11
+									t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3
+							
+									fy =  _mm_cvtepi32_ps(t0);
+									fx =  _mm_cvtepi32_ps(t1);
+
+									// 4 ... 7
+									I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16));
+													
+									// A11 - A22
+									fxtale = _mm_mul_ps(fx, tale_4_7_ps);
+									fytale = _mm_mul_ps(fy, tale_4_7_ps);
+
+									mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
+									mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
+									mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
+							
+									// sumIx und sumIy
+									mmSumIx = _mm_add_ps(mmSumIx, fxtale);
+									mmSumIy = _mm_add_ps(mmSumIy, fytale);
+							
+									mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale));
+									mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale));
+													
+									// sumI
+									I_tale_ps = _mm_mul_ps(I_ps, tale_4_7_ps);
+									mmSumI = _mm_add_ps(mmSumI, I_tale_ps);
+							
+									// sumW 
+									mmSumW = _mm_add_ps(mmSumW, tale_4_7_ps);
+
+									// sumDI
+									mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_tale_ps));
+								}
+							
+							}
+#else
+								for( ; x < _winSize.width*cn; x++, dIptr += 2 )
+								{
+									if( maskPtr[x] == 0)
+										continue;
+
+									short ixval = static_cast<float>(dIptr[0]);
+									short iyval = static_cast<float>(dIptr[1]);
+									int illValue =  Iptr[x] * gainVec.x + gainVec.y  - Iptr[x];
+									
+									int It[4] = {(Jptr[x+step+cn]<< 5)	+ illValue,
+												 (Jptr[x+cn]<< 5)		+ illValue,														
+												 (Jptr[x+step]<< 5)		+ illValue,
+												 (Jptr[x] << 5)			+ illValue};
+
+									
+									float J  =  CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
+														  Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11,
+														  W_BITS1-5);
+									
+									int diff =  J + illValue;
+						
+
+						
+									MEstimatorScale += (diff < MEstimatorScale) ? -eta : eta; 
+
+									int abss = (diff < 0) ? -diff : diff;
+
+
+									// compute the missmatch vector
+									if( j >= 0)
+									{
+										if( abss > fParam1)
+										{
+											It[0] = 0;
+											It[1] = 0;
+											It[2] = 0;
+											It[3] = 0;
+										}
+										else if( abss > fParam0 && diff >= 0 )
+										{
+											It[0] = param[2] * (It[0] - fParam1);
+											It[1] = param[2] * (It[1] - fParam1);
+											It[2] = param[2] * (It[2] - fParam1);
+											It[3] = param[2] * (It[3] - fParam1);
+										}
+										else if( abss > fParam0 && diff < 0 )
+										{
+											It[0] = param[2] * (It[0] + fParam1);
+											It[1] = param[2] * (It[1] + fParam1);
+											It[2] = param[2] * (It[2] + fParam1);
+											It[3] = param[2] * (It[3] + fParam1);
+										}
+									}
+									_b0[0] += (float)(It[0]*dIptr[0]) ;
+									_b0[1] += (float)(It[1]*dIptr[0]) ;
+									_b0[2] += (float)(It[2]*dIptr[0]) ;
+									_b0[3] += (float)(It[3]*dIptr[0]) ;
+									
+									  
+									_b1[0] += (float)(It[0]*dIptr[1]) ;
+									_b1[1] += (float)(It[1]*dIptr[1]) ;
+									_b1[2] += (float)(It[2]*dIptr[1]) ;
+									_b1[3] += (float)(It[3]*dIptr[1]) ;
+									
+									_b2[0] += (float)(It[0])*Iptr[x] ;
+									_b2[1] += (float)(It[1])*Iptr[x] ;
+									_b2[2] += (float)(It[2])*Iptr[x] ;
+									_b2[3] += (float)(It[3])*Iptr[x] ;
+
+									_b3[0] += (float)(It[0]);
+									_b3[1] += (float)(It[1]);
+									_b3[2] += (float)(It[2]);
+									_b3[3] += (float)(It[3]);
+
+									// compute the Gradient Matrice
+									if( j == 0)
+									{
+										float tale = param[2] * FLT_RESCALE;
+										if( abss < fParam0 || j < 0)
+										{
+											tale = FLT_RESCALE;
+										} 
+										else if( abss > fParam1)
+										{
+											tale *= 0.01f;
+										}
+										else
+										{
+											tale *= param[2];
+										}
+										if( j == 0 )
+										{
+											A11 += (float)(ixval*ixval)*tale;
+											A12 += (float)(ixval*iyval)*tale;
+											A22 += (float)(iyval*iyval)*tale;
+										}
+							
+										dI += Iptr[x] * Iptr[x] * tale;
+										float dx = static_cast<float>(dIptr[0]) * tale;
+										float dy = static_cast<float>(dIptr[1]) * tale;
+										sumIx += dx;
+										sumIy += dy;
+										w1 += dx * Iptr[x];
+										w2 += dy * Iptr[x];
+										sumI += Iptr[x]  * tale;
+										sumW += tale;
+									}
+						
+								}
+#endif
+							}
+					
+#ifdef CV_SSE2
+							short etaValues[8];
+						    _mm_storeu_si128((__m128i*)(etaValues), mmEta);
+							MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3] 
+													   + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]);
+
+#endif				
+							float CV_DECL_ALIGNED(32) wbuf[4];//
+							if( j == 0 )
+							{
+#ifdef CV_SSE2
+									_mm_store_ps(wbuf, mmSumW1);
+									w1  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+									_mm_store_ps(wbuf, mmSumW2);
+									w2  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+									_mm_store_ps(wbuf, mmSumDI);
+									dI  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+									_mm_store_ps(wbuf, mmSumI);
+									sumI  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+									_mm_store_ps(wbuf, mmSumIx);
+									sumIx  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+									_mm_store_ps(wbuf, mmSumIy);
+									sumIy  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+									_mm_store_ps(wbuf, mmSumW);
+									sumW  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+#endif
+									sumIx *= -FLT_SCALE;
+									sumIy *= -FLT_SCALE;
+									sumI *=FLT_SCALE;
+									sumW *= FLT_SCALE;
+									w1 *= -FLT_SCALE;
+									w2 *= -FLT_SCALE;
+									dI *= FLT_SCALE;
+											
+		
+#ifdef CV_SSE2
+								float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];//				
+					
+								_mm_store_ps(A11buf, mmAxx);
+								_mm_store_ps(A12buf, mmAxy);
+								_mm_store_ps(A22buf, mmAyy);
+					
+
+								A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
+								A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
+								A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
+#endif
+								A11 *= FLT_SCALE; // 54866744.
+								A12 *= FLT_SCALE; // -628764.00
+								A22 *= FLT_SCALE; // 19730.000
+														
+								D = - A12*A12*sumI*sumI + dI*sumW*A12*A12 + 2*A12*sumI*sumIx*w2 + 2*A12*sumI*sumIy*w1 
+									- 2*dI*A12*sumIx*sumIy - 2*sumW*A12*w1*w2 + A11*A22*sumI*sumI - 2*A22*sumI*sumIx*w1 
+									- 2*A11*sumI*sumIy*w2 - sumIx*sumIx*w2*w2 + A22*dI*sumIx*sumIx + 2*sumIx*sumIy*w1*w2 
+									- sumIy*sumIy*w1*w1 + A11*dI*sumIy*sumIy + A22*sumW*w1*w1 + A11*sumW*w2*w2 - A11*A22*dI*sumW;
+								
+								float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
+										4.f*A12*A12))/(2*winArea);
+								if(  minEig < minEigThreshold || std::abs(D) < FLT_EPSILON )
+								{
+									if( level == 0 && status )
+										status[ptidx] = 0;
+									if( level > 0)
+									{
+										nextPts[ptidx] = backUpNextPt;
+										gainVecs[ptidx] = backUpGain;
+									}
+									noIteration++;
+									break;
+								}
+				
+
+								D = (1.f / D);
+
+								invTensorMat.at<float>(0,0) = (A22*sumI*sumI - 2*sumI*sumIy*w2 + dI*sumIy*sumIy + sumW*w2*w2 - A22*dI*sumW)* D;															
+								invTensorMat.at<float>(0,1) = (A12*dI*sumW - A12*sumI * sumI - dI*sumIx*sumIy + sumI*sumIx*w2 + sumI*sumIy*w1 - sumW*w1*w2)* D;
+								invTensorMat.at<float>(0,2) = (A12*sumI*sumIy - sumIy*sumIy*w1 - A22*sumI*sumIx - A12*sumW*w2 + A22*sumW*w1 + sumIx*sumIy*w2)* D;
+								invTensorMat.at<float>(0,3) = (A22*dI*sumIx - A12*dI*sumIy - sumIx*w2*w2 + A12*sumI*w2 - A22*sumI*w1 + sumIy*w1*w2) * D;
+								invTensorMat.at<float>(1,0) = invTensorMat.at<float>(0,1);
+								invTensorMat.at<float>(1,1) = (A11*sumI * sumI - 2*sumI*sumIx*w1 + dI*sumIx * sumIx + sumW*w1*w1 - A11*dI*sumW) * D;
+								invTensorMat.at<float>(1,2) = (A12*sumI*sumIx - A11*sumI*sumIy - sumIx * sumIx*w2 + A11*sumW*w2 - A12*sumW*w1 + sumIx*sumIy*w1) * D;
+								invTensorMat.at<float>(1,3) = (A11*dI*sumIy - sumIy*w1*w1 - A12*dI*sumIx - A11*sumI*w2 + A12*sumI*w1 + sumIx*w1*w2)* D;
+								invTensorMat.at<float>(2,0) = invTensorMat.at<float>(0,2);
+								invTensorMat.at<float>(2,1) = invTensorMat.at<float>(1,2);
+								invTensorMat.at<float>(2,2) = (sumW*A12*A12 - 2*A12*sumIx*sumIy + A22*sumIx*sumIx + A11*sumIy*sumIy - A11*A22*sumW)* D;
+								invTensorMat.at<float>(2,3) = (A11*A22*sumI - A12*A12*sumI - A11*sumIy*w2 + A12*sumIx*w2 + A12*sumIy*w1 - A22*sumIx*w1)* D;
+								invTensorMat.at<float>(3,0) = invTensorMat.at<float>(0,3);
+								invTensorMat.at<float>(3,1) = invTensorMat.at<float>(1,3);
+								invTensorMat.at<float>(3,2) = invTensorMat.at<float>(2,3);
+								invTensorMat.at<float>(3,3) = (dI*A12*A12 - 2*A12*w1*w2 + A22*w1*w1 + A11*w2*w2 - A11*A22*dI)* D;
+							}
+
+
+			#ifdef CV_SSE2
+							float CV_DECL_ALIGNED(16) bbuf[4];
+							for(int mmi = 0; mmi < 4; mmi++)
+							{
+								
+								_mm_store_ps(bbuf, _mm_add_ps(qb0[mmi], qb1[mmi]));
+								_b0[mmi] = bbuf[0] + bbuf[2];
+								_b1[mmi] = bbuf[1] + bbuf[3];
+								_mm_store_ps(bbuf, qb2[mmi]);
+								_b2[mmi] = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
+								_mm_store_ps(bbuf, qb3[mmi]);
+								_b3[mmi] = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
+						
+							}
+			#endif
+							
+							_b0[0] *= FLT_SCALE;_b0[1] *= FLT_SCALE;_b0[2] *= FLT_SCALE;_b0[3] *= FLT_SCALE;
+							_b1[0] *= FLT_SCALE;_b1[1] *= FLT_SCALE;_b1[2] *= FLT_SCALE;_b1[3] *= FLT_SCALE;
+							_b2[0] *= FLT_SCALE;_b2[1] *= FLT_SCALE;_b2[2] *= FLT_SCALE;_b2[3] *= FLT_SCALE;
+							_b3[0] *= FLT_SCALE;_b3[1] *= FLT_SCALE;_b3[2] *= FLT_SCALE;_b3[3] *= FLT_SCALE;
+							
+
+							Mc0[0] =   _b0[0] - _b0[1] - _b0[2] + _b0[3];
+							Mc0[1] =   _b1[0] - _b1[1] - _b1[2] + _b1[3];
+							Mc0[2] = -(_b2[0] - _b2[1] - _b2[2] + _b2[3]);
+							Mc0[3] = -(_b3[0] - _b3[1] - _b3[2] + _b3[3]);
+						
+							Mc1[0] =   _b0[1] - _b0[3];
+							Mc1[1] =   _b1[1] - _b1[3];
+							Mc1[2] = -(_b2[1] - _b2[3]);
+							Mc1[3] = -(_b3[1] - _b3[3]);
+					
+
+							Mc2[0] =   _b0[2] - _b0[3];
+							Mc2[1] =   _b1[2] - _b1[3];
+							Mc2[2] = -(_b2[2] - _b2[3]);
+							Mc2[3] = -(_b3[2] - _b3[3]);
+							
+
+							Mc3[0] =  _b0[3];
+							Mc3[1] =  _b1[3];
+							Mc3[2] = -_b2[3];
+							Mc3[3] = -_b3[3];
+							
+							//
+							c[0] = -Mc0[0];
+							c[1] = -Mc1[0];
+							c[2] = -Mc2[0];
+							c[3] = -Mc3[0];
+							c[4] = -Mc0[1];
+							c[5] = -Mc1[1];
+							c[6] = -Mc2[1];
+							c[7] = -Mc3[1];
+
+							float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]);
+							float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]);
+							float e2 = e0 * (c[1] * c[7] -c[3] * c[5]);
+							e0 = e1 * e1 - e2;
+							hasSolved = false;
+							if ( e0 > 0)
+							{
+								e0 = sqrt(e0);
+								float _y[2] = {-e1 - e0, e0 - e1};
+								float c0yc1[2] = {c[0] * _y[0] + c[1],  
+													c[0] * _y[1] + c[1]};
+								float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0], 
+												-(c[2] * _y[1] + c[3]) / c0yc1[1]};
+								float c0yc2[2] = {c[0] * _y[0] + c[2],
+											  c[0] * _y[1] + c[2]};
+									
+								float c4yc6[2] = {c[4] * _y[0] + c[6],
+												  c[4] * _y[1] + c[6]};
+
+								float c0xc1[2] = {c[0] * _x[0] + c[1],  
+												  c[0] * _x[1] + c[1]};
+							
+								float c4xc5[2] = {c[4] * _x[0] + c[5],  
+												  c[4] * _x[1] + c[5]};
+								bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1);
+								bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1);
+																
+								bool isSolution1 = checkSolution(_x[0], _y[0], c );
+								bool isSolution2 = checkSolution(_x[1], _y[1], c );
+								bool isSink1 = isIn1 && isSolution1;
+								bool isSink2 = isIn2 && isSolution2;
+						
+								if ( isSink1 != isSink2)
+								{
+									a = isSink1 ? _x[0] : _x[1];
+									b = isSink1 ? _y[0] : _y[1];
+									ab = a * b;
+									hasSolved = true;
+									delta.x = inextPt.x + a - nextPt.x;
+									delta.y = inextPt.y + b - nextPt.y;
+									
+									cv::Vec4f mismatchVec = ab * Mc0  + Mc1 *a + Mc2 * b + Mc3;
+									deltaGain = est_DeltaGain(invTensorMat, mismatchVec);
+									
+								} // isIn1 != isIn2
+							}
+								if( hasSolved == false)
+									noIteration++;
+							}
+							else
+							{
+								hasSolved = false;
+								noReusedIteration++;
+							}
+							if( hasSolved == false )
+							{
+								
+								cv::Vec4f mismatchVec = ab * Mc0  + Mc1 *a + Mc2 * b + Mc3;
+								est_Result(invTensorMat, mismatchVec, delta, deltaGain);
+
+								delta.x  = MAX(-1.f, MIN( 1.f , delta.x));
+								delta.y  = MAX(-1.f, MIN( 1.f , delta.y));
+								
+								
+								if( j == 0) 
+									prevGain = deltaGain;
+								gainVec += deltaGain;
+								nextPt  += delta    ;
+								nextPts[ptidx] = nextPt + halfWin;
+								gainVecs[ptidx]= gainVec;	
+								
+							}
+							else
+							{
+								nextPt += delta;
+								nextPts[ptidx] = nextPt + halfWin;
+								gainVecs[ptidx]= gainVec + deltaGain;	
+								noSolvedIteration++;
+								break;
+							}
+
+
+							cv::Point2f totalDistance = backUpNextPt - nextPts[ptidx];
+
+							if(j > 0 && ( 
+								(std::abs(delta.x - prevDelta.x) < 0.01  &&	std::abs(delta.y - prevDelta.y) < 0.01)  
+							 || ((delta.ddot(delta) <= 0.001) && std::abs(prevGain.x - deltaGain.x) < 0.01)))
+							{
+								nextPts[ptidx]  -= delta*0.5f;
+								gainVecs[ptidx] -= deltaGain* 0.5f;
+								break;
+							}
+							
+							
+							prevDelta = delta;
+							prevGain = deltaGain;
+						}
+
+					}
+	
+				}
+    
+				const Mat*		prevImg;
+				const Mat*		nextImg;
+				const Mat*		prevDeriv;
+				const Mat*		rgbPrevImg;
+				const Mat*		rgbNextImg;
+				const Point2f*	prevPts;
+				Point2f*		nextPts;
+				uchar*			status;
+				cv::Point2f*	gainVecs;		// gain vector x -> multiplier y -> offset
+				float*			err;
+				int				maxWinSize;
+				int				minWinSize;
+				TermCriteria	criteria;
+				int				level;
+				int				maxLevel;
+				int				windowType;
+				float			minEigThreshold;
+				bool			useInitialFlow;
+				std::vector<float> paramReset;
+				int		crossSegmentationThreshold;
+
+			};
+		}
+	}
+	namespace beplk
+	{
+		namespace ica{
+		class TrackerInvoker  : public cv::ParallelLoopBody
+		{
+		public:
+			TrackerInvoker(const Mat&	_prevImg,
+				const Mat&	_prevDeriv,
+				const Mat&	_nextImg,
+				const Mat&	_rgbPrevImg,
+				const Mat&	_rgbNextImg,
+				const Point2f*	_prevPts,
+				Point2f*		_nextPts,
+				uchar*			_status,
+				float*			_err,
+				int				_level,
+				int				_maxLevel,
+				int				_winSize[2],
+				int				_maxIteration,
+				bool			_useInitialFlow,
+				int				_supportRegionType,
+				int				_crossSegmentationThreshold,
+				float           _minEigenValue)
+			{
+				prevImg = &_prevImg;
+				prevDeriv = &_prevDeriv;
+				nextImg = &_nextImg;
+				rgbPrevImg = &_rgbPrevImg;
+				rgbNextImg = &_rgbNextImg;
+				prevPts = _prevPts;
+				nextPts = _nextPts;
+				status = _status;
+				err = _err;
+				minWinSize = _winSize[0];
+				maxWinSize = _winSize[1];
+				criteria.maxCount = _maxIteration;
+				criteria.epsilon = 0.01;
+				level = _level;
+				maxLevel = _maxLevel;
+				windowType = _supportRegionType;
+				minEigThreshold = _minEigenValue;
+				useInitialFlow = _useInitialFlow;
+				crossSegmentationThreshold = _crossSegmentationThreshold;
+			}
+			
+			TrackerInvoker & operator=(const TrackerInvoker &) {return *this;};
+			
+			void operator()(const cv::Range& range) const
+			{
+				cv::Size	winSize;
+				cv::Point2f halfWin;
+
+				const Mat& I	= *prevImg;
+				const Mat& J	= *nextImg;
+				const Mat& derivI = *prevDeriv;
+				const Mat& BI = *rgbPrevImg;
+
+				winSize = cv::Size(maxWinSize,maxWinSize);
+				int winMaskwidth = static_cast<int>(ceil(winSize.width / 8.f)) * 16;
+				cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
+				winMaskMatBuf.setTo(1);
+		
+				const float FLT_SCALE = (1.f/(1 << 20)); // 20
+		
+				int j, cn = I.channels(), cn2 = cn*2;
+				int winbufwidth = static_cast<int>(ceil(winSize.width / 8.f)) * 8;
+				cv::Size winBufSize(winbufwidth,winbufwidth);
+
+				std::vector<short> _buf(winBufSize.area()*(cn + cn2));
+				Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]);
+				Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]);
+
+				for( int ptidx = range.start; ptidx < range.end; ptidx++ )
+				{
+					Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
+					Point2f nextPt;
+					if( level == maxLevel )
+					{
+						if( useInitialFlow )
+							nextPt = nextPts[ptidx]*(float)(1./(1 << level));
+						else
+							nextPt = prevPt;
+					}
+					else
+						nextPt = nextPts[ptidx]*2.f;
+					nextPts[ptidx] = nextPt;
+
+					Point2i iprevPt, inextPt;
+					iprevPt.x = cvFloor(prevPt.x);
+					iprevPt.y = cvFloor(prevPt.y);
+					int winArea = maxWinSize * maxWinSize;
+					cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize));
+
+					if( calcWinMaskMat(BI, windowType, iprevPt,
+								winMaskMat,winSize,halfWin,winArea,
+								minWinSize,maxWinSize, crossSegmentationThreshold) == false)
+						continue;
+
+					prevPt -= halfWin;
+					iprevPt.x = cvFloor(prevPt.x);
+					iprevPt.y = cvFloor(prevPt.y);
+			
+					if( iprevPt.x + halfWin.x < 0 || iprevPt.x + halfWin.x>= derivI.cols ||
+						iprevPt.y + halfWin.y < 0 || iprevPt.y + halfWin.y >= derivI.rows )
+					{
+						if( level == 0 )
+						{
+							if( status )
+								status[ptidx] = 3;
+							if( err )
+								err[ptidx] = 0;
+						}
+						continue;
+					}
+
+					float a = prevPt.x - iprevPt.x;
+					float b = prevPt.y - iprevPt.y;
+					const int W_BITS = 14, W_BITS1 = 14;
+            
+					int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+					int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+					int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+					int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+            
+					int dstep = (int)(derivI.step/derivI.elemSize1());
+					int step = (int)(I.step/I.elemSize1());
+					int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1());
+					CV_Assert( step == (int)(J.step/J.elemSize1()) );
+					float A11 = 0, A12 = 0, A22 = 0;
+            
+		#ifdef CV_SSE2
+					__m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+					__m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+					__m128i z = _mm_setzero_si128();
+					__m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1));
+					__m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1));
+					__m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps();
+					__m128i mmMask4_epi32;
+					get4BitMask(winSize.width, mmMask4_epi32);
+		#endif
+            
+					
+					int x, y;
+					for( y = 0; y < winSize.height; y++ )
+					{
+						const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+						const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
+						const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
+                
+						short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step);
+						short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step);
+                
+						x = 0;
+                
+		#ifdef CV_SSE2
+						for( ; x < winSize.width*cn; x += 4, dsrc += 4*2, dIptr += 4*2 )
+						{
+							__m128i wMask = _mm_set_epi32(MaskSet * maskPtr[x+3], 
+														  MaskSet * maskPtr[x+2], 
+														  MaskSet * maskPtr[x+1], 
+														  MaskSet * maskPtr[x]);
+							__m128i v00, v01, v10, v11, t0, t1;
+							v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
+							v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
+							v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z);
+							v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z);
+                    
+							t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+											   _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+				
+							t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
+							_mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0));
+                    
+							v00 = _mm_loadu_si128((const __m128i*)(dsrc));
+							v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
+							v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep));
+							v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2));
+                    
+							t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+											   _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+							t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+											   _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+							t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
+							t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
+							v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
+                    
+							if( x + 4 > winSize.width)
+							{
+								v00 = _mm_and_si128(v00, mmMask4_epi32);
+							}
+							v00 = _mm_and_si128(v00, wMask);
+					
+							_mm_storeu_si128((__m128i*)dIptr, v00);
+							t0 = _mm_srai_epi32(v00, 16); // Iy0 Iy1 Iy2 Iy3
+							t1 = _mm_srai_epi32(_mm_slli_epi32(v00, 16), 16); // Ix0 Ix1 Ix2 Ix3
+                    
+							__m128 fy = _mm_cvtepi32_ps(t0);
+							__m128 fx = _mm_cvtepi32_ps(t1);
+		
+							qA22 = _mm_add_ps(qA22, _mm_mul_ps(fy, fy));
+							qA12 = _mm_add_ps(qA12, _mm_mul_ps(fx, fy));
+							qA11 = _mm_add_ps(qA11, _mm_mul_ps(fx, fx));
+						}
+		#else
+                
+						for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2)
+						{
+							if( maskPtr[x] == 0)
+							{
+								dIptr[0] = (short)0;
+								dIptr[1] = (short)0;
+		
+								continue;
+							}
+							int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
+												  src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5);
+							int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
+												   dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
+							int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
+												   dsrc[dstep+cn2+1]*iw11, W_BITS1);
+		
+							Iptr[x] = (short)ival;
+							dIptr[0] = (short)ixval;
+							dIptr[1] = (short)iyval;
+							A11 += (float)(ixval*ixval);
+							A12 += (float)(ixval*iyval);
+							A22 += (float)(iyval*iyval);
+					
+						}
+			
+			
+		#endif
+		
+					}
+            
+		#ifdef CV_SSE2
+					float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];
+					_mm_store_ps(A11buf, qA11);
+					_mm_store_ps(A12buf, qA12);
+					_mm_store_ps(A22buf, qA22);
+					A11 += A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
+					A12 += A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
+					A22 += A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
+		#endif
+            
+					A11 *= FLT_SCALE;
+					A12 *= FLT_SCALE;
+					A22 *= FLT_SCALE;
+
+
+					float D = A11*A22 - A12*A12;
+					float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
+									 4.f*A12*A12))/(2 * winArea);
+            
+					if( err )
+						err[ptidx] = (float)minEig;
+            
+					if( D < FLT_EPSILON )
+					{
+						if( level == 0 && status )
+							status[ptidx] = 0;
+						continue;
+					}
+            
+					D = 1.f/D;
+
+					cv::Point2f backUpNextPt = nextPt;
+					nextPt -= halfWin;
+					Point2f prevDelta(0,0);	
+					
+					float c[8];
+					// init the maske 
+					__m128i mmMask_epi32 = _mm_set_epi32(0, std::numeric_limits<unsigned int>::max(), 0, std::numeric_limits<unsigned int>::max());
+					__m128i mmMask_epi16 = _mm_set_epi16(0, std::numeric_limits<unsigned short>::max(), 0, std::numeric_limits<unsigned short>::max(),
+														 0, std::numeric_limits<unsigned short>::max(), 0, std::numeric_limits<unsigned short>::max());
+					__m128i mmMask0, mmMask1, mmMask;
+					getWBitMask(winSize.width, mmMask0, mmMask1, mmMask);
+					for( j = 0; j < criteria.maxCount; j++ )
+					{
+						cv::Point2f delta;
+						bool hasSolved = false;
+						a = nextPt.x - cvFloor(nextPt.x);
+						b = nextPt.y - cvFloor(nextPt.y);
+						float ab = a * b;
+
+						
+						if( (inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j == 0))
+						{
+						inextPt.x = cvFloor(nextPt.x);
+						inextPt.y = cvFloor(nextPt.y);
+                
+				
+						if( inextPt.x < -winSize.width || inextPt.x >= J.cols ||
+							inextPt.y < -winSize.height || inextPt.y >= J.rows )
+						{
+							if( level == 0 && status )
+								status[ptidx] = 3;
+							if (level > 0)
+								nextPts[ptidx] = backUpNextPt;
+							break;
+						}
+
+						
+						iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+						iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+						iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+						iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+						
+						float _b1[4] = {0,0,0,0};
+						float _b2[4] = {0,0,0,0};
+		#ifdef CV_SSE2
+						__m128 qbc0[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
+						__m128 qbc1[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
+						qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+						qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+						__m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();			
+		#endif
+						for( y = 0; y < winSize.height; y++ )
+						{
+							const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn;
+							const short* Iptr = (const short*)(IWinBuf.data + ( y)*IWinBuf.step);
+							const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
+							x = 0;
+		#ifdef CV_SSE2					
+							const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+							for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 )
+							{
+								if( maskPtr[x  ] == 0 && maskPtr[x+1] == 0 && maskPtr[x+2] == 0 && maskPtr[x+3] == 0
+								&&	maskPtr[x+4] == 0 && maskPtr[x+5] == 0 && maskPtr[x+6] == 0 && maskPtr[x+7] == 0)
+									continue;
+								__m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x));
+								__m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
+								__m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
+								__m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z);
+								__m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z);
+	
+								__m128i mmDiffc_epi16[4] = 
+								{ _mm_subs_epi16(_mm_slli_epi16(v00, 5), diff0),
+								  _mm_subs_epi16(_mm_slli_epi16(v01, 5), diff0),
+								  _mm_subs_epi16(_mm_slli_epi16(v10, 5), diff0),
+								  _mm_subs_epi16(_mm_slli_epi16(v11, 5), diff0)
+								};
+
+								__m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... 
+								__m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
+						
+								if(  x > winSize.width*cn - 8)
+								{
+									Ixy_0 = _mm_and_si128(Ixy_0, mmMask0);
+									Ixy_1 = _mm_and_si128(Ixy_1, mmMask1);
+								}
+
+								__m128i diffc1[4] = 
+								{_mm_unpackhi_epi16(mmDiffc_epi16[0],mmDiffc_epi16[0]),
+								 _mm_unpackhi_epi16(mmDiffc_epi16[1],mmDiffc_epi16[1]),
+								 _mm_unpackhi_epi16(mmDiffc_epi16[2],mmDiffc_epi16[2]),
+								 _mm_unpackhi_epi16(mmDiffc_epi16[3],mmDiffc_epi16[3])
+								};
+
+								__m128i diffc0[4] = 
+								{_mm_unpacklo_epi16(mmDiffc_epi16[0],mmDiffc_epi16[0]),
+								 _mm_unpacklo_epi16(mmDiffc_epi16[1],mmDiffc_epi16[1]),
+								 _mm_unpacklo_epi16(mmDiffc_epi16[2],mmDiffc_epi16[2]),
+								 _mm_unpacklo_epi16(mmDiffc_epi16[3],mmDiffc_epi16[3])
+								};
+								
+					
+								// It * Ix It * Iy [0 ... 3]
+								//mask split for multiplication
+								// for set 1 lo sigma 1
+								
+						
+								v10 = _mm_mullo_epi16(Ixy_0, diffc0[0]);
+								v11 = _mm_mulhi_epi16(Ixy_0, diffc0[0]);
+								v00 = _mm_unpacklo_epi16(v10, v11);
+								v10 = _mm_unpackhi_epi16(v10, v11);
+								qbc0[0] = _mm_add_ps(qbc0[0], _mm_cvtepi32_ps(v00));
+								qbc1[0] = _mm_add_ps(qbc1[0], _mm_cvtepi32_ps(v10));
+
+								v10 = _mm_mullo_epi16(Ixy_0, diffc0[1]);
+								v11 = _mm_mulhi_epi16(Ixy_0, diffc0[1]);
+								v00 = _mm_unpacklo_epi16(v10, v11);
+								v10 = _mm_unpackhi_epi16(v10, v11);
+								qbc0[1] = _mm_add_ps(qbc0[1], _mm_cvtepi32_ps(v00));
+								qbc1[1] = _mm_add_ps(qbc1[1], _mm_cvtepi32_ps(v10));
+
+								v10 = _mm_mullo_epi16(Ixy_0, diffc0[2]);
+								v11 = _mm_mulhi_epi16(Ixy_0, diffc0[2]);
+								v00 = _mm_unpacklo_epi16(v10, v11);
+								v10 = _mm_unpackhi_epi16(v10, v11);
+								qbc0[2] = _mm_add_ps(qbc0[2], _mm_cvtepi32_ps(v00));
+								qbc1[2] = _mm_add_ps(qbc1[2], _mm_cvtepi32_ps(v10));
+
+								v10 = _mm_mullo_epi16(Ixy_0, diffc0[3]);
+								v11 = _mm_mulhi_epi16(Ixy_0, diffc0[3]);
+								v00 = _mm_unpacklo_epi16(v10, v11);
+								v10 = _mm_unpackhi_epi16(v10, v11);
+								qbc0[3] = _mm_add_ps(qbc0[3], _mm_cvtepi32_ps(v00));
+								qbc1[3] = _mm_add_ps(qbc1[3], _mm_cvtepi32_ps(v10));
+								// It * Ix It * Iy [4 ... 7]
+								// for set 1 hi sigma 1
+						
+								v10 = _mm_mullo_epi16(Ixy_1, diffc1[0]);
+								v11 = _mm_mulhi_epi16(Ixy_1, diffc1[0]);
+								v00 = _mm_unpacklo_epi16(v10, v11);
+								v10 = _mm_unpackhi_epi16(v10, v11);
+								qbc0[0] = _mm_add_ps(qbc0[0], _mm_cvtepi32_ps(v00));
+								qbc1[0] = _mm_add_ps(qbc1[0], _mm_cvtepi32_ps(v10));
+
+								v10 = _mm_mullo_epi16(Ixy_1, diffc1[1]);
+								v11 = _mm_mulhi_epi16(Ixy_1, diffc1[1]);
+								v00 = _mm_unpacklo_epi16(v10, v11);
+								v10 = _mm_unpackhi_epi16(v10, v11);
+								qbc0[1] = _mm_add_ps(qbc0[1], _mm_cvtepi32_ps(v00));
+								qbc1[1] = _mm_add_ps(qbc1[1], _mm_cvtepi32_ps(v10));
+
+								v10 = _mm_mullo_epi16(Ixy_1, diffc1[2]);
+								v11 = _mm_mulhi_epi16(Ixy_1, diffc1[2]);
+								v00 = _mm_unpacklo_epi16(v10, v11);
+								v10 = _mm_unpackhi_epi16(v10, v11);
+								qbc0[2] = _mm_add_ps(qbc0[2], _mm_cvtepi32_ps(v00));
+								qbc1[2] = _mm_add_ps(qbc1[2], _mm_cvtepi32_ps(v10));
+
+								v10 = _mm_mullo_epi16(Ixy_1, diffc1[3]);
+								v11 = _mm_mulhi_epi16(Ixy_1, diffc1[3]);
+								v00 = _mm_unpacklo_epi16(v10, v11);
+								v10 = _mm_unpackhi_epi16(v10, v11);
+								qbc0[3] = _mm_add_ps(qbc0[3], _mm_cvtepi32_ps(v00));
+								qbc1[3] = _mm_add_ps(qbc1[3], _mm_cvtepi32_ps(v10));
+						
+							 }
+		#else
+							for( ; x < winSize.width*cn; x++, dIptr += 2 )
+							{
+								if( dIptr[0] == 0 && dIptr[1] == 0) 
+									continue;
+								/*short It[4] = {CV_DESCALE((Jptr[x])		*(1 << W_BITS),W_BITS1-5) - Iptr[x], 
+													CV_DESCALE((Jptr[x+cn])		*(1 << W_BITS),W_BITS1-5) - Iptr[x],
+													CV_DESCALE((Jptr[x+step])	*(1 << W_BITS),W_BITS1-5) - Iptr[x],
+													CV_DESCALE((Jptr[x+step+cn])*(1 << W_BITS),W_BITS1-5) - Iptr[x]};*/
+								short It[4] = {(Jptr[x] << 5)			- Iptr[x], 
+													(Jptr[x+cn]<< 5)		- Iptr[x],
+													(Jptr[x+step]<< 5)		- Iptr[x],
+													(Jptr[x+step+cn]<< 5)	- Iptr[x]};
+								
+								_b1[0] += (float)(It[0]*dIptr[0]);
+								_b1[1] += (float)(It[1]*dIptr[0]);
+								_b1[2] += (float)(It[2]*dIptr[0]);
+								_b1[3] += (float)(It[3]*dIptr[0]);
+
+								_b2[0] += (float)(It[0]*dIptr[1]);
+								_b2[1] += (float)(It[1]*dIptr[1]);
+								_b2[2] += (float)(It[2]*dIptr[1]);
+								_b2[3] += (float)(It[3]*dIptr[1]);
+							}
+		#endif
+		
+						}
+				
+		#ifdef CV_SSE2
+						float CV_DECL_ALIGNED(16) bbuf[4];
+						_mm_store_ps(bbuf, _mm_add_ps(qbc0[0], qbc1[0]));
+						_b1[0] += bbuf[0] + bbuf[2];
+						_b2[0] += bbuf[1] + bbuf[3];
+
+						_mm_store_ps(bbuf, _mm_add_ps(qbc0[1], qbc1[1]));
+						_b1[1] += bbuf[0] + bbuf[2];
+						_b2[1] += bbuf[1] + bbuf[3];
+
+						_mm_store_ps(bbuf, _mm_add_ps(qbc0[2], qbc1[2]));
+						_b1[2] += bbuf[0] + bbuf[2];
+						_b2[2] += bbuf[1] + bbuf[3];
+
+						_mm_store_ps(bbuf, _mm_add_ps(qbc0[3], qbc1[3]));
+						_b1[3] += bbuf[0] + bbuf[2];
+						_b2[3] += bbuf[1] + bbuf[3];
+
+		#endif
+						_b1[0] *= FLT_SCALE;
+						_b1[1] *= FLT_SCALE;
+						_b1[2] *= FLT_SCALE;
+						_b1[3] *= FLT_SCALE;
+						_b2[0] *= FLT_SCALE;
+						_b2[1] *= FLT_SCALE;
+						_b2[2] *= FLT_SCALE;
+						_b2[3] *= FLT_SCALE;
+
+						float c0[4] = {	_b1[3] + _b1[0] - _b1[2] - _b1[1], 
+										_b1[1] - _b1[0],
+										_b1[2] - _b1[0], 
+										_b1[0]};
+						float c1[4] = {	_b2[3] + _b2[0] - _b2[2] - _b2[1], 
+										_b2[1] - _b2[0],
+										_b2[2] - _b2[0], 
+										_b2[0]};
+
+						float DA12 = A12 * D ;
+						float DA22 = A22 * D ;
+						float DA11 = A11 * D ;
+						c[0]	= DA12 * c1[0] - DA22 * c0[0];
+						c[1]	= DA12 * c1[1] - DA22 * c0[1];
+						c[2]	= DA12 * c1[2] - DA22 * c0[2];
+						c[3]	= DA12 * c1[3] - DA22 * c0[3];
+						c[4]	= DA12 * c0[0] - DA11 * c1[0];
+						c[5]	= DA12 * c0[1] - DA11 * c1[1];
+						c[6]	= DA12 * c0[2] - DA11 * c1[2];
+						c[7]	= DA12 * c0[3] - DA11 * c1[3];
+
+						float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]);
+						float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]);
+						float e2 = e0 * (c[1] * c[7] -c[3] * c[5]);
+						e0 = e1 * e1 - e2;
+						if ( e0 >= 0)
+						{
+							e0 = sqrt(e0);
+							float _y[2] = {-e1 - e0, e0 - e1};
+							float c0yc1[2] = {c[0] * _y[0] + c[1],  
+										      c[0] * _y[1] + c[1]};
+						
+							float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0], 
+										   -(c[2] * _y[1] + c[3]) / c0yc1[1]};
+						
+							float c0yc2[2] = {c[0] * _y[0] + c[2],
+											  c[0] * _y[1] + c[2]};
+									
+							float c4yc6[2] = {c[4] * _y[0] + c[6],
+											  c[4] * _y[1] + c[6]};
+
+							float c0xc1[2] = {c[0] * _x[0] + c[1],  
+											  c[0] * _x[1] + c[1]};
+							
+							float c4xc5[2] = {c[4] * _x[0] + c[5],  
+											  c[4] * _x[1] + c[5]};
+							bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1);
+							bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1);
+
+
+							bool isSink1 = isIn1 && checkSolution(_x[0], _y[0], c );
+							bool isSink2 = isIn2 && checkSolution(_x[1], _y[1], c );
+							
+					
+							//if( isSink1 != isSink2 )
+							{
+								if( isSink1 )
+								{
+									hasSolved = true;
+									delta.x = inextPt.x + _x[0] - nextPt.x;
+									delta.y = inextPt.y + _y[0] - nextPt.y;			
+								}		
+								if (isSink2 )
+								{
+									hasSolved = true;
+									delta.x = inextPt.x + _x[1] - nextPt.x;
+									delta.y = inextPt.y + _y[1] - nextPt.y;
+								}
+							} // isIn1 != isIn2
+						} // e0 >= 0
+						}
+						else
+							hasSolved = false;
+						if(hasSolved == false)
+						{
+							delta = Point2f( c[0] * ab + c[1] * a + c[2] * b + c[3], 
+											 c[4] * ab + c[5] * a + c[6] * b + c[7]);
+							nextPt += 0.7 * delta;
+							nextPts[ptidx] = nextPt + halfWin;
+						}
+						else
+						{
+							nextPt += delta;
+							nextPts[ptidx] = nextPt + halfWin;
+							break;
+						}
+                
+
+						if( delta.ddot(delta) <= criteria.epsilon)
+							break;
+
+						if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01  &&	
+									std::abs(delta.y - prevDelta.y) < 0.01)
+						{
+							nextPts[ptidx]  -= delta*0.35f;
+							break;
+						}
+				
+						prevDelta = delta;
+				
+					}
+					
+				}
+			}
+	
+    
+			const Mat*		prevImg;
+			const Mat*		nextImg;
+			const Mat*		prevDeriv;
+			const Mat*		rgbPrevImg;
+			const Mat*		rgbNextImg;
+			const Point2f*	prevPts;
+			Point2f*		nextPts;
+			uchar*			status;
+			float*			err;
+			int				maxWinSize;
+			int				minWinSize;
+			TermCriteria	criteria;
+			int				level;
+			int				maxLevel;
+			int				windowType;
+			float			minEigThreshold;
+			bool			useInitialFlow;
+			int		crossSegmentationThreshold;
+		};
+}
+}
+}
+}
+#endif
\ No newline at end of file
diff --git a/modules/optflow/src/rlof/geo_interpolation.cpp b/modules/optflow/src/rlof/geo_interpolation.cpp
new file mode 100644
index 00000000000..47e92c96b0d
--- /dev/null
+++ b/modules/optflow/src/rlof/geo_interpolation.cpp
@@ -0,0 +1,525 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+ //
+ //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+ //
+ //  By downloading, copying, installing or using the software you agree to this license.
+ //  If you do not agree to this license, do not download, install,
+ //  copy or use the software.
+ //
+ //
+ //                           License Agreement
+ //                For Open Source Computer Vision Library
+ //
+ // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+ // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+ // Third party copyrights are property of their respective owners.
+ //
+ // Redistribution and use in source and binary forms, with or without modification,
+ // are permitted provided that the following conditions are met:
+ //
+ //   * Redistribution's of source code must retain the above copyright notice,
+ //     this list of conditions and the following disclaimer.
+ //
+ //   * Redistribution's in binary form must reproduce the above copyright notice,
+ //     this list of conditions and the following disclaimer in the documentation
+ //     and/or other materials provided with the distribution.
+ //
+ //   * The name of the copyright holders may not be used to endorse or promote products
+ //     derived from this software without specific prior written permission.
+ //
+ // This software is provided by the copyright holders and contributors "as is" and
+ // any express or implied warranties, including, but not limited to, the implied
+ // warranties of merchantability and fitness for a particular purpose are disclaimed.
+ // In no event shall the Intel Corporation or contributors be liable for any direct,
+ // indirect, incidental, special, exemplary, or consequential damages
+ // (including, but not limited to, procurement of substitute goods or services;
+ // loss of use, data, or profits; or business interruption) however caused
+ // and on any theory of liability, whether in contract, strict liability,
+ // or tort (including negligence or otherwise) arising in any way out of
+ // the use of this software, even if advised of the possibility of such damage.
+ //
+ //M*/
+// This functions have been contributed by Jonas Geisters <geistert@nue.tu-berlin.de>
+#include "precomp.hpp"
+#include "geo_interpolation.hpp"
+#include <string>
+#include <map>
+namespace cv
+{
+	namespace optflow
+	{
+
+		double K_h1(float x, double h2)
+		{
+			return exp(-(0.5 / (h2)) * x);
+		}
+
+		struct Graph_helper {
+			int * mem;
+			int e_size;
+			Graph_helper(int k, int num_nodes) {
+				e_size = (2 * k + 1);
+				mem = (int*)malloc(sizeof(int) * e_size * num_nodes);
+				memset(mem, 0, sizeof(int) * e_size * num_nodes);
+			}
+			inline int size(int id) {
+				int r_addr = id * (e_size);
+				return ((int*)mem)[r_addr];
+			}
+			inline int * data(int id) {
+				int r_addr = id * (e_size)+1;
+				return mem + r_addr;
+			}
+			inline void add(int id, std::pair<float, int> data) {
+				int r_addr = id * (e_size);
+				int size = ++((int*)mem)[r_addr];
+				r_addr += 2 * size - 1;//== 1 + 2*(size-1);
+				((float*)mem)[r_addr] = data.first;
+				((int*)mem)[r_addr + 1] = data.second;
+			}
+			inline bool color_in_target(int id, int color) {
+				int r_addr = id * (e_size);
+				int size = (((int*)mem)[r_addr]);
+				r_addr += 2;
+				for (int i = 0; i < size; i++) {
+					if (((int*)mem)[r_addr] == color) {
+						return true;
+					}
+					r_addr += 2;
+				}
+				return false;
+			}
+
+		};
+
+		Mat sgeo_dist(const Mat& gra, int y, int x, float max, Mat &prev)
+		{
+			std::vector <Point2f> points;
+			points.push_back(Point2f(static_cast<float>(x), static_cast<float>(y)));
+			return sgeo_dist(gra, points, max, prev);
+		}
+		Mat sgeo_dist(const Mat& gra, const std::vector<Point2f> & points, float max, Mat &prev)
+		{
+			int Dx[] = { -1,0,1,-1,1,-1,0,1 };
+			int Dy[] = { -1,-1,-1,0,0,1,1,1 };
+			Mat dm(gra.rows, gra.cols, CV_32F, Scalar(max));
+			prev = Mat(gra.rows, gra.cols, CV_8U, Scalar(255));
+
+			std::multimap<float, Vec2i > not_visited_with_value;
+
+			for (auto i = points.begin(); i != points.end(); i++)
+			{
+				int y = static_cast<int>(i->y);
+				int x = static_cast<int>(i->x);
+				not_visited_with_value.insert(std::pair<float, Vec2i >(0.f, Vec2i(y, x)));
+				dm.at<float>(y, x) = 0;
+			}
+
+			bool done = false;
+			while (!done)
+			{
+				if (not_visited_with_value.begin() == not_visited_with_value.end()) {
+					done = true;
+					break;
+				}
+				std::multimap<float, Vec2i >::iterator current_it = not_visited_with_value.begin();
+				std::pair<float, Vec2i > current_p = *current_it;
+				not_visited_with_value.erase(current_it);
+
+				int y = current_p.second[0];
+				int x = current_p.second[1];
+				float cur_d = current_p.first;
+
+				if (dm.at<float>(y, x) != cur_d) {
+					continue;
+				}
+
+				Vec8f gra_e = gra.at<Vec8f>(y, x);
+
+				for (int i = 0; i < 8; i++) {
+					if (gra_e[i] < 0) {
+						continue;
+					}
+					int dx = Dx[i];
+					int dy = Dy[i];
+
+					if (dm.at<float>(y + dy, x + dx) > cur_d + gra_e[i]) {
+						dm.at<float>(y + dy, x + dx) = cur_d + gra_e[i];
+						prev.at<uchar>(y + dy, x + dx) = static_cast<uchar>(7 - i);
+						not_visited_with_value.insert(std::pair<float, Vec2i >(cur_d + gra_e[i], Vec2i(y + dy, x + dx)));
+					}
+
+				}
+			}
+
+
+
+
+			return dm;
+		}
+
+		Mat interpolate_irregular_nn_raster(const std::vector<Point2f> & prevPoints,
+			const std::vector<Point2f> & nextPoints,
+			const std::vector<uchar> & status,
+			const Mat & i1)
+		{
+			Mat gra = getGraph(i1, 0.1f);
+			int Dx[] = { -1,0,1,-1,1,-1,0,1 };
+			int Dy[] = { -1,-1,-1,0,0,1,1,1 };
+			int max_rounds = 10;
+			Mat dirt = Mat(gra.rows, gra.cols, CV_8U, Scalar(0));
+			Mat quellknoten = Mat(gra.rows, gra.cols, CV_32S, Scalar(-1));
+			Mat dist = Mat(gra.rows, gra.cols, CV_32F, Scalar(std::numeric_limits<float>::max()));
+			/*
+				* assign quellknoten ids.
+				*/
+			for (int i = 0; i < prevPoints.size(); i++)
+			{
+				int x = (int)prevPoints[i].x;
+				int y = (int)prevPoints[i].y;
+				if (status[i] == 0)
+					continue;
+				dirt.at<uchar>(y, x) = 1;
+				dist.at<float>(y, x) = 0;
+				quellknoten.at<int>(y, x) = i;
+			}
+
+			bool clean = true;
+			bool done = false;
+			int x = 0;
+			int y = 0;
+			int rounds = 0;
+			while (!done) {
+				/*
+					* Update x and y
+					* on even rounds go rasterscanorder , on odd round inverse rasterscanorder
+					*/
+				if (rounds % 2 == 0) {
+					x++;
+					if (x >= gra.cols) {
+						x = 0;
+						y++;
+						if (y >= gra.rows) {
+							y = 0;
+							rounds++;
+							y = gra.rows - 1;
+							x = gra.cols - 1;
+							if (rounds >= max_rounds || clean) {
+								done = true;
+								break;
+							}
+						}
+					}
+				}
+				else {
+					x--;
+					if (x < 0) {
+						x = gra.cols - 1;
+						y--;
+						if (y < 0) {
+							y = gra.rows - 1;
+							rounds++;
+							y = 0;
+							x = 0;
+							if (rounds >= max_rounds || clean) {
+								done = true;
+								break;
+							}
+						}
+					}
+				}
+				if (dirt.at<uchar>(y, x) == 0) {
+					continue;
+				}
+				dirt.at<uchar>(y, x) = 0;
+
+				float c_dist = dist.at<float>(y, x);
+				Vec8f gra_e = gra.at<Vec8f>(y, x);
+
+				for (int i = 0; i < 8; i++) {
+					int tx = Dx[i];
+					int ty = Dy[i];
+					if (ty == 0 && tx == 0) {
+						continue;
+					}
+					if (x + tx < 0 || x + tx >= gra.cols) {
+						continue;
+					}
+					if (y + ty < 0 || y + ty >= gra.rows) {
+						continue;
+					}
+					if (c_dist > dist.at<float>(y + ty, x + tx)) {
+						if (c_dist > dist.at<float>(y + ty, x + tx) + gra_e[i]) {
+							quellknoten.at<int>(y, x) = quellknoten.at<int>(y + ty, x + tx);
+							dist.at<float>(y, x) = dist.at<float>(y + ty, x + tx) + gra_e[i];
+							dirt.at<uchar>(y, x) = 1;
+							clean = false;
+						}
+					}
+					else {
+						if (c_dist + gra_e[i] < dist.at<float>(y + ty, x + tx)) {
+							quellknoten.at<int>(y + ty, x + tx) = quellknoten.at<int>(y, x);
+							dist.at<float>(y + ty, x + tx) = dist.at<float>(y, x) + gra_e[i];
+							dirt.at<uchar>(y + ty, x + tx) = 1;
+							clean = false;
+						}
+					}
+				}
+
+
+
+			}
+
+			Mat nnFlow(i1.rows, i1.cols, CV_32FC2, Scalar(0));
+			for (y = 0; y < i1.rows; y++) {
+				for (x = 0; x < i1.cols; x++) {
+
+					int id = quellknoten.at<int>(y, x);
+					if (id != -1)
+					{
+						nnFlow.at<Point2f>(y, x) = nextPoints[id] - prevPoints[id];
+					}
+				}
+			}
+			return nnFlow;
+		}
+
+		Mat interpolate_irregular_knn(
+			const std::vector<Point2f> & _prevPoints,
+			const std::vector<Point2f> & _nextPoints,
+			const std::vector<uchar> & status,
+			const Mat &color_img,
+			int k,
+			float pixeldistance)
+		{
+			Mat in(color_img.rows, color_img.cols, CV_32FC2);
+			Mat mask = Mat::zeros(color_img.rows, color_img.cols, CV_8UC1);
+
+			for (unsigned n = 0; n < _prevPoints.size(); n++)
+			{
+				if (_prevPoints[n].x >= 0 && _prevPoints[n].y >= 0 && _prevPoints[n].x < color_img.cols && _prevPoints[n].y < color_img.rows)
+				{
+					in.at<Point2f>(_prevPoints[n]) = _nextPoints[n] - _prevPoints[n];
+					mask.at<uchar>(_prevPoints[n]) = status[n];
+				}
+
+			}
+			int Dx[] = { -1,0,1,-1,1,-1,0,1 };
+			int Dy[] = { -1,-1,-1,0,0,1,1,1 };
+			Mat gra = getGraph(color_img, pixeldistance);
+			Mat nnFlow(in.rows, in.cols, CV_32FC2, Scalar(0));
+
+			std::multimap<float, Vec2i > my_agents; // <arrivaltim , < target, color >>
+			Graph_helper graph_helper(k, in.rows*in.cols); //< arrivaltime, color>
+
+
+			int color = 0;
+			std::vector<Vec2i> flow_point_list;
+			for (int y = 0; y < in.rows; y++) {
+				for (int x = 0; x < in.cols; x++) {
+					if (mask.at<uchar>(y, x) > 0) {
+						flow_point_list.push_back(Vec2i(y, x));
+						nnFlow.at<Vec2f>(y, x) = in.at<Vec2f>(y, x);
+
+						int v_id = (y * in.cols + x);
+						graph_helper.add(v_id, std::pair<float, int>(0.f, color));
+
+
+						Vec8f gra_e = gra.at<Vec8f>(y, x);
+						for (int i = 0; i < 8; i++) {
+							if (gra_e[i] < 0)
+								continue;
+							int dx = Dx[i];
+							int dy = Dy[i];
+							int target = (y + dy) * in.cols + (x + dx);
+							Vec2i agent(target, color);
+							my_agents.insert(std::pair<float, Vec2i >(gra_e[i], agent));
+
+						}
+						color++;
+					}
+				}
+			}
+
+			int global_time = 0;
+
+			bool done = false;
+			while (!done) {
+				if (my_agents.size() == 0) {
+					done = true;
+					break;
+				}
+				global_time++;
+				
+				std::multimap<float, Vec2i >::iterator current_it = my_agents.begin();
+				std::pair<float, Vec2i > current_p = *current_it;
+				my_agents.erase(current_it);
+
+				int target = current_p.second[0];
+				color = current_p.second[1];
+				float arriv_time = current_p.first;
+
+				Vec8f gra_e = gra.at<Vec8f>(target);// (y*cols+x)
+				if (graph_helper.size(target) >= k) {
+					continue;
+				}
+
+				bool color_found_in_target = graph_helper.color_in_target(target, color);
+				if (color_found_in_target) {
+					continue;
+				}
+				graph_helper.add(target, std::pair<float, int>(arriv_time, color));
+
+
+				for (int i = 0; i < 8; i++) {
+					if (gra_e[i] < 0)
+						continue;
+
+					int dx = Dx[i];
+					int dy = Dy[i];
+					int new_target = target + dx + (dy*in.cols);
+					if (graph_helper.size(new_target) >= k) {
+						continue;
+					}
+					color_found_in_target = graph_helper.color_in_target(new_target, color);
+					if (color_found_in_target) {
+						continue;
+					}
+					Vec2i new_agent(new_target, color);
+					my_agents.insert(std::pair<float, Vec2i >(arriv_time + gra_e[i], new_agent));
+
+				}
+			}
+
+			Mat ret(in.rows, in.cols*k, CV_32FC2);
+			for (int y = 0; y < in.rows; y++) {
+				for (int x = 0; x < in.cols; x++) {
+					for (int i = 0; i < k; i++) {
+						float dist = *((float*)(graph_helper.data(y*in.cols + x) + 2 * i));
+						float id = *((float*)(graph_helper.data(y*in.cols + x) + 2 * i + 1));
+						ret.at<Vec2f>(y, k*x + i) = Vec2f(dist, id);
+					}
+				}
+			}
+			return ret;
+
+			for (int y = 0; y < in.rows; y++) {
+				for (int x = 0; x < in.cols; x++) {
+					int v_id = y * in.cols + x;
+					std::vector<std::pair<float, Vec2f> > flow_s(k);
+
+					for (int i = 0; i < k; i++) {
+						color = *((int*)(graph_helper.data(v_id) + 1 + 2 * i));
+						int cy = flow_point_list[color][0];
+						int cx = flow_point_list[color][1];
+						Vec2f flow = in.at<Vec2f>(cy, cx);
+						flow_s[i] = std::make_pair(static_cast<float>(norm(flow)), flow);
+
+
+
+					}
+					
+					nnFlow.at<Vec2f>(y, x) = std::min_element(flow_s.begin(), flow_s.end(), [](const std::pair<float, Vec2f> &left, const std::pair<float, Vec2f> &right) {
+						return left.first < right.first;
+					})->second;
+
+
+				}
+			}
+			free(graph_helper.mem);
+			return nnFlow;
+
+
+		}
+		Mat getGraph(const Mat &image, float edge_length)
+		{
+
+			int Dx[] = { -1,0,1,-1,1,-1,0,1 };
+			int Dy[] = { -1,-1,-1,0,0,1,1,1 };
+			Mat gra(image.rows, image.cols, CV_32FC(8));
+
+			for (int y = 0; y < gra.rows; y++) {
+				for (int x = 0; x < gra.cols; x++) {
+
+					for (int i = 0; i < 8; i++) {
+						int dx = Dx[i];
+						int dy = Dy[i];
+						gra.at<Vec8f>(y, x)[i] = -1;
+
+						if (x + dx < 0 || y + dy < 0 || x + dx >= gra.cols || y + dy >= gra.rows) {
+							continue;
+						}
+
+						if (i < 4) {
+							int si = 7 - i;
+							gra.at<Vec8f>(y, x)[i] = gra.at<Vec8f>(y + dy, x + dx)[si];
+						}
+						else {
+							float p1 = dx * dx*edge_length*edge_length + dy * dy*edge_length*edge_length;
+							float p2 = static_cast<float>(image.at<Vec3b>(y, x)[0] - image.at<Vec3b>(y + dy, x + dx)[0]);
+							float p3 = static_cast<float>(image.at<Vec3b>(y, x)[1] - image.at<Vec3b>(y + dy, x + dx)[1]);
+							float p4 = static_cast<float>(image.at<Vec3b>(y, x)[2] - image.at<Vec3b>(y + dy, x + dx)[2]);
+							gra.at<Vec8f>(y, x)[i] = sqrt(p1 + p2 * p2 + p3 * p3 + p4 * p4);
+						}
+
+					}
+
+				}
+			}
+
+			return gra;
+		}
+
+		
+		Mat interpolate_irregular_nn(
+			const std::vector<Point2f> & _prevPoints,
+			const std::vector<Point2f> & _nextPoints,
+			const std::vector<uchar> & status,
+			const Mat &color_img,
+			float pixeldistance)
+		{
+			int Dx[] = { -1,0,1,-1,1,-1,0,1 };
+			int Dy[] = { -1,-1,-1,0,0,1,1,1 };
+			std::vector<Point2f> prevPoints, nextPoints;
+			std::map<std::pair<float, float>, std::pair<float, float>> flowMap;
+			for (unsigned n = 0; n < _prevPoints.size(); n++)
+			{
+				if (status[n] != 0)
+				{
+					flowMap.insert(std::make_pair(
+						std::make_pair(_prevPoints[n].x, _prevPoints[n].y),
+						std::make_pair(_nextPoints[n].x, _nextPoints[n].y)));
+					prevPoints.push_back(_prevPoints[n]);
+					nextPoints.push_back(_nextPoints[n]);
+				}
+
+			}
+
+			Mat gra = getGraph(color_img, pixeldistance);
+
+			Mat prev;
+			Mat geo_dist = sgeo_dist(gra, prevPoints, std::numeric_limits<float>::max(), prev);
+
+
+			Mat nnFlow = Mat::zeros(color_img.size(), CV_32FC2);
+			for (int y = 0; y < nnFlow.rows; y++)
+			{
+				for (int x = 0; x < nnFlow.cols; x++)
+				{
+					int cx = x;
+					int cy = y;
+					while (prev.at<uchar>(cy, cx) != 255)
+					{
+						int i = prev.at<uchar>(cy, cx);
+						cx += Dx[i];
+						cy += Dy[i];
+					}
+					auto val = flowMap[std::make_pair(static_cast<float>(cx), static_cast<float>(cy))];
+					nnFlow.at<Vec2f>(y, x) = Vec2f(val.first - cx, val.second - cy);
+				}
+			}
+			return nnFlow;
+		}
+
+	}
+}
\ No newline at end of file
diff --git a/modules/optflow/src/rlof/geo_interpolation.hpp b/modules/optflow/src/rlof/geo_interpolation.hpp
new file mode 100644
index 00000000000..2c9dfbd38c0
--- /dev/null
+++ b/modules/optflow/src/rlof/geo_interpolation.hpp
@@ -0,0 +1,75 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+ //
+ //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+ //
+ //  By downloading, copying, installing or using the software you agree to this license.
+ //  If you do not agree to this license, do not download, install,
+ //  copy or use the software.
+ //
+ //
+ //                           License Agreement
+ //                For Open Source Computer Vision Library
+ //
+ // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+ // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+ // Third party copyrights are property of their respective owners.
+ //
+ // Redistribution and use in source and binary forms, with or without modification,
+ // are permitted provided that the following conditions are met:
+ //
+ //   * Redistribution's of source code must retain the above copyright notice,
+ //     this list of conditions and the following disclaimer.
+ //
+ //   * Redistribution's in binary form must reproduce the above copyright notice,
+ //     this list of conditions and the following disclaimer in the documentation
+ //     and/or other materials provided with the distribution.
+ //
+ //   * The name of the copyright holders may not be used to endorse or promote products
+ //     derived from this software without specific prior written permission.
+ //
+ // This software is provided by the copyright holders and contributors "as is" and
+ // any express or implied warranties, including, but not limited to, the implied
+ // warranties of merchantability and fitness for a particular purpose are disclaimed.
+ // In no event shall the Intel Corporation or contributors be liable for any direct,
+ // indirect, incidental, special, exemplary, or consequential damages
+ // (including, but not limited to, procurement of substitute goods or services;
+ // loss of use, data, or profits; or business interruption) however caused
+ // and on any theory of liability, whether in contract, strict liability,
+ // or tort (including negligence or otherwise) arising in any way out of
+ // the use of this software, even if advised of the possibility of such damage.
+ //
+ //M*/
+#ifndef _GEO_INTERPOLATION_HPP_
+#define _GEO_INTERPOLATION_HPP_
+#include <opencv2/core.hpp>
+namespace cv
+{
+	namespace optflow
+	{
+		typedef Vec<float, 8> Vec8f;
+		Mat getGraph(const Mat & image, float edge_length);
+		Mat sgeo_dist(const Mat& gra, int y, int x, float max, Mat &prev);
+		Mat sgeo_dist(const Mat& gra, const std::vector<Point2f> & points, float max, Mat &prev);
+		Mat interpolate_irregular_nw(const Mat &in, const Mat &mask, const Mat &color_img, float max_d, float bandwidth, float pixeldistance);
+		Mat interpolate_irregular_nn(
+			const std::vector<Point2f> & prevPoints,
+			const std::vector<Point2f> & nextPoints,
+			const std::vector<uchar> & status,
+			const Mat &color_img,
+			float pixeldistance);
+		Mat interpolate_irregular_knn(
+			const std::vector<Point2f> & _prevPoints,
+			const std::vector<Point2f> & _nextPoints,
+			const std::vector<uchar> & status,
+			const Mat &color_img,
+			int k,
+			float pixeldistance);
+
+		Mat interpolate_irregular_nn_raster(const std::vector<Point2f> & prevPoints,
+			const std::vector<Point2f> & nextPoints,
+			const std::vector<uchar> & status,
+			const Mat & i1);
+
+	}
+}
+#endif
diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp
new file mode 100644
index 00000000000..1a8f5eec380
--- /dev/null
+++ b/modules/optflow/src/rlof/plk_invoker.hpp
@@ -0,0 +1,1141 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+ //
+ //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+ //
+ //  By downloading, copying, installing or using the software you agree to this license.
+ //  If you do not agree to this license, do not download, install,
+ //  copy or use the software.
+ //
+ //
+ //                           License Agreement
+ //                For Open Source Computer Vision Library
+ //
+ // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+ // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+ // Third party copyrights are property of their respective owners.
+ //
+ // Redistribution and use in source and binary forms, with or without modification,
+ // are permitted provided that the following conditions are met:
+ //
+ //   * Redistribution's of source code must retain the above copyright notice,
+ //     this list of conditions and the following disclaimer.
+ //
+ //   * Redistribution's in binary form must reproduce the above copyright notice,
+ //     this list of conditions and the following disclaimer in the documentation
+ //     and/or other materials provided with the distribution.
+ //
+ //   * The name of the copyright holders may not be used to endorse or promote products
+ //     derived from this software without specific prior written permission.
+ //
+ // This software is provided by the copyright holders and contributors "as is" and
+ // any express or implied warranties, including, but not limited to, the implied
+ // warranties of merchantability and fitness for a particular purpose are disclaimed.
+ // In no event shall the Intel Corporation or contributors be liable for any direct,
+ // indirect, incidental, special, exemplary, or consequential damages
+ // (including, but not limited to, procurement of substitute goods or services;
+ // loss of use, data, or profits; or business interruption) however caused
+ // and on any theory of liability, whether in contract, strict liability,
+ // or tort (including negligence or otherwise) arising in any way out of
+ // the use of this software, even if advised of the possibility of such damage.
+ //
+ //M*/
+#ifndef _PLK_INVOKER_HPP_
+#define _PLK_INVOKER_HPP_
+#include "rlof_invokerbase.hpp"
+
+
+namespace cv{
+	namespace optflow{
+namespace plk
+{
+
+// implementierung ohne SSE
+namespace radial
+{
+class TrackerInvoker : public cv::ParallelLoopBody
+{
+public:
+    TrackerInvoker(
+
+		const Mat&	_prevImg,
+		const Mat&	_prevDeriv,
+		const Mat&	_nextImg,
+		const Mat&	_rgbPrevImg,
+		const Mat&	_rgbNextImg,
+		const Point2f*	_prevPts,
+		Point2f*		_nextPts,
+		uchar*			_status,
+		float*			_err,
+		Point2f*		_gainVecs,
+		int				_level,
+		int				_maxLevel,
+		int				_winSize[2],
+		int				_maxIteration,
+		bool			_useInitialFlow,
+		int				_supportRegionType,
+		float           _minEigenValue,
+		int				_crossSegmentationThreshold)
+	{
+		prevImg = &_prevImg;
+		prevDeriv = &_prevDeriv;
+		nextImg = &_nextImg;
+		rgbPrevImg = &_rgbPrevImg;
+		rgbNextImg = &_rgbNextImg;
+		prevPts = _prevPts;
+		nextPts = _nextPts;
+		status = _status;
+		err = _err;
+		gainVecs = _gainVecs;
+		minWinSize = _winSize[0];
+		maxWinSize = _winSize[1];
+		criteria.maxCount = _maxIteration;
+		criteria.epsilon = 0.01;
+		level = _level;
+		maxLevel = _maxLevel;
+		windowType = _supportRegionType;
+		minEigThreshold = _minEigenValue;
+		useInitialFlow = _useInitialFlow;
+		crossSegmentationThreshold = _crossSegmentationThreshold;
+	}
+    
+	TrackerInvoker & operator=(const TrackerInvoker &) {return *this;};
+    void operator()(const cv::Range& range) const
+    {
+        cv::Size	winSize;
+		cv::Point2f halfWin;
+
+		const Mat& I	= *prevImg;
+        const Mat& J	= *nextImg;
+        const Mat& derivI = *prevDeriv;
+		const Mat& BI = *rgbPrevImg;
+
+		winSize = cv::Size(maxWinSize,maxWinSize);
+		int winMaskwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
+		cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
+		winMaskMatBuf.setTo(1);
+		
+		const float FLT_SCALE = (1.f/(1 << 16));//(1.f/(1 << 20)); // 20
+
+        int cn = I.channels(), cn2 = cn*2;
+		int winbufwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
+		cv::Size winBufSize(winbufwidth,winbufwidth);
+
+		cv::Mat invTensorMat(4,4, CV_32FC1);
+		cv::Mat mismatchMat(4, 1, CV_32FC1);
+		cv::Mat resultMat(4, 1, CV_32FC1);
+
+		cv::AutoBuffer<deriv_type> _buf(winBufSize.area()*(cn + cn2));
+		int derivDepth = DataType<deriv_type>::depth;
+
+		Mat IWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf.data());
+		Mat derivIWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf.data() + winBufSize.area()*cn);
+
+        for( int ptidx = range.start; ptidx < range.end; ptidx++ )
+        {
+			Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
+            Point2f nextPt;
+            if( level == maxLevel )
+            {
+                if( useInitialFlow )
+				{
+                    nextPt = nextPts[ptidx]*(float)(1./(1 << level));
+				}
+                else
+                    nextPt = prevPt;
+            }
+            else
+                nextPt = nextPts[ptidx]*2.f;
+            nextPts[ptidx] = nextPt;
+
+			Point2i iprevPt, inextPt;
+			iprevPt.x = cvFloor(prevPt.x);
+			iprevPt.y = cvFloor(prevPt.y);
+			int winArea = maxWinSize * maxWinSize;
+			cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize));
+			winMaskMatBuf.setTo(0);
+			if( calcWinMaskMat(BI, windowType, iprevPt,
+								winMaskMat,winSize,halfWin,winArea,
+								minWinSize,maxWinSize, crossSegmentationThreshold) == false)
+						continue;
+
+			
+			prevPt -= halfWin;
+			iprevPt.x = cvFloor(prevPt.x);
+            iprevPt.y = cvFloor(prevPt.y);
+
+
+			if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols ||
+				iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows )
+            {
+                if( level == 0 )
+                {
+                    if( status )
+                        status[ptidx] = 3;
+                    if( err )
+                        err[ptidx] = 0;
+                }
+                continue;
+            }
+
+            float a = prevPt.x - iprevPt.x;
+            float b = prevPt.y - iprevPt.y;
+            const int W_BITS = 14, W_BITS1 = 14;
+            
+            int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+            int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+            int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+            int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+            
+            int dstep = (int)(derivI.step/derivI.elemSize1());
+            int step = (int)(I.step/I.elemSize1());
+			int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1());
+            CV_Assert( step == (int)(J.step/J.elemSize1()) );
+            float A11 = 0, A12 = 0, A22 = 0;
+            // tensor
+			float sumIx = 0;
+			float sumIy = 0;
+			float sumI  = 0;
+			float sumW = 0;
+			float w1 = 0, w2 = 0; // -IyI
+			float dI = 0; // I^2
+			float D = 0;
+ #ifdef CV_SSE2
+            __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+            __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+            __m128i z = _mm_setzero_si128();
+            __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1));
+            __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1));
+			__m128i mmMask4_epi32;
+			__m128i mmMaskSet_epi16   = _mm_set1_epi16(std::numeric_limits<unsigned short>::max());
+			get4BitMask(winSize.width, mmMask4_epi32);
+#endif
+            // extract the patch from the first image, compute covariation matrix of derivatives
+            int x, y;
+            for( y = 0; y < winSize.height; y++ )
+            {
+				const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+                const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
+                const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
+                
+                short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step);
+                short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step);
+                
+                x = 0;
+#ifdef CV_SSE2
+               for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 )
+                {
+					__m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16);
+					__m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16);
+					__m128i v00, v01, v10, v11, t0, t1;
+                    v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
+                    v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
+                    v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z);
+                    v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z);
+                    
+                    t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                                       _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+					t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
+					if( x + 4 > winSize.width)
+					{
+						t0 = _mm_and_si128(t0, mmMask4_epi32);
+					}
+					t0 = _mm_and_si128(t0, mask_0_3_epi16);
+                    _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0));
+                   
+                    
+                    v00 = _mm_loadu_si128((const __m128i*)(dsrc));
+                    v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
+                    v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep));
+                    v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2));
+                    
+                    t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                                       _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                    t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+                                       _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+                    t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
+                    t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
+                    v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
+                    if( x + 4 > winSize.width)
+					{
+						v00 = _mm_and_si128(v00, mmMask4_epi32);
+					}
+					v00 = _mm_and_si128(v00, mask_0_3_epi16);
+                    _mm_storeu_si128((__m128i*)dIptr, v00);
+                }
+#else
+                for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 )
+                {
+					if( maskPtr[x] == 0)
+					{
+						dIptr[0] = 0;
+						dIptr[1] = 0;
+						continue;
+					}
+					int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
+                                          src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5);
+                    int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
+                                           dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
+                    int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
+                                           dsrc[dstep+cn2+1]*iw11, W_BITS1);
+                    
+					Iptr[x] = (short)ival;
+					dIptr[0] = (short)ixval;
+					dIptr[1] = (short)iyval;
+                }
+#endif
+
+            }
+                        
+			cv::Point2f backUpNextPt = nextPt;
+			nextPt -= halfWin;
+			Point2f prevDelta(0,0);	//relates to h(t-1)
+			Point2f prevGain(1,0);
+			cv::Point2f gainVec = gainVecs[ptidx];
+			cv::Point2f backUpGain = gainVec;
+			int j;
+            for( j = 0; j < criteria.maxCount; j++ )
+            {
+				status[ptidx] = static_cast<uchar>(j);
+                inextPt.x = cvFloor(nextPt.x);
+                inextPt.y = cvFloor(nextPt.y);
+                
+				
+				if( inextPt.x + halfWin.x < 0 || inextPt.x + halfWin.x>= derivI.cols ||
+					inextPt.y + halfWin.y < 0 || inextPt.y + halfWin.y >= derivI.rows )
+				{
+					if( level == 0 && status )
+						status[ptidx] = 3;
+					break;
+				}
+
+                a = nextPt.x - inextPt.x;
+                b = nextPt.y - inextPt.y;
+                iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+                iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+                iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+                iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+				float b1,b2,b3,b4;
+                b1 = 0;
+				b2 = 0;
+				b3 = 0;
+				b4 = 0;
+			
+				if( j == 0 )
+				{
+					// tensor
+					w1 = 0; // -IxI
+					w2 = 0; // -IyI
+					dI = 0; // I^2
+					sumIx = 0;
+					sumIy = 0;
+					sumI  = 0;
+					sumW = 0;
+					A11 = 0;
+					A12 = 0;
+					A22 = 0;
+				}
+#ifdef CV_SSE2
+                qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+                qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+                __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(), qb2 = _mm_setzero_ps();
+				__m128 qb3 = _mm_setzero_ps();
+				__m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps(); 
+				__m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps();
+				__m128 mmSumIy = _mm_setzero_ps(),  mmSumIx = _mm_setzero_ps();
+				__m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();
+				float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x;
+				int bitShift = gainVec.x == 0 ? 1 : static_cast<int>(ceil(log(200.f / gainVal) / log(2.f)));	// zwei stellen nach den komma 
+				__m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.x * (float)(1 << bitShift)));
+				__m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.y));
+#endif
+				for( y = 0; y < winSize.height; y++ )
+                {
+					const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+                    const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn;
+                    const short* Iptr = (const short*)(IWinBuf.data + ( y)*IWinBuf.step);
+                    const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
+                    
+                    x = 0;
+#ifdef CV_SSE2					
+					
+                    for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 )
+                    {
+						// iw = iw << 14 (16384 short / 2)
+						// iq0 =  iw01 |iw00, iw01 |iw00, iw01 |iw00, iw01 |iw00    16 bit
+						__m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16);
+												
+						// I [0...8]
+                        __m128i diff0, diff1;
+						__m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); //  element 0 to 7
+						
+                        __m128i v00 = _mm_unpacklo_epi8(
+							_mm_loadl_epi64((const __m128i*)(Jptr + x)) // J0, J1, J2, ..., J7, 64 bit nullen
+								, z); //J0 , 0, J1, 0, J2, 0 ... J7,0 
+                        __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);  
+                        __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z);
+                        __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z);
+                        
+						
+						__m128i t0 = _mm_add_epi32
+							(_mm_madd_epi16(
+								_mm_unpacklo_epi16(v00, v01), //J0, , J1, J1, J2, J2,  ... J3 , J4
+								qw0),
+                            _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+						
+						__m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+                                                   _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+
+						t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
+                        t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5);
+						
+					
+						__m128i Ilo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16);
+						__m128i Ihi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16); 
+
+						__m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(Ilo, Ihi), bitShift);
+						__m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(Ilo, Ihi), bitShift);
+						__m128i Igain_epi16 =  _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32);
+																					
+						// diff = J - I + I*gain.x + gain.y
+						__m128i mmDiff_epi16 = _mm_add_epi16( 
+							_mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16),	// J - I
+							_mm_add_epi16(Igain_epi16, mmConstValue_epi16));
+
+						
+						mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16);
+
+					
+					    __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... 
+                        __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
+										
+						diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7   | It12 It12 It13 It13...
+                        diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3   | It8 It8 It9 It9...
+
+						v10 = _mm_mullo_epi16(Ixy_0, diff0);
+                        v11 = _mm_mulhi_epi16(Ixy_0, diff0);
+                        v00 = _mm_unpacklo_epi16(v10, v11);
+                        v10 = _mm_unpackhi_epi16(v10, v11);
+						qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
+						qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
+
+						v10 = _mm_mullo_epi16(Ixy_1, diff1);
+						v11 = _mm_mulhi_epi16(Ixy_1, diff1);
+                        v00 = _mm_unpacklo_epi16(v10, v11);
+                        v10 = _mm_unpackhi_epi16(v10, v11);
+						qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
+						qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
+						// for set 1  sigma 1
+						// b3 += diff * Iptr[x]
+						Ilo = _mm_mullo_epi16(mmDiff_epi16, I_0_7_epi16);
+						Ihi = _mm_mulhi_epi16(mmDiff_epi16, I_0_7_epi16);
+						v00 = _mm_unpacklo_epi16(Ilo, Ihi);
+						v10 = _mm_unpackhi_epi16(Ilo, Ihi);
+						qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v00));
+						qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v10));
+						
+						// b4+= diff
+						__m128 mmDiff_0_3_ps  = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mmDiff_epi16)); // (_mm_srai_epi32(_mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16),16));
+						__m128 mmDiff_4_7_ps  = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16),16)));
+						qb3 = _mm_add_ps(qb3, mmDiff_0_3_ps);
+						qb3 = _mm_add_ps(qb3, mmDiff_4_7_ps);
+						if( j == 0 )
+						{
+							__m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16));
+							__m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16)));
+							
+							t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3
+							t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3
+
+							__m128 fy = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t0), mask_0_4_ps);
+							__m128 fx = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t1), mask_0_4_ps);
+
+							// 0 ... 3
+							__m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16));
+							
+							// A11 - A22
+							mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fy));
+							mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fy));
+							mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fx));
+							
+							// sumIx und sumIy
+							mmSumIx = _mm_add_ps(mmSumIx, fx);
+							mmSumIy = _mm_add_ps(mmSumIy, fy);
+							
+							mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fx));
+							mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fy));
+													
+							// sumI
+							I_ps  = _mm_blendv_ps(_mm_set1_ps(0), I_ps, mask_0_4_ps);
+							mmSumI = _mm_add_ps(mmSumI,I_ps);
+		
+							// sumDI
+							mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_ps));
+						
+							t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11
+							t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3
+							
+							fy = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t0), mask_4_7_ps);
+							fx = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t1), mask_4_7_ps);
+
+							// 4 ... 7
+							I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16));
+													
+						
+
+							mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fy));
+							mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fy));
+							mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fx));
+							
+							// sumIx und sumIy
+							mmSumIx = _mm_add_ps(mmSumIx, fx);
+							mmSumIy = _mm_add_ps(mmSumIy, fy);
+							
+							mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fx));
+							mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fy));
+													
+							// sumI
+							I_ps  = _mm_blendv_ps(_mm_set1_ps(0), I_ps, mask_4_7_ps);
+							mmSumI = _mm_add_ps(mmSumI, I_ps);
+							
+							// sumW 
+							// sumDI
+							mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_ps));
+						}
+							
+					}
+#else
+                    for( ; x < winSize.width*cn; x++, dIptr += 2 )
+                    {
+						if( maskPtr[x] == 0) 
+							continue;
+                        int J = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
+                                              Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11,
+                                              W_BITS1-5);
+						int diff = J - Iptr[x] + static_cast<int>(Iptr[x]) * gainVec.x + gainVec.y; 
+						
+						b1 += (float)(diff*dIptr[0])  * FLT_RESCALE;
+                        b2 += (float)(diff*dIptr[1])  * FLT_RESCALE;
+						b3 += (float)(diff) * Iptr[x] * FLT_RESCALE;
+						b4 += (float)(diff);
+						
+						// compute the Gradient Matrice
+						if( j == 0 )
+						{
+							A11 += (float)(dIptr[0]*dIptr[0]);
+							A12 += (float)(dIptr[0]*dIptr[1]);
+							A22 += (float)(dIptr[1]*dIptr[1]);
+							
+							
+							dI += Iptr[x] * Iptr[x] * FLT_RESCALE;
+							float dx = static_cast<float>(dIptr[0]) * FLT_RESCALE;
+							float dy = static_cast<float>(dIptr[1]) * FLT_RESCALE;
+							sumIx += dx;
+							sumIy += dy;
+							w1 += dx * Iptr[x];
+							w2 += dy * Iptr[x];
+							sumI += Iptr[x]  * FLT_RESCALE;
+							//sumW += FLT_RESCALE;
+							
+						}
+                    }
+#endif
+                }
+				
+				float CV_DECL_ALIGNED(16) wbuf[4];//
+				if( j == 0 )
+				{
+#ifdef CV_SSE2
+						_mm_store_ps(wbuf, mmSumW1);
+						w1  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+						_mm_store_ps(wbuf, mmSumW2);
+						w2  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+						_mm_store_ps(wbuf, mmSumDI);
+						dI  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+						_mm_store_ps(wbuf, mmSumI);
+						sumI  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+						_mm_store_ps(wbuf, mmSumIx);
+						sumIx  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+						_mm_store_ps(wbuf, mmSumIy);
+						sumIy  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+						_mm_store_ps(wbuf, mmSumW);
+						sumW  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+#endif
+					sumIx *= -FLT_SCALE;
+					sumIy *= -FLT_SCALE;
+					sumI *=FLT_SCALE;
+					sumW = winArea * FLT_SCALE;
+					w1 *= -FLT_SCALE;
+					w2 *= -FLT_SCALE;
+					dI *= FLT_SCALE;
+#ifdef CV_SSE2
+					float CV_DECL_ALIGNED(32) A11buf[4], A12buf[4], A22buf[4];//				
+					
+					_mm_store_ps(A11buf, mmAxx);
+					_mm_store_ps(A12buf, mmAxy);
+					_mm_store_ps(A22buf, mmAyy);
+					
+					A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
+					A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
+					A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
+#endif
+					A11 *= FLT_SCALE; 
+					A12 *= FLT_SCALE; 
+					A22 *= FLT_SCALE; 
+				}
+
+#ifdef CV_SSE2
+                float CV_DECL_ALIGNED(16) bbuf[4];
+                _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
+				b1 = bbuf[0] + bbuf[2];
+				b2 = bbuf[1] + bbuf[3];
+				_mm_store_ps(bbuf, qb2);
+				b3 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
+				_mm_store_ps(bbuf, qb3);
+				b4 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
+#endif
+
+				mismatchMat.at<float>(0,0) =  b1 * FLT_SCALE;
+				mismatchMat.at<float>(1,0) =  b2 * FLT_SCALE;
+				mismatchMat.at<float>(2,0) = -b3 * FLT_SCALE;
+				mismatchMat.at<float>(3,0) = -b4 * FLT_SCALE;
+				
+				D = - A12*A12*sumI*sumI + dI*sumW*A12*A12 + 2*A12*sumI*sumIx*w2 + 2*A12*sumI*sumIy*w1 
+						- 2*dI*A12*sumIx*sumIy - 2*sumW*A12*w1*w2 + A11*A22*sumI*sumI - 2*A22*sumI*sumIx*w1 
+						- 2*A11*sumI*sumIy*w2 - sumIx*sumIx*w2*w2 + A22*dI*sumIx*sumIx + 2*sumIx*sumIy*w1*w2 
+						- sumIy*sumIy*w1*w1 + A11*dI*sumIy*sumIy + A22*sumW*w1*w1 + A11*sumW*w2*w2 - A11*A22*dI*sumW;
+		
+
+				float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
+                        4.f*A12*A12))/(2*winArea);
+				if(  minEig < minEigThreshold )
+				{
+					if( level == 0 && status || std::abs(D) < FLT_EPSILON)
+						status[ptidx] = 0;
+					if( level > 0)
+					{
+						nextPts[ptidx] = backUpNextPt;
+						gainVecs[ptidx] = backUpGain;
+					}
+					break;
+				}
+				
+		
+
+				D = (1.f / D);				
+				invTensorMat.at<float>(0,0) = (A22*sumI*sumI - 2*sumI*sumIy*w2 + dI*sumIy*sumIy + sumW*w2*w2 - A22*dI*sumW)* D;												invTensorMat.at<float>(0,1) = (A12*dI*sumW - A12*sumI * sumI - dI*sumIx*sumIy + sumI*sumIx*w2 + sumI*sumIy*w1 - sumW*w1*w2)* D;
+				invTensorMat.at<float>(0,2) = (A12*sumI*sumIy - sumIy*sumIy*w1 - A22*sumI*sumIx - A12*sumW*w2 + A22*sumW*w1 + sumIx*sumIy*w2)* D;
+				invTensorMat.at<float>(0,3) = (A22*dI*sumIx - A12*dI*sumIy - sumIx*w2*w2 + A12*sumI*w2 - A22*sumI*w1 + sumIy*w1*w2) * D;
+				invTensorMat.at<float>(1,0) = invTensorMat.at<float>(0,1);
+				invTensorMat.at<float>(1,1) = (A11*sumI * sumI - 2*sumI*sumIx*w1 + dI*sumIx * sumIx + sumW*w1*w1 - A11*dI*sumW) * D;
+				invTensorMat.at<float>(1,2) = (A12*sumI*sumIx - A11*sumI*sumIy - sumIx * sumIx*w2 + A11*sumW*w2 - A12*sumW*w1 + sumIx*sumIy*w1) * D;
+				invTensorMat.at<float>(1,3) = (A11*dI*sumIy - sumIy*w1*w1 - A12*dI*sumIx - A11*sumI*w2 + A12*sumI*w1 + sumIx*w1*w2)* D;
+				invTensorMat.at<float>(2,0) = invTensorMat.at<float>(0,2);
+				invTensorMat.at<float>(2,1) = invTensorMat.at<float>(1,2);
+				invTensorMat.at<float>(2,2) = (sumW*A12*A12 - 2*A12*sumIx*sumIy + A22*sumIx*sumIx + A11*sumIy*sumIy - A11*A22*sumW)* D;
+				invTensorMat.at<float>(2,3) = (A11*A22*sumI - A12*A12*sumI - A11*sumIy*w2 + A12*sumIx*w2 + A12*sumIy*w1 - A22*sumIx*w1)* D;
+				invTensorMat.at<float>(3,0) = invTensorMat.at<float>(0,3);
+				invTensorMat.at<float>(3,1) = invTensorMat.at<float>(1,3);
+				invTensorMat.at<float>(3,2) = invTensorMat.at<float>(2,3);
+				invTensorMat.at<float>(3,3) = (dI*A12*A12 - 2*A12*w1*w2 + A22*w1*w1 + A11*w2*w2 - A11*A22*dI)* D;
+
+				resultMat = invTensorMat * mismatchMat;
+
+                // 0.08 -12.10
+				Point2f delta(-resultMat.at<float>(0), -resultMat.at<float>(1));
+				Point2f deltaGain(-resultMat.at<float>(2), -resultMat.at<float>(3));
+			
+				
+				if( j == 0) 
+					prevGain = deltaGain;
+		        nextPt += delta;
+                nextPts[ptidx] = nextPt + halfWin;
+				gainVecs[ptidx]= gainVec;			
+                if( delta.ddot(delta) <= criteria.epsilon)
+					break;
+				if(	(   std::abs(delta.x - prevDelta.x) < 0.01  
+					&&	std::abs(delta.y - prevDelta.y) < 0.01)  
+					|| ((delta.ddot(delta) <= 0.001) 
+					   && std::abs(prevGain.x - deltaGain.x) < 0.01)
+						)
+				{
+					nextPts[ptidx]  -= delta*0.5f;
+					gainVecs[ptidx] -= deltaGain* 0.5f;
+					break;
+				}
+
+				if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01  &&	
+							std::abs(delta.y - prevDelta.y) < 0.01)
+				{
+					nextPts[ptidx]  -= delta*0.5f;
+					break;
+				}
+				
+				prevDelta = delta;
+				prevGain = deltaGain;
+				
+            }
+			
+		
+        }
+
+    }
+    
+	const Mat*		prevImg;
+	const Mat*		nextImg;
+	const Mat*		prevDeriv;
+	const Mat*		rgbPrevImg;
+	const Mat*		rgbNextImg;
+	const Point2f*	prevPts;
+	Point2f*		nextPts;
+	uchar*			status;
+	cv::Point2f*	gainVecs;		// gain vector x -> multiplier y -> offset
+	float*			err;
+	int				maxWinSize;
+	int				minWinSize;
+	TermCriteria	criteria;
+	int				level;
+	int				maxLevel;
+	int				windowType;
+	float			minEigThreshold;
+	bool			useInitialFlow;
+	int				crossSegmentationThreshold;
+};
+}
+namespace ica{
+class TrackerInvoker : public cv::ParallelLoopBody
+{
+public:
+    TrackerInvoker(const Mat&	_prevImg,
+		const Mat&	_prevDeriv,
+		const Mat&	_nextImg,
+		const Mat&	_rgbPrevImg,
+		const Mat&	_rgbNextImg,
+		const Point2f*	_prevPts,
+		Point2f*		_nextPts,
+		uchar*			_status,
+		float*			_err,
+		int				_level,
+		int				_maxLevel,
+		int				_winSize[2],
+		int				_maxIteration,
+		bool			_useInitialFlow,
+		int				_supportRegionType,
+		int				_crossSegmentationThreshold,
+		float           _minEigenValue)
+	{
+		prevImg = &_prevImg;
+		prevDeriv = &_prevDeriv;
+		nextImg = &_nextImg;
+		rgbPrevImg = &_rgbPrevImg;
+		rgbNextImg = &_rgbNextImg;
+		prevPts = _prevPts;
+		nextPts = _nextPts;
+		status = _status;
+		err = _err;
+		minWinSize = _winSize[0];
+		maxWinSize = _winSize[1];
+		criteria.maxCount = _maxIteration;
+		criteria.epsilon = 0.01;
+		level = _level;
+		maxLevel = _maxLevel;
+		windowType = _supportRegionType;
+		minEigThreshold = _minEigenValue;
+		useInitialFlow = _useInitialFlow;
+		crossSegmentationThreshold = _crossSegmentationThreshold;
+	}
+    
+	TrackerInvoker & operator=(const TrackerInvoker &) {return *this;};
+    void operator()(const cv::Range& range) const
+    {
+        cv::Size	winSize;
+		cv::Point2f halfWin;
+
+		const Mat& I	= *prevImg;
+        const Mat& J	= *nextImg;
+        const Mat& derivI = *prevDeriv;
+		const Mat& BI = *rgbPrevImg;
+
+		winSize = cv::Size(maxWinSize,maxWinSize);
+		int winMaskwidth = static_cast<int>(ceil(winSize.width / 8.f)) * 16;
+		cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
+		winMaskMatBuf.setTo(1);
+		
+		
+
+		const float FLT_SCALE = (1.f/(1 << 20)); // 20
+
+        int j, cn = I.channels(), cn2 = cn*2;
+		int winbufwidth = static_cast<int>(ceil(winSize.width / 8.f)) * 8;
+		cv::Size winBufSize(winbufwidth,winbufwidth);
+
+		std::vector<short> _buf(winBufSize.area()*(cn + cn2));
+		Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]);
+		Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]);
+
+        for( int ptidx = range.start; ptidx < range.end; ptidx++ )
+        {
+			Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
+            Point2f nextPt;
+            if( level == maxLevel )
+            {
+                if( useInitialFlow )
+                    nextPt = nextPts[ptidx]*(float)(1./(1 << level));
+                else
+                    nextPt = prevPt;
+            }
+            else
+                nextPt = nextPts[ptidx]*2.f;
+            nextPts[ptidx] = nextPt;
+
+			Point2i iprevPt, inextPt;
+			iprevPt.x = cvFloor(prevPt.x);
+			iprevPt.y = cvFloor(prevPt.y);
+			int winArea = maxWinSize * maxWinSize;
+			cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize));
+
+			if( calcWinMaskMat(BI,  windowType, iprevPt,
+								winMaskMat,winSize,halfWin,winArea,
+								minWinSize,maxWinSize, crossSegmentationThreshold) == false)
+						continue;
+
+			
+			prevPt -= halfWin;
+			iprevPt.x = cvFloor(prevPt.x);
+            iprevPt.y = cvFloor(prevPt.y);
+
+
+			if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols ||
+				iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows )
+            {
+                if( level == 0 )
+                {
+                    if( status )
+                        status[ptidx] = 3;
+                    if( err )
+                        err[ptidx] = 0;
+                }
+                continue;
+            }
+
+            float a = prevPt.x - iprevPt.x;
+            float b = prevPt.y - iprevPt.y;
+            const int W_BITS = 14, W_BITS1 = 14;
+            
+            int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+            int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+            int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+            int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+            
+            int dstep = (int)(derivI.step/derivI.elemSize1());
+            int step = (int)(I.step/I.elemSize1());
+			int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1());
+            CV_Assert( step == (int)(J.step/J.elemSize1()) );
+            float A11 = 0, A12 = 0, A22 = 0;
+            
+#ifdef CV_SSE2
+            __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+            __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+            __m128i z = _mm_setzero_si128();
+            __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1));
+            __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1));
+            __m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps();
+			__m128i mmMask4_epi32;
+			get4BitMask(winSize.width, mmMask4_epi32);
+#endif
+            
+            // extract the patch from the first image, compute covariation matrix of derivatives
+            int x, y;
+            for( y = 0; y < winSize.height; y++ )
+            {
+				const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+                const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
+                const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
+                
+                short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step);
+                short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step);
+                
+                x = 0;
+                
+#ifdef CV_SSE2
+                for( ; x <= winSize.width*cn; x += 4, dsrc += 4*2, dIptr += 4*2 )
+                {
+					__m128i wMask = _mm_set_epi32(MaskSet * maskPtr[x+3], 
+												  MaskSet * maskPtr[x+2], 
+												  MaskSet * maskPtr[x+1], 
+												  MaskSet * maskPtr[x]);
+                    __m128i v00, v01, v10, v11, t0, t1;
+                    v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
+                    v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
+                    v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z);
+                    v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z);
+                    
+                    t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                                       _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+
+				
+					
+
+                    t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
+                    _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0));
+                    
+					
+
+                    v00 = _mm_loadu_si128((const __m128i*)(dsrc));
+                    v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
+                    v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep));
+                    v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2));
+                    
+                    t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                                       _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                    t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+                                       _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+                    t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
+                    t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
+                    v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
+                    
+					if( x + 4 > winSize.width)
+					{
+						v00 = _mm_and_si128(v00, mmMask4_epi32);
+					}
+					v00 = _mm_and_si128(v00, wMask);
+
+                    _mm_storeu_si128((__m128i*)dIptr, v00);
+                    t0 = _mm_srai_epi32(v00, 16); // Iy0 Iy1 Iy2 Iy3
+                    t1 = _mm_srai_epi32(_mm_slli_epi32(v00, 16), 16); // Ix0 Ix1 Ix2 Ix3
+                    
+                    __m128 fy = _mm_cvtepi32_ps(t0);
+                    __m128 fx = _mm_cvtepi32_ps(t1);
+                    
+					
+					qA22 = _mm_add_ps(qA22, _mm_mul_ps(fy, fy));
+                    qA12 = _mm_add_ps(qA12, _mm_mul_ps(fx, fy));
+                    qA11 = _mm_add_ps(qA11, _mm_mul_ps(fx, fx));
+                }
+#else
+                
+                for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 )
+                {
+					if( maskPtr[x] == 0)
+					{
+						dIptr[0] = 0;
+						dIptr[1] = 0;
+						continue;
+					}
+					int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
+                                          src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5);
+                    int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
+                                           dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
+                    int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
+                                           dsrc[dstep+cn2+1]*iw11, W_BITS1);
+                    
+					Iptr[x] = (short)ival;
+					dIptr[0] = (short)ixval;
+					dIptr[1] = (short)iyval;
+					A11 += (float)(ixval*ixval);
+					A12 += (float)(ixval*iyval);
+					A22 += (float)(iyval*iyval);
+                }
+#endif
+            }
+            
+#ifdef CV_SSE2
+            float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];
+            _mm_store_ps(A11buf, qA11);
+            _mm_store_ps(A12buf, qA12);
+            _mm_store_ps(A22buf, qA22);
+            A11 += A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
+            A12 += A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
+            A22 += A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
+#endif
+            
+			A11 *= FLT_SCALE;
+            A12 *= FLT_SCALE;
+            A22 *= FLT_SCALE;
+            
+            float D = A11*A22 - A12*A12;
+            float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
+						     4.f*A12*A12))/(2 * winArea);
+            
+            if( err )
+                err[ptidx] = (float)minEig;
+            
+			if( minEig < minEigThreshold ||  D < FLT_EPSILON )
+            {
+                if( level == 0 && status )
+                    status[ptidx] = 0;
+                continue;
+            }
+            
+            D = 1.f/D;
+
+            nextPt -= halfWin;
+			Point2f prevDelta(0,0);	//relates to h(t-1)
+			__m128i mmMask0, mmMask1, mmMask;
+			getWBitMask(winSize.width, mmMask0, mmMask1, mmMask);
+            for( j = 0; j < criteria.maxCount; j++ )
+            {
+				status[ptidx] = static_cast<uchar>(j);
+                inextPt.x = cvFloor(nextPt.x);
+                inextPt.y = cvFloor(nextPt.y);
+                
+				
+				if( inextPt.x + halfWin.x < 0 || inextPt.x + halfWin.x>= derivI.cols ||
+					inextPt.y + halfWin.y < 0 || inextPt.y + halfWin.y >= derivI.rows )
+				{
+					if( level == 0 && status )
+						status[ptidx] = 3;
+					break;
+				}
+
+                a = nextPt.x - inextPt.x;
+                b = nextPt.y - inextPt.y;
+                iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+                iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+                iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+                iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+                float b1 = 0, b2 = 0;
+#ifdef CV_SSE2
+                qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+                qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+                __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps();
+				__m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();
+			
+
+#endif
+				for( y = 0; y < winSize.height; y++ )
+                {
+                    const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn;
+                    const short* Iptr = (const short*)(IWinBuf.data + ( y)*IWinBuf.step);
+                    const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
+                    
+                    x = 0;
+#ifdef CV_SSE2					
+					const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+                    for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 )
+                    {
+						if( maskPtr[x  ] == 0 && maskPtr[x+1] == 0 && maskPtr[x+2] == 0 && maskPtr[x+3] == 0
+						&&	maskPtr[x+4] == 0 && maskPtr[x+5] == 0 && maskPtr[x+6] == 0 && maskPtr[x+7] == 0)
+							continue;
+                        __m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x)), diff1;
+						__m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
+                        __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
+                        __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z);
+                        __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z);
+                        
+                        __m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                                                   _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                        __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+                                                   _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+                        t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
+                        t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5);
+                     
+						__m128i mmDiff_epi16 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), diff0);
+                        
+                        __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... 
+                        __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
+						
+						if(  x > winSize.width*cn - 8)
+						{
+							Ixy_0 = _mm_and_si128(Ixy_0, mmMask0);
+							Ixy_1 = _mm_and_si128(Ixy_1, mmMask1);
+						}
+						
+                        
+						diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7   | It12 It12 It13 It13...
+                        diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3   | It8 It8 It9 It9...
+						
+
+						v10 = _mm_mullo_epi16(Ixy_0, diff0);
+                        v11 = _mm_mulhi_epi16(Ixy_0, diff0);
+                        v00 = _mm_unpacklo_epi16(v10, v11);
+                        v10 = _mm_unpackhi_epi16(v10, v11);
+						qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
+						qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
+						
+						// It * Ix It * Iy [4 ... 7]
+						// for set 1 hi sigma 1
+						
+						v10 = _mm_mullo_epi16(Ixy_1, diff1);
+						v11 = _mm_mulhi_epi16(Ixy_1, diff1);
+                        v00 = _mm_unpacklo_epi16(v10, v11);
+                        v10 = _mm_unpackhi_epi16(v10, v11);
+						qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
+						qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
+					}
+#else
+                    for( ; x < winSize.width*cn; x++, dIptr += 2 )
+                    {
+						if( dIptr[0] == 0 && dIptr[1] == 0) 
+							continue;
+                        int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
+                                              Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11,
+                                              W_BITS1-5) - Iptr[x];
+
+                        b1 += (float)(diff*dIptr[0]) * FLT_RESCALE;
+                        b2 += (float)(diff*dIptr[1]) * FLT_RESCALE;
+                    }
+#endif
+                }
+				
+#ifdef CV_SSE2
+                float CV_DECL_ALIGNED(16) bbuf[4];
+                _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
+                b1 += bbuf[0] + bbuf[2];
+                b2 += bbuf[1] + bbuf[3];
+#endif
+              
+				b1 *= FLT_SCALE;
+                b2 *= FLT_SCALE;
+
+                Point2f delta( (float)((A12*b2 - A22*b1) * D),
+                              (float)((A12*b1 - A11*b2) * D));
+
+               
+
+                nextPt += delta;
+                nextPts[ptidx] = nextPt + halfWin;
+
+                 if( delta.ddot(delta) <= criteria.epsilon)
+					break;
+				if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01  &&	
+							std::abs(delta.y - prevDelta.y) < 0.01)
+				{
+					nextPts[ptidx]  -= delta*0.5f;
+					break;
+				}
+				
+				prevDelta = delta;
+				
+            }
+			
+			
+
+			
+			
+        }
+
+    }
+    
+	const Mat*		prevImg;
+	const Mat*		nextImg;
+	const Mat*		prevDeriv;
+	const Mat*		rgbPrevImg;
+	const Mat*		rgbNextImg;
+	const Point2f*	prevPts;
+	Point2f*		nextPts;
+	uchar*			status;
+	float*			err;
+	int				maxWinSize;
+	int				minWinSize;
+	TermCriteria	criteria;
+	int				level;
+	int				maxLevel;
+	int				windowType;
+	float			minEigThreshold;
+	bool			useInitialFlow;
+	int		crossSegmentationThreshold;
+};
+}
+}
+}}
+#endif
\ No newline at end of file
diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp
new file mode 100644
index 00000000000..6dbdfef441e
--- /dev/null
+++ b/modules/optflow/src/rlof/rlof_invoker.hpp
@@ -0,0 +1,1513 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+ //
+ //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+ //
+ //  By downloading, copying, installing or using the software you agree to this license.
+ //  If you do not agree to this license, do not download, install,
+ //  copy or use the software.
+ //
+ //
+ //                           License Agreement
+ //                For Open Source Computer Vision Library
+ //
+ // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+ // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+ // Third party copyrights are property of their respective owners.
+ //
+ // Redistribution and use in source and binary forms, with or without modification,
+ // are permitted provided that the following conditions are met:
+ //
+ //   * Redistribution's of source code must retain the above copyright notice,
+ //     this list of conditions and the following disclaimer.
+ //
+ //   * Redistribution's in binary form must reproduce the above copyright notice,
+ //     this list of conditions and the following disclaimer in the documentation
+ //     and/or other materials provided with the distribution.
+ //
+ //   * The name of the copyright holders may not be used to endorse or promote products
+ //     derived from this software without specific prior written permission.
+ //
+ // This software is provided by the copyright holders and contributors "as is" and
+ // any express or implied warranties, including, but not limited to, the implied
+ // warranties of merchantability and fitness for a particular purpose are disclaimed.
+ // In no event shall the Intel Corporation or contributors be liable for any direct,
+ // indirect, incidental, special, exemplary, or consequential damages
+ // (including, but not limited to, procurement of substitute goods or services;
+ // loss of use, data, or profits; or business interruption) however caused
+ // and on any theory of liability, whether in contract, strict liability,
+ // or tort (including negligence or otherwise) arising in any way out of
+ // the use of this software, even if advised of the possibility of such damage.
+ //
+ //M*/
+#ifndef _RLOF_INVOKER_HPP_
+#define _RLOF_INVOKER_HPP_
+#include "rlof_invokerbase.hpp"
+namespace cv
+{
+namespace optflow
+{
+namespace rlof
+{
+namespace ica
+{
+	int calc_diff(int diff,float fParam0,float fParam1,float param_2)
+	{
+		int abss = (diff < 0) ? -diff : diff;
+		if (abss > fParam1)
+		{
+			diff = 0;
+		}
+		else if (abss > fParam0 && diff >= 0)
+		{
+			//diff = fParam1 * (diff - fParam1);
+			diff = static_cast<int>(param_2 * (diff - fParam1));
+		}
+		else if (abss > fParam0 && diff < 0)
+		{
+			//diff = fParam1 * (diff + fParam1);
+			diff = static_cast<int>(param_2 * (diff + fParam1));
+
+		}
+		return diff;
+	}
+	class TrackerInvoker : public cv::ParallelLoopBody
+	{
+	public:
+		TrackerInvoker(
+			const Mat&	_prevImg,
+			const Mat&	_prevDeriv,
+			const Mat&	_nextImg,
+			const Mat&	_rgbPrevImg,
+			const Mat&	_rgbNextImg,
+			const Point2f*	_prevPts,
+			Point2f*		_nextPts,
+			uchar*			_status,
+			float*			_err,
+			int				_level,
+			int				_maxLevel,
+			int				_winSize[2],
+			int				_maxIteration,
+			bool			_useInitialFlow,
+			int				_supportRegionType,
+			std::vector<float> _normSigmas,
+			float           _minEigenValue,
+			int				_crossSegmentationThreshold)
+		{
+			prevImg = &_prevImg;
+			prevDeriv = &_prevDeriv;
+			nextImg = &_nextImg;
+			rgbPrevImg = &_rgbPrevImg;
+			rgbNextImg = &_rgbNextImg;
+			prevPts = _prevPts;
+			nextPts = _nextPts;
+			status = _status;
+			err = _err;
+			minWinSize = _winSize[0];
+			maxWinSize = _winSize[1];
+			criteria.maxCount = _maxIteration;
+			criteria.epsilon = 0.01;
+			level = _level;
+			maxLevel = _maxLevel;
+			windowType = _supportRegionType;
+			minEigThreshold = _minEigenValue;
+			paramReset = _normSigmas;
+			useInitialFlow = _useInitialFlow;
+			crossSegmentationThreshold = _crossSegmentationThreshold;
+		}
+		TrackerInvoker & operator=(const TrackerInvoker &) { return *this; };
+		void operator()(const cv::Range& range) const
+		{
+			cv::Size	winSize;
+			cv::Point2f halfWin;
+
+			const Mat& I = *prevImg;
+			const Mat& J = *nextImg;
+			const Mat& derivI = *prevDeriv;
+			const Mat& BI = *rgbPrevImg;
+
+			winSize = cv::Size(maxWinSize, maxWinSize);
+			int winMaskwidth = static_cast<int>(ceil(winSize.width / 8.f)) * 16;
+			cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
+			winMaskMatBuf.setTo(1);
+
+			const float FLT_SCALE = (1.f / (1 << 20)); // 20
+
+			std::vector<float> param = paramReset;
+			int cn = I.channels(), cn2 = cn * 2;
+			int winbufwidth = static_cast<int>(ceil(winSize.width / 8.f)) * 8;
+			cv::Size winBufSize(winbufwidth, winbufwidth);
+
+			std::vector<short> _buf(winBufSize.area()*(cn + cn2));
+			Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]);
+			Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]);
+
+			for (int ptidx = range.start; ptidx < range.end; ptidx++)
+			{
+				Point2f prevPt = prevPts[ptidx] * (float)(1. / (1 << level));
+				Point2f nextPt;
+				if (level == maxLevel)
+				{
+					if ( useInitialFlow)
+						nextPt = nextPts[ptidx] * (float)(1. / (1 << level));
+					else
+						nextPt = prevPt;
+				}
+				else
+					nextPt = nextPts[ptidx] * 2.f;
+				nextPts[ptidx] = nextPt;
+
+				Point2i iprevPt, inextPt;
+				iprevPt.x = cvFloor(prevPt.x);
+				iprevPt.y = cvFloor(prevPt.y);
+				int winArea = maxWinSize * maxWinSize;
+				cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0, 0, maxWinSize, maxWinSize));
+
+				if (calcWinMaskMat(BI, windowType, iprevPt,
+					winMaskMat, winSize, halfWin, winArea,
+					minWinSize, maxWinSize, crossSegmentationThreshold) == false)
+					continue;
+
+
+				prevPt -= halfWin;
+				iprevPt.x = cvFloor(prevPt.x);
+				iprevPt.y = cvFloor(prevPt.y);
+
+				if (iprevPt.x + halfWin.x < 0 || iprevPt.x + halfWin.x >= derivI.cols ||
+					iprevPt.y + halfWin.y < 0 || iprevPt.y + halfWin.y >= derivI.rows)
+				{
+					if (level == 0)
+					{
+						if (status)
+							status[ptidx] = 3;
+						if (err)
+							err[ptidx] = 0;
+					}
+					continue;
+				}
+
+				float a = prevPt.x - iprevPt.x;
+				float b = prevPt.y - iprevPt.y;
+				const int W_BITS = 14, W_BITS1 = 14;
+
+				int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+				int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+				int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+				int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+
+				int dstep = (int)(derivI.step / derivI.elemSize1());
+				int step = (int)(I.step / I.elemSize1());
+				int mStep = (int)(winMaskMat.step / winMaskMat.elemSize1());
+				CV_Assert(step == (int)(J.step / J.elemSize1()));
+				float A11 = 0, A12 = 0, A22 = 0;
+				float b1 = 0, b2 = 0;
+				float D = 0;
+				float minEig;
+
+#ifdef CV_SSE2
+				__m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+				__m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+				__m128i z = _mm_setzero_si128();
+				__m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1 - 1));
+				__m128i qdelta = _mm_set1_epi32(1 << (W_BITS1 - 5 - 1));
+				__m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps();
+				__m128i mmMask4_epi32;
+				__m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits<unsigned short>::max());
+				get4BitMask(winSize.width, mmMask4_epi32);
+#endif
+
+				// extract the patch from the first image, compute covariation matrix of derivatives
+				int x, y;
+				for (y = 0; y < winSize.height; y++)
+				{
+					const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+					const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
+					const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
+
+					short* Iptr = (short*)(IWinBuf.data + y * IWinBuf.step);
+					short* dIptr = (short*)(derivIWinBuf.data + y * derivIWinBuf.step);
+
+					x = 0;
+
+#ifdef CV_SSE2
+					
+					for (; x <= winSize.width*cn; x += 4, dsrc += 4 * 2, dIptr += 4 * 2)
+					{
+						__m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
+						__m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16);
+
+						__m128i v00, v01, v10, v11, t0, t1;
+						v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
+						v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
+						v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z);
+						v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z);
+
+						t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+							_mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+						t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5);
+						if (x + 4 > winSize.width)
+						{
+							t0 = _mm_and_si128(t0, mmMask4_epi32);
+						}
+						t0 = _mm_and_si128(t0, mask_0_3_epi16);
+						_mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0, t0));
+
+						v00 = _mm_loadu_si128((const __m128i*)(dsrc));
+						v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
+						v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep));
+						v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2));
+
+						t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+							_mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+						t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+							_mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+						t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
+						t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
+						v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
+						if (x + 4 > winSize.width)
+						{
+							v00 = _mm_and_si128(v00, mmMask4_epi32);
+						}
+						v00 = _mm_and_si128(v00, mask_0_3_epi16);
+						_mm_storeu_si128((__m128i*)dIptr, v00);
+					}
+#else
+
+					for (; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2)
+					{
+						if (maskPtr[x] == 0)
+						{
+							dIptr[0] = 0;
+							dIptr[1] = 0;
+							continue;
+						}
+						int ival = CV_DESCALE(src[x] * iw00 + src[x + cn] * iw01 +
+							src[x + step] * iw10 + src[x + step + cn] * iw11, W_BITS1 - 5);
+						int ixval = CV_DESCALE(dsrc[0] * iw00 + dsrc[cn2] * iw01 +
+							dsrc[dstep] * iw10 + dsrc[dstep + cn2] * iw11, W_BITS1);
+						int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc[dstep + 1] * iw10 +
+							dsrc[dstep + cn2 + 1] * iw11, W_BITS1);
+
+						Iptr[x] = (short)ival;
+						dIptr[0] = (short)ixval;
+						dIptr[1] = (short)iyval;
+					}
+#endif
+				}
+
+				cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1);
+
+				cv::Point2f backUpNextPt = nextPt;
+				nextPt -= halfWin;
+				int j;
+				__m128i mmMask0, mmMask1, mmMask;
+				getWBitMask(winSize.width, mmMask0, mmMask1, mmMask);
+				float MEstimatorScale = 1;
+				int buffIdx = 0;
+				cv::Point2f prevDelta(0, 0);
+
+				for (j = 0; j < criteria.maxCount; j++)
+				{
+					inextPt.x = cvFloor(nextPt.x);
+					inextPt.y = cvFloor(nextPt.y);
+
+					if (inextPt.x < -winSize.width || inextPt.x >= J.cols ||
+						inextPt.y < -winSize.height || inextPt.y >= J.rows)
+					{
+						if (level == 0 && status)
+							status[ptidx] = 3;
+						if (level > 0)
+							nextPts[ptidx] = backUpNextPt;
+						break;
+					}
+
+				
+					a = nextPt.x - inextPt.x;
+					b = nextPt.y - inextPt.y;
+					iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+					iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+					iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+					iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+					b1 = 0;
+					b2 = 0;
+
+					if (j == 0 )
+					{
+						A11 = 0;
+						A12 = 0;
+						A22 = 0;
+					}
+
+					if (j == 0 )
+					{
+						buffIdx = 0;
+						for (y = 0; y < winSize.height; y++)
+						{
+							const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn;
+							const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step);
+							const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
+							const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+							x = 0;
+							for (; x < winSize.width*cn; x++, dIptr += 2)
+							{
+								if (maskPtr[x] == 0)
+									continue;
+								int diff = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11, W_BITS1 - 5) - Iptr[x];
+								residualMat.at<short>(buffIdx++) = static_cast<short>(diff);
+							}
+						}
+						/*! Estimation for the residual */
+						cv::Mat residualMatRoi(residualMat, cv::Rect(0, 0, 1, buffIdx));
+						MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi);
+					}
+
+					float eta = 1.f / winArea;
+					float fParam0 = param[0] * 32.f;
+					float fParam1 = param[1] * 32.f;
+					fParam0 = param[0] * MEstimatorScale;
+					fParam1 = param[1] * MEstimatorScale;
+#ifdef CV_SSE2
+					qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+					qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+					__m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps();
+					__m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();
+					__m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam0)));
+					__m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam1)));
+					__m128  mmParam2 = _mm_set1_ps(param[2]);
+					float s2Val = param[2] > 0 ? param[2] : -param[2];
+					int s2bitShift = param[2] == 0 ? 1 : static_cast<int>(ceil(log(200.f / s2Val) / log(2.f)));	// zwei stellen nach den komma 
+					__m128i mmParam2_epi16 = _mm_set1_epi16(static_cast<short>(param[2] * (float)(1 << s2bitShift)));
+					__m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift);
+					__m128  mmParam2s = _mm_set1_ps(0.01f * param[2]);
+					__m128  mmParam2s2 = _mm_set1_ps(param[2] * param[2]);
+					__m128  mmOnes = _mm_set1_ps(1.f);
+					__m128  mmError0 = _mm_setzero_ps();
+					__m128  mmError1 = _mm_setzero_ps();
+					__m128i	mmEta = _mm_setzero_si128();
+					__m128i	mmEtaNeg = _mm_set1_epi16(-1);
+					__m128i mmScale = _mm_set1_epi16(static_cast<short>(MEstimatorScale));
+
+#endif
+					
+					buffIdx = 0;
+					for (y = 0; y < winSize.height; y++)
+					{
+						const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+						const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn;
+						const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step);
+						const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
+
+						x = 0;
+#ifdef CV_SSE2					
+						for (; x <= winSize.width*cn; x += 8, dIptr += 8 * 2)
+						{
+							__m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
+							__m128i diff0, diff1;
+							__m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); // von element 0 bis 7
+
+							__m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
+							__m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
+							__m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z);
+							__m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z);
+
+							__m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+								_mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+							__m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+								_mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+							t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5);
+							t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1 - 5);
+
+							__m128i mmDiff_epi16 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16);
+
+							mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16);
+
+							
+							__m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale);
+							mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1)));
+							
+
+							__m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... 
+							__m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
+
+							__m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16);
+							__m128i bSet2_epi16, bSet1_epi16;
+							// |It| < sigma1 ?
+							bSet2_epi16 = _mm_cmplt_epi16(abs_epi16, mmParam1);
+							// It > 0 ?
+							__m128i diffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128());
+							// sigma0 < |It| < sigma1 ?
+							bSet1_epi16 = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0));
+							// val = |It| -/+ sigma1
+							__m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)),
+								_mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1)));
+							// It == 0	 ? |It| > sigma13 
+							mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16);
+							// It == val ? sigma0 < |It| < sigma1
+							mmDiff_epi16 = _mm_blendv_epi8(mmDiff_epi16, tmpParam1_epi16, bSet1_epi16);
+
+
+							__m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3
+							// diff = diff * sigma2 
+							__m128i lo = _mm_mullo_epi16(tale_epi16_, mmDiff_epi16);
+							__m128i hi = _mm_mulhi_epi16(tale_epi16_, mmDiff_epi16);
+							__m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); //diff 0_3_epi32 
+							__m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); // diff 4_7_epi32
+
+							mmDiff_epi16 = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); 
+							diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7   | It12 It12 It13 It13...
+							diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3   | It8 It8 It9 It9...
+
+							// Ix * diff / Iy * diff
+							v10 = _mm_mullo_epi16(Ixy_0, diff0);
+							v11 = _mm_mulhi_epi16(Ixy_0, diff0);
+							v00 = _mm_unpacklo_epi16(v10, v11);
+							v10 = _mm_unpackhi_epi16(v10, v11);
+							qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
+							qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
+
+							// It * Ix It * Iy [4 ... 7]
+							// for set 1 hi sigma 1
+
+							v10 = _mm_mullo_epi16(Ixy_1, diff1);
+							v11 = _mm_mulhi_epi16(Ixy_1, diff1);
+							v00 = _mm_unpacklo_epi16(v10, v11);
+							v10 = _mm_unpackhi_epi16(v10, v11);
+							qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
+							qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
+
+
+							if (j == 0)
+							{
+								__m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16));
+								__m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16, bSet1_epi16), 16));
+								__m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16));
+								__m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16)));
+
+								__m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16));
+								__m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16), 16));
+
+								__m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps);
+								__m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps);
+								tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps);
+								tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps);
+
+								tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps);
+								tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps);
+								
+								t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3
+								t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3
+
+								__m128 fy = _mm_cvtepi32_ps(t0);
+								__m128 fx = _mm_cvtepi32_ps(t1);
+
+								// A11 - A22
+								__m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps);
+								__m128 fytale = _mm_mul_ps(fy, tale_0_3_ps);
+
+								mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
+								mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
+								mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
+
+								t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11
+								t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3
+
+								fy = _mm_cvtepi32_ps(t0);
+								fx = _mm_cvtepi32_ps(t1);
+
+								// A11 - A22
+								fxtale = _mm_mul_ps(fx, tale_4_7_ps);
+								fytale = _mm_mul_ps(fy, tale_4_7_ps);
+
+								mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
+								mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
+								mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
+							}
+						}
+#else
+						for (; x < winSize.width*cn; x++, dIptr += 2)
+						{
+							if (maskPtr[x] == 0)
+								continue;
+							int diff = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 +
+								Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11,
+								W_BITS1 - 5) - Iptr[x];
+
+						
+
+
+							
+							if (diff > MEstimatorScale)
+								MEstimatorScale += eta;
+							if (diff < MEstimatorScale)
+								MEstimatorScale -= eta;
+							
+							calc_diff(diff, fParam0, fParam1, param[2]);
+							
+				
+							float ixval = (float)(dIptr[0]);
+							float iyval = (float)(dIptr[1]);
+							b1 += (float)(diff*ixval);
+							b2 += (float)(diff*iyval);
+
+							if ( j == 0)
+							{
+								int abss = (diff < 0) ? -diff : diff;
+								float tale = param[2] * FLT_RESCALE;
+								//float tale = fParam1 * FLT_RESCALE;
+								if (abss < fParam0)
+								{
+									tale = FLT_RESCALE;
+								}
+								else if (abss > fParam1)
+								{
+									tale *= 0.01f;
+								}
+								else
+								{
+									tale *= param[2];
+								}
+								A11 += (float)(ixval*ixval*tale);
+								A12 += (float)(ixval*iyval*tale);
+								A22 += (float)(iyval*iyval*tale);
+							}
+
+
+						}
+#endif
+					}
+					
+#ifdef CV_SSE2
+					short etaValues[8];
+					_mm_storeu_si128((__m128i*)(etaValues), mmEta);
+					MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3]
+							+ etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]);
+
+				
+#endif
+					if (j == 0)
+					{
+#ifdef CV_SSE2
+						float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];//				
+
+						_mm_store_ps(A11buf, mmAxx);
+						_mm_store_ps(A12buf, mmAxy);
+						_mm_store_ps(A22buf, mmAyy);
+
+
+						A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
+						A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
+						A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
+#endif
+						A11 *= FLT_SCALE; 
+						A12 *= FLT_SCALE; 
+						A22 *= FLT_SCALE;
+
+
+						D = A11 * A22 - A12 * A12;
+						minEig = (A22 + A11 - std::sqrt((A11 - A22)*(A11 - A22) +
+							4.f*A12*A12)) / (2 * winArea);
+						D = 1.f / D;
+
+						if (minEig < minEigThreshold || std::abs(D) < FLT_EPSILON)
+						{
+							if (level == 0 && status)
+								status[ptidx] = 0;
+							if (level > 0)
+							{
+								nextPts[ptidx] = backUpNextPt;
+							}
+							break;
+						}
+					}
+
+#ifdef CV_SSE2
+					float CV_DECL_ALIGNED(16) bbuf[4];
+					_mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
+					b1 += bbuf[0] + bbuf[2];
+					b2 += bbuf[1] + bbuf[3];
+#endif
+
+					b1 *= FLT_SCALE;
+					b2 *= FLT_SCALE;
+
+
+					Point2f delta((float)((A12*b2 - A22 * b1) * D), (float)((A12*b1 - A11 * b2) * D));
+
+					delta.x = (delta.x != delta.x) ? 0 : delta.x;
+					delta.y = (delta.y != delta.y) ? 0 : delta.y;
+
+					nextPt += delta * 0.7;
+					nextPts[ptidx] = nextPt + halfWin;
+
+					if (j > 0 && std::abs(delta.x - prevDelta.x) < 0.01  &&
+						std::abs(delta.y - prevDelta.y) < 0.01)
+					{
+						nextPts[ptidx] -= delta * 0.5f;
+						break;
+					}
+
+					prevDelta = delta;
+
+				}
+
+			}
+
+		}
+
+		const Mat*		prevImg;
+		const Mat*		nextImg;
+		const Mat*		prevDeriv;
+		const Mat*		rgbPrevImg;
+		const Mat*		rgbNextImg;
+		const Point2f*	prevPts;
+		Point2f*		nextPts;
+		uchar*			status;
+		float*			err;
+		int				maxWinSize;
+		int				minWinSize;
+		TermCriteria	criteria;
+		int				level;
+		int				maxLevel;
+		int				windowType;
+		float			minEigThreshold;
+		bool			useInitialFlow;
+		std::vector<float> paramReset;
+		int				crossSegmentationThreshold;
+	};
+}
+namespace radial
+{
+	class TrackerInvoker : public cv::ParallelLoopBody
+	{
+	public:
+		TrackerInvoker(
+			const Mat&	_prevImg,
+			const Mat&	_prevDeriv,
+			const Mat&	_nextImg,
+			const Mat&	_rgbPrevImg,
+			const Mat&	_rgbNextImg,
+			const Point2f*	_prevPts,
+			Point2f*		_nextPts,
+			uchar*			_status,
+			float*			_err,
+			Point2f*		_gainVecs,
+			int				_level,
+			int				_maxLevel,
+			int				_winSize[2],
+			int				_maxIteration,
+			bool			_useInitialFlow,
+			int				_supportRegionType,
+			std::vector<float> _normSigmas,
+			float           _minEigenValue,
+			int				_crossSegmentationThreshold)
+		{
+			prevImg = &_prevImg;
+			prevDeriv = &_prevDeriv;
+			nextImg = &_nextImg;
+			rgbPrevImg = &_rgbPrevImg;
+			rgbNextImg = &_rgbNextImg;
+			prevPts = _prevPts;
+			nextPts = _nextPts;
+			status = _status;
+			err = _err;
+			gainVecs = _gainVecs;
+			minWinSize = _winSize[0];
+			maxWinSize = _winSize[1];
+			criteria.maxCount = _maxIteration;
+			criteria.epsilon = 0.01;
+			level = _level;
+			maxLevel = _maxLevel;
+			windowType = _supportRegionType;
+			minEigThreshold = _minEigenValue;
+			paramReset = _normSigmas;
+			useInitialFlow = _useInitialFlow;
+			crossSegmentationThreshold = _crossSegmentationThreshold;
+		}
+
+		TrackerInvoker & operator=(const TrackerInvoker &) { return *this; };
+		void operator()(const cv::Range& range) const
+		{
+			Point2f halfWin; 
+			cv::Size winSize;
+			const Mat& I = *prevImg;
+			const Mat& J = *nextImg;
+			const Mat& derivI = *prevDeriv;
+			const Mat& BI = *rgbPrevImg;
+
+
+			const float FLT_SCALE = (1.f / (1 << 16));
+
+			winSize = cv::Size(maxWinSize, maxWinSize);
+			int winMaskwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
+			cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
+			winMaskMatBuf.setTo(1);
+
+			std::vector<float> param = paramReset;
+			int cn = I.channels(), cn2 = cn * 2;
+			int winbufwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
+			cv::Size winBufSize(winbufwidth, winbufwidth);
+
+			cv::Mat invTensorMat(4, 4, CV_32FC1);
+			cv::Mat mismatchMat(4, 1, CV_32FC1);
+			cv::Mat resultMat(4, 1, CV_32FC1);
+
+
+			std::vector<short> _buf(winBufSize.area()*(cn + cn2));
+			Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]);
+			Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]);
+
+			for (int ptidx = range.start; ptidx < range.end; ptidx++)
+			{
+
+				Point2f prevPt = prevPts[ptidx] * (float)(1. / (1 << level));
+					Point2f nextPt;
+				if (level == maxLevel)
+				{
+					if (useInitialFlow)
+					{
+						nextPt = nextPts[ptidx] * (float)(1. / (1 << level));
+						}
+					else
+						nextPt = prevPt;
+				}
+				else
+				{
+					nextPt = nextPts[ptidx] * 2.f;
+					
+				}
+				nextPts[ptidx] = nextPt;
+
+				Point2i iprevPt, inextPt;
+				iprevPt.x = cvFloor(prevPt.x);
+				iprevPt.y = cvFloor(prevPt.y);
+				int winArea = maxWinSize * maxWinSize;
+				cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0, 0, maxWinSize, maxWinSize));
+				winMaskMatBuf.setTo(0);
+				if (calcWinMaskMat(BI, windowType, iprevPt,
+					winMaskMat, winSize, halfWin, winArea,
+					minWinSize, maxWinSize, crossSegmentationThreshold) == false)
+					continue;
+
+				prevPt -= halfWin;
+				iprevPt.x = cvFloor(prevPt.x);
+				iprevPt.y = cvFloor(prevPt.y);
+
+				if (iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols ||
+					iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows)
+				{
+					if (level == 0)
+					{
+						if (status)
+							status[ptidx] = 3;
+						if (err)
+							err[ptidx] = 0;
+					}
+					continue;
+				}
+
+				float a = prevPt.x - iprevPt.x;
+				float b = prevPt.y - iprevPt.y;
+				const int W_BITS = 14, W_BITS1 = 14;
+
+				int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+				int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+				int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+				int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+
+				int dstep = (int)(derivI.step / derivI.elemSize1());
+				int step = (int)(I.step / I.elemSize1());
+				int mStep = (int)(winMaskMat.step / winMaskMat.elemSize1());
+				CV_Assert(step == (int)(J.step / J.elemSize1()));
+
+				float A11 = 0, A12 = 0, A22 = 0;
+				float b1 = 0, b2 = 0, b3 = 0, b4 = 0;
+				// tensor
+				float sumIx = 0;
+				float sumIy = 0;
+				float sumI = 0;
+				float sumW = 0;
+				float w1 = 0, w2 = 0; // -IyI
+				float dI = 0; // I^2
+				float D = 0;
+
+#ifdef CV_SSE2
+				__m128i rightMask = _mm_set_epi16(0, std::numeric_limits<unsigned short>::max(), 0, std::numeric_limits<unsigned short>::max(), 0, std::numeric_limits<unsigned short>::max(), 0, std::numeric_limits<unsigned short>::max());
+				__m128i leftMask = _mm_set_epi16(std::numeric_limits<unsigned short>::max(), 0, std::numeric_limits<unsigned short>::max(), 0, std::numeric_limits<unsigned short>::max(), 0, std::numeric_limits<unsigned short>::max(), 0);
+				__m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+				__m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+				__m128i z = _mm_setzero_si128();
+				__m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1 - 1));
+				__m128i qdelta = _mm_set1_epi32(1 << (W_BITS1 - 5 - 1));
+				__m128i mmMask4_epi32;
+				__m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits<unsigned short>::max());
+				get4BitMask(winSize.width, mmMask4_epi32);
+				__m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps();
+#endif
+
+				// extract the patch from the first image, compute covariation matrix of derivatives
+				int x, y;
+				for (y = 0; y < winSize.height; y++)
+				{
+					const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+					const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
+					const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
+
+					short* Iptr = (short*)(IWinBuf.data + y * IWinBuf.step);
+					short* dIptr = (short*)(derivIWinBuf.data + y * derivIWinBuf.step);
+
+					x = 0;
+
+#ifdef CV_SSE2
+					for (; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4 * 2, dIptr += 4 * 2)
+					{
+						__m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
+						__m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16);
+						__m128i v00, v01, v10, v11, t0, t1;
+						v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
+						v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
+						v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z);
+						v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z);
+
+						t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+							_mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+						t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5);
+						if (x + 4 > winSize.width)
+						{
+							t0 = _mm_and_si128(t0, mmMask4_epi32);
+						}
+						t0 = _mm_and_si128(t0, mask_0_3_epi16);
+						_mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0, t0));
+
+
+						v00 = _mm_loadu_si128((const __m128i*)(dsrc));
+						v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
+						v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep));
+						v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2));
+
+						t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+							_mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+						t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+							_mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+						t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
+						t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
+						v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
+						if (x + 4 > winSize.width)
+						{
+							v00 = _mm_and_si128(v00, mmMask4_epi32);
+						}
+						v00 = _mm_and_si128(v00, mask_0_3_epi16);
+						_mm_storeu_si128((__m128i*)dIptr, v00);
+					}
+#else
+
+					for (; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2)
+					{
+						if (winMaskMat.at<uchar>(y, x) == 0)
+						{
+							dIptr[0] = 0;
+							dIptr[1] = 0;
+							continue;
+						}
+						int ival = CV_DESCALE(src[x] * iw00 + src[x + cn] * iw01 +
+							src[x + step] * iw10 + src[x + step + cn] * iw11, W_BITS1 - 5);
+						int ixval = CV_DESCALE(dsrc[0] * iw00 + dsrc[cn2] * iw01 +
+							dsrc[dstep] * iw10 + dsrc[dstep + cn2] * iw11, W_BITS1);
+						int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc[dstep + 1] * iw10 +
+							dsrc[dstep + cn2 + 1] * iw11, W_BITS1);
+
+						Iptr[x] = (short)ival;
+						dIptr[0] = (short)ixval;
+						dIptr[1] = (short)iyval;
+
+					}
+#endif
+				}
+
+				cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1);
+
+				cv::Point2f backUpNextPt = nextPt;
+				nextPt -= halfWin;
+				Point2f prevDelta(0, 0);	//related to h(t-1)
+				Point2f prevGain(0, 0);
+				cv::Point2f gainVec = gainVecs[ptidx];
+				cv::Point2f backUpGain = gainVec;
+				cv::Size _winSize = winSize;
+				int j;
+				__m128i mmMask0, mmMask1, mmMask;
+				getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask);
+				__m128  mmOnes = _mm_set1_ps(1.f);
+				__m128i mmOnes_epi16 = _mm_set1_epi16(1);
+				__m128i mmZeros_epi16 = _mm_set1_epi16(0);
+				float MEstimatorScale = 1;
+				int buffIdx = 0;
+				float minEigValue, maxEigValue;
+
+				for (j = 0; j < criteria.maxCount; j++)
+				{
+					
+					inextPt.x = cvFloor(nextPt.x);
+					inextPt.y = cvFloor(nextPt.y);
+
+					if (inextPt.x < -winSize.width || inextPt.x >= J.cols ||
+						inextPt.y < -winSize.height || inextPt.y >= J.rows)
+					{
+						if (level == 0 && status)
+							status[ptidx] = 3;
+						if (level > 0)
+						{
+							nextPts[ptidx] = backUpNextPt;
+							gainVecs[ptidx] = backUpGain;
+						}
+						break;
+					}
+
+					a = nextPt.x - inextPt.x;
+					b = nextPt.y - inextPt.y;
+					iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+					iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+					iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+					iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+
+					// mismatch
+					b1 = 0;
+					b2 = 0;
+					b3 = 0;
+					b4 = 0;
+					if (j == 0)
+					{
+						// tensor
+						w1 = 0; // -IxI
+						w2 = 0; // -IyI
+						dI = 0; // I^2
+						sumIx = 0;
+						sumIy = 0;
+						sumI = 0;
+						sumW = 0;
+						A11 = 0;
+						A12 = 0;
+						A22 = 0;
+					}
+
+					if (j == 0 )
+					{
+						buffIdx = 0;
+						for (y = 0; y < winSize.height; y++)
+						{
+							const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn;
+							const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step);
+							const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
+							x = 0;
+							for (; x < winSize.width*cn; x++, dIptr += 2)
+							{
+								if (dIptr[0] == 0 && dIptr[1] == 0)
+									continue;
+								int diff = static_cast<int>(CV_DESCALE(	Jptr[x] * iw00 + 
+														Jptr[x + cn] * iw01 + 
+														Jptr[x + step] * iw10 + 
+														Jptr[x + step + cn] * iw11, W_BITS1 - 5)
+									- Iptr[x] + Iptr[x] * gainVec.x + gainVec.y);
+								residualMat.at<short>(buffIdx++) = static_cast<short>(diff);
+							}
+						}
+						/*! Estimation for the residual */
+						cv::Mat residualMatRoi(residualMat, cv::Rect(0, 0, 1, buffIdx));
+						MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi);
+					}
+
+					float eta = 1.f / winArea;
+					float fParam0 = param[0] * 32.f;
+					float fParam1 = param[1] * 32.f;
+					fParam0 = param[0] * MEstimatorScale;
+					fParam1 = param[1] * MEstimatorScale;
+					
+#ifdef CV_SSE2
+
+					qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+					qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+					__m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(), qb2 = _mm_setzero_ps();
+					__m128 qb3 = _mm_setzero_ps();
+					__m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps();
+					__m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps();
+					__m128 mmSumIy = _mm_setzero_ps(), mmSumIx = _mm_setzero_ps();
+					__m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();
+					__m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam0)));
+					__m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam1)));
+
+
+					float s2Val = param[2] > 0 ? param[2] : -param[2];
+					int s2bitShift = param[2] == 0 ? 1 : static_cast<int>(ceil(log(200.f / s2Val) / log(2.f)));	// zwei stellen nach den komma 
+					__m128i mmParam2_epi16 = _mm_set1_epi16(static_cast<short>(param[2] * (float)(1 << s2bitShift)));
+					__m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift);
+					__m128  mmParam2 = _mm_set1_ps(param[2]);	
+					__m128  mmParam2s = _mm_set1_ps(0.01f * param[2]);
+					__m128  mmParam2s2 = _mm_set1_ps(param[2] * param[2]);
+
+					float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x;
+					int bitShift = gainVec.x == 0 ? 1 : static_cast<int>(ceil(log(200.f / gainVal) / log(2.f)));	// zwei stellen nach den komma 
+
+					__m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.x * (float)(1 << bitShift)));
+					__m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.y));
+					__m128i	mmEta = _mm_setzero_si128();
+					__m128i	mmEtaNeg = _mm_set1_epi16(-1);
+					__m128i mmScale = _mm_set1_epi16(static_cast<short>(MEstimatorScale));
+
+#endif
+					buffIdx = 0;
+
+					for (y = 0; y < _winSize.height; y++)
+					{
+						const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+						const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn;
+						const short* Iptr = (const short*)(IWinBuf.data + y * IWinBuf.step);
+						const short* dIptr = (const short*)(derivIWinBuf.data + y * derivIWinBuf.step);
+
+						x = 0;
+#ifdef CV_SSE2					
+
+
+						for (; x <= _winSize.width*cn; x += 8, dIptr += 8 * 2)
+						{
+							__m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
+							__m128i diff0, diff1;
+							__m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); // von element 0 bis 7
+				   			__m128i v00 = _mm_unpacklo_epi8(
+							_mm_loadl_epi64((const __m128i*)(Jptr + x)) , z); //J0 , 0, J1, 0, J2, 0 ... J7,0 
+							__m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
+							__m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z);
+							__m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z);
+
+							__m128i t0 = _mm_add_epi32
+							(_mm_madd_epi16(
+								_mm_unpacklo_epi16(v00, v01), //J0, , J1, J1, J2, J2,  ... J3 , J4
+								qw0),
+								_mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+							__m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+								_mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+
+							t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5);
+							t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1 - 5);
+
+							
+							__m128i lo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16);
+							__m128i hi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16);
+
+							__m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), bitShift);
+							__m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), bitShift);
+							__m128i Igain_epi16 = _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32);
+
+							// diff = J - I + I*gain.x + gain.y
+							__m128i mmDiff_epi16 = _mm_add_epi16(
+								_mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16),	// J - I
+								_mm_add_epi16(Igain_epi16, mmConstValue_epi16));
+
+							
+							mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16);
+
+							
+							__m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale);
+							mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1)));
+								
+		
+							__m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... 
+							__m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
+
+
+							__m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16);
+							__m128i bSet2_epi16, bSet1_epi16;
+							// |It| < sigma1 ?
+							bSet2_epi16 = _mm_cmplt_epi16(abs_epi16, mmParam1);
+							// It > 0 ?
+							__m128i diffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128());
+							// sigma0 < |It| < sigma1 ?
+							bSet1_epi16 = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0));
+							// val = |It| -/+ sigma1
+							__m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)),
+								_mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1)));
+							// It == 0	 ? |It| > sigma13 
+							mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16);
+							// It == val ? sigma0 < |It| < sigma1
+							mmDiff_epi16 = _mm_blendv_epi8(mmDiff_epi16, tmpParam1_epi16, bSet1_epi16);
+
+
+							__m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3
+							// diff = diff * sigma2 
+							lo = _mm_mullo_epi16(tale_epi16_, mmDiff_epi16);
+							hi = _mm_mulhi_epi16(tale_epi16_, mmDiff_epi16);
+							__m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); //diff 0_3_epi32 
+							__m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); // diff 4_7_epi32
+
+							mmDiff_epi16 = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); // ,da das ergebniss kleiner als 16 bit sein sollte
+
+							diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7   | It12 It12 It13 It13...
+							diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3   | It8 It8 It9 It9...
+
+							// Ix * diff / Iy * diff
+							v10 = _mm_mullo_epi16(Ixy_0, diff0);
+							v11 = _mm_mulhi_epi16(Ixy_0, diff0);
+							v00 = _mm_unpacklo_epi16(v10, v11);
+							v10 = _mm_unpackhi_epi16(v10, v11);
+							qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
+							qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
+
+							// It * Ix It * Iy [4 ... 7]
+							// for set 1 hi sigma 1
+
+							v10 = _mm_mullo_epi16(Ixy_1, diff1);
+							v11 = _mm_mulhi_epi16(Ixy_1, diff1);
+							v00 = _mm_unpacklo_epi16(v10, v11);
+							v10 = _mm_unpackhi_epi16(v10, v11);
+							qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
+							qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
+							// diff * J [0 ... 7]
+							// for set 1  sigma 1
+							// b3 += diff * Iptr[x]
+
+							v10 = _mm_mullo_epi16(mmDiff_epi16, I_0_7_epi16);
+							v11 = _mm_mulhi_epi16(mmDiff_epi16, I_0_7_epi16);
+							v00 = _mm_unpacklo_epi16(v10, v11);
+							v10 = _mm_unpackhi_epi16(v10, v11);
+							qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v00));
+							qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v10));
+
+							qb3 = _mm_add_ps(qb3, _mm_cvtepi32_ps(diff_0_3_epi32));
+							qb3 = _mm_add_ps(qb3, _mm_cvtepi32_ps(diff_4_7_epi32));
+				
+							if (j == 0)
+							{
+								__m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16));
+								__m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16, bSet1_epi16), 16));
+								__m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16));
+								__m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16)));
+
+								__m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16));
+								__m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16), 16));
+
+								__m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps);
+								__m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps);
+								tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps);
+								tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps);
+
+								tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps);
+								tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps);
+	
+								t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3
+								t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3
+
+								__m128 fy = _mm_cvtepi32_ps(t0);
+								__m128 fx = _mm_cvtepi32_ps(t1);
+
+								// 0 ... 3
+								__m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16));
+
+								// A11 - A22
+								__m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps);
+								__m128 fytale = _mm_mul_ps(fy, tale_0_3_ps);
+
+								mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
+								mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
+								mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
+
+								// sumIx und sumIy
+								mmSumIx = _mm_add_ps(mmSumIx, fxtale);
+								mmSumIy = _mm_add_ps(mmSumIy, fytale);
+
+								mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale));
+								mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale));
+
+								// sumI
+								__m128 I_tale_ps = _mm_mul_ps(I_ps, tale_0_3_ps);
+								mmSumI = _mm_add_ps(mmSumI, I_tale_ps);
+
+								// sumW 
+								mmSumW = _mm_add_ps(mmSumW, tale_0_3_ps);
+
+								// sumDI
+								mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps(I_ps, I_tale_ps));
+
+								t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11
+								t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3
+
+								fy = _mm_cvtepi32_ps(t0);
+								fx = _mm_cvtepi32_ps(t1);
+
+								// 4 ... 7
+								I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16));
+
+								// A11 - A22
+								fxtale = _mm_mul_ps(fx, tale_4_7_ps);
+								fytale = _mm_mul_ps(fy, tale_4_7_ps);
+
+								mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
+								mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
+								mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
+
+								// sumIx und sumIy
+								mmSumIx = _mm_add_ps(mmSumIx, fxtale);
+								mmSumIy = _mm_add_ps(mmSumIy, fytale);
+
+								mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale));
+								mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale));
+
+								// sumI
+								I_tale_ps = _mm_mul_ps(I_ps, tale_4_7_ps);
+								mmSumI = _mm_add_ps(mmSumI, I_tale_ps);
+
+								// sumW 
+								mmSumW = _mm_add_ps(mmSumW, tale_4_7_ps);
+
+								// sumDI
+								mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps(I_ps, I_tale_ps));
+							}
+
+						}
+#else
+						for (; x < _winSize.width*cn; x++, dIptr += 2)
+						{
+							if (maskPtr[x] == 0)
+								continue;
+							
+							float J = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 +
+								Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11,
+								W_BITS1 - 5);
+							short ixval = static_cast<short>(dIptr[0]);
+							short iyval = static_cast<short>(dIptr[1]);
+							int diff = J - static_cast<int>(Iptr[x]) + Iptr[x] * gainVec.x + gainVec.y;
+							int abss = (diff < 0) ? -diff : diff;
+							if (diff > MEstimatorScale)
+								MEstimatorScale += eta;
+							if (diff < MEstimatorScale)
+								MEstimatorScale -= eta;
+							// compute the missmatch vector
+							
+							if (abss > fParam1)
+							{
+								diff = 0.f;
+							}
+							else if (abss > fParam0 && diff >= 0)
+							{
+								diff = param[2] * (diff - fParam1);
+							}
+							else if (abss > fParam0 && diff < 0)
+							{
+								diff = param[2] * (diff + fParam1);
+							}
+							
+
+							b1 += (float)(diff*ixval);
+							b2 += (float)(diff*iyval); ;
+							b3 += (float)(diff)* Iptr[x];
+							b4 += (float)(diff);
+
+
+							// compute the Gradient Matrice
+							if (j == 0)
+							{
+								float tale = param[2] * FLT_RESCALE;
+								if (abss < fParam0 || j < 0)
+								{
+									tale = FLT_RESCALE;
+								}
+								else if (abss > fParam1)
+								{
+									tale *= 0.01f;
+								}
+								else
+								{
+									tale *= param[2];
+								}
+								if (j == 0)
+								{
+									A11 += (float)(ixval*ixval)*tale;
+									A12 += (float)(ixval*iyval)*tale;
+									A22 += (float)(iyval*iyval)*tale;
+								}
+
+								dI += Iptr[x] * Iptr[x] * tale;
+								float dx = static_cast<float>(dIptr[0]) * tale;
+								float dy = static_cast<float>(dIptr[1]) * tale;
+								sumIx += dx;
+								sumIy += dy;
+								w1 += dx * Iptr[x];
+								w2 += dy * Iptr[x];
+								sumI += Iptr[x] * tale;
+								sumW += tale;
+							}
+
+						}
+#endif
+					}
+
+					
+#ifdef CV_SSE2
+					short etaValues[8];
+					_mm_storeu_si128((__m128i*)(etaValues), mmEta);
+					MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3]
+							+ etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]);
+
+#endif				
+					float CV_DECL_ALIGNED(32) wbuf[4];//
+					if (j == 0)
+					{
+#ifdef CV_SSE2
+						_mm_store_ps(wbuf, mmSumW1);
+						w1 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+						_mm_store_ps(wbuf, mmSumW2);
+						w2 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+						_mm_store_ps(wbuf, mmSumDI);
+						dI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+						_mm_store_ps(wbuf, mmSumI);
+						sumI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+						_mm_store_ps(wbuf, mmSumIx);
+						sumIx = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+						_mm_store_ps(wbuf, mmSumIy);
+						sumIy = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+						_mm_store_ps(wbuf, mmSumW);
+						sumW = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+#endif
+						sumIx *= -FLT_SCALE;
+						sumIy *= -FLT_SCALE;
+						sumI *= FLT_SCALE;
+						sumW *= FLT_SCALE;
+						w1 *= -FLT_SCALE;
+						w2 *= -FLT_SCALE;
+						dI *= FLT_SCALE;
+
+
+#ifdef CV_SSE2
+						float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];//				
+
+						_mm_store_ps(A11buf, mmAxx);
+						_mm_store_ps(A12buf, mmAxy);
+						_mm_store_ps(A22buf, mmAyy);
+
+
+						A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
+						A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
+						A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
+#endif
+						A11 *= FLT_SCALE; 
+						A12 *= FLT_SCALE; 
+						A22 *= FLT_SCALE; 
+
+					
+					}
+
+
+#ifdef CV_SSE2
+					float CV_DECL_ALIGNED(16) bbuf[4];
+					_mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
+					b1 = bbuf[0] + bbuf[2];
+					b2 = bbuf[1] + bbuf[3];
+					_mm_store_ps(bbuf, qb2);
+					b3 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
+					_mm_store_ps(bbuf, qb3);
+					b4 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
+					
+#endif
+
+					mismatchMat.at<float>(0, 0) = b1 * FLT_SCALE;
+					mismatchMat.at<float>(1, 0) = b2 * FLT_SCALE;
+					mismatchMat.at<float>(2, 0) = -b3 * FLT_SCALE;
+					mismatchMat.at<float>(3, 0) = -b4 * FLT_SCALE;
+
+					D = -A12 * A12*sumI*sumI + dI * sumW*A12*A12 + 2 * A12*sumI*sumIx*w2 + 2 * A12*sumI*sumIy*w1
+						- 2 * dI*A12*sumIx*sumIy - 2 * sumW*A12*w1*w2 + A11 * A22*sumI*sumI - 2 * A22*sumI*sumIx*w1
+						- 2 * A11*sumI*sumIy*w2 - sumIx * sumIx*w2*w2 + A22 * dI*sumIx*sumIx + 2 * sumIx*sumIy*w1*w2
+						- sumIy * sumIy*w1*w1 + A11 * dI*sumIy*sumIy + A22 * sumW*w1*w1 + A11 * sumW*w2*w2 - A11 * A22*dI*sumW;
+
+					float sqrtVal = std::sqrt((A11 - A22)*(A11 - A22) + 4.f*A12*A12);
+					minEigValue = (A22 + A11 - sqrtVal) / (2.0f*winArea);;
+					maxEigValue = (A22 + A11 + sqrtVal) / (2.0f*winArea);
+					if (minEigValue < minEigThreshold || std::abs(D) < FLT_EPSILON)
+					{
+						if (level == 0 && status)
+							status[ptidx] = 0;
+						if (level > 0)
+						{
+							nextPts[ptidx] = backUpNextPt;
+							gainVecs[ptidx] = backUpGain;
+						}
+						break;
+					}
+
+					D = (1.f / D);
+
+					invTensorMat.at<float>(0, 0) = (A22*sumI*sumI - 2 * sumI*sumIy*w2 + dI * sumIy*sumIy + sumW * w2*w2 - A22 * dI*sumW)* D;												invTensorMat.at<float>(0, 1) = (A12*dI*sumW - A12 * sumI * sumI - dI * sumIx*sumIy + sumI * sumIx*w2 + sumI * sumIy*w1 - sumW * w1*w2)* D;
+					invTensorMat.at<float>(0, 2) = (A12*sumI*sumIy - sumIy * sumIy*w1 - A22 * sumI*sumIx - A12 * sumW*w2 + A22 * sumW*w1 + sumIx * sumIy*w2)* D;
+					invTensorMat.at<float>(0, 3) = (A22*dI*sumIx - A12 * dI*sumIy - sumIx * w2*w2 + A12 * sumI*w2 - A22 * sumI*w1 + sumIy * w1*w2) * D;
+					invTensorMat.at<float>(1, 0) = invTensorMat.at<float>(0, 1);
+					invTensorMat.at<float>(1, 1) = (A11*sumI * sumI - 2 * sumI*sumIx*w1 + dI * sumIx * sumIx + sumW * w1*w1 - A11 * dI*sumW) * D;
+					invTensorMat.at<float>(1, 2) = (A12*sumI*sumIx - A11 * sumI*sumIy - sumIx * sumIx*w2 + A11 * sumW*w2 - A12 * sumW*w1 + sumIx * sumIy*w1) * D;
+					invTensorMat.at<float>(1, 3) = (A11*dI*sumIy - sumIy * w1*w1 - A12 * dI*sumIx - A11 * sumI*w2 + A12 * sumI*w1 + sumIx * w1*w2)* D;
+					invTensorMat.at<float>(2, 0) = invTensorMat.at<float>(0, 2);
+					invTensorMat.at<float>(2, 1) = invTensorMat.at<float>(1, 2);
+					invTensorMat.at<float>(2, 2) = (sumW*A12*A12 - 2 * A12*sumIx*sumIy + A22 * sumIx*sumIx + A11 * sumIy*sumIy - A11 * A22*sumW)* D;
+					invTensorMat.at<float>(2, 3) = (A11*A22*sumI - A12 * A12*sumI - A11 * sumIy*w2 + A12 * sumIx*w2 + A12 * sumIy*w1 - A22 * sumIx*w1)* D;
+					invTensorMat.at<float>(3, 0) = invTensorMat.at<float>(0, 3);
+					invTensorMat.at<float>(3, 1) = invTensorMat.at<float>(1, 3);
+					invTensorMat.at<float>(3, 2) = invTensorMat.at<float>(2, 3);
+					invTensorMat.at<float>(3, 3) = (dI*A12*A12 - 2 * A12*w1*w2 + A22 * w1*w1 + A11 * w2*w2 - A11 * A22*dI)* D;
+
+					resultMat = invTensorMat * mismatchMat;
+
+				
+					Point2f delta(-resultMat.at<float>(0), -resultMat.at<float>(1));
+					Point2f deltaGain(resultMat.at<float>(2), resultMat.at<float>(3));
+
+
+
+					
+					if (j == 0)
+						prevGain = deltaGain;
+					gainVec += deltaGain * 0.8;
+					nextPt += delta * 0.8;
+					nextPts[ptidx] = nextPt + halfWin;
+					gainVecs[ptidx] = gainVec;
+
+
+
+					cv::Point2f totalDistance = backUpNextPt - nextPts[ptidx];
+					
+					if (// j > 1 && 
+						(std::abs(delta.x - prevDelta.x) < 0.01  &&	std::abs(delta.y - prevDelta.y) < 0.01)
+						|| ((delta.ddot(delta) <= 0.001) && std::abs(prevGain.x - deltaGain.x) < 0.01)
+						)
+					{
+						nextPts[ptidx] -= delta * 0.5f;
+						gainVecs[ptidx] -= deltaGain * 0.5f;
+						break;
+					}
+
+					prevDelta = delta;
+					prevGain = deltaGain;
+				}
+
+			}
+
+		}
+
+		const Mat*		prevImg;
+		const Mat*		nextImg;
+		const Mat*		prevDeriv;
+		const Mat*		rgbPrevImg;
+		const Mat*		rgbNextImg;
+		const Point2f*	prevPts;
+		Point2f*		nextPts;
+		uchar*			status;
+		cv::Point2f*	gainVecs;		
+		float*			err;
+		int				maxWinSize;
+		int				minWinSize;
+		TermCriteria	criteria;
+		int				level;
+		int				maxLevel;
+		int				windowType;
+		float			minEigThreshold;
+		bool			useInitialFlow;
+		std::vector<float> paramReset;
+		int				crossSegmentationThreshold;
+	};
+}
+}
+}
+}
+#endif
\ No newline at end of file
diff --git a/modules/optflow/src/rlof/rlof_invokerbase.hpp b/modules/optflow/src/rlof/rlof_invokerbase.hpp
new file mode 100644
index 00000000000..cb6243bf2aa
--- /dev/null
+++ b/modules/optflow/src/rlof/rlof_invokerbase.hpp
@@ -0,0 +1,309 @@
+﻿/*M///////////////////////////////////////////////////////////////////////////////////////
+ //
+ //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+ //
+ //  By downloading, copying, installing or using the software you agree to this license.
+ //  If you do not agree to this license, do not download, install,
+ //  copy or use the software.
+ //
+ //
+ //                           License Agreement
+ //                For Open Source Computer Vision Library
+ //
+ // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+ // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+ // Third party copyrights are property of their respective owners.
+ //
+ // Redistribution and use in source and binary forms, with or without modification,
+ // are permitted provided that the following conditions are met:
+ //
+ //   * Redistribution's of source code must retain the above copyright notice,
+ //     this list of conditions and the following disclaimer.
+ //
+ //   * Redistribution's in binary form must reproduce the above copyright notice,
+ //     this list of conditions and the following disclaimer in the documentation
+ //     and/or other materials provided with the distribution.
+ //
+ //   * The name of the copyright holders may not be used to endorse or promote products
+ //     derived from this software without specific prior written permission.
+ //
+ // This software is provided by the copyright holders and contributors "as is" and
+ // any express or implied warranties, including, but not limited to, the implied
+ // warranties of merchantability and fitness for a particular purpose are disclaimed.
+ // In no event shall the Intel Corporation or contributors be liable for any direct,
+ // indirect, incidental, special, exemplary, or consequential damages
+ // (including, but not limited to, procurement of substitute goods or services;
+ // loss of use, data, or profits; or business interruption) however caused
+ // and on any theory of liability, whether in contract, strict liability,
+ // or tort (including negligence or otherwise) arising in any way out of
+ // the use of this software, even if advised of the possibility of such damage.
+ //
+ //M*/
+#ifndef _RLOF_INVOKERBASE_HPP_
+#define  _RLOF_INVOKERBASE_HPP_
+
+
+#define  CV_DESCALE(x,n)     (((x) + (1 << ((n)-1))) >> (n))
+#define FLT_RESCALE 1
+#ifndef _MSC_VER
+#include <x86intrin.h>
+#endif
+
+#include "opencv2/core.hpp"
+#include "opencv2/video.hpp"
+#include "rlof_localflow.h"
+#include <unordered_map>
+#include "opencv2/core/hal/intrin.hpp"
+using namespace std;
+using namespace cv;
+
+
+namespace cv
+{
+	namespace optflow
+	{
+		typedef short deriv_type;
+		inline void get4BitMask(const int & width, __m128i & mask)
+		{
+			int noBits = width - static_cast<int>(floor(width / 4.f) * 4.f);
+			unsigned int val[4];
+			for (int n = 0; n < 4; n++)
+			{
+				val[n] = (noBits > n) ? (std::numeric_limits<unsigned int>::max()) : 0;
+			}
+			mask = _mm_set_epi32(val[3], val[2], val[1], val[0]);
+
+		}
+		inline void getWBitMask(const int & width, __m128i & t0, __m128i & t1, __m128i & t2)
+		{
+			int noBits = width - static_cast<int>(floor(width / 8.f) * 8.f);
+			unsigned short val[8];
+			for (int n = 0; n < 8; n++)
+			{
+				val[n] = (noBits > n) ? (0xffff) : 0;
+			}
+			t1 = _mm_set_epi16(val[7], val[7], val[6], val[6], val[5], val[5], val[4], val[4]);
+			t0 = _mm_set_epi16(val[3], val[3], val[2], val[2], val[1], val[1], val[0], val[0]);
+			t2 = _mm_set_epi16(val[7], val[6], val[5], val[4], val[3], val[2], val[1], val[0]);
+		}
+
+		typedef uchar tMaskType;
+#define tCVMaskType CV_8UC1
+#define MaskSet 0xffffffff
+
+		inline bool notSameColor(const cv::Point3_<uchar> & ref, int r, int c, const cv::Mat & img, int winSize, int threshold)
+		{
+			if (r >= img.rows + winSize || c >= img.cols + winSize || r < -winSize || c < -winSize) return true;
+			int step = static_cast<int>(img.step1());
+			const cv::Point3_<uchar> * tval = img.ptr<cv::Point3_<uchar>>();
+			tval += c;
+			tval += r * step / 3;
+			int R = std::abs((int)ref.x - (int)tval->x);
+			int G = std::abs((int)ref.y - (int)tval->y);
+			int B = std::abs((int)ref.z - (int)tval->z);
+			int diff = MAX(R, MAX(G, B));
+			return diff > threshold;
+		}
+
+		/*! Estimate the mask with points of the same color as the middle point
+		*\param prevGray input CV_8UC1
+		*\param prevImg input CV_8UC3
+		*\param nextImg input CV_8UC3
+		*\param prevPoint global position of the middle point of the mask window
+		*\param nextPoint global position of the middle point of the mask window
+		*\param winPointMask mask matrice with 1 labeling points contained and 0 point is not contained by the segment
+		*\param noPoints number of points contained by the segment
+		*\param winRoi rectangle of the region of interesset in global coordinates
+		*\param minWinSize,
+		*\param threshold,
+		*\param useBothImages
+		*/
+		static void getLocalPatch(
+			const cv::Mat & prevImg,
+			const cv::Point2i & prevPoint, // feature points
+			cv::Mat & winPointMask,
+			int & noPoints,
+			cv::Rect & winRoi,
+			int minWinSize,
+			int threshold)
+		{
+			int maxWinSizeH = (winPointMask.cols - 1) / 2;
+			winRoi.x = prevPoint.x;
+			winRoi.y = prevPoint.y;
+			winRoi.width = winPointMask.cols;
+			winRoi.height = winPointMask.rows;
+
+			if (minWinSize == winPointMask.cols)
+			{
+				winRoi.x = prevPoint.x - maxWinSizeH;
+				winRoi.y = prevPoint.y - maxWinSizeH;
+				winPointMask.setTo(1);
+				noPoints = winPointMask.size().area();
+				return;
+			}
+			noPoints = 0;
+			int c = prevPoint.x;
+			int r = prevPoint.y;
+			int c_left = c - 1;
+			int c_right = c + 1;
+			int r_top = r - 1;
+			int r_bottom = r;
+			int border_left = c - maxWinSizeH;
+			int border_right = c + maxWinSizeH;
+			int border_top = r - maxWinSizeH;
+			int border_bottom = r + maxWinSizeH;
+			int c_local_diff = prevPoint.x - maxWinSizeH;
+			int r_local_diff = prevPoint.y - maxWinSizeH;
+			int _c = c - c_local_diff;
+			int _r = r - r_local_diff;
+			int min_r = _r;
+			int max_r = _r;
+			int min_c = _c;
+			int max_c = _c;
+			// horizontal line
+			if (r < 0 || r >= prevImg.rows || c < 0 || c >= prevImg.cols)
+			{
+				noPoints = 0;
+				return;
+			}
+			cv::Point3_<uchar> val1 = prevImg.at<cv::Point3_<uchar>>(r, c); // middle grayvalue
+			cv::Point3_<uchar> tval;
+
+			//vertical line
+			for (int dr = r_top; dr >= border_top; dr--)
+			{
+				if (notSameColor(val1, dr, c, prevImg, maxWinSizeH, threshold))
+					break;
+
+				int _dr = dr - r_local_diff;
+				min_r = MIN(min_r, _dr);
+				max_r = MAX(max_r, _dr);
+				winPointMask.at<uchar>(_dr, _c) = 1;
+				noPoints++;
+			}
+			for (int dr = r_bottom; dr < border_bottom; dr++)
+			{
+				if (notSameColor(val1, dr, c, prevImg, maxWinSizeH, threshold)
+					)
+					break;
+
+				int _dr = dr - r_local_diff;
+				min_r = MIN(min_r, _dr);
+				max_r = MAX(max_r, _dr);
+				winPointMask.at<uchar>(_dr, _c) = 1;
+				noPoints++;
+			}
+			// accumulate along the vertical line and the search line that was still labled
+			for (int dr = min_r + r_local_diff; dr <= max_r + r_local_diff; dr++)
+			{
+				int _dr = dr - r_local_diff;
+				if (winPointMask.at<uchar>(_dr, _c) == 0)
+				{
+					winPointMask.row(_dr).setTo(0);
+					continue;
+				}
+				bool skip = false;
+				int _dc = c_right - c_local_diff;
+				for (int dc = c_right; dc < border_right; dc++, _dc++)
+				{
+					if (skip == false)
+					{
+						if (notSameColor(val1, dr, dc, prevImg, maxWinSizeH, threshold))
+							skip = true;
+					}
+					if (skip == false)
+					{
+						min_c = MIN(min_c, _dc);
+						max_c = MAX(max_c, _dc);
+						winPointMask.at<uchar>(_dr, _dc) = 1;
+						noPoints++;
+					}
+					else
+						winPointMask.at<uchar>(_dr, _dc) = 0;
+				}
+
+				skip = false;
+				_dc = c_left - c_local_diff;
+				for (int dc = c_left; dc >= border_left; dc--, _dc--)
+				{
+					if (skip == false)
+					{
+						if (notSameColor(val1, dr, dc, prevImg, maxWinSizeH, threshold))
+							skip = true;
+					}
+					if (skip == false)
+					{
+						min_c = MIN(min_c, _dc);
+						max_c = MAX(max_c, _dc);
+						winPointMask.at<uchar>(_dr, _dc) = 1;
+						noPoints++;
+					}
+					else
+						winPointMask.at<uchar>(_dr, _dc) = 0;
+				}
+			}
+
+
+			// get the initial small window
+			if (noPoints < minWinSize * minWinSize)
+			{
+				cv::Rect roi(winPointMask.cols / 2 - (minWinSize - 1) / 2,
+					winPointMask.rows / 2 - (minWinSize - 1) / 2,
+					minWinSize, minWinSize);
+				cv::Mat(winPointMask, roi).setTo(1);
+				min_c = MIN(MIN(min_c, roi.tl().x), roi.br().x - 1);
+				max_c = MAX(MAX(max_c, roi.tl().x), roi.br().x - 1);
+				min_r = MIN(MIN(min_r, roi.tl().y), roi.br().y - 1);
+				max_r = MAX(MAX(max_r, roi.tl().y), roi.br().y - 1);
+				noPoints += minWinSize * minWinSize;
+			}
+			winRoi.x = c_local_diff + min_c;
+			winRoi.y = r_local_diff + min_r;
+			winRoi.width = max_c - min_c + 1;
+			winRoi.height = max_r - min_r + 1;
+			winPointMask = winPointMask(cv::Rect(min_c, min_r, winRoi.width, winRoi.height));
+		}
+
+		inline bool calcWinMaskMat(	const cv::Mat & BI,
+									const int windowType,
+									const cv::Point2i & iprevPt,
+									cv::Mat & winMaskMat,
+									cv::Size & winSize,
+									cv::Point2f & halfWin,
+									int & winArea,
+									const int minWinSize,
+									const int maxWinSize,
+									const int crossSegmentationThreshold)
+		{
+
+			if (windowType == RLOFOpticalFlowParameter::SR_CROSS && maxWinSize != minWinSize)
+			{
+				// patch generation
+				cv::Rect winRoi;
+				getLocalPatch(BI, iprevPt, winMaskMat, winArea, winRoi, minWinSize, crossSegmentationThreshold);
+				if (winArea == 0)
+					return false;
+				winSize = winRoi.size();
+				halfWin = Point2f(static_cast<float>(iprevPt.x - winRoi.tl().x),
+					static_cast<float>(iprevPt.y - winRoi.tl().y));
+			}
+			else
+			{
+				winSize = cv::Size(maxWinSize, maxWinSize);
+				halfWin = Point2f((winSize.width - 1) / 2.f, (winSize.height - 1) / 2.f);
+				winMaskMat.setTo(1);
+			}
+			return true;
+		}
+
+		inline short estimateScale(cv::Mat & residuals)
+		{
+			cv::Mat absMat = cv::abs(residuals);
+			return quickselect<short>(absMat, absMat.rows / 2);
+		}
+	}
+
+	
+}
+
+#endif
\ No newline at end of file
diff --git a/modules/optflow/src/rlof/rlof_localflow.cpp b/modules/optflow/src/rlof/rlof_localflow.cpp
new file mode 100644
index 00000000000..3f995fd5477
--- /dev/null
+++ b/modules/optflow/src/rlof/rlof_localflow.cpp
@@ -0,0 +1,586 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+ //
+ //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+ //
+ //  By downloading, copying, installing or using the software you agree to this license.
+ //  If you do not agree to this license, do not download, install,
+ //  copy or use the software.
+ //
+ //
+ //                           License Agreement
+ //                For Open Source Computer Vision Library
+ //
+ // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+ // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+ // Third party copyrights are property of their respective owners.
+ //
+ // Redistribution and use in source and binary forms, with or without modification,
+ // are permitted provided that the following conditions are met:
+ //
+ //   * Redistribution's of source code must retain the above copyright notice,
+ //     this list of conditions and the following disclaimer.
+ //
+ //   * Redistribution's in binary form must reproduce the above copyright notice,
+ //     this list of conditions and the following disclaimer in the documentation
+ //     and/or other materials provided with the distribution.
+ //
+ //   * The name of the copyright holders may not be used to endorse or promote products
+ //     derived from this software without specific prior written permission.
+ //
+ // This software is provided by the copyright holders and contributors "as is" and
+ // any express or implied warranties, including, but not limited to, the implied
+ // warranties of merchantability and fitness for a particular purpose are disclaimed.
+ // In no event shall the Intel Corporation or contributors be liable for any direct,
+ // indirect, incidental, special, exemplary, or consequential damages
+ // (including, but not limited to, procurement of substitute goods or services;
+ // loss of use, data, or profits; or business interruption) however caused
+ // and on any theory of liability, whether in contract, strict liability,
+ // or tort (including negligence or otherwise) arising in any way out of
+ // the use of this software, even if advised of the possibility of such damage.
+ //
+ //M*/
+#include "precomp.hpp"
+#include "rlof_localflow.h"
+#ifdef HAVE_OPENCV_CALIB3D
+#include "opencv2/calib3d.hpp"
+#endif
+#include "berlof_invoker.hpp"
+#include "rlof_invoker.hpp"
+#include "plk_invoker.hpp"
+using namespace std;
+using namespace cv;
+
+namespace cv
+{
+	namespace detail
+	{
+
+		typedef short deriv_type;
+	}
+namespace
+{
+	static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
+	{
+		using namespace cv;
+		using cv::detail::deriv_type;
+		int rows = src.rows, cols = src.cols, cn = src.channels(), colsn = cols * cn, depth = src.depth();
+		CV_Assert(depth == CV_8U);
+		dst.create(rows, cols, CV_MAKETYPE(DataType<deriv_type>::depth, cn * 2));
+
+		int x, y, delta = (int)alignSize((cols + 2)*cn, 16);
+		AutoBuffer<deriv_type> _tempBuf(delta * 2 + 64);
+		deriv_type *trow0 = alignPtr(_tempBuf.data() + cn, 16), *trow1 = alignPtr(trow0 + delta, 16);
+
+#if CV_SIMD128
+		v_int16x8 c3 = v_setall_s16(3), c10 = v_setall_s16(10);
+		bool haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON);
+#endif
+
+		for (y = 0; y < rows; y++)
+		{
+			const uchar* srow0 = src.ptr<uchar>(y > 0 ? y - 1 : rows > 1 ? 1 : 0);
+			const uchar* srow1 = src.ptr<uchar>(y);
+			const uchar* srow2 = src.ptr<uchar>(y < rows - 1 ? y + 1 : rows > 1 ? rows - 2 : 0);
+			deriv_type* drow = dst.ptr<deriv_type>(y);
+
+			// do vertical convolution
+			x = 0;
+#if CV_SIMD128
+			if (haveSIMD)
+			{
+				for (; x <= colsn - 8; x += 8)
+				{
+					v_int16x8 s0 = v_reinterpret_as_s16(v_load_expand(srow0 + x));
+					v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(srow1 + x));
+					v_int16x8 s2 = v_reinterpret_as_s16(v_load_expand(srow2 + x));
+
+					v_int16x8 t1 = s2 - s0;
+					v_int16x8 t0 = v_mul_wrap(s0 + s2, c3) + v_mul_wrap(s1, c10);
+
+					v_store(trow0 + x, t0);
+					v_store(trow1 + x, t1);
+				}
+			}
+#endif
+
+			for (; x < colsn; x++)
+			{
+				int t0 = (srow0[x] + srow2[x]) * 3 + srow1[x] * 10;
+				int t1 = srow2[x] - srow0[x];
+				trow0[x] = (deriv_type)t0;
+				trow1[x] = (deriv_type)t1;
+			}
+
+			// make border
+			int x0 = (cols > 1 ? 1 : 0)*cn, x1 = (cols > 1 ? cols - 2 : 0)*cn;
+			for (int k = 0; k < cn; k++)
+			{
+				trow0[-cn + k] = trow0[x0 + k]; trow0[colsn + k] = trow0[x1 + k];
+				trow1[-cn + k] = trow1[x0 + k]; trow1[colsn + k] = trow1[x1 + k];
+			}
+
+			// do horizontal convolution, interleave the results and store them to dst
+			x = 0;
+#if CV_SIMD128
+			if (haveSIMD)
+			{
+				for (; x <= colsn - 8; x += 8)
+				{
+					v_int16x8 s0 = v_load(trow0 + x - cn);
+					v_int16x8 s1 = v_load(trow0 + x + cn);
+					v_int16x8 s2 = v_load(trow1 + x - cn);
+					v_int16x8 s3 = v_load(trow1 + x);
+					v_int16x8 s4 = v_load(trow1 + x + cn);
+
+					v_int16x8 t0 = s1 - s0;
+					v_int16x8 t1 = v_mul_wrap(s2 + s4, c3) + v_mul_wrap(s3, c10);
+
+					v_store_interleave((drow + x * 2), t0, t1);
+				}
+			}
+#endif
+			for (; x < colsn; x++)
+			{
+				deriv_type t0 = (deriv_type)(trow0[x + cn] - trow0[x - cn]);
+				deriv_type t1 = (deriv_type)((trow1[x + cn] + trow1[x - cn]) * 3 + trow1[x] * 10);
+				drow[x * 2] = t0; drow[x * 2 + 1] = t1;
+			}
+		}
+	}
+
+}//namespace
+	namespace optflow
+	{
+		bool isrobust(const RLOFOpticalFlowParameter & param)
+		{
+			return (param.normSigma0 != std::numeric_limits<float>::max())
+				&& (param.normSigma1 != std::numeric_limits<float>::max());
+		}
+		std::vector<float> get_norm(float sigma0, float sigma1)
+		{
+			std::vector<float> result = { sigma0, sigma1, sigma0 / (sigma0 - sigma1), sigma0 * sigma1 };
+			return result;
+		}
+
+	
+		int buildOpticalFlowPyramidScale(InputArray _img, OutputArrayOfArrays pyramid, Size winSize, int maxLevel, bool withDerivatives,
+			int pyrBorder, int derivBorder, bool tryReuseInputImage, float levelScale[2])
+		{
+			Mat img = _img.getMat();
+			CV_Assert(img.depth() == CV_8U && winSize.width > 2 && winSize.height > 2);
+			int pyrstep = withDerivatives ? 2 : 1;
+
+			pyramid.create(1, (maxLevel + 1) * pyrstep, 0 /*type*/, -1, true);
+
+			int derivType = CV_MAKETYPE(DataType<short>::depth, img.channels() * 2);
+
+			//level 0
+			bool lvl0IsSet = false;
+			if (tryReuseInputImage && img.isSubmatrix() && (pyrBorder & BORDER_ISOLATED) == 0)
+			{
+				Size wholeSize;
+				Point ofs;
+				img.locateROI(wholeSize, ofs);
+				if (ofs.x >= winSize.width && ofs.y >= winSize.height
+					&& ofs.x + img.cols + winSize.width <= wholeSize.width
+					&& ofs.y + img.rows + winSize.height <= wholeSize.height)
+				{
+					pyramid.getMatRef(0) = img;
+					lvl0IsSet = true;
+				}
+			}
+
+			if (!lvl0IsSet)
+			{
+				Mat& temp = pyramid.getMatRef(0);
+
+				if (!temp.empty())
+					temp.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width);
+				if (temp.type() != img.type() || temp.cols != winSize.width * 2 + img.cols || temp.rows != winSize.height * 2 + img.rows)
+					temp.create(img.rows + winSize.height * 2, img.cols + winSize.width * 2, img.type());
+
+				if (pyrBorder == BORDER_TRANSPARENT)
+					img.copyTo(temp(Rect(winSize.width, winSize.height, img.cols, img.rows)));
+				else
+					copyMakeBorder(img, temp, winSize.height, winSize.height, winSize.width, winSize.width, pyrBorder);
+				temp.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width);
+			}
+
+			Size sz = img.size();
+			Mat prevLevel = pyramid.getMatRef(0);
+			Mat thisLevel = prevLevel;
+
+			for (int level = 0; level <= maxLevel; ++level)
+			{
+				if (level != 0)
+				{
+					Mat& temp = pyramid.getMatRef(level * pyrstep);
+
+					if (!temp.empty())
+						temp.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width);
+					if (temp.type() != img.type() || temp.cols != winSize.width * 2 + sz.width || temp.rows != winSize.height * 2 + sz.height)
+						temp.create(sz.height + winSize.height * 2, sz.width + winSize.width * 2, img.type());
+
+					thisLevel = temp(Rect(winSize.width, winSize.height, sz.width, sz.height));
+					pyrDown(prevLevel, thisLevel, sz);
+
+					if (pyrBorder != BORDER_TRANSPARENT)
+						copyMakeBorder(thisLevel, temp, winSize.height, winSize.height, winSize.width, winSize.width, pyrBorder | BORDER_ISOLATED);
+					temp.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width);
+				}
+
+				if (withDerivatives)
+				{
+					Mat& deriv = pyramid.getMatRef(level * pyrstep + 1);
+
+					if (!deriv.empty())
+						deriv.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width);
+					if (deriv.type() != derivType || deriv.cols != winSize.width * 2 + sz.width || deriv.rows != winSize.height * 2 + sz.height)
+						deriv.create(sz.height + winSize.height * 2, sz.width + winSize.width * 2, derivType);
+
+					Mat derivI = deriv(Rect(winSize.width, winSize.height, sz.width, sz.height));
+					calcSharrDeriv(thisLevel, derivI);
+
+					if (derivBorder != BORDER_TRANSPARENT)
+						copyMakeBorder(derivI, deriv, winSize.height, winSize.height, winSize.width, winSize.width, derivBorder | BORDER_ISOLATED);
+					deriv.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width);
+				}
+
+				sz = Size(static_cast<int>((sz.width + 1) / levelScale[0]),
+					static_cast<int>((sz.height + 1) / levelScale[1]));
+				if (sz.width <= winSize.width || sz.height <= winSize.height)
+				{
+					pyramid.create(1, (level + 1) * pyrstep, 0 /*type*/, -1, true);//check this
+					return level;
+				}
+
+				prevLevel = thisLevel;
+			}
+
+			return maxLevel;
+		}
+		int CImageBuffer::buildPyramid(cv::Size winSize, int maxLevel, float levelScale[2])
+		{
+			if (m_Overwrite == false)
+				return m_maxLevel;
+			m_maxLevel = buildOpticalFlowPyramidScale(m_Image, m_ImagePyramid, winSize, maxLevel, false, 4, 0, true, levelScale);
+			return m_maxLevel;
+		}
+				
+		void calcLocalOpticalFlowCore(
+			Ptr<CImageBuffer>  prevPyramids[2],
+			Ptr<CImageBuffer> currPyramids[2],
+			InputArray _prevPts,
+			InputOutputArray _nextPts,
+			const RLOFOpticalFlowParameter & param)
+		{
+		
+			bool useAdditionalRGB = param.supportRegionType == RLOFOpticalFlowParameter::SR_CROSS;
+			int iWinSize = param.largeWinSize;
+			int winSizes[2] = { iWinSize, iWinSize };
+			if (param.supportRegionType != RLOFOpticalFlowParameter::SR_FIXED)
+			{
+				winSizes[0] = param.smallWinSize;
+			}
+			//cv::Size winSize(iWinSize, iWinSize);
+			cv::TermCriteria criteria(cv::TermCriteria::COUNT | cv::TermCriteria::EPS, param.maxIteration, 0.01);
+			std::vector<float> rlofNorm = get_norm(param.normSigma0, param.normSigma1);
+			CV_Assert(winSizes[0] <= winSizes[1]);
+
+			Mat prevPtsMat = _prevPts.getMat();
+			const int derivDepth = DataType<detail::deriv_type>::depth;
+
+			CV_Assert(param.maxLevel >= 0 && iWinSize > 2);
+
+			int level = 0,  npoints;
+			CV_Assert((npoints = prevPtsMat.checkVector(2, CV_32F, true)) >= 0);
+
+			if (!(param.useInitialFlow))
+				_nextPts.create(prevPtsMat.size(), prevPtsMat.type(), -1, true);
+
+			Mat nextPtsMat = _nextPts.getMat();
+			CV_Assert(nextPtsMat.checkVector(2, CV_32F, true) == npoints);
+
+			const Point2f* prevPts = (const Point2f*)prevPtsMat.data;
+			Point2f* nextPts = (Point2f*)nextPtsMat.data;
+			std::vector<uchar> status(npoints);
+			std::vector<float> err(npoints);
+			std::vector<Point2f> gainPts(npoints);
+
+			float levelScale[2] = { 2.f,2.f };
+
+			int maxLevel = prevPyramids[0]->buildPyramid(cv::Size(iWinSize, iWinSize), param.maxLevel, levelScale);
+
+			maxLevel = currPyramids[0]->buildPyramid(cv::Size(iWinSize, iWinSize), maxLevel, levelScale);
+			if (useAdditionalRGB)
+			{
+				prevPyramids[1]->buildPyramid(cv::Size(iWinSize, iWinSize), maxLevel, levelScale);
+				currPyramids[1]->buildPyramid(cv::Size(iWinSize, iWinSize), maxLevel, levelScale);
+			}
+
+			if ((criteria.type & TermCriteria::COUNT) == 0)
+				criteria.maxCount = 30;
+			else
+				criteria.maxCount = std::min(std::max(criteria.maxCount, 0), 100);
+			if ((criteria.type & TermCriteria::EPS) == 0)
+				criteria.epsilon = 0.001;
+			else
+				criteria.epsilon = std::min(std::max(criteria.epsilon, 0.), 10.);
+			criteria.epsilon *= criteria.epsilon;
+
+			// dI/dx ~ Ix, dI/dy ~ Iy
+			Mat derivIBuf;
+			derivIBuf.create(prevPyramids[0]->m_ImagePyramid[0].rows + iWinSize * 2, prevPyramids[0]->m_ImagePyramid[0].cols + iWinSize * 2, CV_MAKETYPE(derivDepth, prevPyramids[0]->m_ImagePyramid[0].channels() * 2));
+
+			for (level = maxLevel; level >= 0; level--)
+			{
+				Mat derivI;
+				Size imgSize = prevPyramids[0]->getImage(level).size();
+				Mat _derivI(imgSize.height + iWinSize * 2, imgSize.width + iWinSize * 2, derivIBuf.type(), derivIBuf.data);
+				derivI = _derivI(Rect(iWinSize, iWinSize, imgSize.width, imgSize.height));
+				calcSharrDeriv(prevPyramids[0]->getImage(level), derivI);
+				copyMakeBorder(derivI, _derivI, iWinSize, iWinSize, iWinSize, iWinSize, BORDER_CONSTANT | BORDER_ISOLATED);
+				
+				cv::Mat tRGBPrevPyr;
+				cv::Mat tRGBNextPyr;
+				if (useAdditionalRGB)
+				{
+					tRGBPrevPyr = prevPyramids[1]->getImage(level);
+					tRGBNextPyr = prevPyramids[1]->getImage(level);
+
+					prevPyramids[1]->m_Overwrite = false;
+					currPyramids[1]->m_Overwrite = false;
+				}
+
+				cv::Mat prevImage = prevPyramids[0]->getImage(level);
+				cv::Mat currImage = currPyramids[0]->getImage(level);
+
+				// apply plk like tracker
+				if (isrobust(param) == false)
+				{
+					if (param.useIlluminationModel)
+					{
+						cv::parallel_for_(cv::Range(0, npoints),
+							plk::radial::TrackerInvoker(
+								prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
+								prevPts, nextPts, &status[0], &err[0], &gainPts[0],
+								level, maxLevel, winSizes,
+								param.maxIteration,
+								param.useInitialFlow,
+								param.supportRegionType,
+								param.minEigenValue,
+								param.crossSegmentationThreshold));
+					}
+					else
+					{
+						if (param.solverType == RLOFOpticalFlowParameter::SolverType::ST_STANDART)
+						{
+							cv::parallel_for_(cv::Range(0, npoints),
+								plk::ica::TrackerInvoker(
+									prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
+									prevPts, nextPts, &status[0], &err[0],
+									level, maxLevel, winSizes,
+									param.maxIteration,
+									param.useInitialFlow,
+									param.supportRegionType,
+									param.crossSegmentationThreshold, param.minEigenValue));
+						}
+						else
+						{
+							cv::parallel_for_(cv::Range(0, npoints),
+								beplk::ica::TrackerInvoker(prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
+									prevPts, nextPts, &status[0], &err[0],
+									level, maxLevel, winSizes,
+									param.maxIteration,
+									param.useInitialFlow,
+									param.supportRegionType,
+									param.crossSegmentationThreshold,
+									param.minEigenValue));
+						}
+					}
+				}
+				// for robust models
+				else
+				{
+					if (param.useIlluminationModel)
+					{
+						if (param.solverType == RLOFOpticalFlowParameter::SolverType::ST_STANDART)
+						{
+							cv::parallel_for_(cv::Range(0, npoints),
+								rlof::radial::TrackerInvoker(
+									prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
+									prevPts, nextPts, &status[0], &err[0], &gainPts[0],
+									level, maxLevel, winSizes,
+									param.maxIteration,
+									param.useInitialFlow,
+									param.supportRegionType,
+									rlofNorm,
+									param.minEigenValue,
+									param.crossSegmentationThreshold));
+						}
+						else
+						{
+							cv::parallel_for_(cv::Range(0, npoints),
+								berlof::radial::TrackerInvoker(prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
+									prevPts, nextPts, &status[0], &err[0], &gainPts[0],
+									level, maxLevel, winSizes,
+									param.maxIteration,
+									param.useInitialFlow,
+									param.supportRegionType,
+									param.crossSegmentationThreshold,
+									rlofNorm,
+									param.minEigenValue));
+						}
+					}
+					else
+					{
+						
+						if (param.solverType == RLOFOpticalFlowParameter::SolverType::ST_STANDART)
+						{
+							cv::parallel_for_(cv::Range(0, npoints),
+								rlof::ica::TrackerInvoker(
+									prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
+									prevPts, nextPts, &status[0], &err[0],
+									level, maxLevel, winSizes,
+									param.maxIteration,
+									param.useInitialFlow,
+									param.supportRegionType,
+									rlofNorm,
+									param.minEigenValue,
+									param.crossSegmentationThreshold));
+						}
+						else
+						{
+							cv::parallel_for_(cv::Range(0, npoints),
+								berlof::ica::TrackerInvoker(prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
+									prevPts, nextPts, &status[0], &err[0],
+									level, maxLevel, winSizes,
+									param.maxIteration,
+									param.useInitialFlow,
+									param.supportRegionType,
+									param.crossSegmentationThreshold,
+									rlofNorm,
+									param.minEigenValue));
+						}
+						
+					}
+				}
+
+				prevPyramids[0]->m_Overwrite = true;
+				currPyramids[0]->m_Overwrite = true;
+			}
+		}
+		
+		void preprocess(Ptr<CImageBuffer> prevPyramids[2],
+			Ptr<CImageBuffer> currPyramids[2],
+			const std::vector<cv::Point2f> & prevPoints,
+			std::vector<cv::Point2f> & currPoints,
+			const RLOFOpticalFlowParameter & param)
+		{
+			cv::Mat mask, homography;
+			if (param.useGlobalMotionPrior == false)
+				return;
+
+			currPoints.resize(prevPoints.size());
+
+			RLOFOpticalFlowParameter gmeTrackerParam = param;
+			gmeTrackerParam.useGlobalMotionPrior = false;
+			gmeTrackerParam.largeWinSize = 17;
+			// use none robust tracker for global motion estimation since it is faster
+			gmeTrackerParam.normSigma0 = std::numeric_limits<float>::max();
+			gmeTrackerParam.maxIteration = MAX(15, param.maxIteration);
+			gmeTrackerParam.minEigenValue = 0.000001f;
+
+			std::vector<cv::Point2f> gmPrevPoints, gmCurrPoints, gmCheckPoints;
+
+			// Initialize point grid
+			int stepr = prevPyramids[0]->m_Image.rows / 30;
+			int stepc = prevPyramids[0]->m_Image.rows / 40;
+			for (int r = stepr / 2; r < prevPyramids[0]->m_Image.rows; r += stepr)
+			{
+				for (int c = stepc / 2; c < prevPyramids[0]->m_Image.cols; c += stepc)
+				{
+					gmPrevPoints.push_back(cv::Point2f(static_cast<float>(c), static_cast<float>(r)));
+				}
+			}
+
+			// perform motion estimation
+			calcLocalOpticalFlowCore(prevPyramids, currPyramids, gmPrevPoints, gmCurrPoints, gmeTrackerParam);
+			calcLocalOpticalFlowCore(currPyramids, prevPyramids, gmCurrPoints, gmCheckPoints, gmeTrackerParam);
+
+			cv::Mat prevPointsMat(static_cast<int>(gmPrevPoints.size()), 1, CV_32FC2);
+			cv::Mat currPointsMat(static_cast<int>(gmPrevPoints.size()), 1, CV_32FC2);
+			cv::Mat distMat(static_cast<int>(gmPrevPoints.size()), 1, CV_32FC1);
+
+			// Forward backward confidence to estimate optimal ransac reprojection error
+			int noPoints = 0;
+			for (unsigned int n = 0; n < gmPrevPoints.size(); n++)
+			{
+				cv::Point2f dist = gmCheckPoints[n] - gmPrevPoints[n];
+
+			    cv::Point2f flow = gmCurrPoints[n] - gmPrevPoints[n];
+				prevPointsMat.at<cv::Point2f>(noPoints) = gmPrevPoints[n];
+				currPointsMat.at<cv::Point2f>(noPoints) = gmCurrPoints[n];
+				distMat.at<float>(noPoints) = flow.x * flow.x + flow.y* flow.y;
+				if (isnan(distMat.at<float>(noPoints)) == false)
+					noPoints++;
+			}
+
+			float medianDist = (param.globalMotionRansacThreshold == 0) ? 1.f :
+				quickselect<float>(distMat, static_cast<int>(noPoints  * static_cast<float>(param.globalMotionRansacThreshold) / 100.f));
+			medianDist = sqrt(medianDist);
+
+			if (noPoints < 8)
+				return;
+
+			cv::findHomography(prevPointsMat(cv::Rect(0, 0, 1, noPoints)), currPointsMat(cv::Rect(0, 0, 1, noPoints)), cv::RANSAC, medianDist, mask).convertTo(homography, CV_32FC1);
+
+			if (homography.empty())
+				return;
+
+			cv::perspectiveTransform(prevPoints, currPoints, homography);
+		}
+
+		void calcLocalOpticalFlow(
+			const Mat prevImage,
+			const Mat currImage,
+			Ptr<CImageBuffer>  prevPyramids[2],
+			Ptr<CImageBuffer>  currPyramids[2],
+			const std::vector<Point2f> & prevPoints,
+			std::vector<Point2f> & currPoints,
+			const RLOFOpticalFlowParameter & param)
+		{
+			if (prevImage.empty() == false && currImage.empty()== false)
+			{
+				prevPyramids[0]->m_Overwrite = true;
+				currPyramids[0]->m_Overwrite = true;
+				prevPyramids[1]->m_Overwrite = true;
+				currPyramids[1]->m_Overwrite = true;
+				if (prevImage.type() == CV_8UC3)
+				{
+					prevPyramids[0]->setGrayFromRGB(prevImage);
+					currPyramids[0]->setGrayFromRGB(currImage);
+					prevPyramids[1]->setImage(prevImage);
+					currPyramids[1]->setImage(currImage);
+
+					if (param.supportRegionType == RLOFOpticalFlowParameter::SR_CROSS)
+					{
+						prevPyramids[1]->setBlurFromRGB(prevImage);
+						currPyramids[1]->setBlurFromRGB(currImage);
+					}
+				}
+				else
+				{
+					prevPyramids[0]->setImage(prevImage);
+					currPyramids[0]->setImage(currImage);
+				}
+			}
+			preprocess(prevPyramids, currPyramids, prevPoints, currPoints, param);
+			RLOFOpticalFlowParameter internParam = param;
+			if (param.useGlobalMotionPrior == true)
+				internParam.useInitialFlow = true;
+			calcLocalOpticalFlowCore(prevPyramids, currPyramids, prevPoints, currPoints, internParam);
+		}
+	}
+}
+	
\ No newline at end of file
diff --git a/modules/optflow/src/rlof/rlof_localflow.h b/modules/optflow/src/rlof/rlof_localflow.h
new file mode 100644
index 00000000000..6f6a0fc5e2a
--- /dev/null
+++ b/modules/optflow/src/rlof/rlof_localflow.h
@@ -0,0 +1,158 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+ //
+ //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+ //
+ //  By downloading, copying, installing or using the software you agree to this license.
+ //  If you do not agree to this license, do not download, install,
+ //  copy or use the software.
+ //
+ //
+ //                           License Agreement
+ //                For Open Source Computer Vision Library
+ //
+ // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+ // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+ // Third party copyrights are property of their respective owners.
+ //
+ // Redistribution and use in source and binary forms, with or without modification,
+ // are permitted provided that the following conditions are met:
+ //
+ //   * Redistribution's of source code must retain the above copyright notice,
+ //     this list of conditions and the following disclaimer.
+ //
+ //   * Redistribution's in binary form must reproduce the above copyright notice,
+ //     this list of conditions and the following disclaimer in the documentation
+ //     and/or other materials provided with the distribution.
+ //
+ //   * The name of the copyright holders may not be used to endorse or promote products
+ //     derived from this software without specific prior written permission.
+ //
+ // This software is provided by the copyright holders and contributors "as is" and
+ // any express or implied warranties, including, but not limited to, the implied
+ // warranties of merchantability and fitness for a particular purpose are disclaimed.
+ // In no event shall the Intel Corporation or contributors be liable for any direct,
+ // indirect, incidental, special, exemplary, or consequential damages
+ // (including, but not limited to, procurement of substitute goods or services;
+ // loss of use, data, or profits; or business interruption) however caused
+ // and on any theory of liability, whether in contract, strict liability,
+ // or tort (including negligence or otherwise) arising in any way out of
+ // the use of this software, even if advised of the possibility of such damage.
+ //
+ //M*/
+#ifndef _RLOF_LOCALFLOW_H_
+#define _RLOF_LOCALFLOW_H_
+#include <limits>
+#include <math.h>
+#include <float.h>
+#include <stdio.h>
+#include "opencv2/core.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/optflow/rlofflow.hpp"
+//! Fast median estimation method based on @cite Tibshirani2008. This implementation relates to http://www.stat.cmu.edu/~ryantibs/median/
+using namespace cv;
+template<typename T>
+T quickselect(const Mat & inp, int k)
+{
+	unsigned long i;
+	unsigned long ir;
+	unsigned long j;
+	unsigned long l;
+	unsigned long mid;
+	Mat values = inp.clone();
+	T a;
+
+	l = 0;
+	ir = MAX(values.rows, values.cols) - 1;
+	while (true) 
+	{
+		if (ir <= l + 1) 
+		{
+			if (ir == l + 1 && values.at<T>(ir) < values.at<T>(l))
+				std::swap(values.at<T>(l), values.at<T>(ir));
+			return values.at<T>(k);
+		}
+		else 
+		{
+			mid = (l + ir) >> 1;
+			std::swap(values.at<T>(mid), values.at<T>(l+1));
+			if (values.at<T>(l) > values.at<T>(ir))
+				std::swap(values.at<T>(l), values.at<T>(ir));
+			if (values.at<T>(l+1) > values.at<T>(ir))
+				std::swap(values.at<T>(l+1), values.at<T>(ir));
+			if (values.at<T>(l) > values.at<T>(l+1))
+				std::swap(values.at<T>(l), values.at<T>(l+1));
+			i = l + 1;
+			j = ir;
+			a = values.at<T>(l+1);
+			while (true) 
+			{
+				do
+				{
+					i++;
+				}
+				while (values.at<T>(i) < a);
+				do
+				{
+					j--;
+				}
+				while (values.at<T>(j) > a);
+				if (j < i) break;
+				std::swap(values.at<T>(i), values.at<T>(j));
+			}
+			values.at<T>(l+1) = values.at<T>(j);
+			values.at<T>(j) = a;
+			if (j >= static_cast<unsigned long>(k)) ir = j - 1;
+			if (j <= static_cast<unsigned long>(k)) l = i;
+		}
+	}
+}
+
+namespace cv
+{
+	namespace optflow
+	{
+	class CImageBuffer
+	{
+	public:
+		CImageBuffer()
+			: m_Overwrite(true)
+		{};
+		void setGrayFromRGB(const cv::Mat & inp)
+		{
+			if(m_Overwrite)
+				cv::cvtColor(inp, m_Image, cv::COLOR_BGR2GRAY);
+		}
+		void setImage(const cv::Mat & inp)
+		{
+			if(m_Overwrite)
+				inp.copyTo(m_Image);
+		}
+		void setBlurFromRGB(const cv::Mat & inp)
+		{
+			//cv::medianBlur(constNextImage, blurNextImg, 7);
+			if(m_Overwrite)
+				cv::GaussianBlur(inp, m_BlurredImage, cv::Size(7,7), -1);
+		}
+		
+		int buildPyramid(cv::Size winSize, int maxLevel, float levelScale[2]);
+		cv::Mat & getImage(int level) {return m_ImagePyramid[level];}
+
+		std::vector<cv::Mat>		m_ImagePyramid;
+		cv::Mat						m_BlurredImage;
+		cv::Mat						m_Image;
+		std::vector<cv::Mat>		m_CrossPyramid;
+		int							m_maxLevel;
+		bool m_Overwrite;
+	};
+		
+	void calcLocalOpticalFlow(
+		const Mat prevImage,
+		const Mat currImage,
+		Ptr<CImageBuffer>  prevPyramids[2],
+		Ptr<CImageBuffer>  currPyramids[2],
+		const std::vector<Point2f> & prevPoints,
+		std::vector<Point2f> & currPoints,
+		const RLOFOpticalFlowParameter & param);
+}
+}
+#endif
\ No newline at end of file
diff --git a/modules/optflow/src/rlofflow.cpp b/modules/optflow/src/rlofflow.cpp
new file mode 100644
index 00000000000..1bdc45a0221
--- /dev/null
+++ b/modules/optflow/src/rlofflow.cpp
@@ -0,0 +1,385 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+ //
+ //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+ //
+ //  By downloading, copying, installing or using the software you agree to this license.
+ //  If you do not agree to this license, do not download, install,
+ //  copy or use the software.
+ //
+ //
+ //                           License Agreement
+ //                For Open Source Computer Vision Library
+ //
+ // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+ // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+ // Third party copyrights are property of their respective owners.
+ //
+ // Redistribution and use in source and binary forms, with or without modification,
+ // are permitted provided that the following conditions are met:
+ //
+ //   * Redistribution's of source code must retain the above copyright notice,
+ //     this list of conditions and the following disclaimer.
+ //
+ //   * Redistribution's in binary form must reproduce the above copyright notice,
+ //     this list of conditions and the following disclaimer in the documentation
+ //     and/or other materials provided with the distribution.
+ //
+ //   * The name of the copyright holders may not be used to endorse or promote products
+ //     derived from this software without specific prior written permission.
+ //
+ // This software is provided by the copyright holders and contributors "as is" and
+ // any express or implied warranties, including, but not limited to, the implied
+ // warranties of merchantability and fitness for a particular purpose are disclaimed.
+ // In no event shall the Intel Corporation or contributors be liable for any direct,
+ // indirect, incidental, special, exemplary, or consequential damages
+ // (including, but not limited to, procurement of substitute goods or services;
+ // loss of use, data, or profits; or business interruption) however caused
+ // and on any theory of liability, whether in contract, strict liability,
+ // or tort (including negligence or otherwise) arising in any way out of
+ // the use of this software, even if advised of the possibility of such damage.
+ //
+ //M*/
+#include "precomp.hpp"
+#include "rlof/rlof_localflow.h"
+#include "rlof/geo_interpolation.hpp"
+#include "opencv2/ximgproc.hpp"
+namespace cv
+{ 
+	namespace optflow 
+	{
+				
+		class DenseOpticalFlowRLOFImpl : public DenseRLOFOpticalFlow
+		{
+		public:
+			DenseOpticalFlowRLOFImpl()
+				: forwardBackwardThreshold(1.f)
+				, gridStep(6, 6)
+				, interp_type(InterpolationType::INTERP_GEO)
+				, k(128)
+				, sigma(0.05f)
+				, lambda(999.f)
+				, fgs_lambda(500.0f)
+				, fgs_sigma(1.5f)
+				, use_post_proc(true)
+
+			{
+				prevPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer);
+				prevPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer);
+				currPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer);
+				currPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer);
+			}
+			virtual void setRLOFOpticalFlowParameter(RLOFOpticalFlowParameter val) CV_OVERRIDE { param = val; }
+			virtual RLOFOpticalFlowParameter getRLOFOpticalFlowParameter() const CV_OVERRIDE { return param; }
+
+			virtual float getForwardBackward() const CV_OVERRIDE { return forwardBackwardThreshold; }
+			virtual void setForwardBackward(float val) CV_OVERRIDE { forwardBackwardThreshold = val; }
+
+			virtual void setInterpolation(InterpolationType val) CV_OVERRIDE { interp_type = val; }
+			virtual InterpolationType getInterpolation() const CV_OVERRIDE { return interp_type; }
+
+			virtual Size getGridStep() const CV_OVERRIDE { return gridStep; }
+			virtual void setGridStep(Size val) CV_OVERRIDE { gridStep = val; }
+
+			virtual int getEPICK() const CV_OVERRIDE { return k; }
+			virtual void setEPICK(int val) CV_OVERRIDE { k = val; }
+
+			virtual float getEPICSigma() const CV_OVERRIDE { return sigma; }
+			virtual void setEPICSigma(float val) CV_OVERRIDE { sigma = val; }
+
+			virtual float getEPICLambda() const CV_OVERRIDE { return lambda; }
+			virtual void setEPICLambda(float val)  CV_OVERRIDE { lambda = val; }
+
+			virtual float getFgsLambda() const CV_OVERRIDE { return fgs_lambda; }
+			virtual void setFgsLambda(float val) CV_OVERRIDE { fgs_lambda = val; }
+
+			virtual float getFgsSigma() const CV_OVERRIDE { return fgs_sigma; }
+			virtual void setFgsSigma(float val) CV_OVERRIDE { fgs_sigma = val; }
+
+			virtual bool getUsePostProc() const CV_OVERRIDE { return use_post_proc; }
+			virtual void setUsePostProc(bool val) CV_OVERRIDE { use_post_proc = val; }
+
+			virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow) CV_OVERRIDE 
+			{
+				CV_Assert(!I0.empty() && I0.depth() == CV_8U && (I0.channels() == 3 || I0.channels() == 1));
+				CV_Assert(!I1.empty() && I1.depth() == CV_8U && (I1.channels() == 3 || I1.channels() == 1));
+				CV_Assert(I0.sameSize(I1));
+				if (param.supportRegionType == RLOFOpticalFlowParameter::SR_CROSS)
+					CV_Assert( I0.channels() == 3 && I1.channels() == 3);
+				CV_Assert(interp_type == InterpolationType::INTERP_EPIC || interp_type == InterpolationType::INTERP_GEO);
+
+				Mat prevImage = I0.getMat();
+				Mat currImage = I1.getMat();
+				int noPoints = prevImage.cols * prevImage.rows;
+				std::vector<cv::Point2f> prevPoints(noPoints);
+				std::vector<cv::Point2f> currPoints, refPoints;
+				noPoints = 0;
+				cv::Size grid_h = gridStep / 2;
+				for (int r = grid_h.height; r < prevImage.rows; r += gridStep.height)
+				{
+					for (int c = grid_h.width; c < prevImage.cols; c += gridStep.width)
+					{
+						prevPoints[noPoints++] = cv::Point2f(static_cast<float>(c), static_cast<float>(r));
+					}
+				}
+				prevPoints.erase(prevPoints.begin() + noPoints, prevPoints.end());
+				currPoints.resize(prevPoints.size());
+
+				calcLocalOpticalFlow(prevImage, currImage, prevPyramid, currPyramid, prevPoints, currPoints, param);
+
+				flow.create(prevImage.size(), CV_32FC2);
+				Mat dense_flow = flow.getMat();
+
+				std::vector<Point2f> filtered_prevPoints;
+				std::vector<Point2f> filtered_currPoints;
+				if (gridStep == cv::Size(1, 1) && forwardBackwardThreshold <= 0)
+				{
+					for (unsigned int n = 0; n < prevPoints.size(); n++)
+					{
+						dense_flow.at<Point2f>(prevPoints[n]) = currPoints[n] - prevPoints[n];
+					}
+					return;
+				}
+				if (forwardBackwardThreshold > 0)
+				{
+					// reuse image pyramids
+					calcLocalOpticalFlow(currImage, prevImage, currPyramid, prevPyramid, currPoints, refPoints, param);
+
+					filtered_prevPoints.resize(prevPoints.size());
+					filtered_currPoints.resize(prevPoints.size());
+					float sqrForwardBackwardThreshold = forwardBackwardThreshold * forwardBackwardThreshold;
+					noPoints = 0;
+					for (unsigned int r = 0; r < refPoints.size(); r++)
+					{
+						Point2f diff = refPoints[r] - prevPoints[r];
+						if (diff.x * diff.x + diff.y * diff.y < sqrForwardBackwardThreshold)
+						{
+							filtered_prevPoints[noPoints] = prevPoints[r];
+							filtered_currPoints[noPoints++] = currPoints[r];
+						}
+					}
+
+					filtered_prevPoints.erase(filtered_prevPoints.begin() + noPoints, filtered_prevPoints.end());
+					filtered_currPoints.erase(filtered_currPoints.begin() + noPoints, filtered_currPoints.end());
+
+				}
+				else
+				{
+					filtered_prevPoints = prevPoints;
+					filtered_currPoints = currPoints;
+				}
+
+				if (interp_type == InterpolationType::INTERP_EPIC)
+				{
+					Ptr<ximgproc::EdgeAwareInterpolator> gd = ximgproc::createEdgeAwareInterpolator();
+					gd->setK(k);
+					gd->setSigma(sigma);
+					gd->setLambda(lambda);
+					gd->setUsePostProcessing(false);
+					gd->interpolate(prevImage, filtered_prevPoints, currImage, filtered_currPoints, dense_flow);
+				}
+				else
+				{
+					Mat blurredPrevImage, blurredCurrImage;
+					GaussianBlur(prevImage, blurredPrevImage, cv::Size(5, 5), -1);
+					std::vector<uchar> status(filtered_currPoints.size(), 1);
+					interpolate_irregular_nn_raster(filtered_prevPoints, filtered_currPoints, status, blurredPrevImage).copyTo(dense_flow);
+					std::vector<Mat> vecMats;
+					std::vector<Mat> vecMats2(2);
+					cv::split(dense_flow, vecMats);
+					cv::bilateralFilter(vecMats[0], vecMats2[0], 5, 2, 20);
+					cv::bilateralFilter(vecMats[1], vecMats2[1], 5, 2, 20);
+					cv::merge(vecMats2, dense_flow);
+				}
+				if (use_post_proc)
+				{
+					ximgproc::fastGlobalSmootherFilter(prevImage, flow, flow, fgs_lambda, fgs_sigma);
+				}
+			}
+			
+			virtual void collectGarbage() CV_OVERRIDE
+			{
+				prevPyramid[0].release();
+				prevPyramid[1].release();
+				currPyramid[0].release();
+				currPyramid[1].release();
+			}
+
+		protected:
+			RLOFOpticalFlowParameter		param;
+			float				forwardBackwardThreshold;
+			Ptr<CImageBuffer>	prevPyramid[2];
+			Ptr<CImageBuffer>	currPyramid[2];
+			cv::Size			gridStep;
+			InterpolationType   interp_type;
+			int					k;
+			float				sigma;
+			float				lambda;
+			float				fgs_lambda;
+			float				fgs_sigma;
+			bool				use_post_proc;
+		};
+	
+		Ptr<DenseRLOFOpticalFlow> DenseRLOFOpticalFlow::create(
+			const RLOFOpticalFlowParameter & rlofParam,
+			float forwardBackwardThreshold,
+			cv::Size gridStep,
+			InterpolationType interp_type,
+			int epicK,
+			float epicSigma,
+			float epicLambda,
+			float fgs_lambda,
+			float fgs_sigma)
+		{
+			Ptr<DenseRLOFOpticalFlow> algo = makePtr<DenseOpticalFlowRLOFImpl>();
+			algo->setRLOFOpticalFlowParameter(rlofParam);
+			algo->setForwardBackward(forwardBackwardThreshold);
+			algo->setGridStep(gridStep);
+			algo->setInterpolation(interp_type);
+			algo->setEPICK(epicK);
+			algo->setEPICSigma(epicSigma);
+			algo->setEPICLambda(epicLambda);
+			algo->setFgsLambda(fgs_lambda);
+			algo->setFgsSigma(fgs_sigma);
+			return algo;
+		}
+
+		class SparseRLOFOpticalFlowImpl : public SparseRLOFOpticalFlow
+		{
+			public:
+			SparseRLOFOpticalFlowImpl()
+				: forwardBackwardThreshold(1.f)
+			{
+				prevPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer);
+				prevPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer);
+				currPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer);
+				currPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer);
+			}
+			virtual void setRLOFOpticalFlowParameter(RLOFOpticalFlowParameter val) CV_OVERRIDE  { param = val; }
+			virtual RLOFOpticalFlowParameter getRLOFOpticalFlowParameter() const CV_OVERRIDE { return param; }
+
+			virtual float getForwardBackward()  const CV_OVERRIDE { return forwardBackwardThreshold; }
+			virtual void setForwardBackward(float val) CV_OVERRIDE { forwardBackwardThreshold = val; }
+
+			virtual void calc(InputArray prevImg, InputArray nextImg,
+				InputArray prevPts, InputOutputArray nextPts,
+				OutputArray status,
+				OutputArray err) CV_OVERRIDE
+			{
+				CV_Assert(!prevImg.empty() && prevImg.depth() == CV_8U && (prevImg.channels() == 3 || prevImg.channels() == 1));
+				CV_Assert(!nextImg.empty() && nextImg.depth() == CV_8U && (nextImg.channels() == 3 || nextImg.channels() == 1));
+				CV_Assert(prevImg.sameSize(nextImg));
+
+				//CV_Assert(param.supportRegionType == RLOFOpticalFlowParameter::SR_CROSS && I0.channels() == 3 && I1.channels() == 3);
+
+				Mat prevImage = prevImg.getMat();
+				Mat nextImage = nextImg.getMat();
+				Mat prevPtsMat = prevPts.getMat();
+
+				if (param.useInitialFlow == false)
+					nextPts.create(prevPtsMat.size(), prevPtsMat.type(), -1, true);
+
+				int npoints = 0;
+				CV_Assert((npoints = prevPtsMat.checkVector(2, CV_32F, true)) >= 0);
+
+				if (npoints == 0)
+				{
+					nextPts.release();
+					status.release();
+					err.release();
+					return;
+				}
+
+				Mat nextPtsMat = nextPts.getMat();
+				CV_Assert(nextPtsMat.checkVector(2, CV_32F, true) == npoints);
+				std::vector<cv::Point2f> prevPoints(npoints), nextPoints(npoints), refPoints;
+				prevPtsMat.copyTo(cv::Mat(1, npoints, CV_32FC2, &prevPoints[0]));
+				if (param.useInitialFlow )
+					nextPtsMat.copyTo(cv::Mat(1, nextPtsMat.cols, CV_32FC2, &nextPoints[0]));
+
+				cv::Mat statusMat;
+				cv::Mat errorMat;
+				if (status.needed() || forwardBackwardThreshold > 0)
+				{
+					status.create((int)npoints, 1, CV_8U, -1, true);
+					statusMat = status.getMat();
+					statusMat.setTo(1);
+				}
+
+				if (err.needed() || forwardBackwardThreshold > 0)
+				{
+					err.create((int)npoints, 1, CV_32F, -1, true);
+					errorMat = err.getMat();
+					errorMat.setTo(0);
+				}
+
+				calcLocalOpticalFlow(prevImage, nextImage, prevPyramid, currPyramid, prevPoints, nextPoints, param);
+				cv::Mat(1,npoints , CV_32FC2, &nextPoints[0]).copyTo(nextPtsMat);
+				if (forwardBackwardThreshold > 0)
+				{
+					// reuse image pyramids
+					calcLocalOpticalFlow(nextImage, prevImage, currPyramid, prevPyramid, nextPoints, refPoints, param);
+				}
+				for (unsigned int r = 0; r < refPoints.size(); r++)
+				{
+					Point2f diff = refPoints[r] - prevPoints[r];
+					errorMat.at<float>(r) = sqrt(diff.x * diff.x + diff.y * diff.y);
+					if (errorMat.at<float>(r) <= forwardBackwardThreshold)
+						statusMat.at<uchar>(r) = 0;
+				}
+
+			}
+
+		protected:
+			RLOFOpticalFlowParameter	param;
+			float				forwardBackwardThreshold;
+			Ptr<CImageBuffer>	prevPyramid[2];
+			Ptr<CImageBuffer>	currPyramid[2];
+		};
+		
+		Ptr<SparseRLOFOpticalFlow> SparseRLOFOpticalFlow::create(
+			const RLOFOpticalFlowParameter & rlofParam,
+			float forwardBackwardThreshold)
+		{
+			Ptr<SparseRLOFOpticalFlow> algo = makePtr<SparseRLOFOpticalFlowImpl>();
+			algo->setRLOFOpticalFlowParameter(rlofParam);
+			algo->setForwardBackward(forwardBackwardThreshold);
+			return algo;
+		}
+		
+		void calcOpticalFlowDenseRLOF(InputArray I0, InputArray I1, InputOutputArray flow,
+			RLOFOpticalFlowParameter rlofParam ,
+			float forewardBackwardThreshold, Size gridStep, 
+			DenseRLOFOpticalFlow::InterpolationType interp_type,
+			int epicK, float epicSigma, 
+			bool use_post_proc, float fgsLambda, float fgsSigma)
+		{
+			Ptr<DenseRLOFOpticalFlow> algo = DenseRLOFOpticalFlow::create(
+				rlofParam, forewardBackwardThreshold, gridStep, interp_type,
+				epicK, epicSigma, use_post_proc, fgsLambda, fgsSigma);
+			algo->calc(I0, I1, flow);
+			algo->collectGarbage();
+		}
+
+		void calcOpticalFlowSparseRLOF(InputArray prevImg, InputArray nextImg,
+			InputArray prevPts, InputOutputArray nextPts,
+			OutputArray status, OutputArray err,
+			RLOFOpticalFlowParameter rlofParam,
+			float forewardBackwardThreshold)
+		{
+			Ptr<SparseRLOFOpticalFlow> algo = SparseRLOFOpticalFlow::create(
+				rlofParam, forewardBackwardThreshold);
+			algo->calc(prevImg, nextImg, prevPts, nextPts, status, err);
+		}
+		Ptr<DenseOpticalFlow> createOptFlow_DenseRLOF()
+		{
+			return DenseRLOFOpticalFlow::create();
+		}
+
+		Ptr<SparseOpticalFlow> createOptFlow_SparseRLOF()
+		{
+			return SparseRLOFOpticalFlow::create();
+		}
+	}
+
+
+}
\ No newline at end of file
diff --git a/modules/optflow/test/test_OF_accuracy.cpp b/modules/optflow/test/test_OF_accuracy.cpp
index 6a5ed702e48..d2aedf2573a 100644
--- a/modules/optflow/test/test_OF_accuracy.cpp
+++ b/modules/optflow/test/test_OF_accuracy.cpp
@@ -83,7 +83,20 @@ static float calcRMSE(Mat flow1, Mat flow2)
     }
     return (float)sqrt(sum / (1e-9 + counter));
 }
-
+static float calcRMSE(vector<Point2f> prevPts, vector<Point2f> currPts, Mat flow)
+{
+	vector<float> ee;
+	for (unsigned int n = 0; n < prevPts.size(); n++)
+	{
+		Point2f gtFlow = flow.at<Point2f>(prevPts[n]);
+		if (isFlowCorrect(gtFlow.x) && isFlowCorrect(gtFlow.y))
+		{
+			Point2f diffFlow = (currPts[n] - prevPts[n]) - gtFlow;
+			ee.push_back(sqrt(diffFlow.x * diffFlow.x + diffFlow.y * diffFlow.y));
+		}
+	}
+	return mean(ee).val[0];
+}
 static float calcAvgEPE(vector< pair<Point2i, Point2i> > corr, Mat flow)
 {
     double sum = 0;
@@ -111,9 +124,13 @@ static float calcAvgEPE(vector< pair<Point2i, Point2i> > corr, Mat flow)
 
 bool readRubberWhale(Mat &dst_frame_1, Mat &dst_frame_2, Mat &dst_GT)
 {
-    const string frame1_path = getRubberWhaleFrame1();
-    const string frame2_path = getRubberWhaleFrame2();
-    const string gt_flow_path = getRubberWhaleGroundTruth();
+    string frame1_path = getRubberWhaleFrame1();
+    string frame2_path = getRubberWhaleFrame2();
+    string gt_flow_path = getRubberWhaleGroundTruth();
+	// removing space may be an issue on windows machines
+	frame1_path.erase(std::remove_if(frame1_path.begin(), frame1_path.end(), isspace), frame1_path.end());
+	frame2_path.erase(std::remove_if(frame2_path.begin(), frame2_path.end(), isspace), frame2_path.end());
+	gt_flow_path.erase(std::remove_if(gt_flow_path.begin(), gt_flow_path.end(), isspace), gt_flow_path.end());
 
     dst_frame_1 = imread(frame1_path);
     dst_frame_2 = imread(frame2_path);
@@ -125,6 +142,7 @@ bool readRubberWhale(Mat &dst_frame_1, Mat &dst_frame_2, Mat &dst_GT)
         return true;
 }
 
+
 TEST(DenseOpticalFlow_SimpleFlow, ReferenceAccuracy)
 {
     Mat frame1, frame2, GT;
@@ -157,6 +175,108 @@ TEST(DenseOpticalFlow_DeepFlow, ReferenceAccuracy)
     EXPECT_LE(calcRMSE(GT, flow), target_RMSE);
 }
 
+TEST(SparseOpticalFlow, ReferenceAccuracy)
+{
+	// with the following test each invoker class should be tested once
+	Mat frame1, frame2, GT;
+	ASSERT_TRUE(readRubberWhale(frame1, frame2, GT));
+	vector<Point2f> prevPts, currPts;
+	
+	for (int r = 0; r < frame1.rows; r+=10)
+	{
+		for (int c = 0; c < frame1.cols; c+=10)
+		{
+			prevPts.push_back(Point2f(static_cast<float>(c), static_cast<float>(r)));
+		}
+	}
+	vector<uchar> status(prevPts.size());
+	vector<float> err(prevPts.size());
+	Ptr<SparseRLOFOpticalFlow> algo = SparseRLOFOpticalFlow::create();
+	algo->setForwardBackward(0.0f);
+	
+	RLOFOpticalFlowParameter param;
+	param.supportRegionType = RLOFOpticalFlowParameter::SR_CROSS;
+	param.useIlluminationModel = true;
+	
+	param.solverType = RLOFOpticalFlowParameter::ST_BILINEAR;
+	algo->setRLOFOpticalFlowParameter(param);
+	algo->calc(frame1, frame2, prevPts, currPts, status, err);
+	EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.22f);
+
+	param.solverType = RLOFOpticalFlowParameter::ST_STANDART;
+	algo->setRLOFOpticalFlowParameter(param);
+	algo->calc(frame1, frame2, prevPts, currPts, status, err);
+	EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.34f);
+
+	param.useIlluminationModel = false;
+	
+	param.solverType = RLOFOpticalFlowParameter::ST_BILINEAR;
+	algo->setRLOFOpticalFlowParameter(param);
+	algo->calc(frame1, frame2, prevPts, currPts, status, err);
+	EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.24f);
+
+	param.solverType = RLOFOpticalFlowParameter::ST_STANDART;
+	algo->setRLOFOpticalFlowParameter(param);
+	algo->calc(frame1, frame2, prevPts, currPts, status, err);
+	EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.23f);
+
+	param.normSigma0 = numeric_limits<float>::max();
+	param.normSigma1 = numeric_limits<float>::max();
+	param.useIlluminationModel = true;
+
+	param.solverType = RLOFOpticalFlowParameter::ST_BILINEAR;
+	algo->setRLOFOpticalFlowParameter(param);
+	algo->calc(frame1, frame2, prevPts, currPts, status, err);
+	EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.26f);
+
+	param.solverType = RLOFOpticalFlowParameter::ST_STANDART;
+	algo->setRLOFOpticalFlowParameter(param);
+	algo->calc(frame1, frame2, prevPts, currPts, status, err);
+	EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.26f);
+
+	param.useIlluminationModel = false;
+
+	param.solverType = RLOFOpticalFlowParameter::ST_BILINEAR;
+	algo->setRLOFOpticalFlowParameter(param);
+	algo->calc(frame1, frame2, prevPts, currPts, status, err);
+	EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.25f);
+
+	param.solverType = RLOFOpticalFlowParameter::ST_STANDART;
+	algo->setRLOFOpticalFlowParameter(param);
+	algo->calc(frame1, frame2, prevPts, currPts, status, err);
+	EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.26f);
+}
+
+TEST(DenseOpticalFlow_RLOF, ReferenceAccuracy)
+{
+	Mat frame1, frame2, GT;
+	ASSERT_TRUE(readRubberWhale(frame1, frame2, GT));
+	
+	Mat flow;
+	Ptr<DenseRLOFOpticalFlow> algo = DenseRLOFOpticalFlow::create();
+	RLOFOpticalFlowParameter param;
+	param.supportRegionType = RLOFOpticalFlowParameter::SR_CROSS;
+	param.solverType = RLOFOpticalFlowParameter::ST_BILINEAR;
+	algo->setRLOFOpticalFlowParameter(param);
+	algo->setForwardBackward(1.0f);
+	algo->setGridStep(cv::Size(4, 4));
+	
+	algo->setInterpolation(DenseRLOFOpticalFlow::INTERP_EPIC);
+	algo->calc(frame1, frame2, flow);
+
+	ASSERT_EQ(GT.rows, flow.rows);
+	ASSERT_EQ(GT.cols, flow.cols);
+	EXPECT_LE(calcRMSE(GT, flow), 0.36f);
+
+	algo->setInterpolation(DenseRLOFOpticalFlow::INTERP_GEO);
+	algo->calc(frame1, frame2, flow);
+
+	ASSERT_EQ(GT.rows, flow.rows);
+	ASSERT_EQ(GT.cols, flow.cols);
+	EXPECT_LE(calcRMSE(GT, flow), 0.49f);
+
+}
+
 TEST(DenseOpticalFlow_SparseToDenseFlow, ReferenceAccuracy)
 {
     Mat frame1, frame2, GT;

From 69ef7842458d0f9fefcbbcfa998b653e40ac4cc5 Mon Sep 17 00:00:00 2001
From: senst <senst@nue.tu-berlin.de>
Date: Mon, 10 Dec 2018 20:50:19 +0100
Subject: [PATCH 02/25] - exchange tabs with spaces - fix optflow.bib
 indentation - remove optflow_o.hpp - change RLOFOpticalFlowParameter
 interfaces to Ptr<RLOFOpticalFlowParameter> to remove error on building. Fix
 warnings

---
 modules/optflow/doc/optflow.bib               |   52 +-
 .../include/opencv2/optflow/rlofflow.hpp      |  873 ++--
 modules/optflow/include/opencv2/optflow_o.hpp |  309 --
 modules/optflow/perf/perf_rlof.cpp            |  159 +-
 modules/optflow/src/rlof/berlof_invoker.hpp   | 4328 ++++++++---------
 .../optflow/src/rlof/geo_interpolation.cpp    |  996 ++--
 .../optflow/src/rlof/geo_interpolation.hpp    |   94 +-
 modules/optflow/src/rlof/plk_invoker.hpp      | 1341 +++--
 modules/optflow/src/rlof/rlof_invoker.hpp     | 2865 ++++++-----
 modules/optflow/src/rlof/rlof_invokerbase.hpp |  498 +-
 modules/optflow/src/rlof/rlof_localflow.cpp   | 1082 ++---
 modules/optflow/src/rlof/rlof_localflow.h     |  232 +-
 modules/optflow/src/rlofflow.cpp              |  712 ++-
 modules/optflow/test/test_OF_accuracy.cpp     |  216 +-
 14 files changed, 6491 insertions(+), 7266 deletions(-)
 delete mode 100644 modules/optflow/include/opencv2/optflow_o.hpp

diff --git a/modules/optflow/doc/optflow.bib b/modules/optflow/doc/optflow.bib
index 616686e2698..746cecb6732 100644
--- a/modules/optflow/doc/optflow.bib
+++ b/modules/optflow/doc/optflow.bib
@@ -63,36 +63,36 @@ @inproceedings{Wang_2016_CVPR
 }
 
 @inproceedings{Geistert2016,
-	author = {Jonas Geistert and Tobias Senst and Thomas Sikora},
-	title = {Robust Local Optical Flow: Dense Motion Vector Field Interpolation},
-	booktitle = {Picture Coding Symposium},
-	year = {2016},
-	pages = {1--5},
+  author = {Jonas Geistert and Tobias Senst and Thomas Sikora},
+  title = {Robust Local Optical Flow: Dense Motion Vector Field Interpolation},
+  booktitle = {Picture Coding Symposium},
+  year = {2016},
+  pages = {1--5},
 }
 
-@inproceedings{Senst2016,			    								  
-  author = {Tobias Senst and Jonas Geistert and Thomas Sikora},			  
-  title = {Robust local optical flow: Long-range motions and varying illuminations},															  
-  booktitle = {IEEE International Conference on Image Processing},		  
-  year = {2016},															  
-  pages = {4478--4482},														  
-}																		  
+@inproceedings{Senst2016,
+  author = {Tobias Senst and Jonas Geistert and Thomas Sikora},
+  title = {Robust local optical flow: Long-range motions and varying illuminations},
+  booktitle = {IEEE International Conference on Image Processing},
+  year = {2016},
+  pages = {4478--4482},
+}
 
-@inproceedings{Senst2014,		    									  
-  author = {Tobias Senst and Thilo Borgmann and Ivo Keller and Thomas Sikora},																	  
-  title = {Cross based Robust Local Optical Flow},							  
-  booktitle = {21th IEEE International Conference on Image Processing},	  
-  year = {2014},															  
-  pages = {1967--1971},														  
-}																		  
+@inproceedings{Senst2014,
+  author = {Tobias Senst and Thilo Borgmann and Ivo Keller and Thomas Sikora},
+  title = {Cross based Robust Local Optical Flow},
+  booktitle = {21th IEEE International Conference on Image Processing},
+  year = {2014},
+  pages = {1967--1971},
+}
 
-@inproceedings{Senst2013,	    										  
-  author = {Tobias Senst and Jonas Geistert and Ivo Keller and Thomas Sikora},																	  
-  title = {Robust Local Optical Flow Estimation using Bilinear Equations for Sparse Motion Estimation},												  
-  booktitle = {20th IEEE International Conference on Image Processing},	  
-  year = {2013},															  
-  pages = {2499--2503},													    
-}																		  
+@inproceedings{Senst2013,
+  author = {Tobias Senst and Jonas Geistert and Ivo Keller and Thomas Sikora},
+  title = {Robust Local Optical Flow Estimation using Bilinear Equations for Sparse Motion Estimation},
+  booktitle = {20th IEEE International Conference on Image Processing},
+  year = {2013},
+  pages = {2499--2503},
+}
 
 @article{Senst2012,                                                        
   author = {Tobias Senst and Volker Eiselein and Thomas Sikora},             
diff --git a/modules/optflow/include/opencv2/optflow/rlofflow.hpp b/modules/optflow/include/opencv2/optflow/rlofflow.hpp
index e9accdab5c4..d1a3d6f53ad 100644
--- a/modules/optflow/include/opencv2/optflow/rlofflow.hpp
+++ b/modules/optflow/include/opencv2/optflow/rlofflow.hpp
@@ -1,57 +1,6 @@
-/*
-By downloading, copying, installing or using the software you agree to this
-license. If you do not agree to this license, do not download, install,
-copy or use the software.
-
-
-						  License Agreement
-			   For Open Source Computer Vision Library
-					   (3-clause BSD License)
-
-Copyright (C) 2018, OpenCV Foundation, all rights reserved.
-Third party copyrights are property of their respective owners.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-  * Redistributions of source code must retain the above copyright notice,
-	this list of conditions and the following disclaimer.
-
-  * Redistributions in binary form must reproduce the above copyright notice,
-	this list of conditions and the following disclaimer in the documentation
-	and/or other materials provided with the distribution.
-
-  * Neither the names of the copyright holders nor the names of the contributors
-	may be used to endorse or promote products derived from this software
-	without specific prior written permission.
-
-This software is provided by the copyright holders and contributors "as is" and
-any express or implied warranties, including, but not limited to, the implied
-warranties of merchantability and fitness for a particular purpose are
-disclaimed. In no event shall copyright holders or contributors be liable for
-any direct, indirect, incidental, special, exemplary, or consequential damages
-(including, but not limited to, procurement of substitute goods or services;
-loss of use, data, or profits; or business interruption) however caused
-and on any theory of liability, whether in contract, strict liability,
-or tort (including negligence or otherwise) arising in any way out of
-the use of this software, even if advised of the possibility of such damage.
-*/
-
-/**
- * @file   rloflow.hpp
- * @author Tobias Senst <tobias.senst@googlemail.com>
- * @brief  Implementation of the robust local optical flow algorithm for sparse and dense motion 
- * estimation from the following papers:
- * http://elvera.nue.tu-berlin.de/files/1349Senst2012.pdf
- * http://elvera.nue.tu-berlin.de/files/1422Senst2013.pdf
- * http://elvera.nue.tu-berlin.de/files/1448Senst2014.pdf
- * http://elvera.nue.tu-berlin.de/files/1498Geistert2016.pdf
- * http://elvera.nue.tu-berlin.de/files/1496Senst2016.pdf
- * 
- * 
- * @cite Senst2012, @cite Senst2013, @cite Senst2014, @cite Senst2016 or @cite Geistert2016
- *
-*/
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
 #ifndef __OPENCV_OPTFLOW_RLOFFLOW_HPP__
 #define __OPENCV_OPTFLOW_RLOFFLOW_HPP__
 
@@ -60,423 +9,425 @@ the use of this software, even if advised of the possibility of such damage.
 
 namespace cv
 {
-    namespace optflow
+namespace optflow
+{
+    //! @addtogroup optflow
+    //! @{
+
+    /** @brief This is used store and set up the parameters of the robust local optical flow (RLOF) algoritm. 
+     *
+     * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as 
+     * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). 
+     * This RLOF implementation can be seen as an improved pyramidal iterative Lucas-Kanade and includes 
+     * a set of improving modules. The main improvements in respect to the pyramidal iterative Lucas-Kanade
+     * are:
+     *    - A more robust redecending M-estimator framework (see @cite Senst2012) to improve the accuracy at 
+     *     motion boundaries and appearing and disappearing pixels.
+     *  - an adaptive support region strategies to improve the accuracy at motion boundaries to reduce the 
+     *     corona effect, i.e oversmoothing of the PLK at motion/object boundaries. The cross-based segementation 
+     *     strategy (SR_CROSS) proposed in @cite Senst2014 uses a simple segmenation approach to obtain the optimal 
+     *     shape of the support region.
+     *  - To deal with illumination changes (outdoor sequences and shadow) the intensity constancy assumption 
+     *     based optical flow equation has been adopt with the Gennert and Negahdaripour illumination model 
+     *     (see @cite Senst2016). This model can be switched on/off with the useIlluminationModel variable.
+     *  - By using a global motion prior initialization (see @cite Senst2016) of the iterative refinement
+     *     the accuracy could be significantly improved for large displacements. This initialization can be
+     *     switched on and of with useGlobalMotionPrior variable.
+     *
+     * The RLOF can be computed with the SparseOpticalFlow class or function interface to track a set of features 
+     * or with the DenseOpticalFlow class or function interface to compute dense optical flow. 
+     * 
+     * @see optflow::DenseRLOFOpticalFlow, optflow::calcOpticalFlowDenseRLOF(), optflow::SparseRLOFOpticalFlow, optflow::calcOpticalFlowSparseRLOF()
+     */
+    enum SupportRegionType {
+        SR_FIXED = 0,           /**<  Apply a constant support region */
+        SR_CROSS = 1            /**<  Apply a adaptive support region obtained by cross-based segmentation
+                                 *    as described in @cite Senst2014
+                                */
+    };
+    enum SolverType {
+        ST_STANDART = 0,        /**< Apply standard iterative refinement */
+        ST_BILINEAR = 1         /**< Apply optimized iterative refinement based bilinear equation solutions
+                                 *   as described in @cite Senst2013
+                                */
+    };
+
+    enum InterpolationType
     {
-		//! @addtogroup optflow
-		//! @{
+        INTERP_GEO = 0,    /**<  Fast geodesic interpolation, see @cite Geister2016 */
+        INTERP_EPIC = 1,    /**<  Edge-preserving interpolation, see @cite Revaud2015,Geistert2016. */
+    };
+
+    class CV_EXPORTS_W RLOFOpticalFlowParameter{
+    public:
+        RLOFOpticalFlowParameter()
+            :solverType(ST_BILINEAR)
+            ,supportRegionType(SR_CROSS)
+            ,normSigma0(3.2f)
+            ,normSigma1(7.f)
+            ,smallWinSize(9)
+            ,largeWinSize(21)
+            ,crossSegmentationThreshold(25)
+            ,maxLevel(3)
+            ,useInitialFlow(false)
+            ,useIlluminationModel(true)
+            ,useGlobalMotionPrior(true)
+            ,maxIteration(30)
+            ,minEigenValue(0.0001f)
+            ,globalMotionRansacThreshold(10)
+        {}
+        SolverType solverType;
+        /**< Variable specifies the iterative refinement strategy. Please consider citing  @cite Senst2013 when
+         *  using ST_BILINEAR.
+        */
 
-		/** @brief This is used store and set up the parameters of the robust local optical flow (RLOF) algoritm. 
-		 *
-		 * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as 
-		 * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). 
-		 * This RLOF implementation can be seen as an improved pyramidal iterative Lucas-Kanade and includes 
-		 * a set of improving modules. The main improvements in respect to the pyramidal iterative Lucas-Kanade
-		 * are:
-		 *	- A more robust redecending M-estimator framework (see @cite Senst2012) to improve the accuracy at 
-		 *     motion boundaries and appearing and disappearing pixels.
-		 *  - an adaptive support region strategies to improve the accuracy at motion boundaries to reduce the 
-		 *     corona effect, i.e oversmoothing of the PLK at motion/object boundaries. The cross-based segementation 
-		 *     strategy (SR_CROSS) proposed in @cite Senst2014 uses a simple segmenation approach to obtain the optimal 
-		 *     shape of the support region.
-		 *  - To deal with illumination changes (outdoor sequences and shadow) the intensity constancy assumption 
-		 *     based optical flow equation has been adopt with the Gennert and Negahdaripour illumination model 
-		 *     (see @cite Senst2016). This model can be switched on/off with the useIlluminationModel variable.
-		 *  - By using a global motion prior initialization (see @cite Senst2016) of the iterative refinement
-		 *     the accuracy could be significantly improved for large displacements. This initialization can be
-		 *     switched on and of with useGlobalMotionPrior variable.
-		 *
-		 * The RLOF can be computed with the SparseOpticalFlow class or function interface to track a set of features 
-		 * or with the DenseOpticalFlow class or function interface to compute dense optical flow. 
-		 * 
-		 * @see optflow::DenseRLOFOpticalFlow, optflow::calcOpticalFlowDenseRLOF(), optflow::SparseRLOFOpticalFlow, optflow::calcOpticalFlowSparseRLOF()
-		 */
-		class RLOFOpticalFlowParameter {
-		public:
-			RLOFOpticalFlowParameter()
-				:solverType(ST_BILINEAR)
-				,supportRegionType(SR_CROSS)
-			    ,normSigma0(3.2f)
-				,normSigma1(7.f)
-				,smallWinSize(9)
-				,largeWinSize(21)
-				,crossSegmentationThreshold(25)
-				,useInitialFlow(false)
-				,useIlluminationModel(true)
-				,useGlobalMotionPrior(true)
-				,maxLevel(3)
-				,maxIteration(30)
-				,minEigenValue(0.0001f)
-				,globalMotionRansacThreshold(10)
-			{}
-			enum SupportRegionType {
-				SR_FIXED = 0,			/**<  Apply a constant support region */
-				SR_CROSS				/**<  Apply a adaptive support region obtained by cross-based segmentation 
-										 *    as described in @cite Senst2014
-										 */
-			};
-			enum SolverType {
-				ST_STANDART = 0,		/**< Apply standard iterative refinement */
-				ST_BILINEAR				/**< Apply optimized iterative refinement based bilinear equation solutions 
-										 *   as described in @cite Senst2013
-										 */
-			};
-			SolverType solverType; 
-			/**< Variable specifies the iterative refinement strategy. Please consider citing  @cite Senst2013 when 
-			 *  using ST_BILINEAR.
-			*/
-			
-			SupportRegionType supportRegionType;
-			/**< Variable specifies the support region shape extraction or shrinking strategy.
-			*/
-			
-			float normSigma0;
-			/**< \$ \sigma_0 \$ paramter of the shrinked Hampel norm introduced in @cite Senst2012. If 
-			 * \$ \sigma_0 \$ = std::numeric_limist<float>::max() the least-square estimator will be used 
-			 * instead of the M-estimator. Althoug M-estimator is more robust against outlier in the support 
-			 * region the least-square can be fast in computation.
-			*/
-			float normSigma1;
-			/**< \$ \sigma_1 \$ paramter of the shrinked Hampel norm introduced in @cite Senst2012. If
-			 * \$ \sigma_1 \$ = std::numeric_limist<float>::max() the least-square estimator will be used
-			 * instead of the M-estimator. Althoug M-estimator is more robust against outlier in the support 
-			 * region the least-square can be fast in computation.
-			*/
-			int   smallWinSize;
-			/**< Minimal window size of the support region. This parameter is only used if supportRegionType is SR_CROSS. 
-			*/
-			int   largeWinSize;
-			/**< Maximal window size of the support region. If supportRegionType is SR_FIXED this gives the exact support 
-			 * region size. The speed of the RLOF is related to the applied win sizes. The smaller the window size the lower is the runtime, 
-			 * but the more sensitive to noise is the method.
-			*/
-			int   crossSegmentationThreshold;
-			/**< Color similarity threshold used by cross-based segmentation following @cite Senst2014 .
-			 *   (Only used  if supportRegionType is SR_CROSS). With the cross-bassed segmentation 
-			 *   motion boundaries can be computed more accurately.
-			*/
-			int	  maxLevel;
-			/**< Maximal number of pyramid level used. The large this value is the more likely it is
-			 *   to obtain accurate solutions for long-range motions. The runtime is linear related to 
-			 *   this parameter.
-			*/
-			bool  useInitialFlow;
-			/**< Use next point list as initial values. A good intialization can imporve the algortihm 
-			 *   accuracy and reduce the runtime by a faster convergence of the iteration refinement.
-			*/
-			bool  useIlluminationModel;		
-			/**< Use the Gennert and Negahdaripour illumination model instead of the intensity brigthness 
-			 *   constraint. (Probosed in @cite Senst2016 ) This model is defined as follow:
-			 *   \f[ I(\mathbf{x},t) + m \cdot I(\mathbf{x},t) + c = I(\mathbf{x},t+1) \f[
-			 *   and contains with m and c a multiplicative and additive term which makes the estimate 
-			 *   more robust against illumination changes. The computational complexity is increased by
-			 *   enabling the illumination model.
-			*/
-			bool  useGlobalMotionPrior;
-			/**< Use global motion prior initialisation has been introduced in @cite Senst2016 . It 
-			 *   allows to be more accurate for long-range motion. The computational complexity is 
-			 *   slightly increased by enabling the global motion prior initialisation.
-			*/
-	 		int   maxIteration;
-			/**< Number of maximal iterations used for the iterative refinement. Lower values can
-			 *   reduce the runtime but also the accuracy.
-			*/
-			float minEigenValue;
-			/**< Threshold for the minimal eigenvalue of the gradient matrix defines when to abort the 
-			 *   iterative refinement.
-			*/
-			float globalMotionRansacThreshold;
-			/**< To apply the global motion prior motion vectors will be computed on a regulary sampled which 
-			 *   are the basis for Homography estimation using RANSAC. The reprojection threshold is based on 
-			 *   n-th percentil (given by this value [0 ... 100]) of the forward backward errors relating to 
-			 *   the motion vectors. See @cite Senst2016 for more details.
-			*/
-		};
+        SupportRegionType supportRegionType;
+        /**< Variable specifies the support region shape extraction or shrinking strategy.
+        */
 
-		/** @brief Fast dense optical flow computation based on robust local optical flow (RLOF) algorithms and sparse-to-dense interpolation 
-		 * scheme.
-		 *
-		 * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as 
-		 * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). 
-		 *
-		 * The sparse-to-dense interpolation scheme allows for fast computation of dense optical flow using RLOF (see @cite Geistert2016).
-		 * For this scheme the following steps are applied:
-		 * -# motion vector seeded at a regular sampled grid are computed. The sparsity of this grid can be configured with setGridStep 
-		 * -# (optinally) errornous motion vectors are filter based on the forward backward confidence. The threshold can be configured
-		 * with setForwardBackward. The filter is only applied if the threshold >0 but than the runtime is doubled due to the estimation
-		 * of the backward flow.
-		 * -# Vector field interpolation is applied to the motion vector set to obtain a dense vector field.
-		 *
-		 * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details.
-		 * Parameters have been described in @cite Senst2012 @cite Senst2013 @cite Senst2014 and @cite Senst2016.
-		 *
-		 * @note If the grid size is set to (1,1) and the forward backward threshold <= 0 that the dense optical flow field is purely.
-		 * computed with the RLOF.
-		 *
-		 * @see optflow::calcOpticalFlowDenseRLOF(), optflow::RLOFOpticalFlowParameter
-		*/
-        class CV_EXPORTS_W DenseRLOFOpticalFlow : public DenseOpticalFlow
-        {	
-		public:
-			enum InterpolationType 
-			{
-				INTERP_GEO = 0,	/**<  Fast geodesic interpolation, see @cite Geister2016 */
-				INTERP_EPIC,	/**<  Edge-preserving interpolation, see @cite Revaud2015,Geistert2016. */
-			};
-			//! @brief Configuration of the RLOF alogrithm.
-			/** 
-			    @see optflow::RLOFOpticalFlowParameter, getRLOFOpticalFlowParameter 
-			*/
-			CV_WRAP virtual void setRLOFOpticalFlowParameter(RLOFOpticalFlowParameter val) = 0;
-			/** @copybrief setRLOFOpticalFlowParameter 
-				@see optflow::RLOFOpticalFlowParameter, setRLOFOpticalFlowParameter 
-			*/
-			CV_WRAP virtual RLOFOpticalFlowParameter getRLOFOpticalFlowParameter() const = 0;
-			//! @brief Threshold for the forward backward confidence check
-			/**For each grid point \f$ \mathbf{x} \f$ a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed. 
-			 *	 If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f]
-			 *	 is larger than threshold given by this function then the motion vector will not be used by the following 
-			 *	vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test
-			 *	will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation.
-			 *	@see getForwardBackward, setGridStep
-			*/
-			CV_WRAP virtual void setForwardBackward(float val) = 0;
-			/** @copybrief setForwardBackward
-				@see setForwardBackward 
-			*/
-			CV_WRAP virtual float getForwardBackward() const = 0;
-			//! @brief Size of the grid to spawn the motion vectors.
-			/** For each grid point a motion vector is computed. Some motion vectors will be removed due to the forwatd backward 
-			 *  threshold (if set >0). The rest will be the base of the vector field interpolation.
-			 *	@see getForwardBackward, setGridStep
-			*/
-			CV_WRAP virtual Size getGridStep() const = 0;
-			/** @copybrief getGridStep
-			 *	@see getGridStep 
-			 */
-			CV_WRAP virtual void setGridStep(Size val) = 0;
+        float normSigma0;
+        /**< \$ \sigma_0 \$ paramter of the shrinked Hampel norm introduced in @cite Senst2012. If
+         * \$ \sigma_0 \$ = std::numeric_limist<float>::max() the least-square estimator will be used
+         * instead of the M-estimator. Althoug M-estimator is more robust against outlier in the support
+         * region the least-square can be fast in computation.
+        */
+        float normSigma1;
+        /**< \$ \sigma_1 \$ paramter of the shrinked Hampel norm introduced in @cite Senst2012. If
+         * \$ \sigma_1 \$ = std::numeric_limist<float>::max() the least-square estimator will be used
+         * instead of the M-estimator. Althoug M-estimator is more robust against outlier in the support
+         * region the least-square can be fast in computation.
+        */
+        int smallWinSize;
+        /**< Minimal window size of the support region. This parameter is only used if supportRegionType is SR_CROSS.
+        */
+        int largeWinSize;
+        /**< Maximal window size of the support region. If supportRegionType is SR_FIXED this gives the exact support
+         * region size. The speed of the RLOF is related to the applied win sizes. The smaller the window size the lower is the runtime,
+         * but the more sensitive to noise is the method.
+        */
+        int crossSegmentationThreshold;
+        /**< Color similarity threshold used by cross-based segmentation following @cite Senst2014 .
+         *   (Only used  if supportRegionType is SR_CROSS). With the cross-bassed segmentation
+         *   motion boundaries can be computed more accurately.
+        */
+        int maxLevel;
+        /**< Maximal number of pyramid level used. The large this value is the more likely it is
+         *   to obtain accurate solutions for long-range motions. The runtime is linear related to
+         *   this parameter.
+        */
+        bool useInitialFlow;
+        /**< Use next point list as initial values. A good intialization can imporve the algortihm
+         *   accuracy and reduce the runtime by a faster convergence of the iteration refinement.
+        */
+        bool useIlluminationModel;
+        /**< Use the Gennert and Negahdaripour illumination model instead of the intensity brigthness
+         *   constraint. (Probosed in @cite Senst2016 ) This model is defined as follow:
+         *   \f[ I(\mathbf{x},t) + m \cdot I(\mathbf{x},t) + c = I(\mathbf{x},t+1) \f[
+         *   and contains with m and c a multiplicative and additive term which makes the estimate
+         *   more robust against illumination changes. The computational complexity is increased by
+         *   enabling the illumination model.
+        */
+        bool useGlobalMotionPrior;
+        /**< Use global motion prior initialisation has been introduced in @cite Senst2016 . It
+         *   allows to be more accurate for long-range motion. The computational complexity is
+         *   slightly increased by enabling the global motion prior initialisation.
+        */
+        int maxIteration;
+        /**< Number of maximal iterations used for the iterative refinement. Lower values can
+         *   reduce the runtime but also the accuracy.
+        */
+        float minEigenValue;
+        /**< Threshold for the minimal eigenvalue of the gradient matrix defines when to abort the
+         *   iterative refinement.
+        */
+        float globalMotionRansacThreshold;
+        /**< To apply the global motion prior motion vectors will be computed on a regulary sampled which
+         *   are the basis for Homography estimation using RANSAC. The reprojection threshold is based on
+         *   n-th percentil (given by this value [0 ... 100]) of the forward backward errors relating to
+         *   the motion vectors. See @cite Senst2016 for more details.
+        */
+    };
 
-			//! @brief Interpolation used to compute the dense optical flow.
-			/** Two interpolation algorithms have been supported
-			 * - **INTERP_GEO** applies the fast geodesic interpolation, see @cite Geister2016.
-			 * - **INTERP_EPIC_RESIDUAL** applies the edge-preserving interpolation, see @cite Revaud2015,Geistert2016.
-			 * @see ximgproc::EdgeAwareInterpolator, getInterpolation 
-			*/
-			CV_WRAP virtual void setInterpolation(InterpolationType val) = 0;
-			/** @copybrief setInterpolation
-			 *	@see ximgproc::EdgeAwareInterpolator, setInterpolation
-			 */
-			CV_WRAP virtual InterpolationType getInterpolation() const = 0;
-			//! @brief see ximgproc::EdgeAwareInterpolator() K value.
-			/** K is a number of nearest-neighbor matches considered, when fitting a locally affine
-			 *	model. Usually it should be around 128. However, lower values would make the interpolation noticeably faster.
-			 *	@see ximgproc::EdgeAwareInterpolator,  setEPICK
-			*/
-			CV_WRAP virtual int getEPICK() const = 0;
-			/** @copybrief getEPICK
-			 *	@see ximgproc::EdgeAwareInterpolator, getEPICK 
-			 */
-			CV_WRAP virtual void setEPICK(int val) = 0;
-			//! @brief see ximgproc::EdgeAwareInterpolator() sigma value.
-			/** Sigma is a parameter defining how fast the weights decrease in the locally-weighted affine
-			 *  fitting. Higher values can help preserve fine details, lower values can help to get rid of noise in the
-			 *  output flow.
-			 *	@see ximgproc::EdgeAwareInterpolator, setEPICSigma
-			*/
-			CV_WRAP virtual float getEPICSigma() const = 0;
-			/** @copybrief getEPICSigma
-			 *  @see ximgproc::EdgeAwareInterpolator, getEPICSigma 
-			 */
-			CV_WRAP virtual void setEPICSigma(float val) = 0;
-			//! @brief  see ximgproc::EdgeAwareInterpolator() lambda value.
-			/** Lambda is a parameter defining the weight of the edge-aware term in geodesic distance,
-			 *	should be in the range of 0 to 1000.
-			 *	@see ximgproc::EdgeAwareInterpolator, setEPICSigma
-			*/
-			CV_WRAP virtual float getEPICLambda() const = 0;
-			/** @copybrief getEPICLambda
-			 *	@see ximgproc::EdgeAwareInterpolator, getEPICLambda 
-			*/
-			CV_WRAP virtual void setEPICLambda(float val) = 0;
-			//! @brief see ximgproc::EdgeAwareInterpolator().
-			/** Sets the respective fastGlobalSmootherFilter() parameter.
-			 *	@see ximgproc::EdgeAwareInterpolator, setFgsLambda
-			*/
-			CV_WRAP virtual float getFgsLambda() const = 0;
-			/** @copybrief getFgsLambda
-			 *	@see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, getFgsLambda 
-			*/
-			CV_WRAP virtual void setFgsLambda(float val) = 0;
-			//! @brief see ximgproc::EdgeAwareInterpolator().
-			/** Sets the respective fastGlobalSmootherFilter() parameter.
-			 *	@see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, setFgsSigma 
-			*/
-			CV_WRAP virtual float getFgsSigma() const = 0;
-			/** @copybrief getFgsSigma
-			 *	@see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, getFgsSigma  
-			 */
-			CV_WRAP virtual void setFgsSigma(float val) = 0;
-			//! @brief enables ximgproc::fastGlobalSmootherFilter
-			/**
-			 * @see getUsePostProc
-			 */
-			CV_WRAP virtual void setUsePostProc(bool val) = 0;
-			/** @copybrief setUsePostProc
-			 *	@see ximgproc::fastGlobalSmootherFilter, setUsePostProc
-			 */
-			CV_WRAP virtual bool getUsePostProc() const = 0;
-			//! @brief Creates instance of optflow::DenseRLOFOpticalFlow 
-			/**
-			 *	@param rlofParam see optflow::RLOFOpticalFlowParameter
-			 *	@param forwardBackwardThreshold see setForwardBackward
-			 *	@param gridStep see setGridStep
-			 *	@param epicK see setEPICK
-			 *	@param epicSigma see setEPICSigma
-			 *	@param epicLambda see setEPICLambda
-			 *	@param fgsLambda see setFgsLambda
-			 *	@param fgsSigma see setFgsSigma
-			*/
-			CV_WRAP static Ptr<DenseRLOFOpticalFlow> create(
-				const RLOFOpticalFlowParameter & rlofParam = RLOFOpticalFlowParameter(),
-				float forwardBackwardThreshold = 1.f,
-				Size gridStep = Size(6, 6),
-				InterpolationType interp_type = InterpolationType::INTERP_EPIC,
-				int epicK = 128,
-				float epicSigma = 0.05f,
-				float epicLambda = 999.0f,
-				float fgsLambda = 500.0f,
-				float fgsSigma = 1.5f);
-		};
+    /** @brief Fast dense optical flow computation based on robust local optical flow (RLOF) algorithms and sparse-to-dense interpolation 
+     * scheme.
+     *
+     * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as 
+     * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). 
+     *
+     * The sparse-to-dense interpolation scheme allows for fast computation of dense optical flow using RLOF (see @cite Geistert2016).
+     * For this scheme the following steps are applied:
+     * -# motion vector seeded at a regular sampled grid are computed. The sparsity of this grid can be configured with setGridStep 
+     * -# (optinally) errornous motion vectors are filter based on the forward backward confidence. The threshold can be configured
+     * with setForwardBackward. The filter is only applied if the threshold >0 but than the runtime is doubled due to the estimation
+     * of the backward flow.
+     * -# Vector field interpolation is applied to the motion vector set to obtain a dense vector field.
+     *
+     * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details.
+     * Parameters have been described in @cite Senst2012 @cite Senst2013 @cite Senst2014 and @cite Senst2016.
+     *
+     * @note If the grid size is set to (1,1) and the forward backward threshold <= 0 that the dense optical flow field is purely.
+     * computed with the RLOF.
+     *
+     * @see optflow::calcOpticalFlowDenseRLOF(), optflow::RLOFOpticalFlowParameter
+    */
+    class CV_EXPORTS_W DenseRLOFOpticalFlow : public DenseOpticalFlow
+    {
+    public:
+        //! @brief Configuration of the RLOF alogrithm.
+        /** 
+            @see optflow::RLOFOpticalFlowParameter, getRLOFOpticalFlowParameter 
+        */
+        CV_WRAP virtual void setRLOFOpticalFlowParameter(Ptr<RLOFOpticalFlowParameter>  val) = 0;
+        /** @copybrief setRLOFOpticalFlowParameter 
+            @see optflow::RLOFOpticalFlowParameter, setRLOFOpticalFlowParameter 
+        */
+        CV_WRAP virtual Ptr<RLOFOpticalFlowParameter>  getRLOFOpticalFlowParameter() const = 0;
+        //! @brief Threshold for the forward backward confidence check
+        /**For each grid point \f$ \mathbf{x} \f$ a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed. 
+         *     If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f]
+         *     is larger than threshold given by this function then the motion vector will not be used by the following 
+         *    vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test
+         *    will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation.
+         *    @see getForwardBackward, setGridStep
+        */
+        CV_WRAP virtual void setForwardBackward(float val) = 0;
+        /** @copybrief setForwardBackward
+            @see setForwardBackward 
+        */
+        CV_WRAP virtual float getForwardBackward() const = 0;
+        //! @brief Size of the grid to spawn the motion vectors.
+        /** For each grid point a motion vector is computed. Some motion vectors will be removed due to the forwatd backward 
+         *  threshold (if set >0). The rest will be the base of the vector field interpolation.
+         *    @see getForwardBackward, setGridStep
+        */
+        CV_WRAP virtual Size getGridStep() const = 0;
+        /** @copybrief getGridStep
+         *    @see getGridStep 
+         */
+        CV_WRAP virtual void setGridStep(Size val) = 0;
+
+        //! @brief Interpolation used to compute the dense optical flow.
+        /** Two interpolation algorithms have been supported
+         * - **INTERP_GEO** applies the fast geodesic interpolation, see @cite Geister2016.
+         * - **INTERP_EPIC_RESIDUAL** applies the edge-preserving interpolation, see @cite Revaud2015,Geistert2016.
+         * @see ximgproc::EdgeAwareInterpolator, getInterpolation 
+        */
+        CV_WRAP virtual void setInterpolation(InterpolationType val) = 0;
+        /** @copybrief setInterpolation
+         *    @see ximgproc::EdgeAwareInterpolator, setInterpolation
+         */
+        CV_WRAP virtual InterpolationType getInterpolation() const = 0;
+        //! @brief see ximgproc::EdgeAwareInterpolator() K value.
+        /** K is a number of nearest-neighbor matches considered, when fitting a locally affine
+         *    model. Usually it should be around 128. However, lower values would make the interpolation noticeably faster.
+         *    @see ximgproc::EdgeAwareInterpolator,  setEPICK
+        */
+        CV_WRAP virtual int getEPICK() const = 0;
+        /** @copybrief getEPICK
+         *    @see ximgproc::EdgeAwareInterpolator, getEPICK 
+         */
+        CV_WRAP virtual void setEPICK(int val) = 0;
+        //! @brief see ximgproc::EdgeAwareInterpolator() sigma value.
+        /** Sigma is a parameter defining how fast the weights decrease in the locally-weighted affine
+         *  fitting. Higher values can help preserve fine details, lower values can help to get rid of noise in the
+         *  output flow.
+         *    @see ximgproc::EdgeAwareInterpolator, setEPICSigma
+        */
+        CV_WRAP virtual float getEPICSigma() const = 0;
+        /** @copybrief getEPICSigma
+         *  @see ximgproc::EdgeAwareInterpolator, getEPICSigma 
+         */
+        CV_WRAP virtual void setEPICSigma(float val) = 0;
+        //! @brief  see ximgproc::EdgeAwareInterpolator() lambda value.
+        /** Lambda is a parameter defining the weight of the edge-aware term in geodesic distance,
+         *    should be in the range of 0 to 1000.
+         *    @see ximgproc::EdgeAwareInterpolator, setEPICSigma
+        */
+        CV_WRAP virtual float getEPICLambda() const = 0;
+        /** @copybrief getEPICLambda
+         *    @see ximgproc::EdgeAwareInterpolator, getEPICLambda 
+        */
+        CV_WRAP virtual void setEPICLambda(float val) = 0;
+        //! @brief see ximgproc::EdgeAwareInterpolator().
+        /** Sets the respective fastGlobalSmootherFilter() parameter.
+         *    @see ximgproc::EdgeAwareInterpolator, setFgsLambda
+        */
+        CV_WRAP virtual float getFgsLambda() const = 0;
+        /** @copybrief getFgsLambda
+         *    @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, getFgsLambda 
+        */
+        CV_WRAP virtual void setFgsLambda(float val) = 0;
+        //! @brief see ximgproc::EdgeAwareInterpolator().
+        /** Sets the respective fastGlobalSmootherFilter() parameter.
+         *    @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, setFgsSigma 
+        */
+        CV_WRAP virtual float getFgsSigma() const = 0;
+        /** @copybrief getFgsSigma
+         *    @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, getFgsSigma  
+         */
+        CV_WRAP virtual void setFgsSigma(float val) = 0;
+        //! @brief enables ximgproc::fastGlobalSmootherFilter
+        /**
+         * @see getUsePostProc
+         */
+        CV_WRAP virtual void setUsePostProc(bool val) = 0;
+        /** @copybrief setUsePostProc
+         *    @see ximgproc::fastGlobalSmootherFilter, setUsePostProc
+         */
+        CV_WRAP virtual bool getUsePostProc() const = 0;
+        //! @brief Creates instance of optflow::DenseRLOFOpticalFlow 
+        /**
+         *    @param rlofParam see optflow::RLOFOpticalFlowParameter
+         *    @param forwardBackwardThreshold see setForwardBackward
+         *    @param gridStep see setGridStep
+         *    @param epicK see setEPICK
+         *    @param epicSigma see setEPICSigma
+         *    @param epicLambda see setEPICLambda
+         *    @param fgsLambda see setFgsLambda
+         *    @param fgsSigma see setFgsSigma
+        */
+        CV_WRAP static Ptr<DenseRLOFOpticalFlow> create(
+            Ptr<RLOFOpticalFlowParameter> rlofParam = Ptr<RLOFOpticalFlowParameter>(),
+            float forwardBackwardThreshold = 1.f,
+            Size gridStep = Size(6, 6),
+            InterpolationType interp_type = InterpolationType::INTERP_EPIC,
+            int epicK = 128,
+            float epicSigma = 0.05f,
+            float epicLambda = 999.0f,
+            float fgsLambda = 500.0f,
+            float fgsSigma = 1.5f);
+    };
 
-		/** @brief Class used for calculation sparse optical flow and feature tracking with robust local optical flow (RLOF) algorithms.
-		*
-		* The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as 
-		* proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). 
-		*
-		* For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details. 
-		* Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014 and @cite Senst2016.
-		@sa optflow::calcOpticalFlowSparseRLOF(), optflow::RLOFOpticalFlowParameter
+    /** @brief Class used for calculation sparse optical flow and feature tracking with robust local optical flow (RLOF) algorithms.
+    *
+    * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as 
+    * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). 
+    *
+    * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details. 
+    * Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014 and @cite Senst2016.
+    @sa optflow::calcOpticalFlowSparseRLOF(), optflow::RLOFOpticalFlowParameter
 
-		*/
-		class CV_EXPORTS_W SparseRLOFOpticalFlow : public SparseOpticalFlow
-		{
-		public:
-			/** @copydoc DenseRLOFOpticalFlow::setRLOFOpticalFlowParameter
-			*/
-			CV_WRAP virtual void setRLOFOpticalFlowParameter(RLOFOpticalFlowParameter val) = 0;
-			/** @copybrief setRLOFOpticalFlowParameter
-			 *	@see setRLOFOpticalFlowParameter
-			*/
-			CV_WRAP virtual RLOFOpticalFlowParameter getRLOFOpticalFlowParameter() const = 0;
-			//! @brief Threshold for the forward backward confidence check
-			/** For each feature point a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed.
-			 *	 If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f]
-			 *	 is larger than threshold given by this function then the status  will not be used by the following
-			 *	vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test
-			 *	will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation.
-			 *	@see setForwardBackward
-			*/
-			CV_WRAP virtual void setForwardBackward(float val) = 0;
-			/** @copybrief setForwardBackward
-			 *	@see setForwardBackward
-			*/
-			CV_WRAP virtual float getForwardBackward() const = 0;
-			
-			//! @brief Creates instance of SparseRLOFOpticalFlow 
-			/**
-			 *	@param rlofParam see setRLOFOpticalFlowParameter
-			 *	@param forwardBackwardThreshold see setForwardBackward
-			*/
-			CV_WRAP static Ptr<SparseRLOFOpticalFlow> create(
-				const RLOFOpticalFlowParameter & rlofParam = RLOFOpticalFlowParameter(),
-				float forwardBackwardThreshold = 1.f);
+    */
+    class CV_EXPORTS_W SparseRLOFOpticalFlow : public SparseOpticalFlow
+    {
+    public:
+        /** @copydoc DenseRLOFOpticalFlow::setRLOFOpticalFlowParameter
+        */
+        CV_WRAP virtual void setRLOFOpticalFlowParameter(Ptr<RLOFOpticalFlowParameter> val) = 0;
+        /** @copybrief setRLOFOpticalFlowParameter
+         *    @see setRLOFOpticalFlowParameter
+        */
+        CV_WRAP virtual Ptr<RLOFOpticalFlowParameter>  getRLOFOpticalFlowParameter() const = 0;
+        //! @brief Threshold for the forward backward confidence check
+        /** For each feature point a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed.
+         *     If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f]
+         *     is larger than threshold given by this function then the status  will not be used by the following
+         *    vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test
+         *    will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation.
+         *    @see setForwardBackward
+        */
+        CV_WRAP virtual void setForwardBackward(float val) = 0;
+        /** @copybrief setForwardBackward
+         *    @see setForwardBackward
+        */
+        CV_WRAP virtual float getForwardBackward() const = 0;
+        
+        //! @brief Creates instance of SparseRLOFOpticalFlow 
+        /**
+         *    @param rlofParam see setRLOFOpticalFlowParameter
+         *    @param forwardBackwardThreshold see setForwardBackward
+        */
+        CV_WRAP static Ptr<SparseRLOFOpticalFlow> create(
+            Ptr<RLOFOpticalFlowParameter> rlofParam = Ptr<RLOFOpticalFlowParameter>(),
+            float forwardBackwardThreshold = 1.f);
 
-		};
+    };
 
-		/** @brief Fast dense optical flow computation based on robust local optical flow (RLOF) algorithms and sparse-to-dense interpolation scheme.
-		 *
-		 * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as 
-		 * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). 
-		 *
-		 * The sparse-to-dense interpolation scheme allows for fast computation of dense optical flow using RLOF (see @cite Geistert2016).
-		 * For this scheme the following steps are applied:
-		 * -# motion vector seeded at a regular sampled grid are computed. The sparsity of this grid can be configured with setGridStep
-		 * -# (optinally) errornous motion vectors are filter based on the forward backward confidence. The threshold can be configured
-		 * with setForwardBackward. The filter is only applied if the threshold >0 but than the runtime is doubled due to the estimation
-		 * of the backward flow.
-		 * -# Vector field interpolation is applied to the motion vector set to obtain a dense vector field.
-		 *
-		 * @param I0 first 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType 
-		 * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image. 
-		 * @param I1 second 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType 
-		 * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image.
-		 * @param flow computed flow image that has the same size as I0 and type CV_32FC2.
-		 * @param rlofParam see optflow::RLOFOpticalFlowParameter
-		 * @param forwardBackwardThreshold Threshold for the forward backward confidence check.
-		 * For each grid point \f$ \mathbf{x} \f$ a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed. 
-		 * If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f]
-		 * is larger than threshold given by this function then the motion vector will not be used by the following 
-		 * vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test
-		 *	will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation.
-	     * @param gridStep Size of the grid to spawn the motion vectors. For each grid point a motion vector is computed. 
-		 * Some motion vectors will be removed due to the forwatd backward threshold (if set >0). The rest will be the 
-		 * base of the vector field interpolation.
-		 * @param epicK see ximgproc::EdgeAwareInterpolator() sets the respective parameter.
-		 * @param epicSigma see ximgproc::EdgeAwareInterpolator() sets the respective parameter.
-		 * @param epicLambda see ximgproc::EdgeAwareInterpolator() sets the respective parameter.
-		 * @param use_post_proc enables ximgproc::fastGlobalSmootherFilter() parameter.
-		 * @param fgsLambda sets the respective ximgproc::fastGlobalSmootherFilter() parameter.
-		 * @param fgsSigma sets the respective ximgproc::fastGlobalSmootherFilter() parameter.
-		 *
-		 * Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014, @cite Senst2016.
-		 * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details.
-		 * @note If the grid size is set to (1,1) and the forward backward threshold <= 0 that the dense optical flow field is purely
-		 * computed with the RLOF.
-		 *
-		 * @sa optflow::DenseRLOFOpticalFlow, optflow::RLOFOpticalFlowParameter
-		*/
-		CV_EXPORTS_W void calcOpticalFlowDenseRLOF(InputArray I0, InputArray I1, InputOutputArray flow,
-			RLOFOpticalFlowParameter rlofParam = RLOFOpticalFlowParameter(),
-			float forewardBackwardThreshold = 0, Size gridStep = Size(6, 6),
-			DenseRLOFOpticalFlow::InterpolationType interp_type = DenseRLOFOpticalFlow::InterpolationType::INTERP_EPIC,
-			int epicK = 128, float epicSigma = 0.05f, 
-			bool use_post_proc = true, float fgsLambda = 500.0f, float fgsSigma = 1.5f);
+    /** @brief Fast dense optical flow computation based on robust local optical flow (RLOF) algorithms and sparse-to-dense interpolation scheme.
+     *
+     * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as 
+     * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). 
+     *
+     * The sparse-to-dense interpolation scheme allows for fast computation of dense optical flow using RLOF (see @cite Geistert2016).
+     * For this scheme the following steps are applied:
+     * -# motion vector seeded at a regular sampled grid are computed. The sparsity of this grid can be configured with setGridStep
+     * -# (optinally) errornous motion vectors are filter based on the forward backward confidence. The threshold can be configured
+     * with setForwardBackward. The filter is only applied if the threshold >0 but than the runtime is doubled due to the estimation
+     * of the backward flow.
+     * -# Vector field interpolation is applied to the motion vector set to obtain a dense vector field.
+     *
+     * @param I0 first 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType 
+     * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image. 
+     * @param I1 second 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType 
+     * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image.
+     * @param flow computed flow image that has the same size as I0 and type CV_32FC2.
+     * @param rlofParam see optflow::RLOFOpticalFlowParameter
+     * @param forwardBackwardThreshold Threshold for the forward backward confidence check.
+     * For each grid point \f$ \mathbf{x} \f$ a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed. 
+     * If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f]
+     * is larger than threshold given by this function then the motion vector will not be used by the following 
+     * vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test
+     *    will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation.
+     * @param gridStep Size of the grid to spawn the motion vectors. For each grid point a motion vector is computed. 
+     * Some motion vectors will be removed due to the forwatd backward threshold (if set >0). The rest will be the 
+     * base of the vector field interpolation.
+     * @param epicK see ximgproc::EdgeAwareInterpolator() sets the respective parameter.
+     * @param epicSigma see ximgproc::EdgeAwareInterpolator() sets the respective parameter.
+     * @param epicLambda see ximgproc::EdgeAwareInterpolator() sets the respective parameter.
+     * @param use_post_proc enables ximgproc::fastGlobalSmootherFilter() parameter.
+     * @param fgsLambda sets the respective ximgproc::fastGlobalSmootherFilter() parameter.
+     * @param fgsSigma sets the respective ximgproc::fastGlobalSmootherFilter() parameter.
+     *
+     * Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014, @cite Senst2016.
+     * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details.
+     * @note If the grid size is set to (1,1) and the forward backward threshold <= 0 that the dense optical flow field is purely
+     * computed with the RLOF.
+     *
+     * @sa optflow::DenseRLOFOpticalFlow, optflow::RLOFOpticalFlowParameter
+    */
+    CV_EXPORTS_W void calcOpticalFlowDenseRLOF(InputArray I0, InputArray I1, InputOutputArray flow,
+        Ptr<RLOFOpticalFlowParameter> rlofParam = Ptr<RLOFOpticalFlowParameter>(),
+        float forewardBackwardThreshold = 0, Size gridStep = Size(6, 6),
+        InterpolationType interp_type = InterpolationType::INTERP_EPIC,
+        int epicK = 128, float epicSigma = 0.05f, 
+        bool use_post_proc = true, float fgsLambda = 500.0f, float fgsSigma = 1.5f);
 
-		/** @brief Calculates fast optical flow for a sparse feature set using the robust local optical flow (RLOF) similar 
-		* to optflow::calcOpticalFlowPyrLK().
-		*
-		* The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as
-		* proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK().
-		*
-		* @param I0 first 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType
-		* = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image.
-		* @param I1 second 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType
-		* = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image.
-		* @param prevPts vector of 2D points for which the flow needs to be found; point coordinates must be single-precision 
-		* floating-point numbers.
-		* @param nextPts output vector of 2D points (with single-precision floating-point coordinates) containing the calculated 
-		* new positions of input features in the second image; when optflow::RLOFOpticalFlowParameter::useInitialFlow variable is true  the vector must 
-		* have the same size as in the input and contain the initialization point correspondences.
-		* @param status output status vector (of unsigned chars); each element of the vector is set to 1 if the flow for the 
-		* corresponding features has passed the forward backward check. 
-		* @param err output vector of errors; each element of the vector is set to the forward backward error for the corresponding feature. 
-		* @param rlofParam see optflow::RLOFOpticalFlowParameter
-		* @param forwardBackwardThreshold Threshold for the forward backward confidence check. If forewardBackwardThreshold <=0 the forward
-		*
-		* Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014 and @cite Senst2016.
-		* For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details.
-	    */
-		CV_EXPORTS_W void calcOpticalFlowSparseRLOF(InputArray prevImg, InputArray nextImg,
-			InputArray prevPts, InputOutputArray nextPts,
-			OutputArray status, OutputArray err,
-			RLOFOpticalFlowParameter rlofParam = RLOFOpticalFlowParameter(),
-			float forewardBackwardThreshold = 0);
+    /** @brief Calculates fast optical flow for a sparse feature set using the robust local optical flow (RLOF) similar 
+    * to optflow::calcOpticalFlowPyrLK().
+    *
+    * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as
+    * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK().
+    *
+    * @param I0 first 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType
+    * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image.
+    * @param I1 second 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType
+    * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image.
+    * @param prevPts vector of 2D points for which the flow needs to be found; point coordinates must be single-precision 
+    * floating-point numbers.
+    * @param nextPts output vector of 2D points (with single-precision floating-point coordinates) containing the calculated 
+    * new positions of input features in the second image; when optflow::RLOFOpticalFlowParameter::useInitialFlow variable is true  the vector must 
+    * have the same size as in the input and contain the initialization point correspondences.
+    * @param status output status vector (of unsigned chars); each element of the vector is set to 1 if the flow for the 
+    * corresponding features has passed the forward backward check. 
+    * @param err output vector of errors; each element of the vector is set to the forward backward error for the corresponding feature. 
+    * @param rlofParam see optflow::RLOFOpticalFlowParameter
+    * @param forwardBackwardThreshold Threshold for the forward backward confidence check. If forewardBackwardThreshold <=0 the forward
+    *
+    * Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014 and @cite Senst2016.
+    * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details.
+    */
+    CV_EXPORTS_W void calcOpticalFlowSparseRLOF(InputArray prevImg, InputArray nextImg,
+        InputArray prevPts, InputOutputArray nextPts,
+        OutputArray status, OutputArray err,
+        Ptr<RLOFOpticalFlowParameter> rlofParam = Ptr<RLOFOpticalFlowParameter>(),
+        float forewardBackwardThreshold = 0);
 
-		//! Additional interface to the Dense RLOF algorithm - optflow::calcOpticalFlowDenseRLOF()
-		CV_EXPORTS_W Ptr<DenseOpticalFlow> createOptFlow_DenseRLOF();
+    //! Additional interface to the Dense RLOF algorithm - optflow::calcOpticalFlowDenseRLOF()
+    CV_EXPORTS_W Ptr<DenseOpticalFlow> createOptFlow_DenseRLOF();
 
-		//! Additional interface to the Sparse RLOF algorithm - optflow::calcOpticalFlowSparseRLOF()
-		CV_EXPORTS_W Ptr<SparseOpticalFlow> createOptFlow_SparseRLOF();
-		//! @}
-    }
+    //! Additional interface to the Sparse RLOF algorithm - optflow::calcOpticalFlowSparseRLOF()
+    CV_EXPORTS_W Ptr<SparseOpticalFlow> createOptFlow_SparseRLOF();
+    //! @}
+}
 }
 #endif
diff --git a/modules/optflow/include/opencv2/optflow_o.hpp b/modules/optflow/include/opencv2/optflow_o.hpp
deleted file mode 100644
index 65802750d1a..00000000000
--- a/modules/optflow/include/opencv2/optflow_o.hpp
+++ /dev/null
@@ -1,309 +0,0 @@
-/*
-By downloading, copying, installing or using the software you agree to this
-license. If you do not agree to this license, do not download, install,
-copy or use the software.
-
-
-                          License Agreement
-               For Open Source Computer Vision Library
-                       (3-clause BSD License)
-
-Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-Third party copyrights are property of their respective owners.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-  * Redistributions of source code must retain the above copyright notice,
-    this list of conditions and the following disclaimer.
-
-  * Redistributions in binary form must reproduce the above copyright notice,
-    this list of conditions and the following disclaimer in the documentation
-    and/or other materials provided with the distribution.
-
-  * Neither the names of the copyright holders nor the names of the contributors
-    may be used to endorse or promote products derived from this software
-    without specific prior written permission.
-
-This software is provided by the copyright holders and contributors "as is" and
-any express or implied warranties, including, but not limited to, the implied
-warranties of merchantability and fitness for a particular purpose are
-disclaimed. In no event shall copyright holders or contributors be liable for
-any direct, indirect, incidental, special, exemplary, or consequential damages
-(including, but not limited to, procurement of substitute goods or services;
-loss of use, data, or profits; or business interruption) however caused
-and on any theory of liability, whether in contract, strict liability,
-or tort (including negligence or otherwise) arising in any way out of
-the use of this software, even if advised of the possibility of such damage.
-*/
-
-#ifndef __OPENCV_OPTFLOW_HPP__
-#define __OPENCV_OPTFLOW_HPP__
-
-#include "opencv2/core.hpp"
-#include "opencv2/video.hpp"
-
-/**
-@defgroup optflow Optical Flow Algorithms
-
-Dense optical flow algorithms compute motion for each point:
-
-- cv::optflow::calcOpticalFlowSF
-- cv::optflow::createOptFlow_DeepFlow
-
-Motion templates is alternative technique for detecting motion and computing its direction.
-See samples/motempl.py.
-
-- cv::motempl::updateMotionHistory
-- cv::motempl::calcMotionGradient
-- cv::motempl::calcGlobalOrientation
-- cv::motempl::segmentMotion
-
-Functions reading and writing .flo files in "Middlebury" format, see: <http://vision.middlebury.edu/flow/code/flow-code/README.txt>
-
-- cv::optflow::readOpticalFlow
-- cv::optflow::writeOpticalFlow
-
- */
-
-#include "opencv2/optflow/pcaflow.hpp"
-#include "opencv2/optflow/sparse_matching_gpc.hpp"
-
-namespace cv
-{
-namespace optflow
-{
-
-//! @addtogroup optflow
-//! @{
-
-/** @overload */
-CV_EXPORTS_W void calcOpticalFlowSF( InputArray from, InputArray to, OutputArray flow,
-                                     int layers, int averaging_block_size, int max_flow);
-
-/** @brief Calculate an optical flow using "SimpleFlow" algorithm.
-
-@param from First 8-bit 3-channel image.
-@param to Second 8-bit 3-channel image of the same size as prev
-@param flow computed flow image that has the same size as prev and type CV_32FC2
-@param layers Number of layers
-@param averaging_block_size Size of block through which we sum up when calculate cost function
-for pixel
-@param max_flow maximal flow that we search at each level
-@param sigma_dist vector smooth spatial sigma parameter
-@param sigma_color vector smooth color sigma parameter
-@param postprocess_window window size for postprocess cross bilateral filter
-@param sigma_dist_fix spatial sigma for postprocess cross bilateralf filter
-@param sigma_color_fix color sigma for postprocess cross bilateral filter
-@param occ_thr threshold for detecting occlusions
-@param upscale_averaging_radius window size for bilateral upscale operation
-@param upscale_sigma_dist spatial sigma for bilateral upscale operation
-@param upscale_sigma_color color sigma for bilateral upscale operation
-@param speed_up_thr threshold to detect point with irregular flow - where flow should be
-recalculated after upscale
-
-See @cite Tao2012 . And site of project - <http://graphics.berkeley.edu/papers/Tao-SAN-2012-05/>.
-
-@note
-   -   An example using the simpleFlow algorithm can be found at samples/simpleflow_demo.cpp
- */
-CV_EXPORTS_W void calcOpticalFlowSF( InputArray from, InputArray to, OutputArray flow, int layers,
-                                     int averaging_block_size, int max_flow,
-                                     double sigma_dist, double sigma_color, int postprocess_window,
-                                     double sigma_dist_fix, double sigma_color_fix, double occ_thr,
-                                     int upscale_averaging_radius, double upscale_sigma_dist,
-                                     double upscale_sigma_color, double speed_up_thr );
-
-/** @brief Fast dense optical flow based on PyrLK sparse matches interpolation.
-
-@param from first 8-bit 3-channel or 1-channel image.
-@param to  second 8-bit 3-channel or 1-channel image of the same size as from
-@param flow computed flow image that has the same size as from and CV_32FC2 type
-@param grid_step stride used in sparse match computation. Lower values usually
-       result in higher quality but slow down the algorithm.
-@param k number of nearest-neighbor matches considered, when fitting a locally affine
-       model. Lower values can make the algorithm noticeably faster at the cost of
-       some quality degradation.
-@param sigma parameter defining how fast the weights decrease in the locally-weighted affine
-       fitting. Higher values can help preserve fine details, lower values can help to get rid
-       of the noise in the output flow.
-@param use_post_proc defines whether the ximgproc::fastGlobalSmootherFilter() is used
-       for post-processing after interpolation
-@param fgs_lambda see the respective parameter of the ximgproc::fastGlobalSmootherFilter()
-@param fgs_sigma  see the respective parameter of the ximgproc::fastGlobalSmootherFilter()
- */
-CV_EXPORTS_W void calcOpticalFlowSparseToDense ( InputArray from, InputArray to, OutputArray flow,
-                                                 int grid_step = 8, int k = 128, float sigma = 0.05f,
-                                                 bool use_post_proc = true, float fgs_lambda = 500.0f,
-                                                 float fgs_sigma = 1.5f );
-
-/** @brief DeepFlow optical flow algorithm implementation.
-
-The class implements the DeepFlow optical flow algorithm described in @cite Weinzaepfel2013 . See
-also <http://lear.inrialpes.fr/src/deepmatching/> .
-Parameters - class fields - that may be modified after creating a class instance:
--   member float alpha
-Smoothness assumption weight
--   member float delta
-Color constancy assumption weight
--   member float gamma
-Gradient constancy weight
--   member float sigma
-Gaussian smoothing parameter
--   member int minSize
-Minimal dimension of an image in the pyramid (next, smaller images in the pyramid are generated
-until one of the dimensions reaches this size)
--   member float downscaleFactor
-Scaling factor in the image pyramid (must be \< 1)
--   member int fixedPointIterations
-How many iterations on each level of the pyramid
--   member int sorIterations
-Iterations of Succesive Over-Relaxation (solver)
--   member float omega
-Relaxation factor in SOR
- */
-CV_EXPORTS_W Ptr<DenseOpticalFlow> createOptFlow_DeepFlow();
-
-//! Additional interface to the SimpleFlow algorithm - calcOpticalFlowSF()
-CV_EXPORTS_W Ptr<DenseOpticalFlow> createOptFlow_SimpleFlow();
-
-//! Additional interface to the Farneback's algorithm - calcOpticalFlowFarneback()
-CV_EXPORTS_W Ptr<DenseOpticalFlow> createOptFlow_Farneback();
-
-//! Additional interface to the SparseToDenseFlow algorithm - calcOpticalFlowSparseToDense()
-CV_EXPORTS_W Ptr<DenseOpticalFlow> createOptFlow_SparseToDense();
-
-/** @brief "Dual TV L1" Optical Flow Algorithm.
-
-The class implements the "Dual TV L1" optical flow algorithm described in @cite Zach2007 and
-@cite Javier2012 .
-Here are important members of the class that control the algorithm, which you can set after
-constructing the class instance:
-
--   member double tau
-    Time step of the numerical scheme.
-
--   member double lambda
-    Weight parameter for the data term, attachment parameter. This is the most relevant
-    parameter, which determines the smoothness of the output. The smaller this parameter is,
-    the smoother the solutions we obtain. It depends on the range of motions of the images, so
-    its value should be adapted to each image sequence.
-
--   member double theta
-    Weight parameter for (u - v)\^2, tightness parameter. It serves as a link between the
-    attachment and the regularization terms. In theory, it should have a small value in order
-    to maintain both parts in correspondence. The method is stable for a large range of values
-    of this parameter.
-
--   member int nscales
-    Number of scales used to create the pyramid of images.
-
--   member int warps
-    Number of warpings per scale. Represents the number of times that I1(x+u0) and grad(
-    I1(x+u0) ) are computed per scale. This is a parameter that assures the stability of the
-    method. It also affects the running time, so it is a compromise between speed and
-    accuracy.
-
--   member double epsilon
-    Stopping criterion threshold used in the numerical scheme, which is a trade-off between
-    precision and running time. A small value will yield more accurate solutions at the
-    expense of a slower convergence.
-
--   member int iterations
-    Stopping criterion iterations number used in the numerical scheme.
-
-C. Zach, T. Pock and H. Bischof, "A Duality Based Approach for Realtime TV-L1 Optical Flow".
-Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation".
-*/
-class CV_EXPORTS_W DualTVL1OpticalFlow : public DenseOpticalFlow
-{
-public:
-    //! @brief Time step of the numerical scheme
-    /** @see setTau */
-    CV_WRAP virtual double getTau() const = 0;
-    /** @copybrief getTau @see getTau */
-    CV_WRAP virtual void setTau(double val) = 0;
-    //! @brief Weight parameter for the data term, attachment parameter
-    /** @see setLambda */
-    CV_WRAP virtual double getLambda() const = 0;
-    /** @copybrief getLambda @see getLambda */
-    CV_WRAP virtual void setLambda(double val) = 0;
-    //! @brief Weight parameter for (u - v)^2, tightness parameter
-    /** @see setTheta */
-    CV_WRAP virtual double getTheta() const = 0;
-    /** @copybrief getTheta @see getTheta */
-    CV_WRAP virtual void setTheta(double val) = 0;
-    //! @brief coefficient for additional illumination variation term
-    /** @see setGamma */
-    CV_WRAP virtual double getGamma() const = 0;
-    /** @copybrief getGamma @see getGamma */
-    CV_WRAP virtual void setGamma(double val) = 0;
-    //! @brief Number of scales used to create the pyramid of images
-    /** @see setScalesNumber */
-    CV_WRAP virtual int getScalesNumber() const = 0;
-    /** @copybrief getScalesNumber @see getScalesNumber */
-    CV_WRAP virtual void setScalesNumber(int val) = 0;
-    //! @brief Number of warpings per scale
-    /** @see setWarpingsNumber */
-    CV_WRAP virtual int getWarpingsNumber() const = 0;
-    /** @copybrief getWarpingsNumber @see getWarpingsNumber */
-    CV_WRAP virtual void setWarpingsNumber(int val) = 0;
-    //! @brief Stopping criterion threshold used in the numerical scheme, which is a trade-off between precision and running time
-    /** @see setEpsilon */
-    CV_WRAP virtual double getEpsilon() const = 0;
-    /** @copybrief getEpsilon @see getEpsilon */
-    CV_WRAP virtual void setEpsilon(double val) = 0;
-    //! @brief Inner iterations (between outlier filtering) used in the numerical scheme
-    /** @see setInnerIterations */
-    CV_WRAP virtual int getInnerIterations() const = 0;
-    /** @copybrief getInnerIterations @see getInnerIterations */
-    CV_WRAP virtual void setInnerIterations(int val) = 0;
-    //! @brief Outer iterations (number of inner loops) used in the numerical scheme
-    /** @see setOuterIterations */
-    CV_WRAP virtual int getOuterIterations() const = 0;
-    /** @copybrief getOuterIterations @see getOuterIterations */
-    CV_WRAP virtual void setOuterIterations(int val) = 0;
-    //! @brief Use initial flow
-    /** @see setUseInitialFlow */
-    CV_WRAP virtual bool getUseInitialFlow() const = 0;
-    /** @copybrief getUseInitialFlow @see getUseInitialFlow */
-    CV_WRAP virtual void setUseInitialFlow(bool val) = 0;
-    //! @brief Step between scales (<1)
-    /** @see setScaleStep */
-    CV_WRAP virtual double getScaleStep() const = 0;
-    /** @copybrief getScaleStep @see getScaleStep */
-    CV_WRAP virtual void setScaleStep(double val) = 0;
-    //! @brief Median filter kernel size (1 = no filter) (3 or 5)
-    /** @see setMedianFiltering */
-    CV_WRAP virtual int getMedianFiltering() const = 0;
-    /** @copybrief getMedianFiltering @see getMedianFiltering */
-    CV_WRAP virtual void setMedianFiltering(int val) = 0;
-
-    /** @brief Creates instance of cv::DualTVL1OpticalFlow*/
-    CV_WRAP static Ptr<DualTVL1OpticalFlow> create(
-                                            double tau = 0.25,
-                                            double lambda = 0.15,
-                                            double theta = 0.3,
-                                            int nscales = 5,
-                                            int warps = 5,
-                                            double epsilon = 0.01,
-                                            int innnerIterations = 30,
-                                            int outerIterations = 10,
-                                            double scaleStep = 0.8,
-                                            double gamma = 0.0,
-                                            int medianFiltering = 5,
-                                            bool useInitialFlow = false);
-};
-
-/** @brief Creates instance of cv::DenseOpticalFlow
-*/
-CV_EXPORTS_W Ptr<DualTVL1OpticalFlow> createOptFlow_DualTVL1();
-
-//! @}
-
-} //optflow
-}
-
-#include "opencv2/optflow/motempl.hpp"
-
-#endif
diff --git a/modules/optflow/perf/perf_rlof.cpp b/modules/optflow/perf/perf_rlof.cpp
index ea48a62be59..4c87f51ef78 100644
--- a/modules/optflow/perf/perf_rlof.cpp
+++ b/modules/optflow/perf/perf_rlof.cpp
@@ -1,89 +1,88 @@
 #include "perf_precomp.hpp"
 namespace opencv_test {
-	namespace {
-		
-		typedef tuple<std::string, std::string, bool> ST_SR_IM_Sparse_t;
-		typedef TestBaseWithParam<ST_SR_IM_Sparse_t> ST_SR_IM_Sparse;
-		PERF_TEST_P(ST_SR_IM_Sparse, OpticalFlow_SparseRLOF,
-			testing::Combine(
-				testing::Values<std::string>("ST_BILINEAR", "ST_STANDART"),
-				testing::Values<std::string>("SR_CROSS", "SR_FIXED"),
-				testing::Values(true, false))
-		)
-		{
-			Size sz(640,480);
-			string frame1_path = "B:/workspace/opencv_extra/testdata/cv/optflow/RubberWhale1.png";
-			string frame2_path = "B:/workspace/opencv_extra/testdata/cv/optflow/RubberWhale2.png";
-			frame1_path.erase(std::remove_if(frame1_path.begin(), frame1_path.end(), isspace), frame1_path.end());
-			frame2_path.erase(std::remove_if(frame2_path.begin(), frame2_path.end(), isspace), frame2_path.end());
-			Mat frame1 = imread(frame1_path);
-			Mat frame2 = imread(frame2_path);
-			ASSERT_FALSE(frame1.empty());
-			ASSERT_FALSE(frame2.empty());
-			vector<Point2f> prevPts, currPts;
-			for (int r = 0; r < frame1.rows; r += 10)
-			{
-				for (int c = 0; c < frame1.cols; c += 10)
-				{
-					prevPts.push_back(Point2f(static_cast<float>(c), static_cast<float>(r)));
-				}
-			}
-			vector<uchar> status(prevPts.size());
-			vector<float> err(prevPts.size());
+    namespace {
+        
+        typedef tuple<std::string, std::string, bool> ST_SR_IM_Sparse_t;
+        typedef TestBaseWithParam<ST_SR_IM_Sparse_t> ST_SR_IM_Sparse;
+        PERF_TEST_P(ST_SR_IM_Sparse, OpticalFlow_SparseRLOF,
+            testing::Combine(
+                testing::Values<std::string>("ST_BILINEAR", "ST_STANDART"),
+                testing::Values<std::string>("SR_CROSS", "SR_FIXED"),
+                testing::Values(true, false))
+        )
+        {
+            string frame1_path = TS::ptr()->get_data_path() + "/cv/optflow/RubberWhale1.png";
+            string frame2_path = TS::ptr()->get_data_path() + "/cv/optflow/RubberWhale2.png";
+            frame1_path.erase(std::remove_if(frame1_path.begin(), frame1_path.end(), isspace), frame1_path.end());
+            frame2_path.erase(std::remove_if(frame2_path.begin(), frame2_path.end(), isspace), frame2_path.end());
+            Mat frame1 = imread(frame1_path);
+            Mat frame2 = imread(frame2_path);
+            ASSERT_FALSE(frame1.empty());
+            ASSERT_FALSE(frame2.empty());
+            vector<Point2f> prevPts, currPts;
+            for (int r = 0; r < frame1.rows; r += 10)
+            {
+                for (int c = 0; c < frame1.cols; c += 10)
+                {
+                    prevPts.push_back(Point2f(static_cast<float>(c), static_cast<float>(r)));
+                }
+            }
+            vector<uchar> status(prevPts.size());
+            vector<float> err(prevPts.size());
 
-			RLOFOpticalFlowParameter param;
-			if (get<0>(GetParam()) == "ST_BILINEAR")
-				param.solverType = RLOFOpticalFlowParameter::ST_BILINEAR;
-			if (get<0>(GetParam()) == "ST_STANDART")
-				param.solverType = RLOFOpticalFlowParameter::ST_STANDART;
-			if (get<1>(GetParam()) == "SR_CROSS")
-				param.supportRegionType = RLOFOpticalFlowParameter::SR_CROSS;
-			if (get<1>(GetParam()) == "SR_FIXED")
-				param.supportRegionType = RLOFOpticalFlowParameter::SR_FIXED;
-			param.useIlluminationModel = get<2>(GetParam());
+            Ptr<RLOFOpticalFlowParameter> param;
+            if (get<0>(GetParam()) == "ST_BILINEAR")
+                param->solverType = ST_BILINEAR;
+            if (get<0>(GetParam()) == "ST_STANDART")
+                param->solverType = ST_STANDART;
+            if (get<1>(GetParam()) == "SR_CROSS")
+                param->supportRegionType = SR_CROSS;
+            if (get<1>(GetParam()) == "SR_FIXED")
+                param->supportRegionType = SR_FIXED;
+            param->useIlluminationModel = get<2>(GetParam());
 
-			TEST_CYCLE_N(1)
-			{
-				calcOpticalFlowSparseRLOF(frame1, frame2, prevPts, currPts, status, err, param, 1.f);
-			}
+            TEST_CYCLE_N(1)
+            {
+                calcOpticalFlowSparseRLOF(frame1, frame2, prevPts, currPts, status, err, param, 1.f);
+            }
 
-			SANITY_CHECK_NOTHING();
-		}
+            SANITY_CHECK_NOTHING();
+        }
 
-		typedef tuple<std::string, int> INTERP_GRID_Dense_t;
-		typedef TestBaseWithParam<INTERP_GRID_Dense_t> INTERP_GRID_Dense;
-		PERF_TEST_P(INTERP_GRID_Dense, OpticalFlow_DenseRLOF,
-			testing::Combine(
-				testing::Values<std::string>("INTERP_EPIC", "INTERP_GEO"),
-				testing::Values<int>(4,10,50))
-		)
-		{
-			Mat flow;
-			string frame1_path = TS::ptr()->get_data_path() + "/cv/optflow/RubberWhale1.png";
-			string frame2_path = TS::ptr()->get_data_path() + "/cv/optflow/RubberWhale2.png";
-			// removing space may be an issue on windows machines
-			frame1_path.erase(std::remove_if(frame1_path.begin(), frame1_path.end(), isspace), frame1_path.end());
-			frame2_path.erase(std::remove_if(frame2_path.begin(), frame2_path.end(), isspace), frame2_path.end());
-			Mat frame1 = imread(frame1_path);
-			Mat frame2 = imread(frame2_path);
-			ASSERT_FALSE(frame1.empty());
-			ASSERT_FALSE(frame2.empty());
-		
-			RLOFOpticalFlowParameter param;
-			Ptr< DenseRLOFOpticalFlow> algo = DenseRLOFOpticalFlow::create();
-			DenseRLOFOpticalFlow::InterpolationType interp_type;
-			if (get<0>(GetParam()) == "INTERP_EPIC")
-				interp_type = DenseRLOFOpticalFlow::InterpolationType::INTERP_EPIC;
-			if (get<0>(GetParam()) == "INTERP_GEO")
-				interp_type = DenseRLOFOpticalFlow::InterpolationType::INTERP_GEO;
-			
+        typedef tuple<std::string, int> INTERP_GRID_Dense_t;
+        typedef TestBaseWithParam<INTERP_GRID_Dense_t> INTERP_GRID_Dense;
+        PERF_TEST_P(INTERP_GRID_Dense, OpticalFlow_DenseRLOF,
+            testing::Combine(
+                testing::Values<std::string>("INTERP_EPIC", "INTERP_GEO"),
+                testing::Values<int>(4,10,50))
+        )
+        {
+            Mat flow;
+            string frame1_path = TS::ptr()->get_data_path() + "/cv/optflow/RubberWhale1.png";
+            string frame2_path = TS::ptr()->get_data_path() + "/cv/optflow/RubberWhale2.png";
+            // removing space may be an issue on windows machines
+            frame1_path.erase(std::remove_if(frame1_path.begin(), frame1_path.end(), isspace), frame1_path.end());
+            frame2_path.erase(std::remove_if(frame2_path.begin(), frame2_path.end(), isspace), frame2_path.end());
+            Mat frame1 = imread(frame1_path);
+            Mat frame2 = imread(frame2_path);
+            ASSERT_FALSE(frame1.empty());
+            ASSERT_FALSE(frame2.empty());
+        
+            Ptr<RLOFOpticalFlowParameter> param;
+            Ptr< DenseRLOFOpticalFlow> algo = DenseRLOFOpticalFlow::create();
+            InterpolationType interp_type;
+            if (get<0>(GetParam()) == "INTERP_EPIC")
+                interp_type = INTERP_EPIC;
+            if (get<0>(GetParam()) == "INTERP_GEO")
+                interp_type = INTERP_GEO;
+            
 
-			TEST_CYCLE_N(5)
-			{
-				calcOpticalFlowDenseRLOF(frame1, frame2,flow, param, 1.0f, Size(get<1>(GetParam()), get<1>(GetParam())), interp_type);
-			}
+            TEST_CYCLE_N(5)
+            {
+                calcOpticalFlowDenseRLOF(frame1, frame2,flow, param, 1.0f, Size(get<1>(GetParam()), get<1>(GetParam())), interp_type);
+            }
 
-			SANITY_CHECK_NOTHING();
-		}
-	}
+            SANITY_CHECK_NOTHING();
+        }
+    }
 } // namespace
diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp
index d903d1733db..437e807d810 100644
--- a/modules/optflow/src/rlof/berlof_invoker.hpp
+++ b/modules/optflow/src/rlof/berlof_invoker.hpp
@@ -1,44 +1,6 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
- //
- //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
- //
- //  By downloading, copying, installing or using the software you agree to this license.
- //  If you do not agree to this license, do not download, install,
- //  copy or use the software.
- //
- //
- //                           License Agreement
- //                For Open Source Computer Vision Library
- //
- // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
- // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
- // Third party copyrights are property of their respective owners.
- //
- // Redistribution and use in source and binary forms, with or without modification,
- // are permitted provided that the following conditions are met:
- //
- //   * Redistribution's of source code must retain the above copyright notice,
- //     this list of conditions and the following disclaimer.
- //
- //   * Redistribution's in binary form must reproduce the above copyright notice,
- //     this list of conditions and the following disclaimer in the documentation
- //     and/or other materials provided with the distribution.
- //
- //   * The name of the copyright holders may not be used to endorse or promote products
- //     derived from this software without specific prior written permission.
- //
- // This software is provided by the copyright holders and contributors "as is" and
- // any express or implied warranties, including, but not limited to, the implied
- // warranties of merchantability and fitness for a particular purpose are disclaimed.
- // In no event shall the Intel Corporation or contributors be liable for any direct,
- // indirect, incidental, special, exemplary, or consequential damages
- // (including, but not limited to, procurement of substitute goods or services;
- // loss of use, data, or profits; or business interruption) however caused
- // and on any theory of liability, whether in contract, strict liability,
- // or tort (including negligence or otherwise) arising in any way out of
- // the use of this software, even if advised of the possibility of such damage.
- //
- //M*/
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
 #ifndef _BERLOF_INVOKER_HPP_
 #define _BERLOF_INVOKER_HPP_
 #include "rlof_invokerbase.hpp"
@@ -46,2184 +8,2122 @@
 
 namespace cv{
 namespace optflow{
-	
-	bool checkSolution(float x, float y, float * c )
-	{
-		float _a = x - 0.002f;
-		float _b = y - 0.002f;
-		cv::Point2f tl( c[0] * _a * _b + c[1] * _a + c[2] * _b + c[3],  c[4] * _a * _b + c[5] * _a + c[6] * _b + c[7]);
-		_a = x + 0.002f;
-		cv::Point2f tr( c[0] * _a * _b + c[1] * _a + c[2] * _b + c[3],  c[4] * _a * _b + c[5] * _a + c[6] * _b + c[7]);
-		_b = y + 0.002f;
-		cv::Point2f br( c[0] * _a * _b + c[1] * _a + c[2] * _b + c[3],  c[4] * _a * _b + c[5] * _a + c[6] * _b + c[7]);
-		_a = x - 0.002f;
-		cv::Point2f bl( c[0] * _a * _b + c[1] * _a + c[2] * _b + c[3],  c[4] * _a * _b + c[5] * _a + c[6] * _b + c[7]);
-		return (tl.x >= 0 && tl.y >= 0) && (tr.x <= 0 && tr.y >= 0)
-			&& (bl.x >= 0 && bl.y <= 0) && (br.x <= 0 && br.y <= 0);
-	}
-
-	inline cv::Point2f est_DeltaGain( const cv::Mat & src, const cv::Vec4f & val)
-	{
-		return cv::Point2f( 
-			src.at<float>(2,0) * val[0] + src.at<float>(2,1) * val[1] + src.at<float>(2,2) * val[2] + src.at<float>(2,3) * val[3],
-			src.at<float>(3,0) * val[0] + src.at<float>(3,1) * val[1] + src.at<float>(3,2) * val[2] + src.at<float>(3,3) * val[3]);
-	}
-	inline void est_Result( const cv::Mat & src, const cv::Vec4f & val, cv::Point2f & delta, cv::Point2f & gain)
-	{
-
-		delta = cv::Point2f( 
-			-(src.at<float>(0,0) * val[0] + src.at<float>(0,1) * val[1] + src.at<float>(0,2) * val[2] + src.at<float>(0,3) * val[3]),
-			-(src.at<float>(1,0) * val[0] + src.at<float>(1,1) * val[1] + src.at<float>(1,2) * val[2] + src.at<float>(1,3) * val[3]));
-
-		gain = cv::Point2f( 
-			src.at<float>(2,0) * val[0] + src.at<float>(2,1) * val[1] + src.at<float>(2,2) * val[2] + src.at<float>(2,3) * val[3],
-			src.at<float>(3,0) * val[0] + src.at<float>(3,1) * val[1] + src.at<float>(3,2) * val[2] + src.at<float>(3,3) * val[3]);
-	}
-	
-	namespace berlof
-	{
-		
-		namespace ica
-		{
-		
-		class TrackerInvoker : public cv::ParallelLoopBody
-			{
-				public:
-				TrackerInvoker( 
-									const Mat&	_prevImg, 
-									const Mat&	_prevDeriv, 
-									const Mat&	_nextImg,
-									const Mat&	_rgbPrevImg,
-									const Mat&	_rgbNextImg,
-									const Point2f*	_prevPts, 
-									Point2f*		_nextPts,
-									uchar*			_status, 
-									float*			_err, 
-									int				_level, 
-									int				_maxLevel,
-									int				_winSize[2], 
-									int				_maxIteration,
-									bool			_useInitialFlow,
-									int				_supportRegionType,
-									int				_crossSegmentationThreshold,
-									std::vector<float> _normSigmas,
-									float           _minEigenValue)
-				{
-					prevImg =		&_prevImg;
-					prevDeriv =		&_prevDeriv;
-					nextImg =		&_nextImg;
-					rgbPrevImg =	&_rgbPrevImg;
-					rgbNextImg =	&_rgbNextImg; 
-					prevPts =		_prevPts;
-					nextPts =		_nextPts;
-					status =		_status;
-					err =			_err;
-					minWinSize			=	_winSize[0];
-					maxWinSize			=	_winSize[1];
-					criteria.maxCount	=	_maxIteration;
-					criteria.epsilon	=	0.01;
-					level				=	_level;
-					maxLevel			=	_maxLevel;
-					windowType			= _supportRegionType;
-					minEigThreshold		= _minEigenValue;
-					paramReset			= _normSigmas;
-					useInitialFlow		= _useInitialFlow;
-					crossSegmentationThreshold	= _crossSegmentationThreshold;
-				}
-    
-				TrackerInvoker & operator=(const TrackerInvoker &) {return *this;};
-				void operator()(const cv::Range& range) const
-				{
-	
-					Point2f halfWin; 
-					cv::Size winSize;
-					const Mat& I = *prevImg;
-					const Mat& J = *nextImg;
-					const Mat& derivI = *prevDeriv;
-					const Mat& BI = *rgbPrevImg;
-					
-
-					const float FLT_SCALE = (1.f/(1 << 16));
-
-					winSize = cv::Size(maxWinSize,maxWinSize);
-					int winMaskwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
-					cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
-					winMaskMatBuf.setTo(1);
-		
-					std::vector<float> param = paramReset;
-					int j, cn = I.channels(), cn2 = cn*2;
-					int winbufwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
-					cv::Size winBufSize(winbufwidth,winbufwidth);
-
-					cv::AutoBuffer<deriv_type> _buf(winBufSize.area()*(cn + cn2));
-					int derivDepth = DataType<deriv_type>::depth;
-
-					Mat IWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf.data());
-					Mat derivIWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf.data() + winBufSize.area()*cn);
-
-					
-					for( int ptidx = range.start; ptidx < range.end; ptidx++ )
-					{
-						Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
-						Point2f nextPt;
-						if( level == maxLevel )
-						{
-							if( useInitialFlow )
-								nextPt = nextPts[ptidx]*(float)(1./(1 << level));
-							else
-								nextPt = prevPt;
-						}
-						else
-							nextPt = nextPts[ptidx]*2.f;
-						nextPts[ptidx] = nextPt;
-
-						Point2i iprevPt, inextPt;
-						iprevPt.x = cvFloor(prevPt.x);
-						iprevPt.y = cvFloor(prevPt.y);
-						int winArea = maxWinSize * maxWinSize;
-						cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize));
-						winMaskMatBuf.setTo(0);
-						if( calcWinMaskMat(BI, windowType, iprevPt,
-											winMaskMat,winSize,halfWin,winArea,
-											minWinSize,maxWinSize, crossSegmentationThreshold) == false)
-							continue;
-
-						prevPt -= halfWin;
-						iprevPt.x = cvFloor(prevPt.x);
-						iprevPt.y = cvFloor(prevPt.y);
-            
-						if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols ||
-							iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows )
-						{
-							if( level == 0 )
-							{
-								if( status )
-									status[ptidx] = 3;
-								if( err )
-									err[ptidx] = 0;
-							}
-							continue;
-						}
-            
-						float a = prevPt.x - iprevPt.x;
-						float b = prevPt.y - iprevPt.y;
-						const int W_BITS = 14, W_BITS1 = 14;
-            
-						int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
-						int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
-						int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
-						int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-            
-						int dstep = (int)(derivI.step/derivI.elemSize1());
-						int step = (int)(I.step/I.elemSize1());
-						int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1());
-						CV_Assert( step == (int)(J.step/J.elemSize1()) );
-
-						float A11 = 0, A12 = 0, A22 = 0;
-						float b1 = 0, b2 = 0;
-						float D = 0;
-
-						// extract the patch from the first image, compute covariation matrix of derivatives
-						int x, y;
-						for( y = 0; y < winSize.height; y++ )
-						{
-							const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
-							const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
-                
-							short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step);
-							short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step);
-							x = 0;
-							for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 )
-							{
-								if( winMaskMat.at<uchar>(y,x) == 0)
-								{
-									dIptr[0] = 0;
-									dIptr[1] = 0;
-									continue;
-								}
-								int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
-													  src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5);
-								int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
-													   dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
-								int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
-													   dsrc[dstep+cn2+1]*iw11, W_BITS1);
-								Iptr[x] = (short)ival;
-								dIptr[0] = (short)ixval;
-								dIptr[1] = (short)iyval;
-							}
-						}
-
-						cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1);
-						cv::Point2f backUpNextPt = nextPt;
-						nextPt -= halfWin;
-						Point2f prevDelta(0,0);	//denotes h(t-1)
-						cv::Size _winSize = winSize;
-						__m128i mmMask0, mmMask1, mmMask;
-						getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask);
-						__m128  mmOnes   = _mm_set1_ps(1.f );
-						__m128i mmOnes_epi16   = _mm_set1_epi16(1 );
-						__m128i mmZeros_epi16 = _mm_set1_epi16(0);
-						float MEstimatorScale = 1;
-						int buffIdx = 0;
-						float c[8];
-						cv::Mat GMc0, GMc1, GMc2, GMc3;
-						cv::Vec2f Mc0, Mc1, Mc2, Mc3;
-						int noIteration = 0;
-						int noReusedIteration = 0;
-						int noSolvedIteration = 0;
-						for( j = 0; j < criteria.maxCount; j++ )
-						{
-							cv::Point2f delta(0,0);
-							cv::Point2f deltaGain(0,0);
-							bool hasSolved = false; 
-							a = nextPt.x - inextPt.x;
-							b = nextPt.y - inextPt.y;
-							float ab = a * b;
-							if( j == 0	
-								|| ( inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j % 2 != 0 ))
-							{
-							inextPt.x = cvFloor(nextPt.x);
-							inextPt.y = cvFloor(nextPt.y);
-                
-							if( inextPt.x < -winSize.width  || inextPt.x >= J.cols ||
-							    inextPt.y < -winSize.height || inextPt.y >= J.rows )
-							{
-								if( level == 0 && status )
-									status[ptidx] = 3;
-								break;
-							}
-                
-							
-							a = nextPt.x - inextPt.x;
-							b = nextPt.y - inextPt.y;
-							ab = a * b;
-							iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
-							iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
-							iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
-							iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-							// mismatch
-							b1 = 0;
-							b2 = 0;
-							if( j == 0 )
-							{
-								A11 = 0;
-								A12 = 0;
-								A22 = 0;
-							}
-
-							if ( j == 0 )
-							{
-								buffIdx = 0;
-								for( y = 0; y < winSize.height; y++ )
-								{
-									const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y )*step + (inextPt.x )*cn;
-									const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step);
-									const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
-									const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
-									x = 0;
-									for( ; x < winSize.width*cn; x++, dIptr += 2)
-									{
-										if( maskPtr[x] == 0) 
-											continue;
-										int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, W_BITS1-5) 
-											- Iptr[x];
-										residualMat.at<short>(buffIdx++) = static_cast<short>(diff);
-									}
-								}
-								/*! Estimation for the residual */
-								cv::Mat residualMatRoi(residualMat, cv::Rect(0,0,1, buffIdx));
-								MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi);
-							}
-
-							float eta = 1.f / winArea;
-							float fParam0 = param[0] * 32.f;
-							float fParam1 = param[1] * 32.f;
-							fParam0 = param[0] * MEstimatorScale;
-							fParam1 = param[1] * MEstimatorScale;
-							
-							
-							buffIdx = 0;
-							float _b0[4] = {0,0,0,0};
-							float _b1[4] = {0,0,0,0};
-							
-							/*
-							*/
-							for( y = 0; y < _winSize.height; y++ )
-							{
-								const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
-								const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn;
-								const short* Iptr = (const short*)(IWinBuf.data +  y*IWinBuf.step);
-								const short* dIptr = (const short*)(derivIWinBuf.data + y*derivIWinBuf.step);
-                    
-								x = 0;
-								for( ; x < _winSize.width*cn; x++, dIptr += 2 )
-								{
-									if( maskPtr[x] == 0)
-										continue;
-									int illValue =   - Iptr[x];
-									
-									float It[4] = {static_cast<float>((Jptr[x+step+cn]<< 5)	+ illValue),
-												 static_cast<float>((Jptr[x+cn]<< 5)		+ illValue),
-												 static_cast<float>((Jptr[x+step]<< 5)		+ illValue),
-												 static_cast<float>((Jptr[x] << 5)			+ illValue)};
-									
-									
-									
-									int J_val  =  CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
-														  Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11,
-														  W_BITS1-5);
-									
-								
-									int diff = J_val + illValue;
-						
-									
-									MEstimatorScale += (diff < MEstimatorScale) ? -eta : eta; 
-
-									int abss = (diff < 0) ? -diff : diff;
-						
-									// compute the missmatch vector
-									if( j >= 0)
-									{
-										if( abss > fParam1)
-										{
-											It[0] = 0;
-											It[1] = 0;
-											It[2] = 0;
-											It[3] = 0;
-										}
-										else if( abss > fParam0 && diff >= 0 )
-										{
-											It[0] = param[2] * (It[0] - fParam1);
-											It[1] = param[2] * (It[1] - fParam1);
-											It[2] = param[2] * (It[2] - fParam1);
-											It[3] = param[2] * (It[3] - fParam1);
-										}
-										else if( abss > fParam0 && diff < 0 )
-										{
-											It[0] = param[2] * (It[0] + fParam1);
-											It[1] = param[2] * (It[1] + fParam1);
-											It[2] = param[2] * (It[2] + fParam1);
-											It[3] = param[2] * (It[3] + fParam1);
-										}
-									}
-									
-									float It0 = It[0];
-									float It1 = It[1];
-									float It2 = It[2];
-									float It3 = It[3];
-
-									float ixval = static_cast<float>(dIptr[0]);
-									float iyval = static_cast<float>(dIptr[1]);
-									_b0[0] += It0 * ixval;
-									_b0[1] += It1 * ixval;
-									_b0[2] += It2 * ixval;
-									_b0[3] += It3 * ixval;
-									
-									  
-									_b1[0] += It0*iyval;
-									_b1[1] += It1*iyval;
-									_b1[2] += It2*iyval;
-									_b1[3] += It3*iyval;
-									
-									
-									// compute the Gradient Matrice
-									if( j == 0)
-									{
-										float tale = param[2] * FLT_RESCALE;
-										if( abss < fParam0 || j < 0)
-										{
-											tale = FLT_RESCALE;
-										} 
-										else if( abss > fParam1)
-										{
-											tale *= 0.01f;
-										}
-										else
-										{
-											tale *= param[2];
-										}
-
-										A11 += (float)(ixval*ixval)*tale;
-										A12 += (float)(ixval*iyval)*tale;
-										A22 += (float)(iyval*iyval)*tale;
-										
-									}
-								}
-							}
-
-							if( j == 0 )
-							{
-			
-								A11 *= FLT_SCALE; // 54866744.
-								A12 *= FLT_SCALE; // -628764.00
-								A22 *= FLT_SCALE; // 19730.000
-								
-								D = A11 * A22 - A12 * A12;
-								float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
-										4.f*A12*A12))/(2*winArea);
-
-								if(  minEig < minEigThreshold || std::abs(D) < FLT_EPSILON)
-								{
-									if( level == 0 && status )
-										status[ptidx] = 0;
-									if( level > 0)
-									{
-										nextPts[ptidx] = backUpNextPt;
-									}
-									noIteration++;
-									break;
-								}
-				
-								D = (1.f / D);
-
-							}
-
-							_b0[0] *= FLT_SCALE;_b0[1] *= FLT_SCALE;_b0[2] *= FLT_SCALE;_b0[3] *= FLT_SCALE;
-							_b1[0] *= FLT_SCALE;_b1[1] *= FLT_SCALE;_b1[2] *= FLT_SCALE;_b1[3] *= FLT_SCALE;
-							
-
-							Mc0[0] =   _b0[0] - _b0[1] - _b0[2] + _b0[3];
-							Mc0[1] =   _b1[0] - _b1[1] - _b1[2] + _b1[3];
-						
-							Mc1[0] =   _b0[1] - _b0[3];
-							Mc1[1] =   _b1[1] - _b1[3];
-						
-
-							Mc2[0] =   _b0[2] - _b0[3];
-							Mc2[1] =   _b1[2] - _b1[3];
-							
-
-							Mc3[0] =  _b0[3];
-							Mc3[1] =  _b1[3];
-							
-							c[0] = -Mc0[0];
-							c[1] = -Mc1[0];
-							c[2] = -Mc2[0];
-							c[3] = -Mc3[0];
-							c[4] = -Mc0[1];
-							c[5] = -Mc1[1];
-							c[6] = -Mc2[1];
-							c[7] = -Mc3[1];
-
-							float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]);
-							float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]);
-							float e2 = e0 * (c[1] * c[7] -c[3] * c[5]);
-							e0 = e1 * e1 - e2;
-							hasSolved = false;
-							if ( e0 > 0)
-							{
-								e0 = sqrt(e0);
-								float _y[2] = {-e1 - e0, e0 - e1};
-								float c0yc1[2] = {c[0] * _y[0] + c[1],  
-													c[0] * _y[1] + c[1]};
-								float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0], 
-												-(c[2] * _y[1] + c[3]) / c0yc1[1]};
-								float c0yc2[2] = {c[0] * _y[0] + c[2],
-											  c[0] * _y[1] + c[2]};
-									
-								float c4yc6[2] = {c[4] * _y[0] + c[6],
-												  c[4] * _y[1] + c[6]};
-
-								float c0xc1[2] = {c[0] * _x[0] + c[1],  
-												  c[0] * _x[1] + c[1]};
-							
-								float c4xc5[2] = {c[4] * _x[0] + c[5],  
-												  c[4] * _x[1] + c[5]};
-								bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1);
-								bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1);
-																
-								bool isSolution1 = checkSolution(_x[0], _y[0], c );
-								bool isSolution2 = checkSolution(_x[1], _y[1], c );
-								bool isSink1 = isIn1 && isSolution1;
-								bool isSink2 = isIn2 && isSolution2;
-							
-								if ( isSink1 != isSink2)
-								{
-									a = isSink1 ? _x[0] : _x[1];
-									b = isSink1 ? _y[0] : _y[1];
-									ab = a * b;
-									hasSolved = true;
-									delta.x = inextPt.x + a - nextPt.x;
-									delta.y = inextPt.y + b - nextPt.y;
-								} // isIn1 != isIn2
-							}
-								if( hasSolved == false)
-									noIteration++;
-							}
-							else
-							{
-								hasSolved = false;
-								noReusedIteration++;
-							}
-							if( hasSolved == false )
-							{
-								
-								cv::Vec2f mismatchVec = ab * Mc0  + Mc1 *a + Mc2 * b + Mc3;
-								delta.x = (A12 * mismatchVec.val[1] - A22 * mismatchVec.val[0]) * D;
-								delta.y = (A12 * mismatchVec.val[0] - A11 * mismatchVec.val[1]) * D;
-								delta.x = MAX(-1.f, MIN(1.f, delta.x));
-								delta.y = MAX(-1.f, MIN(1.f, delta.y));
-								nextPt  += delta;
-								nextPts[ptidx] = nextPt + halfWin;
-							}
-							else
-							{
-								nextPt += delta;
-								nextPts[ptidx] = nextPt + halfWin;
-								noSolvedIteration++;
-								break;
-							}
-
-							delta.x = ( delta.x != delta.x) ? 0 : delta.x;
-							delta.y = ( delta.y != delta.y) ? 0 : delta.y;
-
-							
-							cv::Point2f totalDistance = backUpNextPt - nextPts[ptidx];
-
-							if(j > 0 && ( 
-								(std::abs(delta.x - prevDelta.x) < 0.01  &&	std::abs(delta.y - prevDelta.y) < 0.01)))
-							{
-								nextPts[ptidx]  -= delta*0.5f;
-								break;
-							}
-							
-							
-							prevDelta = delta;
-						}
-						
-					}
-	
-				}
-    
-				const Mat*		prevImg;
-				const Mat*		nextImg;
-				const Mat*		prevDeriv;
-				const Mat*		rgbPrevImg;
-				const Mat*		rgbNextImg;
-				const Point2f*	prevPts;
-				Point2f*		nextPts;
-				uchar*			status;
-				float*			err;
-				int				maxWinSize;
-				int				minWinSize;
-				TermCriteria	criteria;
-				int				level;
-				int				maxLevel;
-				int				windowType;
-				float			minEigThreshold;
-				bool			useInitialFlow;
-				std::vector<float> paramReset;
-				int		crossSegmentationThreshold;
-				
-			};
-		}
-		namespace radial
-		{
-			class TrackerInvoker : public cv::ParallelLoopBody
-			{
-			public:
-				TrackerInvoker(
-					const Mat&	_prevImg,
-					const Mat&	_prevDeriv,
-					const Mat&	_nextImg,
-					const Mat&	_rgbPrevImg,
-					const Mat&	_rgbNextImg,
-					const Point2f*	_prevPts,
-					Point2f*		_nextPts,
-					uchar*			_status,
-					float*			_err,
-					Point2f*		_gainVecs,
-					int				_level,
-					int				_maxLevel,
-					int				_winSize[2],
-					int				_maxIteration,
-					bool			_useInitialFlow,
-					int				_supportRegionType,
-					int				_crossSegmentationThreshold,
-					std::vector<float> _normSigmas,
-					float           _minEigenValue)
-				{
-					prevImg = &_prevImg;
-					prevDeriv = &_prevDeriv;
-					nextImg = &_nextImg;
-					rgbPrevImg = &_rgbPrevImg;
-					rgbNextImg = &_rgbNextImg;
-					prevPts = _prevPts;
-					nextPts = _nextPts;
-					status = _status;
-					err = _err;
-					gainVecs = _gainVecs;
-					minWinSize = _winSize[0];
-					maxWinSize = _winSize[1];
-					criteria.maxCount = _maxIteration;
-					criteria.epsilon = 0.01;
-					level = _level;
-					maxLevel = _maxLevel;
-					windowType = _supportRegionType;
-					minEigThreshold = _minEigenValue;
-					paramReset = _normSigmas;
-					useInitialFlow = _useInitialFlow;
-					crossSegmentationThreshold = _crossSegmentationThreshold;
-				}
-    
-				TrackerInvoker & operator=(const TrackerInvoker &) {return *this;};
-				void operator()(const cv::Range& range) const
-				{
-					Point2f halfWin; 
-					cv::Size winSize;
-					const Mat& I = *prevImg;
-					const Mat& J = *nextImg;
-					const Mat& derivI = *prevDeriv;
-					const Mat& BI = *rgbPrevImg;
-					const float FLT_SCALE = (1.f/(1 << 16));//(1.f/(1 << 20)); // 20
-					winSize = cv::Size(maxWinSize,maxWinSize);
-					int winMaskwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
-					cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
-					winMaskMatBuf.setTo(1);
-		
-					std::vector<float> param = paramReset;
-					int cn = I.channels(), cn2 = cn*2;
-					int winbufwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
-					cv::Size winBufSize(winbufwidth,winbufwidth);
-
-					
-					cv::Mat invTensorMat(4,4, CV_32FC1);
-					cv::Mat mismatchMat(4, 1, CV_32FC1);
-					cv::Mat resultMat(4, 1, CV_32FC1);
-		
-					cv::AutoBuffer<deriv_type> _buf(winBufSize.area()*(cn + cn2));
-					int derivDepth = DataType<deriv_type>::depth;
-
-					Mat IWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf.data());
-					Mat derivIWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf.data() + winBufSize.area()*cn);
-
-					for( int ptidx = range.start; ptidx < range.end; ptidx++ )
-					{
-						Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
-						Point2f nextPt;
-						if( level == maxLevel )
-						{
-							if( useInitialFlow )
-								nextPt = nextPts[ptidx]*(float)(1./(1 << level));
-							else
-								nextPt = prevPt;
-						}
-						else
-							nextPt = nextPts[ptidx]*2.f;
-						nextPts[ptidx] = nextPt;
-
-						Point2i iprevPt, inextPt;
-						iprevPt.x = cvFloor(prevPt.x);
-						iprevPt.y = cvFloor(prevPt.y);
-						int winArea = maxWinSize * maxWinSize;
-						cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize));
-						winMaskMatBuf.setTo(0);
-						if( calcWinMaskMat(BI,  windowType, iprevPt,
-											winMaskMat,winSize,halfWin,winArea,
-											minWinSize,maxWinSize, crossSegmentationThreshold) == false)
-							continue;
-
-						prevPt -= halfWin;
-						iprevPt.x = cvFloor(prevPt.x);
-						iprevPt.y = cvFloor(prevPt.y);
-            
-						if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols ||
-							iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows )
-						{
-							if( level == 0 )
-							{
-								if( status )
-									status[ptidx] = 3;
-								if( err )
-									err[ptidx] = 0;
-							}
-							continue;
-						}
-            
-						float a = prevPt.x - iprevPt.x;
-						float b = prevPt.y - iprevPt.y;
-						const int W_BITS = 14, W_BITS1 = 14;
-            
-						int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
-						int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
-						int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
-						int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-            
-						int dstep = (int)(derivI.step/derivI.elemSize1());
-						int step = (int)(I.step/I.elemSize1());
-						int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1());
-						CV_Assert( step == (int)(J.step/J.elemSize1()) );
-
-						float A11 = 0, A12 = 0, A22 = 0;
-						float b1 = 0, b2 = 0, b3 = 0, b4 = 0;
-					
-						// tensor
-						float sumIx = 0;
-						float sumIy = 0;
-						float sumI  = 0;
-						float sumW = 0;
-						float w1 = 0, w2 = 0; // -IyI
-						float dI = 0; // I^2
-						float D = 0;
-
-			#ifdef CV_SSE2
-						__m128i rightMask = _mm_set_epi16(0, std::numeric_limits<unsigned short>::max(), 0, std::numeric_limits<unsigned short>::max(), 0, std::numeric_limits<unsigned short>::max(), 0, std::numeric_limits<unsigned short>::max());
-						__m128i leftMask  = _mm_set_epi16(std::numeric_limits<unsigned short>::max(), 0, std::numeric_limits<unsigned short>::max(), 0, std::numeric_limits<unsigned short>::max(), 0, std::numeric_limits<unsigned short>::max(), 0);
-					
-						__m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
-						__m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
-						__m128i z = _mm_setzero_si128();
-						__m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1));
-						__m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1));
-						__m128i mmMask4_epi32;
-						__m128i mmMaskSet_epi16   = _mm_set1_epi16(std::numeric_limits<unsigned short>::max());
-						get4BitMask(winSize.width, mmMask4_epi32);
-						__m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps();
-			#endif
-            
-						// extract the patch from the first image, compute covariation matrix of derivatives
-						int x, y;
-						for( y = 0; y < winSize.height; y++ )
-						{
-							x = 0;
-							const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
-							const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
-							const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
-                
-							short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step);
-							short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step);
-                
-							
-                
-			#ifdef CV_SSE2
-							for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 )
-							{
-								__m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16);
-								__m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16);
-						
-					
-								__m128i v00, v01, v10, v11, t0, t1;
-								v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
-								v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
-								v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z);
-								v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z);
-                    
-								t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
-												   _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-								t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
-								if( x + 4 > winSize.width)
-								{
-									t0 = _mm_and_si128(t0, mmMask4_epi32);
-								}
-								t0 = _mm_and_si128(t0, mask_0_3_epi16);
-								_mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0));
-                   
-                    
-								v00 = _mm_loadu_si128((const __m128i*)(dsrc));
-								v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
-								v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep));
-								v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2));
-                    
-								t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
-												   _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-								t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
-												   _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
-								t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
-								t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
-								v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
-								if( x + 4 > winSize.width)
-								{
-									v00 = _mm_and_si128(v00, mmMask4_epi32);
-								}
-								v00 = _mm_and_si128(v00, mask_0_3_epi16);
-								_mm_storeu_si128((__m128i*)dIptr, v00);
-							}
-			#else
-                
-							for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 )
-							{
-								if( winMaskMat.at<uchar>(y,x) == 0)
-								{
-									dIptr[0] = 0;
-									dIptr[1] = 0;
-									continue;
-								}
-								int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
-													  src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5);
-								int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
-													   dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
-								int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
-													   dsrc[dstep+cn2+1]*iw11, W_BITS1);
-                    
-								Iptr[x] = (short)ival;
-								dIptr[0] = (short)ixval;
-								dIptr[1] = (short)iyval;
-
-							}
-			#endif
-						}
-
-						cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1);
-
-						cv::Point2f backUpNextPt = nextPt;
-						nextPt -= halfWin;
-						Point2f prevDelta(0,0);	//relates to h(t-1)
-						Point2f prevGain(1,0);
-						cv::Point2f gainVec = gainVecs[ptidx];
-						cv::Point2f backUpGain = gainVec;
-						cv::Size _winSize = winSize;
-						int j;
-						__m128i mmMask0, mmMask1, mmMask;
-						getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask);
-						__m128  mmOnes   = _mm_set1_ps(1.f );
-						__m128i mmOnes_epi16   = _mm_set1_epi16(1 );
-						__m128i mmZeros_epi16 = _mm_set1_epi16(0);
-						float MEstimatorScale = 1;
-						int buffIdx = 0;
-						float c[8];
-						cv::Mat GMc0, GMc1, GMc2, GMc3;
-						cv::Vec4f Mc0, Mc1, Mc2, Mc3;
-						int noIteration = 0;
-						int noReusedIteration = 0;
-						int noSolvedIteration = 0;
-						for( j = 0; j < criteria.maxCount; j++ )
-						{
-							cv::Point2f delta(0,0);
-							cv::Point2f deltaGain(0,0);
-							bool hasSolved = false; 
-							a = nextPt.x - inextPt.x;
-							b = nextPt.y - inextPt.y;
-							float ab = a * b;
-							if( j == 0 
-							|| (inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j % 2 != 0) )
-							{
-							inextPt.x = cvFloor(nextPt.x);
-							inextPt.y = cvFloor(nextPt.y);
-                
-							if( inextPt.x < -winSize.width  || inextPt.x >= J.cols ||
-							    inextPt.y < -winSize.height || inextPt.y >= J.rows )
-							{
-								if( level == 0 && status )
-									status[ptidx] = 3;
-								if (level > 0)
-								{
-									nextPts[ptidx] = backUpNextPt;
-									gainVecs[ptidx] = backUpGain;
-								}
-								noIteration++;
-								break;
-							}
-                
-							
-							a = nextPt.x - inextPt.x;
-							b = nextPt.y - inextPt.y;
-							ab = a * b;
-							iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
-							iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
-							iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
-							iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-							// mismatch
-							b1 = 0;
-							b2 = 0;
-							b3 = 0;
-							b4 = 0;
-							if( j == 0)
-							{
-								// tensor
-								w1 = 0; // -IxI
-								w2 = 0; // -IyI
-								dI = 0; // I^2
-								sumIx = 0;
-								sumIy = 0;
-								sumI  = 0;
-								sumW = 0;
-								A11 = 0;
-								A12 = 0;
-								A22 = 0;
-							}
-
-							if ( j == 0 )
-							{
-								buffIdx = 0;
-								for( y = 0; y < winSize.height; y++ )
-								{
-									const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y )*step + (inextPt.x )*cn;
-									const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step);
-									const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
-									const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
-									x = 0;
-									for( ; x < winSize.width*cn; x++, dIptr += 2)
-									{
-										if( maskPtr[x] == 0) 
-											continue;
-										int diff = static_cast<int>(CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, W_BITS1-5) 
-											- Iptr[x] + Iptr[x] * gainVec.x +gainVec.y);
-										residualMat.at<short>(buffIdx++) = static_cast<short>(diff);
-									}
-								}
-								/*! Estimation for the residual */
-								cv::Mat residualMatRoi(residualMat, cv::Rect(0,0,1, buffIdx));
-								MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi);
-							}
-
-							float eta = 1.f / winArea;
-							float fParam0 = param[0] * 32.f;
-							float fParam1 = param[1] * 32.f;
-							fParam0 = param[0] * MEstimatorScale;
-							fParam1 = param[1] * MEstimatorScale;
-							
-#ifdef CV_SSE2
-							
-							qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
-							qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
-							__m128 qb0[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; 
-							__m128 qb1[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
-							__m128 qb2[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
-							__m128 qb3[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
-							__m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps(); 
-							__m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps();
-							__m128 mmSumIy = _mm_setzero_ps(),  mmSumIx = _mm_setzero_ps();
-							__m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();
-							__m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() -1, static_cast<short>(fParam0)));
-							__m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max()- 1, static_cast<short>(fParam1)));
-				
-
-							float s2Val = param[2] > 0 ? param[2] : -param[2];
-							int s2bitShift = param[2] == 0 ? 1 : static_cast<int>(ceil(log(200.f / s2Val) / log(2.f)));	
-							__m128i mmParam2_epi16 = _mm_set1_epi16(static_cast<short>(param[2] * (float)(1 << s2bitShift)));
-							__m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift);
-							__m128  mmParam2 = _mm_set1_ps(param[2]);		
-							__m128  mmParam2s = _mm_set1_ps(0.01f * param[2]);
-							__m128  mmParam2s2 = _mm_set1_ps(param[2] * param[2]);
-							float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x;
-							int bitShift = gainVec.x == 0 ? 1 : static_cast<int>(ceil(log(200.f / gainVal) / log(2.f)));	
-							__m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.x * (float)(1 << bitShift)));
-							__m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.y));
-							__m128i	mmEta	   = _mm_setzero_si128();
-							__m128i	mmEtaNeg  = _mm_set1_epi16(-1);
-							__m128i mmScale	  = _mm_set1_epi16(static_cast<short>(MEstimatorScale));
-				
+
+
+inline bool checkSolution(float x, float y, float * c )
+{
+    float _a = x - 0.002f;
+    float _b = y - 0.002f;
+    cv::Point2f tl( c[0] * _a * _b + c[1] * _a + c[2] * _b + c[3],  c[4] * _a * _b + c[5] * _a + c[6] * _b + c[7]);
+    _a = x + 0.002f;
+    cv::Point2f tr( c[0] * _a * _b + c[1] * _a + c[2] * _b + c[3],  c[4] * _a * _b + c[5] * _a + c[6] * _b + c[7]);
+    _b = y + 0.002f;
+    cv::Point2f br( c[0] * _a * _b + c[1] * _a + c[2] * _b + c[3],  c[4] * _a * _b + c[5] * _a + c[6] * _b + c[7]);
+    _a = x - 0.002f;
+    cv::Point2f bl( c[0] * _a * _b + c[1] * _a + c[2] * _b + c[3],  c[4] * _a * _b + c[5] * _a + c[6] * _b + c[7]);
+    return (tl.x >= 0 && tl.y >= 0) && (tr.x <= 0 && tr.y >= 0)
+        && (bl.x >= 0 && bl.y <= 0) && (br.x <= 0 && br.y <= 0);
+}
+
+inline cv::Point2f est_DeltaGain( const cv::Mat & src, const cv::Vec4f & val)
+{
+    return cv::Point2f(
+        src.at<float>(2,0) * val[0] + src.at<float>(2,1) * val[1] + src.at<float>(2,2) * val[2] + src.at<float>(2,3) * val[3],
+        src.at<float>(3,0) * val[0] + src.at<float>(3,1) * val[1] + src.at<float>(3,2) * val[2] + src.at<float>(3,3) * val[3]);
+}
+inline void est_Result( const cv::Mat & src, const cv::Vec4f & val, cv::Point2f & delta, cv::Point2f & gain)
+{
+
+    delta = cv::Point2f(
+        -(src.at<float>(0,0) * val[0] + src.at<float>(0,1) * val[1] + src.at<float>(0,2) * val[2] + src.at<float>(0,3) * val[3]),
+        -(src.at<float>(1,0) * val[0] + src.at<float>(1,1) * val[1] + src.at<float>(1,2) * val[2] + src.at<float>(1,3) * val[3]));
+
+    gain = cv::Point2f(
+        src.at<float>(2,0) * val[0] + src.at<float>(2,1) * val[1] + src.at<float>(2,2) * val[2] + src.at<float>(2,3) * val[3],
+        src.at<float>(3,0) * val[0] + src.at<float>(3,1) * val[1] + src.at<float>(3,2) * val[2] + src.at<float>(3,3) * val[3]);
+}
+
+namespace berlof
+{
+
+    namespace ica
+    {
+
+    class TrackerInvoker : public cv::ParallelLoopBody
+        {
+            public:
+            TrackerInvoker(
+                                const Mat&    _prevImg,
+                                const Mat&    _prevDeriv,
+                                const Mat&    _nextImg,
+                                const Mat&    _rgbPrevImg,
+                                const Mat&    _rgbNextImg,
+                                const Point2f*    _prevPts,
+                                Point2f*        _nextPts,
+                                uchar*            _status,
+                                float*            _err,
+                                int                _level,
+                                int                _maxLevel,
+                                int                _winSize[2],
+                                int                _maxIteration,
+                                bool            _useInitialFlow,
+                                int                _supportRegionType,
+                                int                _crossSegmentationThreshold,
+                                std::vector<float> _normSigmas,
+                                float           _minEigenValue)
+            {
+                prevImg =        &_prevImg;
+                prevDeriv =        &_prevDeriv;
+                nextImg =        &_nextImg;
+                rgbPrevImg =    &_rgbPrevImg;
+                rgbNextImg =    &_rgbNextImg;
+                prevPts =        _prevPts;
+                nextPts =        _nextPts;
+                status =        _status;
+                err =            _err;
+                minWinSize            =    _winSize[0];
+                maxWinSize            =    _winSize[1];
+                criteria.maxCount    =    _maxIteration;
+                criteria.epsilon    =    0.01;
+                level                =    _level;
+                maxLevel            =    _maxLevel;
+                windowType            = _supportRegionType;
+                minEigThreshold        = _minEigenValue;
+                paramReset            = _normSigmas;
+                useInitialFlow        = _useInitialFlow;
+                crossSegmentationThreshold    = _crossSegmentationThreshold;
+            }
+
+            TrackerInvoker & operator=(const TrackerInvoker &) {return *this;};
+            void operator()(const cv::Range& range) const
+            {
+
+                Point2f halfWin;
+                cv::Size winSize;
+                const Mat& I = *prevImg;
+                const Mat& J = *nextImg;
+                const Mat& derivI = *prevDeriv;
+                const Mat& BI = *rgbPrevImg;
+
+
+                const float FLT_SCALE = (1.f/(1 << 16));
+
+                winSize = cv::Size(maxWinSize,maxWinSize);
+                int winMaskwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
+                cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
+                winMaskMatBuf.setTo(1);
+
+                std::vector<float> param = paramReset;
+                int j, cn = I.channels(), cn2 = cn*2;
+                int winbufwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
+                cv::Size winBufSize(winbufwidth,winbufwidth);
+
+                cv::AutoBuffer<deriv_type> _buf(winBufSize.area()*(cn + cn2));
+                int derivDepth = DataType<deriv_type>::depth;
+
+                Mat IWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf.data());
+                Mat derivIWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf.data() + winBufSize.area()*cn);
+
+
+                for( int ptidx = range.start; ptidx < range.end; ptidx++ )
+                {
+                    Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
+                    Point2f nextPt;
+                    if( level == maxLevel )
+                    {
+                        if( useInitialFlow )
+                            nextPt = nextPts[ptidx]*(float)(1./(1 << level));
+                        else
+                            nextPt = prevPt;
+                    }
+                    else
+                        nextPt = nextPts[ptidx]*2.f;
+                    nextPts[ptidx] = nextPt;
+
+                    Point2i iprevPt, inextPt;
+                    iprevPt.x = cvFloor(prevPt.x);
+                    iprevPt.y = cvFloor(prevPt.y);
+                    int winArea = maxWinSize * maxWinSize;
+                    cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize));
+                    winMaskMatBuf.setTo(0);
+                    if( calcWinMaskMat(BI, windowType, iprevPt,
+                                        winMaskMat,winSize,halfWin,winArea,
+                                        minWinSize,maxWinSize, crossSegmentationThreshold) == false)
+                        continue;
+
+                    prevPt -= halfWin;
+                    iprevPt.x = cvFloor(prevPt.x);
+                    iprevPt.y = cvFloor(prevPt.y);
+
+                    if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols ||
+                        iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows )
+                    {
+                        if( level == 0 )
+                        {
+                            if( status )
+                                status[ptidx] = 3;
+                            if( err )
+                                err[ptidx] = 0;
+                        }
+                        continue;
+                    }
+
+                    float a = prevPt.x - iprevPt.x;
+                    float b = prevPt.y - iprevPt.y;
+                    const int W_BITS = 14, W_BITS1 = 14;
+
+                    int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+                    int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+                    int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+                    int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+
+                    int dstep = (int)(derivI.step/derivI.elemSize1());
+                    int step = (int)(I.step/I.elemSize1());
+                    int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1());
+                    CV_Assert( step == (int)(J.step/J.elemSize1()) );
+
+                    float A11 = 0, A12 = 0, A22 = 0;
+                    float D = 0;
+
+                    // extract the patch from the first image, compute covariation matrix of derivatives
+                    int x, y;
+                    for( y = 0; y < winSize.height; y++ )
+                    {
+                        const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
+                        const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
+
+                        short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step);
+                        short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step);
+                        x = 0;
+                        for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 )
+                        {
+                            if( winMaskMat.at<uchar>(y,x) == 0)
+                            {
+                                dIptr[0] = 0;
+                                dIptr[1] = 0;
+                                continue;
+                            }
+                            int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
+                                                  src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5);
+                            int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
+                                                   dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
+                            int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
+                                                   dsrc[dstep+cn2+1]*iw11, W_BITS1);
+                            Iptr[x] = (short)ival;
+                            dIptr[0] = (short)ixval;
+                            dIptr[1] = (short)iyval;
+                        }
+                    }
+
+                    cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1);
+                    cv::Point2f backUpNextPt = nextPt;
+                    nextPt -= halfWin;
+                    Point2f prevDelta(0,0);    //denotes h(t-1)
+                    cv::Size _winSize = winSize;
+                    __m128i mmMask0, mmMask1, mmMask;
+                    getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask);
+                    float MEstimatorScale = 1;
+                    int buffIdx = 0;
+                    float c[8];
+                    cv::Mat GMc0, GMc1, GMc2, GMc3;
+                    cv::Vec2f Mc0, Mc1, Mc2, Mc3;
+                    int noIteration = 0;
+                    int noReusedIteration = 0;
+                    int noSolvedIteration = 0;
+                    for( j = 0; j < criteria.maxCount; j++ )
+                    {
+                        cv::Point2f delta(0,0);
+                        cv::Point2f deltaGain(0,0);
+                        bool hasSolved = false;
+                        a = nextPt.x - inextPt.x;
+                        b = nextPt.y - inextPt.y;
+                        float ab = a * b;
+                        if( j == 0
+                            || ( inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j % 2 != 0 ))
+                        {
+                        inextPt.x = cvFloor(nextPt.x);
+                        inextPt.y = cvFloor(nextPt.y);
+
+                        if( inextPt.x < -winSize.width  || inextPt.x >= J.cols ||
+                            inextPt.y < -winSize.height || inextPt.y >= J.rows )
+                        {
+                            if( level == 0 && status )
+                                status[ptidx] = 3;
+                            break;
+                        }
+
+
+                        a = nextPt.x - inextPt.x;
+                        b = nextPt.y - inextPt.y;
+                        ab = a * b;
+                        iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+                        iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+                        iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+                        iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+                        // mismatch
+                        if( j == 0 )
+                        {
+                            A11 = 0;
+                            A12 = 0;
+                            A22 = 0;
+                        }
+
+                        if ( j == 0 )
+                        {
+                            buffIdx = 0;
+                            for( y = 0; y < winSize.height; y++ )
+                            {
+                                const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y )*step + (inextPt.x )*cn;
+                                const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step);
+                                const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
+                                const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+                                x = 0;
+                                for( ; x < winSize.width*cn; x++, dIptr += 2)
+                                {
+                                    if( maskPtr[x] == 0)
+                                        continue;
+                                    int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, W_BITS1-5)
+                                        - Iptr[x];
+                                    residualMat.at<short>(buffIdx++) = static_cast<short>(diff);
+                                }
+                            }
+                            /*! Estimation for the residual */
+                            cv::Mat residualMatRoi(residualMat, cv::Rect(0,0,1, buffIdx));
+                            MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi);
+                        }
+
+                        float eta = 1.f / winArea;
+                        float fParam0 = param[0] * 32.f;
+                        float fParam1 = param[1] * 32.f;
+                        fParam0 = param[0] * MEstimatorScale;
+                        fParam1 = param[1] * MEstimatorScale;
+
+
+                        buffIdx = 0;
+                        float _b0[4] = {0,0,0,0};
+                        float _b1[4] = {0,0,0,0};
+
+                        /*
+                        */
+                        for( y = 0; y < _winSize.height; y++ )
+                        {
+                            const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+                            const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn;
+                            const short* Iptr = (const short*)(IWinBuf.data +  y*IWinBuf.step);
+                            const short* dIptr = (const short*)(derivIWinBuf.data + y*derivIWinBuf.step);
+
+                            x = 0;
+                            for( ; x < _winSize.width*cn; x++, dIptr += 2 )
+                            {
+                                if( maskPtr[x] == 0)
+                                    continue;
+                                int illValue =   - Iptr[x];
+
+                                float It[4] = {static_cast<float>((Jptr[x+step+cn]<< 5)    + illValue),
+                                             static_cast<float>((Jptr[x+cn]<< 5)        + illValue),
+                                             static_cast<float>((Jptr[x+step]<< 5)        + illValue),
+                                             static_cast<float>((Jptr[x] << 5)            + illValue)};
+
+
+
+                                int J_val  =  CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
+                                                      Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11,
+                                                      W_BITS1-5);
+
+
+                                int diff = J_val + illValue;
+
+
+                                MEstimatorScale += (diff < MEstimatorScale) ? -eta : eta;
+
+                                int abss = (diff < 0) ? -diff : diff;
+
+                                // compute the missmatch vector
+                                if( j >= 0)
+                                {
+                                    if( abss > fParam1)
+                                    {
+                                        It[0] = 0;
+                                        It[1] = 0;
+                                        It[2] = 0;
+                                        It[3] = 0;
+                                    }
+                                    else if( abss > fParam0 && diff >= 0 )
+                                    {
+                                        It[0] = param[2] * (It[0] - fParam1);
+                                        It[1] = param[2] * (It[1] - fParam1);
+                                        It[2] = param[2] * (It[2] - fParam1);
+                                        It[3] = param[2] * (It[3] - fParam1);
+                                    }
+                                    else if( abss > fParam0 && diff < 0 )
+                                    {
+                                        It[0] = param[2] * (It[0] + fParam1);
+                                        It[1] = param[2] * (It[1] + fParam1);
+                                        It[2] = param[2] * (It[2] + fParam1);
+                                        It[3] = param[2] * (It[3] + fParam1);
+                                    }
+                                }
+
+                                float It0 = It[0];
+                                float It1 = It[1];
+                                float It2 = It[2];
+                                float It3 = It[3];
+
+                                float ixval = static_cast<float>(dIptr[0]);
+                                float iyval = static_cast<float>(dIptr[1]);
+                                _b0[0] += It0 * ixval;
+                                _b0[1] += It1 * ixval;
+                                _b0[2] += It2 * ixval;
+                                _b0[3] += It3 * ixval;
+
+
+                                _b1[0] += It0*iyval;
+                                _b1[1] += It1*iyval;
+                                _b1[2] += It2*iyval;
+                                _b1[3] += It3*iyval;
+
+
+                                // compute the Gradient Matrice
+                                if( j == 0)
+                                {
+                                    float tale = param[2] * FLT_RESCALE;
+                                    if( abss < fParam0 || j < 0)
+                                    {
+                                        tale = FLT_RESCALE;
+                                    }
+                                    else if( abss > fParam1)
+                                    {
+                                        tale *= 0.01f;
+                                    }
+                                    else
+                                    {
+                                        tale *= param[2];
+                                    }
+
+                                    A11 += (float)(ixval*ixval)*tale;
+                                    A12 += (float)(ixval*iyval)*tale;
+                                    A22 += (float)(iyval*iyval)*tale;
+
+                                }
+                            }
+                        }
+
+                        if( j == 0 )
+                        {
+
+                            A11 *= FLT_SCALE; // 54866744.
+                            A12 *= FLT_SCALE; // -628764.00
+                            A22 *= FLT_SCALE; // 19730.000
+
+                            D = A11 * A22 - A12 * A12;
+                            float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
+                                    4.f*A12*A12))/(2*winArea);
+
+                            if(  minEig < minEigThreshold || std::abs(D) < FLT_EPSILON)
+                            {
+                                if( level == 0 && status )
+                                    status[ptidx] = 0;
+                                if( level > 0)
+                                {
+                                    nextPts[ptidx] = backUpNextPt;
+                                }
+                                noIteration++;
+                                break;
+                            }
+
+                            D = (1.f / D);
+
+                        }
+
+                        _b0[0] *= FLT_SCALE;_b0[1] *= FLT_SCALE;_b0[2] *= FLT_SCALE;_b0[3] *= FLT_SCALE;
+                        _b1[0] *= FLT_SCALE;_b1[1] *= FLT_SCALE;_b1[2] *= FLT_SCALE;_b1[3] *= FLT_SCALE;
+
+
+                        Mc0[0] =   _b0[0] - _b0[1] - _b0[2] + _b0[3];
+                        Mc0[1] =   _b1[0] - _b1[1] - _b1[2] + _b1[3];
+
+                        Mc1[0] =   _b0[1] - _b0[3];
+                        Mc1[1] =   _b1[1] - _b1[3];
+
+
+                        Mc2[0] =   _b0[2] - _b0[3];
+                        Mc2[1] =   _b1[2] - _b1[3];
+
+
+                        Mc3[0] =  _b0[3];
+                        Mc3[1] =  _b1[3];
+
+                        c[0] = -Mc0[0];
+                        c[1] = -Mc1[0];
+                        c[2] = -Mc2[0];
+                        c[3] = -Mc3[0];
+                        c[4] = -Mc0[1];
+                        c[5] = -Mc1[1];
+                        c[6] = -Mc2[1];
+                        c[7] = -Mc3[1];
+
+                        float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]);
+                        float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]);
+                        float e2 = e0 * (c[1] * c[7] -c[3] * c[5]);
+                        e0 = e1 * e1 - e2;
+                        hasSolved = false;
+                        if ( e0 > 0)
+                        {
+                            e0 = sqrt(e0);
+                            float _y[2] = {-e1 - e0, e0 - e1};
+                            float c0yc1[2] = {c[0] * _y[0] + c[1],
+                                                c[0] * _y[1] + c[1]};
+                            float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0],
+                                            -(c[2] * _y[1] + c[3]) / c0yc1[1]};
+                            bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1);
+                            bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1);
+
+                            bool isSolution1 = checkSolution(_x[0], _y[0], c );
+                            bool isSolution2 = checkSolution(_x[1], _y[1], c );
+                            bool isSink1 = isIn1 && isSolution1;
+                            bool isSink2 = isIn2 && isSolution2;
+
+                            if ( isSink1 != isSink2)
+                            {
+                                a = isSink1 ? _x[0] : _x[1];
+                                b = isSink1 ? _y[0] : _y[1];
+                                ab = a * b;
+                                hasSolved = true;
+                                delta.x = inextPt.x + a - nextPt.x;
+                                delta.y = inextPt.y + b - nextPt.y;
+                            } // isIn1 != isIn2
+                        }
+                            if( hasSolved == false)
+                                noIteration++;
+                        }
+                        else
+                        {
+                            hasSolved = false;
+                            noReusedIteration++;
+                        }
+                        if( hasSolved == false )
+                        {
+
+                            cv::Vec2f mismatchVec = ab * Mc0  + Mc1 *a + Mc2 * b + Mc3;
+                            delta.x = (A12 * mismatchVec.val[1] - A22 * mismatchVec.val[0]) * D;
+                            delta.y = (A12 * mismatchVec.val[0] - A11 * mismatchVec.val[1]) * D;
+                            delta.x = MAX(-1.f, MIN(1.f, delta.x));
+                            delta.y = MAX(-1.f, MIN(1.f, delta.y));
+                            nextPt  += delta;
+                            nextPts[ptidx] = nextPt + halfWin;
+                        }
+                        else
+                        {
+                            nextPt += delta;
+                            nextPts[ptidx] = nextPt + halfWin;
+                            noSolvedIteration++;
+                            break;
+                        }
+
+                        delta.x = ( delta.x != delta.x) ? 0 : delta.x;
+                        delta.y = ( delta.y != delta.y) ? 0 : delta.y;
+
+                        if(j > 0 && (
+                            (std::abs(delta.x - prevDelta.x) < 0.01  &&    std::abs(delta.y - prevDelta.y) < 0.01)))
+                        {
+                            nextPts[ptidx]  -= delta*0.5f;
+                            break;
+                        }
+
+
+                        prevDelta = delta;
+                    }
+
+                }
+
+            }
+
+            const Mat*        prevImg;
+            const Mat*        nextImg;
+            const Mat*        prevDeriv;
+            const Mat*        rgbPrevImg;
+            const Mat*        rgbNextImg;
+            const Point2f*    prevPts;
+            Point2f*        nextPts;
+            uchar*            status;
+            float*            err;
+            int                maxWinSize;
+            int                minWinSize;
+            TermCriteria    criteria;
+            int                level;
+            int                maxLevel;
+            int                windowType;
+            float            minEigThreshold;
+            bool            useInitialFlow;
+            std::vector<float> paramReset;
+            int        crossSegmentationThreshold;
+
+        };
+    }
+    namespace radial
+    {
+        class TrackerInvoker : public cv::ParallelLoopBody
+        {
+        public:
+            TrackerInvoker(
+                const Mat&    _prevImg,
+                const Mat&    _prevDeriv,
+                const Mat&    _nextImg,
+                const Mat&    _rgbPrevImg,
+                const Mat&    _rgbNextImg,
+                const Point2f*    _prevPts,
+                Point2f*        _nextPts,
+                uchar*            _status,
+                float*            _err,
+                Point2f*        _gainVecs,
+                int                _level,
+                int                _maxLevel,
+                int                _winSize[2],
+                int                _maxIteration,
+                bool            _useInitialFlow,
+                int                _supportRegionType,
+                int                _crossSegmentationThreshold,
+                std::vector<float> _normSigmas,
+                float           _minEigenValue)
+            {
+                prevImg = &_prevImg;
+                prevDeriv = &_prevDeriv;
+                nextImg = &_nextImg;
+                rgbPrevImg = &_rgbPrevImg;
+                rgbNextImg = &_rgbNextImg;
+                prevPts = _prevPts;
+                nextPts = _nextPts;
+                status = _status;
+                err = _err;
+                gainVecs = _gainVecs;
+                minWinSize = _winSize[0];
+                maxWinSize = _winSize[1];
+                criteria.maxCount = _maxIteration;
+                criteria.epsilon = 0.01;
+                level = _level;
+                maxLevel = _maxLevel;
+                windowType = _supportRegionType;
+                minEigThreshold = _minEigenValue;
+                paramReset = _normSigmas;
+                useInitialFlow = _useInitialFlow;
+                crossSegmentationThreshold = _crossSegmentationThreshold;
+            }
+
+            TrackerInvoker & operator=(const TrackerInvoker &) {return *this;};
+            void operator()(const cv::Range& range) const
+            {
+                Point2f halfWin;
+                cv::Size winSize;
+                const Mat& I = *prevImg;
+                const Mat& J = *nextImg;
+                const Mat& derivI = *prevDeriv;
+                const Mat& BI = *rgbPrevImg;
+                const float FLT_SCALE = (1.f/(1 << 16));//(1.f/(1 << 20)); // 20
+                winSize = cv::Size(maxWinSize,maxWinSize);
+                int winMaskwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
+                cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
+                winMaskMatBuf.setTo(1);
+
+                std::vector<float> param = paramReset;
+                int cn = I.channels(), cn2 = cn*2;
+                int winbufwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
+                cv::Size winBufSize(winbufwidth,winbufwidth);
+
+
+                cv::Mat invTensorMat(4,4, CV_32FC1);
+                cv::Mat mismatchMat(4, 1, CV_32FC1);
+                cv::Mat resultMat(4, 1, CV_32FC1);
+
+                cv::AutoBuffer<deriv_type> _buf(winBufSize.area()*(cn + cn2));
+                int derivDepth = DataType<deriv_type>::depth;
+
+                Mat IWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf.data());
+                Mat derivIWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf.data() + winBufSize.area()*cn);
+
+                for( int ptidx = range.start; ptidx < range.end; ptidx++ )
+                {
+                    Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
+                    Point2f nextPt;
+                    if( level == maxLevel )
+                    {
+                        if( useInitialFlow )
+                            nextPt = nextPts[ptidx]*(float)(1./(1 << level));
+                        else
+                            nextPt = prevPt;
+                    }
+                    else
+                        nextPt = nextPts[ptidx]*2.f;
+                    nextPts[ptidx] = nextPt;
+
+                    Point2i iprevPt, inextPt;
+                    iprevPt.x = cvFloor(prevPt.x);
+                    iprevPt.y = cvFloor(prevPt.y);
+                    int winArea = maxWinSize * maxWinSize;
+                    cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize));
+                    winMaskMatBuf.setTo(0);
+                    if( calcWinMaskMat(BI,  windowType, iprevPt,
+                                        winMaskMat,winSize,halfWin,winArea,
+                                        minWinSize,maxWinSize, crossSegmentationThreshold) == false)
+                        continue;
+
+                    prevPt -= halfWin;
+                    iprevPt.x = cvFloor(prevPt.x);
+                    iprevPt.y = cvFloor(prevPt.y);
+
+                    if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols ||
+                        iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows )
+                    {
+                        if( level == 0 )
+                        {
+                            if( status )
+                                status[ptidx] = 3;
+                            if( err )
+                                err[ptidx] = 0;
+                        }
+                        continue;
+                    }
+
+                    float a = prevPt.x - iprevPt.x;
+                    float b = prevPt.y - iprevPt.y;
+                    const int W_BITS = 14, W_BITS1 = 14;
+
+                    int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+                    int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+                    int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+                    int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+
+                    int dstep = (int)(derivI.step/derivI.elemSize1());
+                    int step = (int)(I.step/I.elemSize1());
+                    int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1());
+                    CV_Assert( step == (int)(J.step/J.elemSize1()) );
+
+                    float A11 = 0, A12 = 0, A22 = 0;
+
+                    // tensor
+                    float sumIx = 0;
+                    float sumIy = 0;
+                    float sumI  = 0;
+                    float sumW = 0;
+                    float w1 = 0, w2 = 0; // -IyI
+                    float dI = 0; // I^2
+                    float D = 0;
+
+        #ifdef CV_SSE4_1
+
+                    __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+                    __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+                    __m128i z = _mm_setzero_si128();
+                    __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1));
+                    __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1));
+                    __m128i mmMask4_epi32;
+                    __m128i mmMaskSet_epi16   = _mm_set1_epi16(std::numeric_limits<unsigned short>::max());
+                    get4BitMask(winSize.width, mmMask4_epi32);
+        #endif
+
+                    // extract the patch from the first image, compute covariation matrix of derivatives
+                    int x, y;
+                    for( y = 0; y < winSize.height; y++ )
+                    {
+                        x = 0;
+                        const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+                        const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
+                        const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
+
+                        short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step);
+                        short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step);
+
+
+
+        #ifdef CV_SSE4_1
+                        for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 )
+                        {
+                            __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16);
+                            __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16);
+
+
+                            __m128i v00, v01, v10, v11, t0, t1;
+                            v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
+                            v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
+                            v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z);
+                            v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z);
+
+                            t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                                               _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                            t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
+                            if( x + 4 > winSize.width)
+                            {
+                                t0 = _mm_and_si128(t0, mmMask4_epi32);
+                            }
+                            t0 = _mm_and_si128(t0, mask_0_3_epi16);
+                            _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0));
+
+
+                            v00 = _mm_loadu_si128((const __m128i*)(dsrc));
+                            v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
+                            v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep));
+                            v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2));
+
+                            t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                                               _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                            t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+                                               _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+                            t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
+                            t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
+                            v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
+                            if( x + 4 > winSize.width)
+                            {
+                                v00 = _mm_and_si128(v00, mmMask4_epi32);
+                            }
+                            v00 = _mm_and_si128(v00, mask_0_3_epi16);
+                            _mm_storeu_si128((__m128i*)dIptr, v00);
+                        }
+        #else
+
+                        for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 )
+                        {
+                            if( winMaskMat.at<uchar>(y,x) == 0)
+                            {
+                                dIptr[0] = 0;
+                                dIptr[1] = 0;
+                                continue;
+                            }
+                            int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
+                                                  src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5);
+                            int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
+                                                   dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
+                            int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
+                                                   dsrc[dstep+cn2+1]*iw11, W_BITS1);
+
+                            Iptr[x] = (short)ival;
+                            dIptr[0] = (short)ixval;
+                            dIptr[1] = (short)iyval;
+
+                        }
+        #endif
+                    }
+
+                    cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1);
+
+                    cv::Point2f backUpNextPt = nextPt;
+                    nextPt -= halfWin;
+                    Point2f prevDelta(0,0);    //relates to h(t-1)
+                    Point2f prevGain(1,0);
+                    cv::Point2f gainVec = gainVecs[ptidx];
+                    cv::Point2f backUpGain = gainVec;
+                    cv::Size _winSize = winSize;
+                    int j;
+                    __m128i mmMask0, mmMask1, mmMask;
+                    getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask);
+                    __m128  mmOnes   = _mm_set1_ps(1.f );
+                    float MEstimatorScale = 1;
+                    int buffIdx = 0;
+                    float c[8];
+                    cv::Mat GMc0, GMc1, GMc2, GMc3;
+                    cv::Vec4f Mc0, Mc1, Mc2, Mc3;
+                    int noIteration = 0;
+                    int noReusedIteration = 0;
+                    int noSolvedIteration = 0;
+                    for( j = 0; j < criteria.maxCount; j++ )
+                    {
+                        cv::Point2f delta(0,0);
+                        cv::Point2f deltaGain(0,0);
+                        bool hasSolved = false;
+                        a = nextPt.x - inextPt.x;
+                        b = nextPt.y - inextPt.y;
+                        float ab = a * b;
+                        if( j == 0
+                        || (inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j % 2 != 0) )
+                        {
+                        inextPt.x = cvFloor(nextPt.x);
+                        inextPt.y = cvFloor(nextPt.y);
+
+                        if( inextPt.x < -winSize.width  || inextPt.x >= J.cols ||
+                            inextPt.y < -winSize.height || inextPt.y >= J.rows )
+                        {
+                            if( level == 0 && status )
+                                status[ptidx] = 3;
+                            if (level > 0)
+                            {
+                                nextPts[ptidx] = backUpNextPt;
+                                gainVecs[ptidx] = backUpGain;
+                            }
+                            noIteration++;
+                            break;
+                        }
+
+
+                        a = nextPt.x - inextPt.x;
+                        b = nextPt.y - inextPt.y;
+                        ab = a * b;
+                        iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+                        iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+                        iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+                        iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+                        // mismatch
+
+                        if( j == 0)
+                        {
+                            // tensor
+                            w1 = 0; // -IxI
+                            w2 = 0; // -IyI
+                            dI = 0; // I^2
+                            sumIx = 0;
+                            sumIy = 0;
+                            sumI  = 0;
+                            sumW = 0;
+                            A11 = 0;
+                            A12 = 0;
+                            A22 = 0;
+                        }
+
+                        if ( j == 0 )
+                        {
+                            buffIdx = 0;
+                            for( y = 0; y < winSize.height; y++ )
+                            {
+                                const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y )*step + (inextPt.x )*cn;
+                                const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step);
+                                const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
+                                const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+                                x = 0;
+                                for( ; x < winSize.width*cn; x++, dIptr += 2)
+                                {
+                                    if( maskPtr[x] == 0)
+                                        continue;
+                                    int diff = static_cast<int>(CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, W_BITS1-5)
+                                        - Iptr[x] + Iptr[x] * gainVec.x +gainVec.y);
+                                    residualMat.at<short>(buffIdx++) = static_cast<short>(diff);
+                                }
+                            }
+                            /*! Estimation for the residual */
+                            cv::Mat residualMatRoi(residualMat, cv::Rect(0,0,1, buffIdx));
+                            MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi);
+                        }
+
+                        float eta = 1.f / winArea;
+                        float fParam0 = param[0] * 32.f;
+                        float fParam1 = param[1] * 32.f;
+                        fParam0 = param[0] * MEstimatorScale;
+                        fParam1 = param[1] * MEstimatorScale;
+
+#ifdef CV_SSE4_1
+
+                        qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+                        qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+                        __m128 qb0[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
+                        __m128 qb1[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
+                        __m128 qb2[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
+                        __m128 qb3[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
+                        __m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps();
+                        __m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps();
+                        __m128 mmSumIy = _mm_setzero_ps(),  mmSumIx = _mm_setzero_ps();
+                        __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();
+                        __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() -1, static_cast<short>(fParam0)));
+                        __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max()- 1, static_cast<short>(fParam1)));
+
+
+                        float s2Val = param[2] > 0 ? param[2] : -param[2];
+                        int s2bitShift = param[2] == 0 ? 1 : static_cast<int>(ceil(log(200.f / s2Val) / log(2.f)));
+                        __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast<short>(param[2] * (float)(1 << s2bitShift)));
+                        __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift);
+                        __m128  mmParam2s = _mm_set1_ps(0.01f * param[2]);
+                        __m128  mmParam2s2 = _mm_set1_ps(param[2] * param[2]);
+                        float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x;
+                        int bitShift = gainVec.x == 0 ? 1 : static_cast<int>(ceil(log(200.f / gainVal) / log(2.f)));
+                        __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.x * (float)(1 << bitShift)));
+                        __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.y));
+                        __m128i mmEta     = _mm_setzero_si128();
+                        __m128i mmScale      = _mm_set1_epi16(static_cast<short>(MEstimatorScale));
+
 #endif
-	
-							buffIdx = 0;
-							float _b0[4] = {0,0,0,0};
-							float _b1[4] = {0,0,0,0};
-							float _b2[4] = {0,0,0,0};
-							float _b3[4] = {0,0,0,0};
-							float _bm[4]  = {0,0,0,0};
-							float _bc[4]  = {0,0,0,0};
-							/*
-							*/
-							for( y = 0; y < _winSize.height; y++ )
-							{
-								const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
-								const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn;
-								const short* Iptr = (const short*)(IWinBuf.data +  y*IWinBuf.step);
-								const short* dIptr = (const short*)(derivIWinBuf.data + y*derivIWinBuf.step);
-                    
-								x = 0;
-#ifdef CV_SSE2
-							for( ; x <= _winSize.width*cn; x += 8, dIptr += 8*2 )
-							{
-								__m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16);
-								__m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x));
-						
-								__m128i v00 = _mm_unpacklo_epi8(
-									_mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
-								__m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);  
-								__m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z);
-								__m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z);
-                        
-								__m128i t0 = _mm_add_epi32
-									(_mm_madd_epi16(
-										_mm_unpacklo_epi16(v00, v01), 
-										qw0),
-									_mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-								__m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
-														   _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
-
-								t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
-								t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5);
-						
-								__m128i lo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16);
-								__m128i hi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16); 
-
-								__m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), bitShift);
-								__m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), bitShift);
-								__m128i Igain_epi16 =  _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32);
-								
-					
-								__m128i diffValue = _mm_subs_epi16(_mm_add_epi16(Igain_epi16, mmConstValue_epi16), I_0_7_epi16);
-								__m128i mmDiffc_epi16[4] = 
-								{
-								   _mm_add_epi16(_mm_slli_epi16(v11, 5), diffValue),
-								   _mm_add_epi16(_mm_slli_epi16(v01, 5), diffValue),
-								   _mm_add_epi16(_mm_slli_epi16(v10, 5), diffValue),
-								   _mm_add_epi16(_mm_slli_epi16(v00, 5), diffValue)
-								};
-
-								__m128i mmDiff_epi16 = _mm_add_epi16( _mm_packs_epi32(t0, t1), diffValue);
-
-							
-								mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16);
-
-								__m128i scalediffIsPos_epi16	= _mm_cmpgt_epi16(mmDiff_epi16, mmScale);
-								mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1)));
-									
-							
-								__m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); 
-								__m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
-						
-							
-								__m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16);
-								__m128i bSet2_epi16, bSet1_epi16;
-								// |It| < sigma1 ?
-								bSet2_epi16		= _mm_cmplt_epi16(abs_epi16, mmParam1);
-								// It > 0 ?
-								__m128i diffIsPos_epi16	= _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128());
-								// sigma0 < |It| < sigma1 ?
-								bSet1_epi16		= _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0));  
-															// val = |It| -/+ sigma1
-								__m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)),
-																	 _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1)));
-								// It == 0	 ? |It| > sigma13 
-								mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16);
-
-								for( unsigned int mmi = 0; mmi < 4; mmi++)
-								{
-									__m128i mmDiffc_epi16_t = _mm_and_si128(mmDiffc_epi16[mmi], mask_0_7_epi16);
-									mmDiffc_epi16_t = _mm_and_si128(bSet2_epi16, mmDiffc_epi16_t);
-								
-									// It == val ? sigma0 < |It| < sigma1
-									mmDiffc_epi16_t = _mm_blendv_epi8(mmDiffc_epi16_t, tmpParam1_epi16, bSet1_epi16);
-									__m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3
-									// diff = diff * sigma2 
-									lo = _mm_mullo_epi16(tale_epi16_, mmDiffc_epi16_t);
-									hi = _mm_mulhi_epi16(tale_epi16_, mmDiffc_epi16_t);
-									__m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift);
-									__m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift);
-														
-									mmDiffc_epi16_t = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); 
-									__m128i diff1 = _mm_unpackhi_epi16(mmDiffc_epi16_t, mmDiffc_epi16_t); // It4 It4 It5 It5 It6 It6 It7 It7   | It12 It12 It13 It13...
-									__m128i diff0 = _mm_unpacklo_epi16(mmDiffc_epi16_t, mmDiffc_epi16_t); // It0 It0 It1 It1 It2 It2 It3 It3   | It8 It8 It9 It9...
-									
-									// Ix * diff / Iy * diff
-									v10 = _mm_mullo_epi16(Ixy_0, diff0);
-									v11 = _mm_mulhi_epi16(Ixy_0, diff0);
-									v00 = _mm_unpacklo_epi16(v10, v11);
-									v10 = _mm_unpackhi_epi16(v10, v11);
-								
-									qb0[mmi] = _mm_add_ps(qb0[mmi], _mm_cvtepi32_ps(v00));
-									qb1[mmi] = _mm_add_ps(qb1[mmi], _mm_cvtepi32_ps(v10));
-									// It * Ix It * Iy [4 ... 7]
-									// for set 1 hi sigma 1
-									v10 = _mm_mullo_epi16(Ixy_1, diff1);
-									v11 = _mm_mulhi_epi16(Ixy_1, diff1);
-									v00 = _mm_unpacklo_epi16(v10, v11);
-									v10 = _mm_unpackhi_epi16(v10, v11);
-									qb0[mmi] = _mm_add_ps(qb0[mmi], _mm_cvtepi32_ps(v00));
-									qb1[mmi] = _mm_add_ps(qb1[mmi], _mm_cvtepi32_ps(v10));
-									// diff * J [0 ... 7]
-									// for set 1  sigma 1
-									// b3 += diff * Iptr[x]
-									v10 = _mm_mullo_epi16(mmDiffc_epi16_t, I_0_7_epi16);
-									v11 = _mm_mulhi_epi16(mmDiffc_epi16_t, I_0_7_epi16);
-									v00 = _mm_unpacklo_epi16(v10, v11);
-									
-									v10 = _mm_unpackhi_epi16(v10, v11);
-									qb2[mmi] = _mm_add_ps(qb2[mmi], _mm_cvtepi32_ps(v00));
-									qb2[mmi] = _mm_add_ps(qb2[mmi], _mm_cvtepi32_ps(v10));
-									qb3[mmi] = _mm_add_ps(qb3[mmi], _mm_cvtepi32_ps(diff_0_3_epi32));
-									qb3[mmi] = _mm_add_ps(qb3[mmi], _mm_cvtepi32_ps(diff_4_7_epi32));
-								}
-
-								if( j == 0 )
-								{
-									__m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16));
-									__m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16,bSet1_epi16), 16));
-									__m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16));
-									__m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16),16)));
-
-									__m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16));
-									__m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16),16));
-							
-									__m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps);
-									__m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps);
-									tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps);
-									tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps);
-							
-									tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps);
-									tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps);
-									
-									t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3
-									t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3
-
-									__m128 fy = _mm_cvtepi32_ps(t0);
-									__m128 fx = _mm_cvtepi32_ps(t1);
-
-									// 0 ... 3
-									__m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16));
-							
-									// A11 - A22
-									__m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps);
-									__m128 fytale = _mm_mul_ps(fy, tale_0_3_ps);
-
-									mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
-									mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
-									mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
-							
-									// sumIx und sumIy
-									mmSumIx = _mm_add_ps(mmSumIx, fxtale);
-									mmSumIy = _mm_add_ps(mmSumIy, fytale);
-							
-									mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale));
-									mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale));
-													
-									// sumI
-									__m128 I_tale_ps = _mm_mul_ps(I_ps, tale_0_3_ps);
-									mmSumI = _mm_add_ps(mmSumI,I_tale_ps);
-							
-									// sumW 
-									mmSumW = _mm_add_ps(mmSumW, tale_0_3_ps);
-
-									// sumDI
-									mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_tale_ps));
-						
-				
-									t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11
-									t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3
-							
-									fy =  _mm_cvtepi32_ps(t0);
-									fx =  _mm_cvtepi32_ps(t1);
-
-									// 4 ... 7
-									I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16));
-													
-									// A11 - A22
-									fxtale = _mm_mul_ps(fx, tale_4_7_ps);
-									fytale = _mm_mul_ps(fy, tale_4_7_ps);
-
-									mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
-									mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
-									mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
-							
-									// sumIx und sumIy
-									mmSumIx = _mm_add_ps(mmSumIx, fxtale);
-									mmSumIy = _mm_add_ps(mmSumIy, fytale);
-							
-									mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale));
-									mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale));
-													
-									// sumI
-									I_tale_ps = _mm_mul_ps(I_ps, tale_4_7_ps);
-									mmSumI = _mm_add_ps(mmSumI, I_tale_ps);
-							
-									// sumW 
-									mmSumW = _mm_add_ps(mmSumW, tale_4_7_ps);
-
-									// sumDI
-									mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_tale_ps));
-								}
-							
-							}
+
+                        buffIdx = 0;
+                        float _b0[4] = {0,0,0,0};
+                        float _b1[4] = {0,0,0,0};
+                        float _b2[4] = {0,0,0,0};
+                        float _b3[4] = {0,0,0,0};
+                        /*
+                        */
+                        for( y = 0; y < _winSize.height; y++ )
+                        {
+                            const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+                            const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn;
+                            const short* Iptr = (const short*)(IWinBuf.data +  y*IWinBuf.step);
+                            const short* dIptr = (const short*)(derivIWinBuf.data + y*derivIWinBuf.step);
+
+                            x = 0;
+#ifdef CV_SSE4_1
+                        for( ; x <= _winSize.width*cn; x += 8, dIptr += 8*2 )
+                        {
+                            __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16);
+                            __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x));
+
+                            __m128i v00 = _mm_unpacklo_epi8(
+                                _mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
+                            __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
+                            __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z);
+                            __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z);
+
+                            __m128i t0 = _mm_add_epi32
+                                (_mm_madd_epi16(
+                                    _mm_unpacklo_epi16(v00, v01),
+                                    qw0),
+                                _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                            __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+                                                       _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+
+                            t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
+                            t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5);
+
+                            __m128i lo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16);
+                            __m128i hi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16);
+
+                            __m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), bitShift);
+                            __m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), bitShift);
+                            __m128i Igain_epi16 =  _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32);
+
+
+                            __m128i diffValue = _mm_subs_epi16(_mm_add_epi16(Igain_epi16, mmConstValue_epi16), I_0_7_epi16);
+                            __m128i mmDiffc_epi16[4] =
+                            {
+                               _mm_add_epi16(_mm_slli_epi16(v11, 5), diffValue),
+                               _mm_add_epi16(_mm_slli_epi16(v01, 5), diffValue),
+                               _mm_add_epi16(_mm_slli_epi16(v10, 5), diffValue),
+                               _mm_add_epi16(_mm_slli_epi16(v00, 5), diffValue)
+                            };
+
+                            __m128i mmDiff_epi16 = _mm_add_epi16( _mm_packs_epi32(t0, t1), diffValue);
+
+
+                            mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16);
+
+                            __m128i scalediffIsPos_epi16    = _mm_cmpgt_epi16(mmDiff_epi16, mmScale);
+                            mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1)));
+
+
+                            __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr));
+                            __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
+
+
+                            __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16);
+                            __m128i bSet2_epi16, bSet1_epi16;
+                            // |It| < sigma1 ?
+                            bSet2_epi16        = _mm_cmplt_epi16(abs_epi16, mmParam1);
+                            // It > 0 ?
+                            __m128i diffIsPos_epi16    = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128());
+                            // sigma0 < |It| < sigma1 ?
+                            bSet1_epi16        = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0));
+                                                        // val = |It| -/+ sigma1
+                            __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)),
+                                                                 _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1)));
+                            // It == 0     ? |It| > sigma13
+                            mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16);
+
+                            for( unsigned int mmi = 0; mmi < 4; mmi++)
+                            {
+                                __m128i mmDiffc_epi16_t = _mm_and_si128(mmDiffc_epi16[mmi], mask_0_7_epi16);
+                                mmDiffc_epi16_t = _mm_and_si128(bSet2_epi16, mmDiffc_epi16_t);
+
+                                // It == val ? sigma0 < |It| < sigma1
+                                mmDiffc_epi16_t = _mm_blendv_epi8(mmDiffc_epi16_t, tmpParam1_epi16, bSet1_epi16);
+                                __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3
+                                // diff = diff * sigma2
+                                lo = _mm_mullo_epi16(tale_epi16_, mmDiffc_epi16_t);
+                                hi = _mm_mulhi_epi16(tale_epi16_, mmDiffc_epi16_t);
+                                __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift);
+                                __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift);
+
+                                mmDiffc_epi16_t = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32);
+                                __m128i diff1 = _mm_unpackhi_epi16(mmDiffc_epi16_t, mmDiffc_epi16_t); // It4 It4 It5 It5 It6 It6 It7 It7   | It12 It12 It13 It13...
+                                __m128i diff0 = _mm_unpacklo_epi16(mmDiffc_epi16_t, mmDiffc_epi16_t); // It0 It0 It1 It1 It2 It2 It3 It3   | It8 It8 It9 It9...
+
+                                // Ix * diff / Iy * diff
+                                v10 = _mm_mullo_epi16(Ixy_0, diff0);
+                                v11 = _mm_mulhi_epi16(Ixy_0, diff0);
+                                v00 = _mm_unpacklo_epi16(v10, v11);
+                                v10 = _mm_unpackhi_epi16(v10, v11);
+
+                                qb0[mmi] = _mm_add_ps(qb0[mmi], _mm_cvtepi32_ps(v00));
+                                qb1[mmi] = _mm_add_ps(qb1[mmi], _mm_cvtepi32_ps(v10));
+                                // It * Ix It * Iy [4 ... 7]
+                                // for set 1 hi sigma 1
+                                v10 = _mm_mullo_epi16(Ixy_1, diff1);
+                                v11 = _mm_mulhi_epi16(Ixy_1, diff1);
+                                v00 = _mm_unpacklo_epi16(v10, v11);
+                                v10 = _mm_unpackhi_epi16(v10, v11);
+                                qb0[mmi] = _mm_add_ps(qb0[mmi], _mm_cvtepi32_ps(v00));
+                                qb1[mmi] = _mm_add_ps(qb1[mmi], _mm_cvtepi32_ps(v10));
+                                // diff * J [0 ... 7]
+                                // for set 1  sigma 1
+                                // b3 += diff * Iptr[x]
+                                v10 = _mm_mullo_epi16(mmDiffc_epi16_t, I_0_7_epi16);
+                                v11 = _mm_mulhi_epi16(mmDiffc_epi16_t, I_0_7_epi16);
+                                v00 = _mm_unpacklo_epi16(v10, v11);
+
+                                v10 = _mm_unpackhi_epi16(v10, v11);
+                                qb2[mmi] = _mm_add_ps(qb2[mmi], _mm_cvtepi32_ps(v00));
+                                qb2[mmi] = _mm_add_ps(qb2[mmi], _mm_cvtepi32_ps(v10));
+                                qb3[mmi] = _mm_add_ps(qb3[mmi], _mm_cvtepi32_ps(diff_0_3_epi32));
+                                qb3[mmi] = _mm_add_ps(qb3[mmi], _mm_cvtepi32_ps(diff_4_7_epi32));
+                            }
+
+                            if( j == 0 )
+                            {
+                                __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16));
+                                __m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16,bSet1_epi16), 16));
+                                __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16));
+                                __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16),16)));
+
+                                __m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16));
+                                __m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16),16));
+
+                                __m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps);
+                                __m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps);
+                                tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps);
+                                tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps);
+
+                                tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps);
+                                tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps);
+
+                                t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3
+                                t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3
+
+                                __m128 fy = _mm_cvtepi32_ps(t0);
+                                __m128 fx = _mm_cvtepi32_ps(t1);
+
+                                // 0 ... 3
+                                __m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16));
+
+                                // A11 - A22
+                                __m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps);
+                                __m128 fytale = _mm_mul_ps(fy, tale_0_3_ps);
+
+                                mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
+                                mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
+                                mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
+
+                                // sumIx und sumIy
+                                mmSumIx = _mm_add_ps(mmSumIx, fxtale);
+                                mmSumIy = _mm_add_ps(mmSumIy, fytale);
+
+                                mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale));
+                                mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale));
+
+                                // sumI
+                                __m128 I_tale_ps = _mm_mul_ps(I_ps, tale_0_3_ps);
+                                mmSumI = _mm_add_ps(mmSumI,I_tale_ps);
+
+                                // sumW
+                                mmSumW = _mm_add_ps(mmSumW, tale_0_3_ps);
+
+                                // sumDI
+                                mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_tale_ps));
+
+
+                                t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11
+                                t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3
+
+                                fy =  _mm_cvtepi32_ps(t0);
+                                fx =  _mm_cvtepi32_ps(t1);
+
+                                // 4 ... 7
+                                I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16));
+
+                                // A11 - A22
+                                fxtale = _mm_mul_ps(fx, tale_4_7_ps);
+                                fytale = _mm_mul_ps(fy, tale_4_7_ps);
+
+                                mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
+                                mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
+                                mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
+
+                                // sumIx und sumIy
+                                mmSumIx = _mm_add_ps(mmSumIx, fxtale);
+                                mmSumIy = _mm_add_ps(mmSumIy, fytale);
+
+                                mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale));
+                                mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale));
+
+                                // sumI
+                                I_tale_ps = _mm_mul_ps(I_ps, tale_4_7_ps);
+                                mmSumI = _mm_add_ps(mmSumI, I_tale_ps);
+
+                                // sumW
+                                mmSumW = _mm_add_ps(mmSumW, tale_4_7_ps);
+
+                                // sumDI
+                                mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_tale_ps));
+                            }
+
+                        }
 #else
-								for( ; x < _winSize.width*cn; x++, dIptr += 2 )
-								{
-									if( maskPtr[x] == 0)
-										continue;
-
-									short ixval = static_cast<float>(dIptr[0]);
-									short iyval = static_cast<float>(dIptr[1]);
-									int illValue =  Iptr[x] * gainVec.x + gainVec.y  - Iptr[x];
-									
-									int It[4] = {(Jptr[x+step+cn]<< 5)	+ illValue,
-												 (Jptr[x+cn]<< 5)		+ illValue,														
-												 (Jptr[x+step]<< 5)		+ illValue,
-												 (Jptr[x] << 5)			+ illValue};
-
-									
-									float J  =  CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
-														  Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11,
-														  W_BITS1-5);
-									
-									int diff =  J + illValue;
-						
-
-						
-									MEstimatorScale += (diff < MEstimatorScale) ? -eta : eta; 
-
-									int abss = (diff < 0) ? -diff : diff;
-
-
-									// compute the missmatch vector
-									if( j >= 0)
-									{
-										if( abss > fParam1)
-										{
-											It[0] = 0;
-											It[1] = 0;
-											It[2] = 0;
-											It[3] = 0;
-										}
-										else if( abss > fParam0 && diff >= 0 )
-										{
-											It[0] = param[2] * (It[0] - fParam1);
-											It[1] = param[2] * (It[1] - fParam1);
-											It[2] = param[2] * (It[2] - fParam1);
-											It[3] = param[2] * (It[3] - fParam1);
-										}
-										else if( abss > fParam0 && diff < 0 )
-										{
-											It[0] = param[2] * (It[0] + fParam1);
-											It[1] = param[2] * (It[1] + fParam1);
-											It[2] = param[2] * (It[2] + fParam1);
-											It[3] = param[2] * (It[3] + fParam1);
-										}
-									}
-									_b0[0] += (float)(It[0]*dIptr[0]) ;
-									_b0[1] += (float)(It[1]*dIptr[0]) ;
-									_b0[2] += (float)(It[2]*dIptr[0]) ;
-									_b0[3] += (float)(It[3]*dIptr[0]) ;
-									
-									  
-									_b1[0] += (float)(It[0]*dIptr[1]) ;
-									_b1[1] += (float)(It[1]*dIptr[1]) ;
-									_b1[2] += (float)(It[2]*dIptr[1]) ;
-									_b1[3] += (float)(It[3]*dIptr[1]) ;
-									
-									_b2[0] += (float)(It[0])*Iptr[x] ;
-									_b2[1] += (float)(It[1])*Iptr[x] ;
-									_b2[2] += (float)(It[2])*Iptr[x] ;
-									_b2[3] += (float)(It[3])*Iptr[x] ;
-
-									_b3[0] += (float)(It[0]);
-									_b3[1] += (float)(It[1]);
-									_b3[2] += (float)(It[2]);
-									_b3[3] += (float)(It[3]);
-
-									// compute the Gradient Matrice
-									if( j == 0)
-									{
-										float tale = param[2] * FLT_RESCALE;
-										if( abss < fParam0 || j < 0)
-										{
-											tale = FLT_RESCALE;
-										} 
-										else if( abss > fParam1)
-										{
-											tale *= 0.01f;
-										}
-										else
-										{
-											tale *= param[2];
-										}
-										if( j == 0 )
-										{
-											A11 += (float)(ixval*ixval)*tale;
-											A12 += (float)(ixval*iyval)*tale;
-											A22 += (float)(iyval*iyval)*tale;
-										}
-							
-										dI += Iptr[x] * Iptr[x] * tale;
-										float dx = static_cast<float>(dIptr[0]) * tale;
-										float dy = static_cast<float>(dIptr[1]) * tale;
-										sumIx += dx;
-										sumIy += dy;
-										w1 += dx * Iptr[x];
-										w2 += dy * Iptr[x];
-										sumI += Iptr[x]  * tale;
-										sumW += tale;
-									}
-						
-								}
+                            for( ; x < _winSize.width*cn; x++, dIptr += 2 )
+                            {
+                                if( maskPtr[x] == 0)
+                                    continue;
+
+                                short ixval = static_cast<float>(dIptr[0]);
+                                short iyval = static_cast<float>(dIptr[1]);
+                                int illValue =  Iptr[x] * gainVec.x + gainVec.y  - Iptr[x];
+
+                                int It[4] = {(Jptr[x+step+cn]<< 5)    + illValue,
+                                             (Jptr[x+cn]<< 5)        + illValue,
+                                             (Jptr[x+step]<< 5)        + illValue,
+                                             (Jptr[x] << 5)            + illValue};
+
+
+                                float J  =  CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
+                                                      Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11,
+                                                      W_BITS1-5);
+
+                                int diff =  J + illValue;
+
+
+
+                                MEstimatorScale += (diff < MEstimatorScale) ? -eta : eta;
+
+                                int abss = (diff < 0) ? -diff : diff;
+
+
+                                // compute the missmatch vector
+                                if( j >= 0)
+                                {
+                                    if( abss > fParam1)
+                                    {
+                                        It[0] = 0;
+                                        It[1] = 0;
+                                        It[2] = 0;
+                                        It[3] = 0;
+                                    }
+                                    else if( abss > fParam0 && diff >= 0 )
+                                    {
+                                        It[0] = param[2] * (It[0] - fParam1);
+                                        It[1] = param[2] * (It[1] - fParam1);
+                                        It[2] = param[2] * (It[2] - fParam1);
+                                        It[3] = param[2] * (It[3] - fParam1);
+                                    }
+                                    else if( abss > fParam0 && diff < 0 )
+                                    {
+                                        It[0] = param[2] * (It[0] + fParam1);
+                                        It[1] = param[2] * (It[1] + fParam1);
+                                        It[2] = param[2] * (It[2] + fParam1);
+                                        It[3] = param[2] * (It[3] + fParam1);
+                                    }
+                                }
+                                _b0[0] += (float)(It[0]*dIptr[0]) ;
+                                _b0[1] += (float)(It[1]*dIptr[0]) ;
+                                _b0[2] += (float)(It[2]*dIptr[0]) ;
+                                _b0[3] += (float)(It[3]*dIptr[0]) ;
+
+
+                                _b1[0] += (float)(It[0]*dIptr[1]) ;
+                                _b1[1] += (float)(It[1]*dIptr[1]) ;
+                                _b1[2] += (float)(It[2]*dIptr[1]) ;
+                                _b1[3] += (float)(It[3]*dIptr[1]) ;
+
+                                _b2[0] += (float)(It[0])*Iptr[x] ;
+                                _b2[1] += (float)(It[1])*Iptr[x] ;
+                                _b2[2] += (float)(It[2])*Iptr[x] ;
+                                _b2[3] += (float)(It[3])*Iptr[x] ;
+
+                                _b3[0] += (float)(It[0]);
+                                _b3[1] += (float)(It[1]);
+                                _b3[2] += (float)(It[2]);
+                                _b3[3] += (float)(It[3]);
+
+                                // compute the Gradient Matrice
+                                if( j == 0)
+                                {
+                                    float tale = param[2] * FLT_RESCALE;
+                                    if( abss < fParam0 || j < 0)
+                                    {
+                                        tale = FLT_RESCALE;
+                                    }
+                                    else if( abss > fParam1)
+                                    {
+                                        tale *= 0.01f;
+                                    }
+                                    else
+                                    {
+                                        tale *= param[2];
+                                    }
+                                    if( j == 0 )
+                                    {
+                                        A11 += (float)(ixval*ixval)*tale;
+                                        A12 += (float)(ixval*iyval)*tale;
+                                        A22 += (float)(iyval*iyval)*tale;
+                                    }
+
+                                    dI += Iptr[x] * Iptr[x] * tale;
+                                    float dx = static_cast<float>(dIptr[0]) * tale;
+                                    float dy = static_cast<float>(dIptr[1]) * tale;
+                                    sumIx += dx;
+                                    sumIy += dy;
+                                    w1 += dx * Iptr[x];
+                                    w2 += dy * Iptr[x];
+                                    sumI += Iptr[x]  * tale;
+                                    sumW += tale;
+                                }
+
+                            }
 #endif
-							}
-					
-#ifdef CV_SSE2
-							short etaValues[8];
-						    _mm_storeu_si128((__m128i*)(etaValues), mmEta);
-							MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3] 
-													   + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]);
-
-#endif				
-							float CV_DECL_ALIGNED(32) wbuf[4];//
-							if( j == 0 )
-							{
-#ifdef CV_SSE2
-									_mm_store_ps(wbuf, mmSumW1);
-									w1  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-									_mm_store_ps(wbuf, mmSumW2);
-									w2  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-									_mm_store_ps(wbuf, mmSumDI);
-									dI  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-									_mm_store_ps(wbuf, mmSumI);
-									sumI  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-									_mm_store_ps(wbuf, mmSumIx);
-									sumIx  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-									_mm_store_ps(wbuf, mmSumIy);
-									sumIy  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-									_mm_store_ps(wbuf, mmSumW);
-									sumW  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                        }
+
+#ifdef CV_SSE4_1
+                        short etaValues[8];
+                        _mm_storeu_si128((__m128i*)(etaValues), mmEta);
+                        MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3]
+                                                   + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]);
+
+#endif                
+                        float CV_DECL_ALIGNED(32) wbuf[4];//
+                        if( j == 0 )
+                        {
+#ifdef CV_SSE4_1
+                                _mm_store_ps(wbuf, mmSumW1);
+                                w1  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                                _mm_store_ps(wbuf, mmSumW2);
+                                w2  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                                _mm_store_ps(wbuf, mmSumDI);
+                                dI  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                                _mm_store_ps(wbuf, mmSumI);
+                                sumI  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                                _mm_store_ps(wbuf, mmSumIx);
+                                sumIx  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                                _mm_store_ps(wbuf, mmSumIy);
+                                sumIy  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                                _mm_store_ps(wbuf, mmSumW);
+                                sumW  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
 #endif
-									sumIx *= -FLT_SCALE;
-									sumIy *= -FLT_SCALE;
-									sumI *=FLT_SCALE;
-									sumW *= FLT_SCALE;
-									w1 *= -FLT_SCALE;
-									w2 *= -FLT_SCALE;
-									dI *= FLT_SCALE;
-											
-		
-#ifdef CV_SSE2
-								float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];//				
-					
-								_mm_store_ps(A11buf, mmAxx);
-								_mm_store_ps(A12buf, mmAxy);
-								_mm_store_ps(A22buf, mmAyy);
-					
-
-								A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
-								A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
-								A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
+                                sumIx *= -FLT_SCALE;
+                                sumIy *= -FLT_SCALE;
+                                sumI *=FLT_SCALE;
+                                sumW *= FLT_SCALE;
+                                w1 *= -FLT_SCALE;
+                                w2 *= -FLT_SCALE;
+                                dI *= FLT_SCALE;
+
+
+#ifdef CV_SSE4_1
+                            float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];//
+
+                            _mm_store_ps(A11buf, mmAxx);
+                            _mm_store_ps(A12buf, mmAxy);
+                            _mm_store_ps(A22buf, mmAyy);
+
+
+                            A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
+                            A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
+                            A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
 #endif
-								A11 *= FLT_SCALE; // 54866744.
-								A12 *= FLT_SCALE; // -628764.00
-								A22 *= FLT_SCALE; // 19730.000
-														
-								D = - A12*A12*sumI*sumI + dI*sumW*A12*A12 + 2*A12*sumI*sumIx*w2 + 2*A12*sumI*sumIy*w1 
-									- 2*dI*A12*sumIx*sumIy - 2*sumW*A12*w1*w2 + A11*A22*sumI*sumI - 2*A22*sumI*sumIx*w1 
-									- 2*A11*sumI*sumIy*w2 - sumIx*sumIx*w2*w2 + A22*dI*sumIx*sumIx + 2*sumIx*sumIy*w1*w2 
-									- sumIy*sumIy*w1*w1 + A11*dI*sumIy*sumIy + A22*sumW*w1*w1 + A11*sumW*w2*w2 - A11*A22*dI*sumW;
-								
-								float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
-										4.f*A12*A12))/(2*winArea);
-								if(  minEig < minEigThreshold || std::abs(D) < FLT_EPSILON )
-								{
-									if( level == 0 && status )
-										status[ptidx] = 0;
-									if( level > 0)
-									{
-										nextPts[ptidx] = backUpNextPt;
-										gainVecs[ptidx] = backUpGain;
-									}
-									noIteration++;
-									break;
-								}
-				
-
-								D = (1.f / D);
-
-								invTensorMat.at<float>(0,0) = (A22*sumI*sumI - 2*sumI*sumIy*w2 + dI*sumIy*sumIy + sumW*w2*w2 - A22*dI*sumW)* D;															
-								invTensorMat.at<float>(0,1) = (A12*dI*sumW - A12*sumI * sumI - dI*sumIx*sumIy + sumI*sumIx*w2 + sumI*sumIy*w1 - sumW*w1*w2)* D;
-								invTensorMat.at<float>(0,2) = (A12*sumI*sumIy - sumIy*sumIy*w1 - A22*sumI*sumIx - A12*sumW*w2 + A22*sumW*w1 + sumIx*sumIy*w2)* D;
-								invTensorMat.at<float>(0,3) = (A22*dI*sumIx - A12*dI*sumIy - sumIx*w2*w2 + A12*sumI*w2 - A22*sumI*w1 + sumIy*w1*w2) * D;
-								invTensorMat.at<float>(1,0) = invTensorMat.at<float>(0,1);
-								invTensorMat.at<float>(1,1) = (A11*sumI * sumI - 2*sumI*sumIx*w1 + dI*sumIx * sumIx + sumW*w1*w1 - A11*dI*sumW) * D;
-								invTensorMat.at<float>(1,2) = (A12*sumI*sumIx - A11*sumI*sumIy - sumIx * sumIx*w2 + A11*sumW*w2 - A12*sumW*w1 + sumIx*sumIy*w1) * D;
-								invTensorMat.at<float>(1,3) = (A11*dI*sumIy - sumIy*w1*w1 - A12*dI*sumIx - A11*sumI*w2 + A12*sumI*w1 + sumIx*w1*w2)* D;
-								invTensorMat.at<float>(2,0) = invTensorMat.at<float>(0,2);
-								invTensorMat.at<float>(2,1) = invTensorMat.at<float>(1,2);
-								invTensorMat.at<float>(2,2) = (sumW*A12*A12 - 2*A12*sumIx*sumIy + A22*sumIx*sumIx + A11*sumIy*sumIy - A11*A22*sumW)* D;
-								invTensorMat.at<float>(2,3) = (A11*A22*sumI - A12*A12*sumI - A11*sumIy*w2 + A12*sumIx*w2 + A12*sumIy*w1 - A22*sumIx*w1)* D;
-								invTensorMat.at<float>(3,0) = invTensorMat.at<float>(0,3);
-								invTensorMat.at<float>(3,1) = invTensorMat.at<float>(1,3);
-								invTensorMat.at<float>(3,2) = invTensorMat.at<float>(2,3);
-								invTensorMat.at<float>(3,3) = (dI*A12*A12 - 2*A12*w1*w2 + A22*w1*w1 + A11*w2*w2 - A11*A22*dI)* D;
-							}
-
-
-			#ifdef CV_SSE2
-							float CV_DECL_ALIGNED(16) bbuf[4];
-							for(int mmi = 0; mmi < 4; mmi++)
-							{
-								
-								_mm_store_ps(bbuf, _mm_add_ps(qb0[mmi], qb1[mmi]));
-								_b0[mmi] = bbuf[0] + bbuf[2];
-								_b1[mmi] = bbuf[1] + bbuf[3];
-								_mm_store_ps(bbuf, qb2[mmi]);
-								_b2[mmi] = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
-								_mm_store_ps(bbuf, qb3[mmi]);
-								_b3[mmi] = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
-						
-							}
-			#endif
-							
-							_b0[0] *= FLT_SCALE;_b0[1] *= FLT_SCALE;_b0[2] *= FLT_SCALE;_b0[3] *= FLT_SCALE;
-							_b1[0] *= FLT_SCALE;_b1[1] *= FLT_SCALE;_b1[2] *= FLT_SCALE;_b1[3] *= FLT_SCALE;
-							_b2[0] *= FLT_SCALE;_b2[1] *= FLT_SCALE;_b2[2] *= FLT_SCALE;_b2[3] *= FLT_SCALE;
-							_b3[0] *= FLT_SCALE;_b3[1] *= FLT_SCALE;_b3[2] *= FLT_SCALE;_b3[3] *= FLT_SCALE;
-							
-
-							Mc0[0] =   _b0[0] - _b0[1] - _b0[2] + _b0[3];
-							Mc0[1] =   _b1[0] - _b1[1] - _b1[2] + _b1[3];
-							Mc0[2] = -(_b2[0] - _b2[1] - _b2[2] + _b2[3]);
-							Mc0[3] = -(_b3[0] - _b3[1] - _b3[2] + _b3[3]);
-						
-							Mc1[0] =   _b0[1] - _b0[3];
-							Mc1[1] =   _b1[1] - _b1[3];
-							Mc1[2] = -(_b2[1] - _b2[3]);
-							Mc1[3] = -(_b3[1] - _b3[3]);
-					
-
-							Mc2[0] =   _b0[2] - _b0[3];
-							Mc2[1] =   _b1[2] - _b1[3];
-							Mc2[2] = -(_b2[2] - _b2[3]);
-							Mc2[3] = -(_b3[2] - _b3[3]);
-							
-
-							Mc3[0] =  _b0[3];
-							Mc3[1] =  _b1[3];
-							Mc3[2] = -_b2[3];
-							Mc3[3] = -_b3[3];
-							
-							//
-							c[0] = -Mc0[0];
-							c[1] = -Mc1[0];
-							c[2] = -Mc2[0];
-							c[3] = -Mc3[0];
-							c[4] = -Mc0[1];
-							c[5] = -Mc1[1];
-							c[6] = -Mc2[1];
-							c[7] = -Mc3[1];
-
-							float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]);
-							float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]);
-							float e2 = e0 * (c[1] * c[7] -c[3] * c[5]);
-							e0 = e1 * e1 - e2;
-							hasSolved = false;
-							if ( e0 > 0)
-							{
-								e0 = sqrt(e0);
-								float _y[2] = {-e1 - e0, e0 - e1};
-								float c0yc1[2] = {c[0] * _y[0] + c[1],  
-													c[0] * _y[1] + c[1]};
-								float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0], 
-												-(c[2] * _y[1] + c[3]) / c0yc1[1]};
-								float c0yc2[2] = {c[0] * _y[0] + c[2],
-											  c[0] * _y[1] + c[2]};
-									
-								float c4yc6[2] = {c[4] * _y[0] + c[6],
-												  c[4] * _y[1] + c[6]};
-
-								float c0xc1[2] = {c[0] * _x[0] + c[1],  
-												  c[0] * _x[1] + c[1]};
-							
-								float c4xc5[2] = {c[4] * _x[0] + c[5],  
-												  c[4] * _x[1] + c[5]};
-								bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1);
-								bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1);
-																
-								bool isSolution1 = checkSolution(_x[0], _y[0], c );
-								bool isSolution2 = checkSolution(_x[1], _y[1], c );
-								bool isSink1 = isIn1 && isSolution1;
-								bool isSink2 = isIn2 && isSolution2;
-						
-								if ( isSink1 != isSink2)
-								{
-									a = isSink1 ? _x[0] : _x[1];
-									b = isSink1 ? _y[0] : _y[1];
-									ab = a * b;
-									hasSolved = true;
-									delta.x = inextPt.x + a - nextPt.x;
-									delta.y = inextPt.y + b - nextPt.y;
-									
-									cv::Vec4f mismatchVec = ab * Mc0  + Mc1 *a + Mc2 * b + Mc3;
-									deltaGain = est_DeltaGain(invTensorMat, mismatchVec);
-									
-								} // isIn1 != isIn2
-							}
-								if( hasSolved == false)
-									noIteration++;
-							}
-							else
-							{
-								hasSolved = false;
-								noReusedIteration++;
-							}
-							if( hasSolved == false )
-							{
-								
-								cv::Vec4f mismatchVec = ab * Mc0  + Mc1 *a + Mc2 * b + Mc3;
-								est_Result(invTensorMat, mismatchVec, delta, deltaGain);
-
-								delta.x  = MAX(-1.f, MIN( 1.f , delta.x));
-								delta.y  = MAX(-1.f, MIN( 1.f , delta.y));
-								
-								
-								if( j == 0) 
-									prevGain = deltaGain;
-								gainVec += deltaGain;
-								nextPt  += delta    ;
-								nextPts[ptidx] = nextPt + halfWin;
-								gainVecs[ptidx]= gainVec;	
-								
-							}
-							else
-							{
-								nextPt += delta;
-								nextPts[ptidx] = nextPt + halfWin;
-								gainVecs[ptidx]= gainVec + deltaGain;	
-								noSolvedIteration++;
-								break;
-							}
-
-
-							cv::Point2f totalDistance = backUpNextPt - nextPts[ptidx];
-
-							if(j > 0 && ( 
-								(std::abs(delta.x - prevDelta.x) < 0.01  &&	std::abs(delta.y - prevDelta.y) < 0.01)  
-							 || ((delta.ddot(delta) <= 0.001) && std::abs(prevGain.x - deltaGain.x) < 0.01)))
-							{
-								nextPts[ptidx]  -= delta*0.5f;
-								gainVecs[ptidx] -= deltaGain* 0.5f;
-								break;
-							}
-							
-							
-							prevDelta = delta;
-							prevGain = deltaGain;
-						}
-
-					}
-	
-				}
-    
-				const Mat*		prevImg;
-				const Mat*		nextImg;
-				const Mat*		prevDeriv;
-				const Mat*		rgbPrevImg;
-				const Mat*		rgbNextImg;
-				const Point2f*	prevPts;
-				Point2f*		nextPts;
-				uchar*			status;
-				cv::Point2f*	gainVecs;		// gain vector x -> multiplier y -> offset
-				float*			err;
-				int				maxWinSize;
-				int				minWinSize;
-				TermCriteria	criteria;
-				int				level;
-				int				maxLevel;
-				int				windowType;
-				float			minEigThreshold;
-				bool			useInitialFlow;
-				std::vector<float> paramReset;
-				int		crossSegmentationThreshold;
-
-			};
-		}
-	}
-	namespace beplk
-	{
-		namespace ica{
-		class TrackerInvoker  : public cv::ParallelLoopBody
-		{
-		public:
-			TrackerInvoker(const Mat&	_prevImg,
-				const Mat&	_prevDeriv,
-				const Mat&	_nextImg,
-				const Mat&	_rgbPrevImg,
-				const Mat&	_rgbNextImg,
-				const Point2f*	_prevPts,
-				Point2f*		_nextPts,
-				uchar*			_status,
-				float*			_err,
-				int				_level,
-				int				_maxLevel,
-				int				_winSize[2],
-				int				_maxIteration,
-				bool			_useInitialFlow,
-				int				_supportRegionType,
-				int				_crossSegmentationThreshold,
-				float           _minEigenValue)
-			{
-				prevImg = &_prevImg;
-				prevDeriv = &_prevDeriv;
-				nextImg = &_nextImg;
-				rgbPrevImg = &_rgbPrevImg;
-				rgbNextImg = &_rgbNextImg;
-				prevPts = _prevPts;
-				nextPts = _nextPts;
-				status = _status;
-				err = _err;
-				minWinSize = _winSize[0];
-				maxWinSize = _winSize[1];
-				criteria.maxCount = _maxIteration;
-				criteria.epsilon = 0.01;
-				level = _level;
-				maxLevel = _maxLevel;
-				windowType = _supportRegionType;
-				minEigThreshold = _minEigenValue;
-				useInitialFlow = _useInitialFlow;
-				crossSegmentationThreshold = _crossSegmentationThreshold;
-			}
-			
-			TrackerInvoker & operator=(const TrackerInvoker &) {return *this;};
-			
-			void operator()(const cv::Range& range) const
-			{
-				cv::Size	winSize;
-				cv::Point2f halfWin;
-
-				const Mat& I	= *prevImg;
-				const Mat& J	= *nextImg;
-				const Mat& derivI = *prevDeriv;
-				const Mat& BI = *rgbPrevImg;
-
-				winSize = cv::Size(maxWinSize,maxWinSize);
-				int winMaskwidth = static_cast<int>(ceil(winSize.width / 8.f)) * 16;
-				cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
-				winMaskMatBuf.setTo(1);
-		
-				const float FLT_SCALE = (1.f/(1 << 20)); // 20
-		
-				int j, cn = I.channels(), cn2 = cn*2;
-				int winbufwidth = static_cast<int>(ceil(winSize.width / 8.f)) * 8;
-				cv::Size winBufSize(winbufwidth,winbufwidth);
-
-				std::vector<short> _buf(winBufSize.area()*(cn + cn2));
-				Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]);
-				Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]);
-
-				for( int ptidx = range.start; ptidx < range.end; ptidx++ )
-				{
-					Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
-					Point2f nextPt;
-					if( level == maxLevel )
-					{
-						if( useInitialFlow )
-							nextPt = nextPts[ptidx]*(float)(1./(1 << level));
-						else
-							nextPt = prevPt;
-					}
-					else
-						nextPt = nextPts[ptidx]*2.f;
-					nextPts[ptidx] = nextPt;
-
-					Point2i iprevPt, inextPt;
-					iprevPt.x = cvFloor(prevPt.x);
-					iprevPt.y = cvFloor(prevPt.y);
-					int winArea = maxWinSize * maxWinSize;
-					cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize));
-
-					if( calcWinMaskMat(BI, windowType, iprevPt,
-								winMaskMat,winSize,halfWin,winArea,
-								minWinSize,maxWinSize, crossSegmentationThreshold) == false)
-						continue;
-
-					prevPt -= halfWin;
-					iprevPt.x = cvFloor(prevPt.x);
-					iprevPt.y = cvFloor(prevPt.y);
-			
-					if( iprevPt.x + halfWin.x < 0 || iprevPt.x + halfWin.x>= derivI.cols ||
-						iprevPt.y + halfWin.y < 0 || iprevPt.y + halfWin.y >= derivI.rows )
-					{
-						if( level == 0 )
-						{
-							if( status )
-								status[ptidx] = 3;
-							if( err )
-								err[ptidx] = 0;
-						}
-						continue;
-					}
-
-					float a = prevPt.x - iprevPt.x;
-					float b = prevPt.y - iprevPt.y;
-					const int W_BITS = 14, W_BITS1 = 14;
-            
-					int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
-					int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
-					int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
-					int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-            
-					int dstep = (int)(derivI.step/derivI.elemSize1());
-					int step = (int)(I.step/I.elemSize1());
-					int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1());
-					CV_Assert( step == (int)(J.step/J.elemSize1()) );
-					float A11 = 0, A12 = 0, A22 = 0;
-            
-		#ifdef CV_SSE2
-					__m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
-					__m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
-					__m128i z = _mm_setzero_si128();
-					__m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1));
-					__m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1));
-					__m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps();
-					__m128i mmMask4_epi32;
-					get4BitMask(winSize.width, mmMask4_epi32);
-		#endif
-            
-					
-					int x, y;
-					for( y = 0; y < winSize.height; y++ )
-					{
-						const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
-						const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
-						const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
-                
-						short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step);
-						short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step);
-                
-						x = 0;
-                
-		#ifdef CV_SSE2
-						for( ; x < winSize.width*cn; x += 4, dsrc += 4*2, dIptr += 4*2 )
-						{
-							__m128i wMask = _mm_set_epi32(MaskSet * maskPtr[x+3], 
-														  MaskSet * maskPtr[x+2], 
-														  MaskSet * maskPtr[x+1], 
-														  MaskSet * maskPtr[x]);
-							__m128i v00, v01, v10, v11, t0, t1;
-							v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
-							v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
-							v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z);
-							v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z);
-                    
-							t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
-											   _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-				
-							t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
-							_mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0));
-                    
-							v00 = _mm_loadu_si128((const __m128i*)(dsrc));
-							v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
-							v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep));
-							v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2));
-                    
-							t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
-											   _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-							t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
-											   _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
-							t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
-							t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
-							v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
-                    
-							if( x + 4 > winSize.width)
-							{
-								v00 = _mm_and_si128(v00, mmMask4_epi32);
-							}
-							v00 = _mm_and_si128(v00, wMask);
-					
-							_mm_storeu_si128((__m128i*)dIptr, v00);
-							t0 = _mm_srai_epi32(v00, 16); // Iy0 Iy1 Iy2 Iy3
-							t1 = _mm_srai_epi32(_mm_slli_epi32(v00, 16), 16); // Ix0 Ix1 Ix2 Ix3
-                    
-							__m128 fy = _mm_cvtepi32_ps(t0);
-							__m128 fx = _mm_cvtepi32_ps(t1);
-		
-							qA22 = _mm_add_ps(qA22, _mm_mul_ps(fy, fy));
-							qA12 = _mm_add_ps(qA12, _mm_mul_ps(fx, fy));
-							qA11 = _mm_add_ps(qA11, _mm_mul_ps(fx, fx));
-						}
-		#else
-                
-						for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2)
-						{
-							if( maskPtr[x] == 0)
-							{
-								dIptr[0] = (short)0;
-								dIptr[1] = (short)0;
-		
-								continue;
-							}
-							int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
-												  src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5);
-							int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
-												   dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
-							int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
-												   dsrc[dstep+cn2+1]*iw11, W_BITS1);
-		
-							Iptr[x] = (short)ival;
-							dIptr[0] = (short)ixval;
-							dIptr[1] = (short)iyval;
-							A11 += (float)(ixval*ixval);
-							A12 += (float)(ixval*iyval);
-							A22 += (float)(iyval*iyval);
-					
-						}
-			
-			
-		#endif
-		
-					}
-            
-		#ifdef CV_SSE2
-					float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];
-					_mm_store_ps(A11buf, qA11);
-					_mm_store_ps(A12buf, qA12);
-					_mm_store_ps(A22buf, qA22);
-					A11 += A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
-					A12 += A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
-					A22 += A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
-		#endif
-            
-					A11 *= FLT_SCALE;
-					A12 *= FLT_SCALE;
-					A22 *= FLT_SCALE;
-
-
-					float D = A11*A22 - A12*A12;
-					float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
-									 4.f*A12*A12))/(2 * winArea);
-            
-					if( err )
-						err[ptidx] = (float)minEig;
-            
-					if( D < FLT_EPSILON )
-					{
-						if( level == 0 && status )
-							status[ptidx] = 0;
-						continue;
-					}
-            
-					D = 1.f/D;
-
-					cv::Point2f backUpNextPt = nextPt;
-					nextPt -= halfWin;
-					Point2f prevDelta(0,0);	
-					
-					float c[8];
-					// init the maske 
-					__m128i mmMask_epi32 = _mm_set_epi32(0, std::numeric_limits<unsigned int>::max(), 0, std::numeric_limits<unsigned int>::max());
-					__m128i mmMask_epi16 = _mm_set_epi16(0, std::numeric_limits<unsigned short>::max(), 0, std::numeric_limits<unsigned short>::max(),
-														 0, std::numeric_limits<unsigned short>::max(), 0, std::numeric_limits<unsigned short>::max());
-					__m128i mmMask0, mmMask1, mmMask;
-					getWBitMask(winSize.width, mmMask0, mmMask1, mmMask);
-					for( j = 0; j < criteria.maxCount; j++ )
-					{
-						cv::Point2f delta;
-						bool hasSolved = false;
-						a = nextPt.x - cvFloor(nextPt.x);
-						b = nextPt.y - cvFloor(nextPt.y);
-						float ab = a * b;
-
-						
-						if( (inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j == 0))
-						{
-						inextPt.x = cvFloor(nextPt.x);
-						inextPt.y = cvFloor(nextPt.y);
-                
-				
-						if( inextPt.x < -winSize.width || inextPt.x >= J.cols ||
-							inextPt.y < -winSize.height || inextPt.y >= J.rows )
-						{
-							if( level == 0 && status )
-								status[ptidx] = 3;
-							if (level > 0)
-								nextPts[ptidx] = backUpNextPt;
-							break;
-						}
-
-						
-						iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
-						iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
-						iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
-						iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-						
-						float _b1[4] = {0,0,0,0};
-						float _b2[4] = {0,0,0,0};
-		#ifdef CV_SSE2
-						__m128 qbc0[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
-						__m128 qbc1[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
-						qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
-						qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
-						__m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();			
-		#endif
-						for( y = 0; y < winSize.height; y++ )
-						{
-							const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn;
-							const short* Iptr = (const short*)(IWinBuf.data + ( y)*IWinBuf.step);
-							const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
-							x = 0;
-		#ifdef CV_SSE2					
-							const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
-							for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 )
-							{
-								if( maskPtr[x  ] == 0 && maskPtr[x+1] == 0 && maskPtr[x+2] == 0 && maskPtr[x+3] == 0
-								&&	maskPtr[x+4] == 0 && maskPtr[x+5] == 0 && maskPtr[x+6] == 0 && maskPtr[x+7] == 0)
-									continue;
-								__m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x));
-								__m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
-								__m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
-								__m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z);
-								__m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z);
-	
-								__m128i mmDiffc_epi16[4] = 
-								{ _mm_subs_epi16(_mm_slli_epi16(v00, 5), diff0),
-								  _mm_subs_epi16(_mm_slli_epi16(v01, 5), diff0),
-								  _mm_subs_epi16(_mm_slli_epi16(v10, 5), diff0),
-								  _mm_subs_epi16(_mm_slli_epi16(v11, 5), diff0)
-								};
-
-								__m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... 
-								__m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
-						
-								if(  x > winSize.width*cn - 8)
-								{
-									Ixy_0 = _mm_and_si128(Ixy_0, mmMask0);
-									Ixy_1 = _mm_and_si128(Ixy_1, mmMask1);
-								}
-
-								__m128i diffc1[4] = 
-								{_mm_unpackhi_epi16(mmDiffc_epi16[0],mmDiffc_epi16[0]),
-								 _mm_unpackhi_epi16(mmDiffc_epi16[1],mmDiffc_epi16[1]),
-								 _mm_unpackhi_epi16(mmDiffc_epi16[2],mmDiffc_epi16[2]),
-								 _mm_unpackhi_epi16(mmDiffc_epi16[3],mmDiffc_epi16[3])
-								};
-
-								__m128i diffc0[4] = 
-								{_mm_unpacklo_epi16(mmDiffc_epi16[0],mmDiffc_epi16[0]),
-								 _mm_unpacklo_epi16(mmDiffc_epi16[1],mmDiffc_epi16[1]),
-								 _mm_unpacklo_epi16(mmDiffc_epi16[2],mmDiffc_epi16[2]),
-								 _mm_unpacklo_epi16(mmDiffc_epi16[3],mmDiffc_epi16[3])
-								};
-								
-					
-								// It * Ix It * Iy [0 ... 3]
-								//mask split for multiplication
-								// for set 1 lo sigma 1
-								
-						
-								v10 = _mm_mullo_epi16(Ixy_0, diffc0[0]);
-								v11 = _mm_mulhi_epi16(Ixy_0, diffc0[0]);
-								v00 = _mm_unpacklo_epi16(v10, v11);
-								v10 = _mm_unpackhi_epi16(v10, v11);
-								qbc0[0] = _mm_add_ps(qbc0[0], _mm_cvtepi32_ps(v00));
-								qbc1[0] = _mm_add_ps(qbc1[0], _mm_cvtepi32_ps(v10));
-
-								v10 = _mm_mullo_epi16(Ixy_0, diffc0[1]);
-								v11 = _mm_mulhi_epi16(Ixy_0, diffc0[1]);
-								v00 = _mm_unpacklo_epi16(v10, v11);
-								v10 = _mm_unpackhi_epi16(v10, v11);
-								qbc0[1] = _mm_add_ps(qbc0[1], _mm_cvtepi32_ps(v00));
-								qbc1[1] = _mm_add_ps(qbc1[1], _mm_cvtepi32_ps(v10));
-
-								v10 = _mm_mullo_epi16(Ixy_0, diffc0[2]);
-								v11 = _mm_mulhi_epi16(Ixy_0, diffc0[2]);
-								v00 = _mm_unpacklo_epi16(v10, v11);
-								v10 = _mm_unpackhi_epi16(v10, v11);
-								qbc0[2] = _mm_add_ps(qbc0[2], _mm_cvtepi32_ps(v00));
-								qbc1[2] = _mm_add_ps(qbc1[2], _mm_cvtepi32_ps(v10));
-
-								v10 = _mm_mullo_epi16(Ixy_0, diffc0[3]);
-								v11 = _mm_mulhi_epi16(Ixy_0, diffc0[3]);
-								v00 = _mm_unpacklo_epi16(v10, v11);
-								v10 = _mm_unpackhi_epi16(v10, v11);
-								qbc0[3] = _mm_add_ps(qbc0[3], _mm_cvtepi32_ps(v00));
-								qbc1[3] = _mm_add_ps(qbc1[3], _mm_cvtepi32_ps(v10));
-								// It * Ix It * Iy [4 ... 7]
-								// for set 1 hi sigma 1
-						
-								v10 = _mm_mullo_epi16(Ixy_1, diffc1[0]);
-								v11 = _mm_mulhi_epi16(Ixy_1, diffc1[0]);
-								v00 = _mm_unpacklo_epi16(v10, v11);
-								v10 = _mm_unpackhi_epi16(v10, v11);
-								qbc0[0] = _mm_add_ps(qbc0[0], _mm_cvtepi32_ps(v00));
-								qbc1[0] = _mm_add_ps(qbc1[0], _mm_cvtepi32_ps(v10));
-
-								v10 = _mm_mullo_epi16(Ixy_1, diffc1[1]);
-								v11 = _mm_mulhi_epi16(Ixy_1, diffc1[1]);
-								v00 = _mm_unpacklo_epi16(v10, v11);
-								v10 = _mm_unpackhi_epi16(v10, v11);
-								qbc0[1] = _mm_add_ps(qbc0[1], _mm_cvtepi32_ps(v00));
-								qbc1[1] = _mm_add_ps(qbc1[1], _mm_cvtepi32_ps(v10));
-
-								v10 = _mm_mullo_epi16(Ixy_1, diffc1[2]);
-								v11 = _mm_mulhi_epi16(Ixy_1, diffc1[2]);
-								v00 = _mm_unpacklo_epi16(v10, v11);
-								v10 = _mm_unpackhi_epi16(v10, v11);
-								qbc0[2] = _mm_add_ps(qbc0[2], _mm_cvtepi32_ps(v00));
-								qbc1[2] = _mm_add_ps(qbc1[2], _mm_cvtepi32_ps(v10));
-
-								v10 = _mm_mullo_epi16(Ixy_1, diffc1[3]);
-								v11 = _mm_mulhi_epi16(Ixy_1, diffc1[3]);
-								v00 = _mm_unpacklo_epi16(v10, v11);
-								v10 = _mm_unpackhi_epi16(v10, v11);
-								qbc0[3] = _mm_add_ps(qbc0[3], _mm_cvtepi32_ps(v00));
-								qbc1[3] = _mm_add_ps(qbc1[3], _mm_cvtepi32_ps(v10));
-						
-							 }
-		#else
-							for( ; x < winSize.width*cn; x++, dIptr += 2 )
-							{
-								if( dIptr[0] == 0 && dIptr[1] == 0) 
-									continue;
-								/*short It[4] = {CV_DESCALE((Jptr[x])		*(1 << W_BITS),W_BITS1-5) - Iptr[x], 
-													CV_DESCALE((Jptr[x+cn])		*(1 << W_BITS),W_BITS1-5) - Iptr[x],
-													CV_DESCALE((Jptr[x+step])	*(1 << W_BITS),W_BITS1-5) - Iptr[x],
-													CV_DESCALE((Jptr[x+step+cn])*(1 << W_BITS),W_BITS1-5) - Iptr[x]};*/
-								short It[4] = {(Jptr[x] << 5)			- Iptr[x], 
-													(Jptr[x+cn]<< 5)		- Iptr[x],
-													(Jptr[x+step]<< 5)		- Iptr[x],
-													(Jptr[x+step+cn]<< 5)	- Iptr[x]};
-								
-								_b1[0] += (float)(It[0]*dIptr[0]);
-								_b1[1] += (float)(It[1]*dIptr[0]);
-								_b1[2] += (float)(It[2]*dIptr[0]);
-								_b1[3] += (float)(It[3]*dIptr[0]);
-
-								_b2[0] += (float)(It[0]*dIptr[1]);
-								_b2[1] += (float)(It[1]*dIptr[1]);
-								_b2[2] += (float)(It[2]*dIptr[1]);
-								_b2[3] += (float)(It[3]*dIptr[1]);
-							}
-		#endif
-		
-						}
-				
-		#ifdef CV_SSE2
-						float CV_DECL_ALIGNED(16) bbuf[4];
-						_mm_store_ps(bbuf, _mm_add_ps(qbc0[0], qbc1[0]));
-						_b1[0] += bbuf[0] + bbuf[2];
-						_b2[0] += bbuf[1] + bbuf[3];
-
-						_mm_store_ps(bbuf, _mm_add_ps(qbc0[1], qbc1[1]));
-						_b1[1] += bbuf[0] + bbuf[2];
-						_b2[1] += bbuf[1] + bbuf[3];
-
-						_mm_store_ps(bbuf, _mm_add_ps(qbc0[2], qbc1[2]));
-						_b1[2] += bbuf[0] + bbuf[2];
-						_b2[2] += bbuf[1] + bbuf[3];
-
-						_mm_store_ps(bbuf, _mm_add_ps(qbc0[3], qbc1[3]));
-						_b1[3] += bbuf[0] + bbuf[2];
-						_b2[3] += bbuf[1] + bbuf[3];
-
-		#endif
-						_b1[0] *= FLT_SCALE;
-						_b1[1] *= FLT_SCALE;
-						_b1[2] *= FLT_SCALE;
-						_b1[3] *= FLT_SCALE;
-						_b2[0] *= FLT_SCALE;
-						_b2[1] *= FLT_SCALE;
-						_b2[2] *= FLT_SCALE;
-						_b2[3] *= FLT_SCALE;
-
-						float c0[4] = {	_b1[3] + _b1[0] - _b1[2] - _b1[1], 
-										_b1[1] - _b1[0],
-										_b1[2] - _b1[0], 
-										_b1[0]};
-						float c1[4] = {	_b2[3] + _b2[0] - _b2[2] - _b2[1], 
-										_b2[1] - _b2[0],
-										_b2[2] - _b2[0], 
-										_b2[0]};
-
-						float DA12 = A12 * D ;
-						float DA22 = A22 * D ;
-						float DA11 = A11 * D ;
-						c[0]	= DA12 * c1[0] - DA22 * c0[0];
-						c[1]	= DA12 * c1[1] - DA22 * c0[1];
-						c[2]	= DA12 * c1[2] - DA22 * c0[2];
-						c[3]	= DA12 * c1[3] - DA22 * c0[3];
-						c[4]	= DA12 * c0[0] - DA11 * c1[0];
-						c[5]	= DA12 * c0[1] - DA11 * c1[1];
-						c[6]	= DA12 * c0[2] - DA11 * c1[2];
-						c[7]	= DA12 * c0[3] - DA11 * c1[3];
-
-						float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]);
-						float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]);
-						float e2 = e0 * (c[1] * c[7] -c[3] * c[5]);
-						e0 = e1 * e1 - e2;
-						if ( e0 >= 0)
-						{
-							e0 = sqrt(e0);
-							float _y[2] = {-e1 - e0, e0 - e1};
-							float c0yc1[2] = {c[0] * _y[0] + c[1],  
-										      c[0] * _y[1] + c[1]};
-						
-							float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0], 
-										   -(c[2] * _y[1] + c[3]) / c0yc1[1]};
-						
-							float c0yc2[2] = {c[0] * _y[0] + c[2],
-											  c[0] * _y[1] + c[2]};
-									
-							float c4yc6[2] = {c[4] * _y[0] + c[6],
-											  c[4] * _y[1] + c[6]};
-
-							float c0xc1[2] = {c[0] * _x[0] + c[1],  
-											  c[0] * _x[1] + c[1]};
-							
-							float c4xc5[2] = {c[4] * _x[0] + c[5],  
-											  c[4] * _x[1] + c[5]};
-							bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1);
-							bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1);
-
-
-							bool isSink1 = isIn1 && checkSolution(_x[0], _y[0], c );
-							bool isSink2 = isIn2 && checkSolution(_x[1], _y[1], c );
-							
-					
-							//if( isSink1 != isSink2 )
-							{
-								if( isSink1 )
-								{
-									hasSolved = true;
-									delta.x = inextPt.x + _x[0] - nextPt.x;
-									delta.y = inextPt.y + _y[0] - nextPt.y;			
-								}		
-								if (isSink2 )
-								{
-									hasSolved = true;
-									delta.x = inextPt.x + _x[1] - nextPt.x;
-									delta.y = inextPt.y + _y[1] - nextPt.y;
-								}
-							} // isIn1 != isIn2
-						} // e0 >= 0
-						}
-						else
-							hasSolved = false;
-						if(hasSolved == false)
-						{
-							delta = Point2f( c[0] * ab + c[1] * a + c[2] * b + c[3], 
-											 c[4] * ab + c[5] * a + c[6] * b + c[7]);
-							nextPt += 0.7 * delta;
-							nextPts[ptidx] = nextPt + halfWin;
-						}
-						else
-						{
-							nextPt += delta;
-							nextPts[ptidx] = nextPt + halfWin;
-							break;
-						}
-                
-
-						if( delta.ddot(delta) <= criteria.epsilon)
-							break;
-
-						if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01  &&	
-									std::abs(delta.y - prevDelta.y) < 0.01)
-						{
-							nextPts[ptidx]  -= delta*0.35f;
-							break;
-						}
-				
-						prevDelta = delta;
-				
-					}
-					
-				}
-			}
-	
-    
-			const Mat*		prevImg;
-			const Mat*		nextImg;
-			const Mat*		prevDeriv;
-			const Mat*		rgbPrevImg;
-			const Mat*		rgbNextImg;
-			const Point2f*	prevPts;
-			Point2f*		nextPts;
-			uchar*			status;
-			float*			err;
-			int				maxWinSize;
-			int				minWinSize;
-			TermCriteria	criteria;
-			int				level;
-			int				maxLevel;
-			int				windowType;
-			float			minEigThreshold;
-			bool			useInitialFlow;
-			int		crossSegmentationThreshold;
-		};
+                            A11 *= FLT_SCALE; // 54866744.
+                            A12 *= FLT_SCALE; // -628764.00
+                            A22 *= FLT_SCALE; // 19730.000
+
+                            D = - A12*A12*sumI*sumI + dI*sumW*A12*A12 + 2*A12*sumI*sumIx*w2 + 2*A12*sumI*sumIy*w1
+                                - 2*dI*A12*sumIx*sumIy - 2*sumW*A12*w1*w2 + A11*A22*sumI*sumI - 2*A22*sumI*sumIx*w1
+                                - 2*A11*sumI*sumIy*w2 - sumIx*sumIx*w2*w2 + A22*dI*sumIx*sumIx + 2*sumIx*sumIy*w1*w2
+                                - sumIy*sumIy*w1*w1 + A11*dI*sumIy*sumIy + A22*sumW*w1*w1 + A11*sumW*w2*w2 - A11*A22*dI*sumW;
+
+                            float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
+                                    4.f*A12*A12))/(2*winArea);
+                            if(  minEig < minEigThreshold || std::abs(D) < FLT_EPSILON )
+                            {
+                                if( level == 0 && status )
+                                    status[ptidx] = 0;
+                                if( level > 0)
+                                {
+                                    nextPts[ptidx] = backUpNextPt;
+                                    gainVecs[ptidx] = backUpGain;
+                                }
+                                noIteration++;
+                                break;
+                            }
+
+
+                            D = (1.f / D);
+
+                            invTensorMat.at<float>(0,0) = (A22*sumI*sumI - 2*sumI*sumIy*w2 + dI*sumIy*sumIy + sumW*w2*w2 - A22*dI*sumW)* D;
+                            invTensorMat.at<float>(0,1) = (A12*dI*sumW - A12*sumI * sumI - dI*sumIx*sumIy + sumI*sumIx*w2 + sumI*sumIy*w1 - sumW*w1*w2)* D;
+                            invTensorMat.at<float>(0,2) = (A12*sumI*sumIy - sumIy*sumIy*w1 - A22*sumI*sumIx - A12*sumW*w2 + A22*sumW*w1 + sumIx*sumIy*w2)* D;
+                            invTensorMat.at<float>(0,3) = (A22*dI*sumIx - A12*dI*sumIy - sumIx*w2*w2 + A12*sumI*w2 - A22*sumI*w1 + sumIy*w1*w2) * D;
+                            invTensorMat.at<float>(1,0) = invTensorMat.at<float>(0,1);
+                            invTensorMat.at<float>(1,1) = (A11*sumI * sumI - 2*sumI*sumIx*w1 + dI*sumIx * sumIx + sumW*w1*w1 - A11*dI*sumW) * D;
+                            invTensorMat.at<float>(1,2) = (A12*sumI*sumIx - A11*sumI*sumIy - sumIx * sumIx*w2 + A11*sumW*w2 - A12*sumW*w1 + sumIx*sumIy*w1) * D;
+                            invTensorMat.at<float>(1,3) = (A11*dI*sumIy - sumIy*w1*w1 - A12*dI*sumIx - A11*sumI*w2 + A12*sumI*w1 + sumIx*w1*w2)* D;
+                            invTensorMat.at<float>(2,0) = invTensorMat.at<float>(0,2);
+                            invTensorMat.at<float>(2,1) = invTensorMat.at<float>(1,2);
+                            invTensorMat.at<float>(2,2) = (sumW*A12*A12 - 2*A12*sumIx*sumIy + A22*sumIx*sumIx + A11*sumIy*sumIy - A11*A22*sumW)* D;
+                            invTensorMat.at<float>(2,3) = (A11*A22*sumI - A12*A12*sumI - A11*sumIy*w2 + A12*sumIx*w2 + A12*sumIy*w1 - A22*sumIx*w1)* D;
+                            invTensorMat.at<float>(3,0) = invTensorMat.at<float>(0,3);
+                            invTensorMat.at<float>(3,1) = invTensorMat.at<float>(1,3);
+                            invTensorMat.at<float>(3,2) = invTensorMat.at<float>(2,3);
+                            invTensorMat.at<float>(3,3) = (dI*A12*A12 - 2*A12*w1*w2 + A22*w1*w1 + A11*w2*w2 - A11*A22*dI)* D;
+                        }
+
+
+        #ifdef CV_SSE4_1
+                        float CV_DECL_ALIGNED(16) bbuf[4];
+                        for(int mmi = 0; mmi < 4; mmi++)
+                        {
+
+                            _mm_store_ps(bbuf, _mm_add_ps(qb0[mmi], qb1[mmi]));
+                            _b0[mmi] = bbuf[0] + bbuf[2];
+                            _b1[mmi] = bbuf[1] + bbuf[3];
+                            _mm_store_ps(bbuf, qb2[mmi]);
+                            _b2[mmi] = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
+                            _mm_store_ps(bbuf, qb3[mmi]);
+                            _b3[mmi] = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
+
+                        }
+        #endif
+
+                        _b0[0] *= FLT_SCALE;_b0[1] *= FLT_SCALE;_b0[2] *= FLT_SCALE;_b0[3] *= FLT_SCALE;
+                        _b1[0] *= FLT_SCALE;_b1[1] *= FLT_SCALE;_b1[2] *= FLT_SCALE;_b1[3] *= FLT_SCALE;
+                        _b2[0] *= FLT_SCALE;_b2[1] *= FLT_SCALE;_b2[2] *= FLT_SCALE;_b2[3] *= FLT_SCALE;
+                        _b3[0] *= FLT_SCALE;_b3[1] *= FLT_SCALE;_b3[2] *= FLT_SCALE;_b3[3] *= FLT_SCALE;
+
+
+                        Mc0[0] =   _b0[0] - _b0[1] - _b0[2] + _b0[3];
+                        Mc0[1] =   _b1[0] - _b1[1] - _b1[2] + _b1[3];
+                        Mc0[2] = -(_b2[0] - _b2[1] - _b2[2] + _b2[3]);
+                        Mc0[3] = -(_b3[0] - _b3[1] - _b3[2] + _b3[3]);
+
+                        Mc1[0] =   _b0[1] - _b0[3];
+                        Mc1[1] =   _b1[1] - _b1[3];
+                        Mc1[2] = -(_b2[1] - _b2[3]);
+                        Mc1[3] = -(_b3[1] - _b3[3]);
+
+
+                        Mc2[0] =   _b0[2] - _b0[3];
+                        Mc2[1] =   _b1[2] - _b1[3];
+                        Mc2[2] = -(_b2[2] - _b2[3]);
+                        Mc2[3] = -(_b3[2] - _b3[3]);
+
+
+                        Mc3[0] =  _b0[3];
+                        Mc3[1] =  _b1[3];
+                        Mc3[2] = -_b2[3];
+                        Mc3[3] = -_b3[3];
+
+                        //
+                        c[0] = -Mc0[0];
+                        c[1] = -Mc1[0];
+                        c[2] = -Mc2[0];
+                        c[3] = -Mc3[0];
+                        c[4] = -Mc0[1];
+                        c[5] = -Mc1[1];
+                        c[6] = -Mc2[1];
+                        c[7] = -Mc3[1];
+
+                        float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]);
+                        float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]);
+                        float e2 = e0 * (c[1] * c[7] -c[3] * c[5]);
+                        e0 = e1 * e1 - e2;
+                        hasSolved = false;
+                        if ( e0 > 0)
+                        {
+                            e0 = sqrt(e0);
+                            float _y[2] = {-e1 - e0, e0 - e1};
+                            float c0yc1[2] = {c[0] * _y[0] + c[1],
+                                                c[0] * _y[1] + c[1]};
+                            float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0],
+                                            -(c[2] * _y[1] + c[3]) / c0yc1[1]};
+                            bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1);
+                            bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1);
+
+                            bool isSolution1 = checkSolution(_x[0], _y[0], c );
+                            bool isSolution2 = checkSolution(_x[1], _y[1], c );
+                            bool isSink1 = isIn1 && isSolution1;
+                            bool isSink2 = isIn2 && isSolution2;
+
+                            if ( isSink1 != isSink2)
+                            {
+                                a = isSink1 ? _x[0] : _x[1];
+                                b = isSink1 ? _y[0] : _y[1];
+                                ab = a * b;
+                                hasSolved = true;
+                                delta.x = inextPt.x + a - nextPt.x;
+                                delta.y = inextPt.y + b - nextPt.y;
+
+                                cv::Vec4f mismatchVec = ab * Mc0  + Mc1 *a + Mc2 * b + Mc3;
+                                deltaGain = est_DeltaGain(invTensorMat, mismatchVec);
+
+                            } // isIn1 != isIn2
+                        }
+                            if( hasSolved == false)
+                                noIteration++;
+                        }
+                        else
+                        {
+                            hasSolved = false;
+                            noReusedIteration++;
+                        }
+                        if( hasSolved == false )
+                        {
+
+                            cv::Vec4f mismatchVec = ab * Mc0  + Mc1 *a + Mc2 * b + Mc3;
+                            est_Result(invTensorMat, mismatchVec, delta, deltaGain);
+
+                            delta.x  = MAX(-1.f, MIN( 1.f , delta.x));
+                            delta.y  = MAX(-1.f, MIN( 1.f , delta.y));
+
+
+                            if( j == 0)
+                                prevGain = deltaGain;
+                            gainVec += deltaGain;
+                            nextPt  += delta    ;
+                            nextPts[ptidx] = nextPt + halfWin;
+                            gainVecs[ptidx]= gainVec;
+
+                        }
+                        else
+                        {
+                            nextPt += delta;
+                            nextPts[ptidx] = nextPt + halfWin;
+                            gainVecs[ptidx]= gainVec + deltaGain;
+                            noSolvedIteration++;
+                            break;
+                        }
+
+                        if(j > 0 && (
+                            (std::abs(delta.x - prevDelta.x) < 0.01  &&    std::abs(delta.y - prevDelta.y) < 0.01)
+                         || ((delta.ddot(delta) <= 0.001) && std::abs(prevGain.x - deltaGain.x) < 0.01)))
+                        {
+                            nextPts[ptidx]  -= delta*0.5f;
+                            gainVecs[ptidx] -= deltaGain* 0.5f;
+                            break;
+                        }
+
+
+                        prevDelta = delta;
+                        prevGain = deltaGain;
+                    }
+
+                }
+
+            }
+
+            const Mat*        prevImg;
+            const Mat*        nextImg;
+            const Mat*        prevDeriv;
+            const Mat*        rgbPrevImg;
+            const Mat*        rgbNextImg;
+            const Point2f*    prevPts;
+            Point2f*        nextPts;
+            uchar*            status;
+            cv::Point2f*    gainVecs;        // gain vector x -> multiplier y -> offset
+            float*            err;
+            int                maxWinSize;
+            int                minWinSize;
+            TermCriteria    criteria;
+            int                level;
+            int                maxLevel;
+            int                windowType;
+            float            minEigThreshold;
+            bool            useInitialFlow;
+            std::vector<float> paramReset;
+            int        crossSegmentationThreshold;
+
+        };
+    }
+}
+namespace beplk
+{
+    namespace ica{
+    class TrackerInvoker  : public cv::ParallelLoopBody
+    {
+    public:
+        TrackerInvoker(const Mat&    _prevImg,
+            const Mat&    _prevDeriv,
+            const Mat&    _nextImg,
+            const Mat&    _rgbPrevImg,
+            const Mat&    _rgbNextImg,
+            const Point2f*    _prevPts,
+            Point2f*        _nextPts,
+            uchar*            _status,
+            float*            _err,
+            int                _level,
+            int                _maxLevel,
+            int                _winSize[2],
+            int                _maxIteration,
+            bool            _useInitialFlow,
+            int                _supportRegionType,
+            int                _crossSegmentationThreshold,
+            float           _minEigenValue)
+        {
+            prevImg = &_prevImg;
+            prevDeriv = &_prevDeriv;
+            nextImg = &_nextImg;
+            rgbPrevImg = &_rgbPrevImg;
+            rgbNextImg = &_rgbNextImg;
+            prevPts = _prevPts;
+            nextPts = _nextPts;
+            status = _status;
+            err = _err;
+            minWinSize = _winSize[0];
+            maxWinSize = _winSize[1];
+            criteria.maxCount = _maxIteration;
+            criteria.epsilon = 0.01;
+            level = _level;
+            maxLevel = _maxLevel;
+            windowType = _supportRegionType;
+            minEigThreshold = _minEigenValue;
+            useInitialFlow = _useInitialFlow;
+            crossSegmentationThreshold = _crossSegmentationThreshold;
+        }
+
+        TrackerInvoker & operator=(const TrackerInvoker &) {return *this;};
+
+        void operator()(const cv::Range& range) const
+        {
+            cv::Size    winSize;
+            cv::Point2f halfWin;
+
+            const Mat& I    = *prevImg;
+            const Mat& J    = *nextImg;
+            const Mat& derivI = *prevDeriv;
+            const Mat& BI = *rgbPrevImg;
+
+            winSize = cv::Size(maxWinSize,maxWinSize);
+            int winMaskwidth = static_cast<int>(ceil(winSize.width / 8.f)) * 16;
+            cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
+            winMaskMatBuf.setTo(1);
+
+            const float FLT_SCALE = (1.f/(1 << 20)); // 20
+
+            int j, cn = I.channels(), cn2 = cn*2;
+            int winbufwidth = static_cast<int>(ceil(winSize.width / 8.f)) * 8;
+            cv::Size winBufSize(winbufwidth,winbufwidth);
+
+            std::vector<short> _buf(winBufSize.area()*(cn + cn2));
+            Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]);
+            Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]);
+
+            for( int ptidx = range.start; ptidx < range.end; ptidx++ )
+            {
+                Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
+                Point2f nextPt;
+                if( level == maxLevel )
+                {
+                    if( useInitialFlow )
+                        nextPt = nextPts[ptidx]*(float)(1./(1 << level));
+                    else
+                        nextPt = prevPt;
+                }
+                else
+                    nextPt = nextPts[ptidx]*2.f;
+                nextPts[ptidx] = nextPt;
+
+                Point2i iprevPt, inextPt;
+                iprevPt.x = cvFloor(prevPt.x);
+                iprevPt.y = cvFloor(prevPt.y);
+                int winArea = maxWinSize * maxWinSize;
+                cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize));
+
+                if( calcWinMaskMat(BI, windowType, iprevPt,
+                            winMaskMat,winSize,halfWin,winArea,
+                            minWinSize,maxWinSize, crossSegmentationThreshold) == false)
+                    continue;
+
+                prevPt -= halfWin;
+                iprevPt.x = cvFloor(prevPt.x);
+                iprevPt.y = cvFloor(prevPt.y);
+
+                if( iprevPt.x + halfWin.x < 0 || iprevPt.x + halfWin.x>= derivI.cols ||
+                    iprevPt.y + halfWin.y < 0 || iprevPt.y + halfWin.y >= derivI.rows )
+                {
+                    if( level == 0 )
+                    {
+                        if( status )
+                            status[ptidx] = 3;
+                        if( err )
+                            err[ptidx] = 0;
+                    }
+                    continue;
+                }
+
+                float a = prevPt.x - iprevPt.x;
+                float b = prevPt.y - iprevPt.y;
+                const int W_BITS = 14, W_BITS1 = 14;
+
+                int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+                int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+                int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+                int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+
+                int dstep = (int)(derivI.step/derivI.elemSize1());
+                int step = (int)(I.step/I.elemSize1());
+                int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1());
+                CV_Assert( step == (int)(J.step/J.elemSize1()) );
+                float A11 = 0, A12 = 0, A22 = 0;
+
+    #ifdef CV_SSE4_1
+                __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+                __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+                __m128i z = _mm_setzero_si128();
+                __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1));
+                __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1));
+                __m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps();
+                __m128i mmMask4_epi32;
+                get4BitMask(winSize.width, mmMask4_epi32);
+    #endif
+
+
+                int x, y;
+                for( y = 0; y < winSize.height; y++ )
+                {
+                    const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+                    const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
+                    const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
+
+                    short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step);
+                    short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step);
+
+                    x = 0;
+
+    #ifdef CV_SSE4_1
+                    for( ; x < winSize.width*cn; x += 4, dsrc += 4*2, dIptr += 4*2 )
+                    {
+                        __m128i wMask = _mm_set_epi32(MaskSet * maskPtr[x+3],
+                                                      MaskSet * maskPtr[x+2],
+                                                      MaskSet * maskPtr[x+1],
+                                                      MaskSet * maskPtr[x]);
+                        __m128i v00, v01, v10, v11, t0, t1;
+                        v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
+                        v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
+                        v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z);
+                        v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z);
+
+                        t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                                           _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+
+                        t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
+                        _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0));
+
+                        v00 = _mm_loadu_si128((const __m128i*)(dsrc));
+                        v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
+                        v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep));
+                        v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2));
+
+                        t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                                           _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                        t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+                                           _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+                        t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
+                        t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
+                        v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
+
+                        if( x + 4 > winSize.width)
+                        {
+                            v00 = _mm_and_si128(v00, mmMask4_epi32);
+                        }
+                        v00 = _mm_and_si128(v00, wMask);
+
+                        _mm_storeu_si128((__m128i*)dIptr, v00);
+                        t0 = _mm_srai_epi32(v00, 16); // Iy0 Iy1 Iy2 Iy3
+                        t1 = _mm_srai_epi32(_mm_slli_epi32(v00, 16), 16); // Ix0 Ix1 Ix2 Ix3
+
+                        __m128 fy = _mm_cvtepi32_ps(t0);
+                        __m128 fx = _mm_cvtepi32_ps(t1);
+
+                        qA22 = _mm_add_ps(qA22, _mm_mul_ps(fy, fy));
+                        qA12 = _mm_add_ps(qA12, _mm_mul_ps(fx, fy));
+                        qA11 = _mm_add_ps(qA11, _mm_mul_ps(fx, fx));
+                    }
+    #else
+
+                    for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2)
+                    {
+                        if( maskPtr[x] == 0)
+                        {
+                            dIptr[0] = (short)0;
+                            dIptr[1] = (short)0;
+
+                            continue;
+                        }
+                        int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
+                                              src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5);
+                        int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
+                                               dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
+                        int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
+                                               dsrc[dstep+cn2+1]*iw11, W_BITS1);
+
+                        Iptr[x] = (short)ival;
+                        dIptr[0] = (short)ixval;
+                        dIptr[1] = (short)iyval;
+                        A11 += (float)(ixval*ixval);
+                        A12 += (float)(ixval*iyval);
+                        A22 += (float)(iyval*iyval);
+
+                    }
+
+
+    #endif
+
+                }
+
+    #ifdef CV_SSE4_1
+                float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];
+                _mm_store_ps(A11buf, qA11);
+                _mm_store_ps(A12buf, qA12);
+                _mm_store_ps(A22buf, qA22);
+                A11 += A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
+                A12 += A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
+                A22 += A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
+    #endif
+
+                A11 *= FLT_SCALE;
+                A12 *= FLT_SCALE;
+                A22 *= FLT_SCALE;
+
+
+                float D = A11*A22 - A12*A12;
+                float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
+                                 4.f*A12*A12))/(2 * winArea);
+
+                if( err )
+                    err[ptidx] = (float)minEig;
+
+                if( D < FLT_EPSILON )
+                {
+                    if( level == 0 && status )
+                        status[ptidx] = 0;
+                    continue;
+                }
+
+                D = 1.f/D;
+
+                cv::Point2f backUpNextPt = nextPt;
+                nextPt -= halfWin;
+                Point2f prevDelta(0,0);
+
+                float c[8];
+                __m128i mmMask0, mmMask1, mmMask;
+                getWBitMask(winSize.width, mmMask0, mmMask1, mmMask);
+                for( j = 0; j < criteria.maxCount; j++ )
+                {
+                    cv::Point2f delta;
+                    bool hasSolved = false;
+                    a = nextPt.x - cvFloor(nextPt.x);
+                    b = nextPt.y - cvFloor(nextPt.y);
+                    float ab = a * b;
+
+
+                    if( (inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j == 0))
+                    {
+                    inextPt.x = cvFloor(nextPt.x);
+                    inextPt.y = cvFloor(nextPt.y);
+
+
+                    if( inextPt.x < -winSize.width || inextPt.x >= J.cols ||
+                        inextPt.y < -winSize.height || inextPt.y >= J.rows )
+                    {
+                        if( level == 0 && status )
+                            status[ptidx] = 3;
+                        if (level > 0)
+                            nextPts[ptidx] = backUpNextPt;
+                        break;
+                    }
+
+
+                    iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+                    iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+                    iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+                    iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+
+                    float _b1[4] = {0,0,0,0};
+                    float _b2[4] = {0,0,0,0};
+    #ifdef CV_SSE4_1
+                    __m128 qbc0[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
+                    __m128 qbc1[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
+                    qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+                    qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+    #endif
+                    for( y = 0; y < winSize.height; y++ )
+                    {
+                        const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn;
+                        const short* Iptr = (const short*)(IWinBuf.data + ( y)*IWinBuf.step);
+                        const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
+                        x = 0;
+    #ifdef CV_SSE4_1
+                        const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+                        for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 )
+                        {
+                            if( maskPtr[x  ] == 0 && maskPtr[x+1] == 0 && maskPtr[x+2] == 0 && maskPtr[x+3] == 0
+                            &&    maskPtr[x+4] == 0 && maskPtr[x+5] == 0 && maskPtr[x+6] == 0 && maskPtr[x+7] == 0)
+                                continue;
+                            __m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x));
+                            __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
+                            __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
+                            __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z);
+                            __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z);
+
+                            __m128i mmDiffc_epi16[4] =
+                            { _mm_subs_epi16(_mm_slli_epi16(v00, 5), diff0),
+                              _mm_subs_epi16(_mm_slli_epi16(v01, 5), diff0),
+                              _mm_subs_epi16(_mm_slli_epi16(v10, 5), diff0),
+                              _mm_subs_epi16(_mm_slli_epi16(v11, 5), diff0)
+                            };
+
+                            __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
+                            __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
+
+                            if(  x > winSize.width*cn - 8)
+                            {
+                                Ixy_0 = _mm_and_si128(Ixy_0, mmMask0);
+                                Ixy_1 = _mm_and_si128(Ixy_1, mmMask1);
+                            }
+
+                            __m128i diffc1[4] =
+                            {_mm_unpackhi_epi16(mmDiffc_epi16[0],mmDiffc_epi16[0]),
+                             _mm_unpackhi_epi16(mmDiffc_epi16[1],mmDiffc_epi16[1]),
+                             _mm_unpackhi_epi16(mmDiffc_epi16[2],mmDiffc_epi16[2]),
+                             _mm_unpackhi_epi16(mmDiffc_epi16[3],mmDiffc_epi16[3])
+                            };
+
+                            __m128i diffc0[4] =
+                            {_mm_unpacklo_epi16(mmDiffc_epi16[0],mmDiffc_epi16[0]),
+                             _mm_unpacklo_epi16(mmDiffc_epi16[1],mmDiffc_epi16[1]),
+                             _mm_unpacklo_epi16(mmDiffc_epi16[2],mmDiffc_epi16[2]),
+                             _mm_unpacklo_epi16(mmDiffc_epi16[3],mmDiffc_epi16[3])
+                            };
+
+
+                            // It * Ix It * Iy [0 ... 3]
+                            //mask split for multiplication
+                            // for set 1 lo sigma 1
+
+
+                            v10 = _mm_mullo_epi16(Ixy_0, diffc0[0]);
+                            v11 = _mm_mulhi_epi16(Ixy_0, diffc0[0]);
+                            v00 = _mm_unpacklo_epi16(v10, v11);
+                            v10 = _mm_unpackhi_epi16(v10, v11);
+                            qbc0[0] = _mm_add_ps(qbc0[0], _mm_cvtepi32_ps(v00));
+                            qbc1[0] = _mm_add_ps(qbc1[0], _mm_cvtepi32_ps(v10));
+
+                            v10 = _mm_mullo_epi16(Ixy_0, diffc0[1]);
+                            v11 = _mm_mulhi_epi16(Ixy_0, diffc0[1]);
+                            v00 = _mm_unpacklo_epi16(v10, v11);
+                            v10 = _mm_unpackhi_epi16(v10, v11);
+                            qbc0[1] = _mm_add_ps(qbc0[1], _mm_cvtepi32_ps(v00));
+                            qbc1[1] = _mm_add_ps(qbc1[1], _mm_cvtepi32_ps(v10));
+
+                            v10 = _mm_mullo_epi16(Ixy_0, diffc0[2]);
+                            v11 = _mm_mulhi_epi16(Ixy_0, diffc0[2]);
+                            v00 = _mm_unpacklo_epi16(v10, v11);
+                            v10 = _mm_unpackhi_epi16(v10, v11);
+                            qbc0[2] = _mm_add_ps(qbc0[2], _mm_cvtepi32_ps(v00));
+                            qbc1[2] = _mm_add_ps(qbc1[2], _mm_cvtepi32_ps(v10));
+
+                            v10 = _mm_mullo_epi16(Ixy_0, diffc0[3]);
+                            v11 = _mm_mulhi_epi16(Ixy_0, diffc0[3]);
+                            v00 = _mm_unpacklo_epi16(v10, v11);
+                            v10 = _mm_unpackhi_epi16(v10, v11);
+                            qbc0[3] = _mm_add_ps(qbc0[3], _mm_cvtepi32_ps(v00));
+                            qbc1[3] = _mm_add_ps(qbc1[3], _mm_cvtepi32_ps(v10));
+                            // It * Ix It * Iy [4 ... 7]
+                            // for set 1 hi sigma 1
+
+                            v10 = _mm_mullo_epi16(Ixy_1, diffc1[0]);
+                            v11 = _mm_mulhi_epi16(Ixy_1, diffc1[0]);
+                            v00 = _mm_unpacklo_epi16(v10, v11);
+                            v10 = _mm_unpackhi_epi16(v10, v11);
+                            qbc0[0] = _mm_add_ps(qbc0[0], _mm_cvtepi32_ps(v00));
+                            qbc1[0] = _mm_add_ps(qbc1[0], _mm_cvtepi32_ps(v10));
+
+                            v10 = _mm_mullo_epi16(Ixy_1, diffc1[1]);
+                            v11 = _mm_mulhi_epi16(Ixy_1, diffc1[1]);
+                            v00 = _mm_unpacklo_epi16(v10, v11);
+                            v10 = _mm_unpackhi_epi16(v10, v11);
+                            qbc0[1] = _mm_add_ps(qbc0[1], _mm_cvtepi32_ps(v00));
+                            qbc1[1] = _mm_add_ps(qbc1[1], _mm_cvtepi32_ps(v10));
+
+                            v10 = _mm_mullo_epi16(Ixy_1, diffc1[2]);
+                            v11 = _mm_mulhi_epi16(Ixy_1, diffc1[2]);
+                            v00 = _mm_unpacklo_epi16(v10, v11);
+                            v10 = _mm_unpackhi_epi16(v10, v11);
+                            qbc0[2] = _mm_add_ps(qbc0[2], _mm_cvtepi32_ps(v00));
+                            qbc1[2] = _mm_add_ps(qbc1[2], _mm_cvtepi32_ps(v10));
+
+                            v10 = _mm_mullo_epi16(Ixy_1, diffc1[3]);
+                            v11 = _mm_mulhi_epi16(Ixy_1, diffc1[3]);
+                            v00 = _mm_unpacklo_epi16(v10, v11);
+                            v10 = _mm_unpackhi_epi16(v10, v11);
+                            qbc0[3] = _mm_add_ps(qbc0[3], _mm_cvtepi32_ps(v00));
+                            qbc1[3] = _mm_add_ps(qbc1[3], _mm_cvtepi32_ps(v10));
+
+                         }
+    #else
+                        for( ; x < winSize.width*cn; x++, dIptr += 2 )
+                        {
+                            if( dIptr[0] == 0 && dIptr[1] == 0)
+                                continue;
+                            /*short It[4] = {CV_DESCALE((Jptr[x])        *(1 << W_BITS),W_BITS1-5) - Iptr[x],
+                                                CV_DESCALE((Jptr[x+cn])        *(1 << W_BITS),W_BITS1-5) - Iptr[x],
+                                                CV_DESCALE((Jptr[x+step])    *(1 << W_BITS),W_BITS1-5) - Iptr[x],
+                                                CV_DESCALE((Jptr[x+step+cn])*(1 << W_BITS),W_BITS1-5) - Iptr[x]};*/
+                            short It[4] = {(Jptr[x] << 5)            - Iptr[x],
+                                                (Jptr[x+cn]<< 5)        - Iptr[x],
+                                                (Jptr[x+step]<< 5)        - Iptr[x],
+                                                (Jptr[x+step+cn]<< 5)    - Iptr[x]};
+
+                            _b1[0] += (float)(It[0]*dIptr[0]);
+                            _b1[1] += (float)(It[1]*dIptr[0]);
+                            _b1[2] += (float)(It[2]*dIptr[0]);
+                            _b1[3] += (float)(It[3]*dIptr[0]);
+
+                            _b2[0] += (float)(It[0]*dIptr[1]);
+                            _b2[1] += (float)(It[1]*dIptr[1]);
+                            _b2[2] += (float)(It[2]*dIptr[1]);
+                            _b2[3] += (float)(It[3]*dIptr[1]);
+                        }
+    #endif
+
+                    }
+
+    #ifdef CV_SSE4_1
+                    float CV_DECL_ALIGNED(16) bbuf[4];
+                    _mm_store_ps(bbuf, _mm_add_ps(qbc0[0], qbc1[0]));
+                    _b1[0] += bbuf[0] + bbuf[2];
+                    _b2[0] += bbuf[1] + bbuf[3];
+
+                    _mm_store_ps(bbuf, _mm_add_ps(qbc0[1], qbc1[1]));
+                    _b1[1] += bbuf[0] + bbuf[2];
+                    _b2[1] += bbuf[1] + bbuf[3];
+
+                    _mm_store_ps(bbuf, _mm_add_ps(qbc0[2], qbc1[2]));
+                    _b1[2] += bbuf[0] + bbuf[2];
+                    _b2[2] += bbuf[1] + bbuf[3];
+
+                    _mm_store_ps(bbuf, _mm_add_ps(qbc0[3], qbc1[3]));
+                    _b1[3] += bbuf[0] + bbuf[2];
+                    _b2[3] += bbuf[1] + bbuf[3];
+
+    #endif
+                    _b1[0] *= FLT_SCALE;
+                    _b1[1] *= FLT_SCALE;
+                    _b1[2] *= FLT_SCALE;
+                    _b1[3] *= FLT_SCALE;
+                    _b2[0] *= FLT_SCALE;
+                    _b2[1] *= FLT_SCALE;
+                    _b2[2] *= FLT_SCALE;
+                    _b2[3] *= FLT_SCALE;
+
+                    float c0[4] = {    _b1[3] + _b1[0] - _b1[2] - _b1[1],
+                                    _b1[1] - _b1[0],
+                                    _b1[2] - _b1[0],
+                                    _b1[0]};
+                    float c1[4] = {    _b2[3] + _b2[0] - _b2[2] - _b2[1],
+                                    _b2[1] - _b2[0],
+                                    _b2[2] - _b2[0],
+                                    _b2[0]};
+
+                    float DA12 = A12 * D ;
+                    float DA22 = A22 * D ;
+                    float DA11 = A11 * D ;
+                    c[0]    = DA12 * c1[0] - DA22 * c0[0];
+                    c[1]    = DA12 * c1[1] - DA22 * c0[1];
+                    c[2]    = DA12 * c1[2] - DA22 * c0[2];
+                    c[3]    = DA12 * c1[3] - DA22 * c0[3];
+                    c[4]    = DA12 * c0[0] - DA11 * c1[0];
+                    c[5]    = DA12 * c0[1] - DA11 * c1[1];
+                    c[6]    = DA12 * c0[2] - DA11 * c1[2];
+                    c[7]    = DA12 * c0[3] - DA11 * c1[3];
+
+                    float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]);
+                    float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]);
+                    float e2 = e0 * (c[1] * c[7] -c[3] * c[5]);
+                    e0 = e1 * e1 - e2;
+                    if ( e0 >= 0)
+                    {
+                        e0 = sqrt(e0);
+                        float _y[2] = {-e1 - e0, e0 - e1};
+                        float c0yc1[2] = {c[0] * _y[0] + c[1],
+                                          c[0] * _y[1] + c[1]};
+
+                        float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0],
+                                       -(c[2] * _y[1] + c[3]) / c0yc1[1]};
+
+                        bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1);
+                        bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1);
+
+
+                        bool isSink1 = isIn1 && checkSolution(_x[0], _y[0], c );
+                        bool isSink2 = isIn2 && checkSolution(_x[1], _y[1], c );
+
+
+                        //if( isSink1 != isSink2 )
+                        {
+                            if( isSink1 )
+                            {
+                                hasSolved = true;
+                                delta.x = inextPt.x + _x[0] - nextPt.x;
+                                delta.y = inextPt.y + _y[0] - nextPt.y;
+                            }
+                            if (isSink2 )
+                            {
+                                hasSolved = true;
+                                delta.x = inextPt.x + _x[1] - nextPt.x;
+                                delta.y = inextPt.y + _y[1] - nextPt.y;
+                            }
+                        } // isIn1 != isIn2
+                    } // e0 >= 0
+                    }
+                    else
+                        hasSolved = false;
+                    if(hasSolved == false)
+                    {
+                        delta = Point2f( c[0] * ab + c[1] * a + c[2] * b + c[3],
+                                         c[4] * ab + c[5] * a + c[6] * b + c[7]);
+                        nextPt += 0.7 * delta;
+                        nextPts[ptidx] = nextPt + halfWin;
+                    }
+                    else
+                    {
+                        nextPt += delta;
+                        nextPts[ptidx] = nextPt + halfWin;
+                        break;
+                    }
+
+
+                    if( delta.ddot(delta) <= criteria.epsilon)
+                        break;
+
+                    if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01  &&
+                                std::abs(delta.y - prevDelta.y) < 0.01)
+                    {
+                        nextPts[ptidx]  -= delta*0.35f;
+                        break;
+                    }
+
+                    prevDelta = delta;
+
+                }
+
+            }
+        }
+
+
+        const Mat*        prevImg;
+        const Mat*        nextImg;
+        const Mat*        prevDeriv;
+        const Mat*        rgbPrevImg;
+        const Mat*        rgbNextImg;
+        const Point2f*    prevPts;
+        Point2f*        nextPts;
+        uchar*            status;
+        float*            err;
+        int                maxWinSize;
+        int                minWinSize;
+        TermCriteria    criteria;
+        int                level;
+        int                maxLevel;
+        int                windowType;
+        float            minEigThreshold;
+        bool            useInitialFlow;
+        int        crossSegmentationThreshold;
+    };
 }
 }
 }
diff --git a/modules/optflow/src/rlof/geo_interpolation.cpp b/modules/optflow/src/rlof/geo_interpolation.cpp
index 47e92c96b0d..823070bf5e9 100644
--- a/modules/optflow/src/rlof/geo_interpolation.cpp
+++ b/modules/optflow/src/rlof/geo_interpolation.cpp
@@ -1,44 +1,6 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
- //
- //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
- //
- //  By downloading, copying, installing or using the software you agree to this license.
- //  If you do not agree to this license, do not download, install,
- //  copy or use the software.
- //
- //
- //                           License Agreement
- //                For Open Source Computer Vision Library
- //
- // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
- // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
- // Third party copyrights are property of their respective owners.
- //
- // Redistribution and use in source and binary forms, with or without modification,
- // are permitted provided that the following conditions are met:
- //
- //   * Redistribution's of source code must retain the above copyright notice,
- //     this list of conditions and the following disclaimer.
- //
- //   * Redistribution's in binary form must reproduce the above copyright notice,
- //     this list of conditions and the following disclaimer in the documentation
- //     and/or other materials provided with the distribution.
- //
- //   * The name of the copyright holders may not be used to endorse or promote products
- //     derived from this software without specific prior written permission.
- //
- // This software is provided by the copyright holders and contributors "as is" and
- // any express or implied warranties, including, but not limited to, the implied
- // warranties of merchantability and fitness for a particular purpose are disclaimed.
- // In no event shall the Intel Corporation or contributors be liable for any direct,
- // indirect, incidental, special, exemplary, or consequential damages
- // (including, but not limited to, procurement of substitute goods or services;
- // loss of use, data, or profits; or business interruption) however caused
- // and on any theory of liability, whether in contract, strict liability,
- // or tort (including negligence or otherwise) arising in any way out of
- // the use of this software, even if advised of the possibility of such damage.
- //
- //M*/
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
 // This functions have been contributed by Jonas Geisters <geistert@nue.tu-berlin.de>
 #include "precomp.hpp"
 #include "geo_interpolation.hpp"
@@ -46,480 +8,480 @@
 #include <map>
 namespace cv
 {
-	namespace optflow
-	{
-
-		double K_h1(float x, double h2)
-		{
-			return exp(-(0.5 / (h2)) * x);
-		}
-
-		struct Graph_helper {
-			int * mem;
-			int e_size;
-			Graph_helper(int k, int num_nodes) {
-				e_size = (2 * k + 1);
-				mem = (int*)malloc(sizeof(int) * e_size * num_nodes);
-				memset(mem, 0, sizeof(int) * e_size * num_nodes);
-			}
-			inline int size(int id) {
-				int r_addr = id * (e_size);
-				return ((int*)mem)[r_addr];
-			}
-			inline int * data(int id) {
-				int r_addr = id * (e_size)+1;
-				return mem + r_addr;
-			}
-			inline void add(int id, std::pair<float, int> data) {
-				int r_addr = id * (e_size);
-				int size = ++((int*)mem)[r_addr];
-				r_addr += 2 * size - 1;//== 1 + 2*(size-1);
-				((float*)mem)[r_addr] = data.first;
-				((int*)mem)[r_addr + 1] = data.second;
-			}
-			inline bool color_in_target(int id, int color) {
-				int r_addr = id * (e_size);
-				int size = (((int*)mem)[r_addr]);
-				r_addr += 2;
-				for (int i = 0; i < size; i++) {
-					if (((int*)mem)[r_addr] == color) {
-						return true;
-					}
-					r_addr += 2;
-				}
-				return false;
-			}
-
-		};
-
-		Mat sgeo_dist(const Mat& gra, int y, int x, float max, Mat &prev)
-		{
-			std::vector <Point2f> points;
-			points.push_back(Point2f(static_cast<float>(x), static_cast<float>(y)));
-			return sgeo_dist(gra, points, max, prev);
-		}
-		Mat sgeo_dist(const Mat& gra, const std::vector<Point2f> & points, float max, Mat &prev)
-		{
-			int Dx[] = { -1,0,1,-1,1,-1,0,1 };
-			int Dy[] = { -1,-1,-1,0,0,1,1,1 };
-			Mat dm(gra.rows, gra.cols, CV_32F, Scalar(max));
-			prev = Mat(gra.rows, gra.cols, CV_8U, Scalar(255));
-
-			std::multimap<float, Vec2i > not_visited_with_value;
-
-			for (auto i = points.begin(); i != points.end(); i++)
-			{
-				int y = static_cast<int>(i->y);
-				int x = static_cast<int>(i->x);
-				not_visited_with_value.insert(std::pair<float, Vec2i >(0.f, Vec2i(y, x)));
-				dm.at<float>(y, x) = 0;
-			}
-
-			bool done = false;
-			while (!done)
-			{
-				if (not_visited_with_value.begin() == not_visited_with_value.end()) {
-					done = true;
-					break;
-				}
-				std::multimap<float, Vec2i >::iterator current_it = not_visited_with_value.begin();
-				std::pair<float, Vec2i > current_p = *current_it;
-				not_visited_with_value.erase(current_it);
-
-				int y = current_p.second[0];
-				int x = current_p.second[1];
-				float cur_d = current_p.first;
-
-				if (dm.at<float>(y, x) != cur_d) {
-					continue;
-				}
-
-				Vec8f gra_e = gra.at<Vec8f>(y, x);
-
-				for (int i = 0; i < 8; i++) {
-					if (gra_e[i] < 0) {
-						continue;
-					}
-					int dx = Dx[i];
-					int dy = Dy[i];
-
-					if (dm.at<float>(y + dy, x + dx) > cur_d + gra_e[i]) {
-						dm.at<float>(y + dy, x + dx) = cur_d + gra_e[i];
-						prev.at<uchar>(y + dy, x + dx) = static_cast<uchar>(7 - i);
-						not_visited_with_value.insert(std::pair<float, Vec2i >(cur_d + gra_e[i], Vec2i(y + dy, x + dx)));
-					}
-
-				}
-			}
-
-
-
-
-			return dm;
-		}
-
-		Mat interpolate_irregular_nn_raster(const std::vector<Point2f> & prevPoints,
-			const std::vector<Point2f> & nextPoints,
-			const std::vector<uchar> & status,
-			const Mat & i1)
-		{
-			Mat gra = getGraph(i1, 0.1f);
-			int Dx[] = { -1,0,1,-1,1,-1,0,1 };
-			int Dy[] = { -1,-1,-1,0,0,1,1,1 };
-			int max_rounds = 10;
-			Mat dirt = Mat(gra.rows, gra.cols, CV_8U, Scalar(0));
-			Mat quellknoten = Mat(gra.rows, gra.cols, CV_32S, Scalar(-1));
-			Mat dist = Mat(gra.rows, gra.cols, CV_32F, Scalar(std::numeric_limits<float>::max()));
-			/*
-				* assign quellknoten ids.
-				*/
-			for (int i = 0; i < prevPoints.size(); i++)
-			{
-				int x = (int)prevPoints[i].x;
-				int y = (int)prevPoints[i].y;
-				if (status[i] == 0)
-					continue;
-				dirt.at<uchar>(y, x) = 1;
-				dist.at<float>(y, x) = 0;
-				quellknoten.at<int>(y, x) = i;
-			}
-
-			bool clean = true;
-			bool done = false;
-			int x = 0;
-			int y = 0;
-			int rounds = 0;
-			while (!done) {
-				/*
-					* Update x and y
-					* on even rounds go rasterscanorder , on odd round inverse rasterscanorder
-					*/
-				if (rounds % 2 == 0) {
-					x++;
-					if (x >= gra.cols) {
-						x = 0;
-						y++;
-						if (y >= gra.rows) {
-							y = 0;
-							rounds++;
-							y = gra.rows - 1;
-							x = gra.cols - 1;
-							if (rounds >= max_rounds || clean) {
-								done = true;
-								break;
-							}
-						}
-					}
-				}
-				else {
-					x--;
-					if (x < 0) {
-						x = gra.cols - 1;
-						y--;
-						if (y < 0) {
-							y = gra.rows - 1;
-							rounds++;
-							y = 0;
-							x = 0;
-							if (rounds >= max_rounds || clean) {
-								done = true;
-								break;
-							}
-						}
-					}
-				}
-				if (dirt.at<uchar>(y, x) == 0) {
-					continue;
-				}
-				dirt.at<uchar>(y, x) = 0;
-
-				float c_dist = dist.at<float>(y, x);
-				Vec8f gra_e = gra.at<Vec8f>(y, x);
-
-				for (int i = 0; i < 8; i++) {
-					int tx = Dx[i];
-					int ty = Dy[i];
-					if (ty == 0 && tx == 0) {
-						continue;
-					}
-					if (x + tx < 0 || x + tx >= gra.cols) {
-						continue;
-					}
-					if (y + ty < 0 || y + ty >= gra.rows) {
-						continue;
-					}
-					if (c_dist > dist.at<float>(y + ty, x + tx)) {
-						if (c_dist > dist.at<float>(y + ty, x + tx) + gra_e[i]) {
-							quellknoten.at<int>(y, x) = quellknoten.at<int>(y + ty, x + tx);
-							dist.at<float>(y, x) = dist.at<float>(y + ty, x + tx) + gra_e[i];
-							dirt.at<uchar>(y, x) = 1;
-							clean = false;
-						}
-					}
-					else {
-						if (c_dist + gra_e[i] < dist.at<float>(y + ty, x + tx)) {
-							quellknoten.at<int>(y + ty, x + tx) = quellknoten.at<int>(y, x);
-							dist.at<float>(y + ty, x + tx) = dist.at<float>(y, x) + gra_e[i];
-							dirt.at<uchar>(y + ty, x + tx) = 1;
-							clean = false;
-						}
-					}
-				}
-
-
-
-			}
-
-			Mat nnFlow(i1.rows, i1.cols, CV_32FC2, Scalar(0));
-			for (y = 0; y < i1.rows; y++) {
-				for (x = 0; x < i1.cols; x++) {
-
-					int id = quellknoten.at<int>(y, x);
-					if (id != -1)
-					{
-						nnFlow.at<Point2f>(y, x) = nextPoints[id] - prevPoints[id];
-					}
-				}
-			}
-			return nnFlow;
-		}
-
-		Mat interpolate_irregular_knn(
-			const std::vector<Point2f> & _prevPoints,
-			const std::vector<Point2f> & _nextPoints,
-			const std::vector<uchar> & status,
-			const Mat &color_img,
-			int k,
-			float pixeldistance)
-		{
-			Mat in(color_img.rows, color_img.cols, CV_32FC2);
-			Mat mask = Mat::zeros(color_img.rows, color_img.cols, CV_8UC1);
-
-			for (unsigned n = 0; n < _prevPoints.size(); n++)
-			{
-				if (_prevPoints[n].x >= 0 && _prevPoints[n].y >= 0 && _prevPoints[n].x < color_img.cols && _prevPoints[n].y < color_img.rows)
-				{
-					in.at<Point2f>(_prevPoints[n]) = _nextPoints[n] - _prevPoints[n];
-					mask.at<uchar>(_prevPoints[n]) = status[n];
-				}
-
-			}
-			int Dx[] = { -1,0,1,-1,1,-1,0,1 };
-			int Dy[] = { -1,-1,-1,0,0,1,1,1 };
-			Mat gra = getGraph(color_img, pixeldistance);
-			Mat nnFlow(in.rows, in.cols, CV_32FC2, Scalar(0));
-
-			std::multimap<float, Vec2i > my_agents; // <arrivaltim , < target, color >>
-			Graph_helper graph_helper(k, in.rows*in.cols); //< arrivaltime, color>
-
-
-			int color = 0;
-			std::vector<Vec2i> flow_point_list;
-			for (int y = 0; y < in.rows; y++) {
-				for (int x = 0; x < in.cols; x++) {
-					if (mask.at<uchar>(y, x) > 0) {
-						flow_point_list.push_back(Vec2i(y, x));
-						nnFlow.at<Vec2f>(y, x) = in.at<Vec2f>(y, x);
-
-						int v_id = (y * in.cols + x);
-						graph_helper.add(v_id, std::pair<float, int>(0.f, color));
-
-
-						Vec8f gra_e = gra.at<Vec8f>(y, x);
-						for (int i = 0; i < 8; i++) {
-							if (gra_e[i] < 0)
-								continue;
-							int dx = Dx[i];
-							int dy = Dy[i];
-							int target = (y + dy) * in.cols + (x + dx);
-							Vec2i agent(target, color);
-							my_agents.insert(std::pair<float, Vec2i >(gra_e[i], agent));
-
-						}
-						color++;
-					}
-				}
-			}
-
-			int global_time = 0;
-
-			bool done = false;
-			while (!done) {
-				if (my_agents.size() == 0) {
-					done = true;
-					break;
-				}
-				global_time++;
-				
-				std::multimap<float, Vec2i >::iterator current_it = my_agents.begin();
-				std::pair<float, Vec2i > current_p = *current_it;
-				my_agents.erase(current_it);
-
-				int target = current_p.second[0];
-				color = current_p.second[1];
-				float arriv_time = current_p.first;
-
-				Vec8f gra_e = gra.at<Vec8f>(target);// (y*cols+x)
-				if (graph_helper.size(target) >= k) {
-					continue;
-				}
-
-				bool color_found_in_target = graph_helper.color_in_target(target, color);
-				if (color_found_in_target) {
-					continue;
-				}
-				graph_helper.add(target, std::pair<float, int>(arriv_time, color));
-
-
-				for (int i = 0; i < 8; i++) {
-					if (gra_e[i] < 0)
-						continue;
-
-					int dx = Dx[i];
-					int dy = Dy[i];
-					int new_target = target + dx + (dy*in.cols);
-					if (graph_helper.size(new_target) >= k) {
-						continue;
-					}
-					color_found_in_target = graph_helper.color_in_target(new_target, color);
-					if (color_found_in_target) {
-						continue;
-					}
-					Vec2i new_agent(new_target, color);
-					my_agents.insert(std::pair<float, Vec2i >(arriv_time + gra_e[i], new_agent));
-
-				}
-			}
-
-			Mat ret(in.rows, in.cols*k, CV_32FC2);
-			for (int y = 0; y < in.rows; y++) {
-				for (int x = 0; x < in.cols; x++) {
-					for (int i = 0; i < k; i++) {
-						float dist = *((float*)(graph_helper.data(y*in.cols + x) + 2 * i));
-						float id = *((float*)(graph_helper.data(y*in.cols + x) + 2 * i + 1));
-						ret.at<Vec2f>(y, k*x + i) = Vec2f(dist, id);
-					}
-				}
-			}
-			return ret;
-
-			for (int y = 0; y < in.rows; y++) {
-				for (int x = 0; x < in.cols; x++) {
-					int v_id = y * in.cols + x;
-					std::vector<std::pair<float, Vec2f> > flow_s(k);
-
-					for (int i = 0; i < k; i++) {
-						color = *((int*)(graph_helper.data(v_id) + 1 + 2 * i));
-						int cy = flow_point_list[color][0];
-						int cx = flow_point_list[color][1];
-						Vec2f flow = in.at<Vec2f>(cy, cx);
-						flow_s[i] = std::make_pair(static_cast<float>(norm(flow)), flow);
-
-
-
-					}
-					
-					nnFlow.at<Vec2f>(y, x) = std::min_element(flow_s.begin(), flow_s.end(), [](const std::pair<float, Vec2f> &left, const std::pair<float, Vec2f> &right) {
-						return left.first < right.first;
-					})->second;
-
-
-				}
-			}
-			free(graph_helper.mem);
-			return nnFlow;
-
-
-		}
-		Mat getGraph(const Mat &image, float edge_length)
-		{
-
-			int Dx[] = { -1,0,1,-1,1,-1,0,1 };
-			int Dy[] = { -1,-1,-1,0,0,1,1,1 };
-			Mat gra(image.rows, image.cols, CV_32FC(8));
-
-			for (int y = 0; y < gra.rows; y++) {
-				for (int x = 0; x < gra.cols; x++) {
-
-					for (int i = 0; i < 8; i++) {
-						int dx = Dx[i];
-						int dy = Dy[i];
-						gra.at<Vec8f>(y, x)[i] = -1;
-
-						if (x + dx < 0 || y + dy < 0 || x + dx >= gra.cols || y + dy >= gra.rows) {
-							continue;
-						}
-
-						if (i < 4) {
-							int si = 7 - i;
-							gra.at<Vec8f>(y, x)[i] = gra.at<Vec8f>(y + dy, x + dx)[si];
-						}
-						else {
-							float p1 = dx * dx*edge_length*edge_length + dy * dy*edge_length*edge_length;
-							float p2 = static_cast<float>(image.at<Vec3b>(y, x)[0] - image.at<Vec3b>(y + dy, x + dx)[0]);
-							float p3 = static_cast<float>(image.at<Vec3b>(y, x)[1] - image.at<Vec3b>(y + dy, x + dx)[1]);
-							float p4 = static_cast<float>(image.at<Vec3b>(y, x)[2] - image.at<Vec3b>(y + dy, x + dx)[2]);
-							gra.at<Vec8f>(y, x)[i] = sqrt(p1 + p2 * p2 + p3 * p3 + p4 * p4);
-						}
-
-					}
-
-				}
-			}
-
-			return gra;
-		}
-
-		
-		Mat interpolate_irregular_nn(
-			const std::vector<Point2f> & _prevPoints,
-			const std::vector<Point2f> & _nextPoints,
-			const std::vector<uchar> & status,
-			const Mat &color_img,
-			float pixeldistance)
-		{
-			int Dx[] = { -1,0,1,-1,1,-1,0,1 };
-			int Dy[] = { -1,-1,-1,0,0,1,1,1 };
-			std::vector<Point2f> prevPoints, nextPoints;
-			std::map<std::pair<float, float>, std::pair<float, float>> flowMap;
-			for (unsigned n = 0; n < _prevPoints.size(); n++)
-			{
-				if (status[n] != 0)
-				{
-					flowMap.insert(std::make_pair(
-						std::make_pair(_prevPoints[n].x, _prevPoints[n].y),
-						std::make_pair(_nextPoints[n].x, _nextPoints[n].y)));
-					prevPoints.push_back(_prevPoints[n]);
-					nextPoints.push_back(_nextPoints[n]);
-				}
-
-			}
-
-			Mat gra = getGraph(color_img, pixeldistance);
-
-			Mat prev;
-			Mat geo_dist = sgeo_dist(gra, prevPoints, std::numeric_limits<float>::max(), prev);
-
-
-			Mat nnFlow = Mat::zeros(color_img.size(), CV_32FC2);
-			for (int y = 0; y < nnFlow.rows; y++)
-			{
-				for (int x = 0; x < nnFlow.cols; x++)
-				{
-					int cx = x;
-					int cy = y;
-					while (prev.at<uchar>(cy, cx) != 255)
-					{
-						int i = prev.at<uchar>(cy, cx);
-						cx += Dx[i];
-						cy += Dy[i];
-					}
-					auto val = flowMap[std::make_pair(static_cast<float>(cx), static_cast<float>(cy))];
-					nnFlow.at<Vec2f>(y, x) = Vec2f(val.first - cx, val.second - cy);
-				}
-			}
-			return nnFlow;
-		}
-
-	}
+namespace optflow
+{
+
+    double K_h1(float x, double h2)
+    {
+        return exp(-(0.5 / (h2)) * x);
+    }
+
+    struct Graph_helper {
+        int * mem;
+        int e_size;
+        Graph_helper(int k, int num_nodes) {
+            e_size = (2 * k + 1);
+            mem = (int*)malloc(sizeof(int) * e_size * num_nodes);
+            memset(mem, 0, sizeof(int) * e_size * num_nodes);
+        }
+        inline int size(int id) {
+            int r_addr = id * (e_size);
+            return ((int*)mem)[r_addr];
+        }
+        inline int * data(int id) {
+            int r_addr = id * (e_size)+1;
+            return mem + r_addr;
+        }
+        inline void add(int id, std::pair<float, int> data) {
+            int r_addr = id * (e_size);
+            int size = ++((int*)mem)[r_addr];
+            r_addr += 2 * size - 1;//== 1 + 2*(size-1);
+            ((float*)mem)[r_addr] = data.first;
+            ((int*)mem)[r_addr + 1] = data.second;
+        }
+        inline bool color_in_target(int id, int color) {
+            int r_addr = id * (e_size);
+            int size = (((int*)mem)[r_addr]);
+            r_addr += 2;
+            for (int i = 0; i < size; i++) {
+                if (((int*)mem)[r_addr] == color) {
+                    return true;
+                }
+                r_addr += 2;
+            }
+            return false;
+        }
+
+    };
+
+    Mat sgeo_dist(const Mat& gra, int y, int x, float max, Mat &prev)
+    {
+        std::vector <Point2f> points;
+        points.push_back(Point2f(static_cast<float>(x), static_cast<float>(y)));
+        return sgeo_dist(gra, points, max, prev);
+    }
+    Mat sgeo_dist(const Mat& gra, const std::vector<Point2f> & points, float max, Mat &prev)
+    {
+        int Dx[] = { -1,0,1,-1,1,-1,0,1 };
+        int Dy[] = { -1,-1,-1,0,0,1,1,1 };
+        Mat dm(gra.rows, gra.cols, CV_32F, Scalar(max));
+        prev = Mat(gra.rows, gra.cols, CV_8U, Scalar(255));
+
+        std::multimap<float, Vec2i > not_visited_with_value;
+
+        for (auto i = points.begin(); i != points.end(); i++)
+        {
+            int y = static_cast<int>(i->y);
+            int x = static_cast<int>(i->x);
+            not_visited_with_value.insert(std::pair<float, Vec2i >(0.f, Vec2i(y, x)));
+            dm.at<float>(y, x) = 0;
+        }
+
+        bool done = false;
+        while (!done)
+        {
+            if (not_visited_with_value.begin() == not_visited_with_value.end()) {
+                done = true;
+                break;
+            }
+            std::multimap<float, Vec2i >::iterator current_it = not_visited_with_value.begin();
+            std::pair<float, Vec2i > current_p = *current_it;
+            not_visited_with_value.erase(current_it);
+
+            int y = current_p.second[0];
+            int x = current_p.second[1];
+            float cur_d = current_p.first;
+
+            if (dm.at<float>(y, x) != cur_d) {
+                continue;
+            }
+
+            Vec8f gra_e = gra.at<Vec8f>(y, x);
+
+            for (int i = 0; i < 8; i++) {
+                if (gra_e[i] < 0) {
+                    continue;
+                }
+                int dx = Dx[i];
+                int dy = Dy[i];
+
+                if (dm.at<float>(y + dy, x + dx) > cur_d + gra_e[i]) {
+                    dm.at<float>(y + dy, x + dx) = cur_d + gra_e[i];
+                    prev.at<uchar>(y + dy, x + dx) = static_cast<uchar>(7 - i);
+                    not_visited_with_value.insert(std::pair<float, Vec2i >(cur_d + gra_e[i], Vec2i(y + dy, x + dx)));
+                }
+
+            }
+        }
+
+
+
+
+        return dm;
+    }
+
+    Mat interpolate_irregular_nn_raster(const std::vector<Point2f> & prevPoints,
+        const std::vector<Point2f> & nextPoints,
+        const std::vector<uchar> & status,
+        const Mat & i1)
+    {
+        Mat gra = getGraph(i1, 0.1f);
+        int Dx[] = { -1,0,1,-1,1,-1,0,1 };
+        int Dy[] = { -1,-1,-1,0,0,1,1,1 };
+        int max_rounds = 10;
+        Mat dirt = Mat(gra.rows, gra.cols, CV_8U, Scalar(0));
+        Mat quellknoten = Mat(gra.rows, gra.cols, CV_32S, Scalar(-1));
+        Mat dist = Mat(gra.rows, gra.cols, CV_32F, Scalar(std::numeric_limits<float>::max()));
+        /*
+            * assign quellknoten ids.
+            */
+        for (int i = 0; i < prevPoints.size(); i++)
+        {
+            int x = (int)prevPoints[i].x;
+            int y = (int)prevPoints[i].y;
+            if (status[i] == 0)
+                continue;
+            dirt.at<uchar>(y, x) = 1;
+            dist.at<float>(y, x) = 0;
+            quellknoten.at<int>(y, x) = i;
+        }
+
+        bool clean = true;
+        bool done = false;
+        int x = 0;
+        int y = 0;
+        int rounds = 0;
+        while (!done) {
+            /*
+                * Update x and y
+                * on even rounds go rasterscanorder , on odd round inverse rasterscanorder
+                */
+            if (rounds % 2 == 0) {
+                x++;
+                if (x >= gra.cols) {
+                    x = 0;
+                    y++;
+                    if (y >= gra.rows) {
+                        y = 0;
+                        rounds++;
+                        y = gra.rows - 1;
+                        x = gra.cols - 1;
+                        if (rounds >= max_rounds || clean) {
+                            done = true;
+                            break;
+                        }
+                    }
+                }
+            }
+            else {
+                x--;
+                if (x < 0) {
+                    x = gra.cols - 1;
+                    y--;
+                    if (y < 0) {
+                        y = gra.rows - 1;
+                        rounds++;
+                        y = 0;
+                        x = 0;
+                        if (rounds >= max_rounds || clean) {
+                            done = true;
+                            break;
+                        }
+                    }
+                }
+            }
+            if (dirt.at<uchar>(y, x) == 0) {
+                continue;
+            }
+            dirt.at<uchar>(y, x) = 0;
+
+            float c_dist = dist.at<float>(y, x);
+            Vec8f gra_e = gra.at<Vec8f>(y, x);
+
+            for (int i = 0; i < 8; i++) {
+                int tx = Dx[i];
+                int ty = Dy[i];
+                if (ty == 0 && tx == 0) {
+                    continue;
+                }
+                if (x + tx < 0 || x + tx >= gra.cols) {
+                    continue;
+                }
+                if (y + ty < 0 || y + ty >= gra.rows) {
+                    continue;
+                }
+                if (c_dist > dist.at<float>(y + ty, x + tx)) {
+                    if (c_dist > dist.at<float>(y + ty, x + tx) + gra_e[i]) {
+                        quellknoten.at<int>(y, x) = quellknoten.at<int>(y + ty, x + tx);
+                        dist.at<float>(y, x) = dist.at<float>(y + ty, x + tx) + gra_e[i];
+                        dirt.at<uchar>(y, x) = 1;
+                        clean = false;
+                    }
+                }
+                else {
+                    if (c_dist + gra_e[i] < dist.at<float>(y + ty, x + tx)) {
+                        quellknoten.at<int>(y + ty, x + tx) = quellknoten.at<int>(y, x);
+                        dist.at<float>(y + ty, x + tx) = dist.at<float>(y, x) + gra_e[i];
+                        dirt.at<uchar>(y + ty, x + tx) = 1;
+                        clean = false;
+                    }
+                }
+            }
+
+
+
+        }
+
+        Mat nnFlow(i1.rows, i1.cols, CV_32FC2, Scalar(0));
+        for (y = 0; y < i1.rows; y++) {
+            for (x = 0; x < i1.cols; x++) {
+
+                int id = quellknoten.at<int>(y, x);
+                if (id != -1)
+                {
+                    nnFlow.at<Point2f>(y, x) = nextPoints[id] - prevPoints[id];
+                }
+            }
+        }
+        return nnFlow;
+    }
+
+    Mat interpolate_irregular_knn(
+        const std::vector<Point2f> & _prevPoints,
+        const std::vector<Point2f> & _nextPoints,
+        const std::vector<uchar> & status,
+        const Mat &color_img,
+        int k,
+        float pixeldistance)
+    {
+        Mat in(color_img.rows, color_img.cols, CV_32FC2);
+        Mat mask = Mat::zeros(color_img.rows, color_img.cols, CV_8UC1);
+
+        for (unsigned n = 0; n < _prevPoints.size(); n++)
+        {
+            if (_prevPoints[n].x >= 0 && _prevPoints[n].y >= 0 && _prevPoints[n].x < color_img.cols && _prevPoints[n].y < color_img.rows)
+            {
+                in.at<Point2f>(_prevPoints[n]) = _nextPoints[n] - _prevPoints[n];
+                mask.at<uchar>(_prevPoints[n]) = status[n];
+            }
+
+        }
+        int Dx[] = { -1,0,1,-1,1,-1,0,1 };
+        int Dy[] = { -1,-1,-1,0,0,1,1,1 };
+        Mat gra = getGraph(color_img, pixeldistance);
+        Mat nnFlow(in.rows, in.cols, CV_32FC2, Scalar(0));
+
+        std::multimap<float, Vec2i > my_agents; // <arrivaltim , < target, color >>
+        Graph_helper graph_helper(k, in.rows*in.cols); //< arrivaltime, color>
+
+
+        int color = 0;
+        std::vector<Vec2i> flow_point_list;
+        for (int y = 0; y < in.rows; y++) {
+            for (int x = 0; x < in.cols; x++) {
+                if (mask.at<uchar>(y, x) > 0) {
+                    flow_point_list.push_back(Vec2i(y, x));
+                    nnFlow.at<Vec2f>(y, x) = in.at<Vec2f>(y, x);
+
+                    int v_id = (y * in.cols + x);
+                    graph_helper.add(v_id, std::pair<float, int>(0.f, color));
+
+
+                    Vec8f gra_e = gra.at<Vec8f>(y, x);
+                    for (int i = 0; i < 8; i++) {
+                        if (gra_e[i] < 0)
+                            continue;
+                        int dx = Dx[i];
+                        int dy = Dy[i];
+                        int target = (y + dy) * in.cols + (x + dx);
+                        Vec2i agent(target, color);
+                        my_agents.insert(std::pair<float, Vec2i >(gra_e[i], agent));
+
+                    }
+                    color++;
+                }
+            }
+        }
+
+        int global_time = 0;
+
+        bool done = false;
+        while (!done) {
+            if (my_agents.size() == 0) {
+                done = true;
+                break;
+            }
+            global_time++;
+
+            std::multimap<float, Vec2i >::iterator current_it = my_agents.begin();
+            std::pair<float, Vec2i > current_p = *current_it;
+            my_agents.erase(current_it);
+
+            int target = current_p.second[0];
+            color = current_p.second[1];
+            float arriv_time = current_p.first;
+
+            Vec8f gra_e = gra.at<Vec8f>(target);// (y*cols+x)
+            if (graph_helper.size(target) >= k) {
+                continue;
+            }
+
+            bool color_found_in_target = graph_helper.color_in_target(target, color);
+            if (color_found_in_target) {
+                continue;
+            }
+            graph_helper.add(target, std::pair<float, int>(arriv_time, color));
+
+
+            for (int i = 0; i < 8; i++) {
+                if (gra_e[i] < 0)
+                    continue;
+
+                int dx = Dx[i];
+                int dy = Dy[i];
+                int new_target = target + dx + (dy*in.cols);
+                if (graph_helper.size(new_target) >= k) {
+                    continue;
+                }
+                color_found_in_target = graph_helper.color_in_target(new_target, color);
+                if (color_found_in_target) {
+                    continue;
+                }
+                Vec2i new_agent(new_target, color);
+                my_agents.insert(std::pair<float, Vec2i >(arriv_time + gra_e[i], new_agent));
+
+            }
+        }
+
+        Mat ret(in.rows, in.cols*k, CV_32FC2);
+        for (int y = 0; y < in.rows; y++) {
+            for (int x = 0; x < in.cols; x++) {
+                for (int i = 0; i < k; i++) {
+                    float dist = *((float*)(graph_helper.data(y*in.cols + x) + 2 * i));
+                    float id = *((float*)(graph_helper.data(y*in.cols + x) + 2 * i + 1));
+                    ret.at<Vec2f>(y, k*x + i) = Vec2f(dist, id);
+                }
+            }
+        }
+        return ret;
+
+        for (int y = 0; y < in.rows; y++) {
+            for (int x = 0; x < in.cols; x++) {
+                int v_id = y * in.cols + x;
+                std::vector<std::pair<float, Vec2f> > flow_s(k);
+
+                for (int i = 0; i < k; i++) {
+                    color = *((int*)(graph_helper.data(v_id) + 1 + 2 * i));
+                    int cy = flow_point_list[color][0];
+                    int cx = flow_point_list[color][1];
+                    Vec2f flow = in.at<Vec2f>(cy, cx);
+                    flow_s[i] = std::make_pair(static_cast<float>(norm(flow)), flow);
+
+
+
+                }
+
+                nnFlow.at<Vec2f>(y, x) = std::min_element(flow_s.begin(), flow_s.end(), [](const std::pair<float, Vec2f> &left, const std::pair<float, Vec2f> &right) {
+                    return left.first < right.first;
+                })->second;
+
+
+            }
+        }
+        free(graph_helper.mem);
+        return nnFlow;
+
+
+    }
+    Mat getGraph(const Mat &image, float edge_length)
+    {
+
+        int Dx[] = { -1,0,1,-1,1,-1,0,1 };
+        int Dy[] = { -1,-1,-1,0,0,1,1,1 };
+        Mat gra(image.rows, image.cols, CV_32FC(8));
+
+        for (int y = 0; y < gra.rows; y++) {
+            for (int x = 0; x < gra.cols; x++) {
+
+                for (int i = 0; i < 8; i++) {
+                    int dx = Dx[i];
+                    int dy = Dy[i];
+                    gra.at<Vec8f>(y, x)[i] = -1;
+
+                    if (x + dx < 0 || y + dy < 0 || x + dx >= gra.cols || y + dy >= gra.rows) {
+                        continue;
+                    }
+
+                    if (i < 4) {
+                        int si = 7 - i;
+                        gra.at<Vec8f>(y, x)[i] = gra.at<Vec8f>(y + dy, x + dx)[si];
+                    }
+                    else {
+                        float p1 = dx * dx*edge_length*edge_length + dy * dy*edge_length*edge_length;
+                        float p2 = static_cast<float>(image.at<Vec3b>(y, x)[0] - image.at<Vec3b>(y + dy, x + dx)[0]);
+                        float p3 = static_cast<float>(image.at<Vec3b>(y, x)[1] - image.at<Vec3b>(y + dy, x + dx)[1]);
+                        float p4 = static_cast<float>(image.at<Vec3b>(y, x)[2] - image.at<Vec3b>(y + dy, x + dx)[2]);
+                        gra.at<Vec8f>(y, x)[i] = sqrt(p1 + p2 * p2 + p3 * p3 + p4 * p4);
+                    }
+
+                }
+
+            }
+        }
+
+        return gra;
+    }
+
+
+    Mat interpolate_irregular_nn(
+        const std::vector<Point2f> & _prevPoints,
+        const std::vector<Point2f> & _nextPoints,
+        const std::vector<uchar> & status,
+        const Mat &color_img,
+        float pixeldistance)
+    {
+        int Dx[] = { -1,0,1,-1,1,-1,0,1 };
+        int Dy[] = { -1,-1,-1,0,0,1,1,1 };
+        std::vector<Point2f> prevPoints, nextPoints;
+        std::map<std::pair<float, float>, std::pair<float, float>> flowMap;
+        for (unsigned n = 0; n < _prevPoints.size(); n++)
+        {
+            if (status[n] != 0)
+            {
+                flowMap.insert(std::make_pair(
+                    std::make_pair(_prevPoints[n].x, _prevPoints[n].y),
+                    std::make_pair(_nextPoints[n].x, _nextPoints[n].y)));
+                prevPoints.push_back(_prevPoints[n]);
+                nextPoints.push_back(_nextPoints[n]);
+            }
+
+        }
+
+        Mat gra = getGraph(color_img, pixeldistance);
+
+        Mat prev;
+        Mat geo_dist = sgeo_dist(gra, prevPoints, std::numeric_limits<float>::max(), prev);
+
+
+        Mat nnFlow = Mat::zeros(color_img.size(), CV_32FC2);
+        for (int y = 0; y < nnFlow.rows; y++)
+        {
+            for (int x = 0; x < nnFlow.cols; x++)
+            {
+                int cx = x;
+                int cy = y;
+                while (prev.at<uchar>(cy, cx) != 255)
+                {
+                    int i = prev.at<uchar>(cy, cx);
+                    cx += Dx[i];
+                    cy += Dy[i];
+                }
+                auto val = flowMap[std::make_pair(static_cast<float>(cx), static_cast<float>(cy))];
+                nnFlow.at<Vec2f>(y, x) = Vec2f(val.first - cx, val.second - cy);
+            }
+        }
+        return nnFlow;
+    }
+
+}
 }
\ No newline at end of file
diff --git a/modules/optflow/src/rlof/geo_interpolation.hpp b/modules/optflow/src/rlof/geo_interpolation.hpp
index 2c9dfbd38c0..60aef39bedc 100644
--- a/modules/optflow/src/rlof/geo_interpolation.hpp
+++ b/modules/optflow/src/rlof/geo_interpolation.hpp
@@ -1,75 +1,37 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
- //
- //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
- //
- //  By downloading, copying, installing or using the software you agree to this license.
- //  If you do not agree to this license, do not download, install,
- //  copy or use the software.
- //
- //
- //                           License Agreement
- //                For Open Source Computer Vision Library
- //
- // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
- // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
- // Third party copyrights are property of their respective owners.
- //
- // Redistribution and use in source and binary forms, with or without modification,
- // are permitted provided that the following conditions are met:
- //
- //   * Redistribution's of source code must retain the above copyright notice,
- //     this list of conditions and the following disclaimer.
- //
- //   * Redistribution's in binary form must reproduce the above copyright notice,
- //     this list of conditions and the following disclaimer in the documentation
- //     and/or other materials provided with the distribution.
- //
- //   * The name of the copyright holders may not be used to endorse or promote products
- //     derived from this software without specific prior written permission.
- //
- // This software is provided by the copyright holders and contributors "as is" and
- // any express or implied warranties, including, but not limited to, the implied
- // warranties of merchantability and fitness for a particular purpose are disclaimed.
- // In no event shall the Intel Corporation or contributors be liable for any direct,
- // indirect, incidental, special, exemplary, or consequential damages
- // (including, but not limited to, procurement of substitute goods or services;
- // loss of use, data, or profits; or business interruption) however caused
- // and on any theory of liability, whether in contract, strict liability,
- // or tort (including negligence or otherwise) arising in any way out of
- // the use of this software, even if advised of the possibility of such damage.
- //
- //M*/
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
 #ifndef _GEO_INTERPOLATION_HPP_
 #define _GEO_INTERPOLATION_HPP_
 #include <opencv2/core.hpp>
 namespace cv
 {
-	namespace optflow
-	{
-		typedef Vec<float, 8> Vec8f;
-		Mat getGraph(const Mat & image, float edge_length);
-		Mat sgeo_dist(const Mat& gra, int y, int x, float max, Mat &prev);
-		Mat sgeo_dist(const Mat& gra, const std::vector<Point2f> & points, float max, Mat &prev);
-		Mat interpolate_irregular_nw(const Mat &in, const Mat &mask, const Mat &color_img, float max_d, float bandwidth, float pixeldistance);
-		Mat interpolate_irregular_nn(
-			const std::vector<Point2f> & prevPoints,
-			const std::vector<Point2f> & nextPoints,
-			const std::vector<uchar> & status,
-			const Mat &color_img,
-			float pixeldistance);
-		Mat interpolate_irregular_knn(
-			const std::vector<Point2f> & _prevPoints,
-			const std::vector<Point2f> & _nextPoints,
-			const std::vector<uchar> & status,
-			const Mat &color_img,
-			int k,
-			float pixeldistance);
+namespace optflow
+{
+    typedef Vec<float, 8> Vec8f;
+    Mat getGraph(const Mat & image, float edge_length);
+    Mat sgeo_dist(const Mat& gra, int y, int x, float max, Mat &prev);
+    Mat sgeo_dist(const Mat& gra, const std::vector<Point2f> & points, float max, Mat &prev);
+    Mat interpolate_irregular_nw(const Mat &in, const Mat &mask, const Mat &color_img, float max_d, float bandwidth, float pixeldistance);
+    Mat interpolate_irregular_nn(
+        const std::vector<Point2f> & prevPoints,
+        const std::vector<Point2f> & nextPoints,
+        const std::vector<uchar> & status,
+        const Mat &color_img,
+        float pixeldistance);
+    Mat interpolate_irregular_knn(
+        const std::vector<Point2f> & _prevPoints,
+        const std::vector<Point2f> & _nextPoints,
+        const std::vector<uchar> & status,
+        const Mat &color_img,
+        int k,
+        float pixeldistance);
 
-		Mat interpolate_irregular_nn_raster(const std::vector<Point2f> & prevPoints,
-			const std::vector<Point2f> & nextPoints,
-			const std::vector<uchar> & status,
-			const Mat & i1);
+    Mat interpolate_irregular_nn_raster(const std::vector<Point2f> & prevPoints,
+        const std::vector<Point2f> & nextPoints,
+        const std::vector<uchar> & status,
+        const Mat & i1);
 
-	}
+}
 }
 #endif
diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp
index 1a8f5eec380..2637fa7fbe7 100644
--- a/modules/optflow/src/rlof/plk_invoker.hpp
+++ b/modules/optflow/src/rlof/plk_invoker.hpp
@@ -1,51 +1,13 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
- //
- //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
- //
- //  By downloading, copying, installing or using the software you agree to this license.
- //  If you do not agree to this license, do not download, install,
- //  copy or use the software.
- //
- //
- //                           License Agreement
- //                For Open Source Computer Vision Library
- //
- // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
- // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
- // Third party copyrights are property of their respective owners.
- //
- // Redistribution and use in source and binary forms, with or without modification,
- // are permitted provided that the following conditions are met:
- //
- //   * Redistribution's of source code must retain the above copyright notice,
- //     this list of conditions and the following disclaimer.
- //
- //   * Redistribution's in binary form must reproduce the above copyright notice,
- //     this list of conditions and the following disclaimer in the documentation
- //     and/or other materials provided with the distribution.
- //
- //   * The name of the copyright holders may not be used to endorse or promote products
- //     derived from this software without specific prior written permission.
- //
- // This software is provided by the copyright holders and contributors "as is" and
- // any express or implied warranties, including, but not limited to, the implied
- // warranties of merchantability and fitness for a particular purpose are disclaimed.
- // In no event shall the Intel Corporation or contributors be liable for any direct,
- // indirect, incidental, special, exemplary, or consequential damages
- // (including, but not limited to, procurement of substitute goods or services;
- // loss of use, data, or profits; or business interruption) however caused
- // and on any theory of liability, whether in contract, strict liability,
- // or tort (including negligence or otherwise) arising in any way out of
- // the use of this software, even if advised of the possibility of such damage.
- //
- //M*/
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
 #ifndef _PLK_INVOKER_HPP_
 #define _PLK_INVOKER_HPP_
 #include "rlof_invokerbase.hpp"
 
 
 namespace cv{
-	namespace optflow{
+namespace optflow{
 namespace plk
 {
 
@@ -57,89 +19,89 @@ class TrackerInvoker : public cv::ParallelLoopBody
 public:
     TrackerInvoker(
 
-		const Mat&	_prevImg,
-		const Mat&	_prevDeriv,
-		const Mat&	_nextImg,
-		const Mat&	_rgbPrevImg,
-		const Mat&	_rgbNextImg,
-		const Point2f*	_prevPts,
-		Point2f*		_nextPts,
-		uchar*			_status,
-		float*			_err,
-		Point2f*		_gainVecs,
-		int				_level,
-		int				_maxLevel,
-		int				_winSize[2],
-		int				_maxIteration,
-		bool			_useInitialFlow,
-		int				_supportRegionType,
-		float           _minEigenValue,
-		int				_crossSegmentationThreshold)
-	{
-		prevImg = &_prevImg;
-		prevDeriv = &_prevDeriv;
-		nextImg = &_nextImg;
-		rgbPrevImg = &_rgbPrevImg;
-		rgbNextImg = &_rgbNextImg;
-		prevPts = _prevPts;
-		nextPts = _nextPts;
-		status = _status;
-		err = _err;
-		gainVecs = _gainVecs;
-		minWinSize = _winSize[0];
-		maxWinSize = _winSize[1];
-		criteria.maxCount = _maxIteration;
-		criteria.epsilon = 0.01;
-		level = _level;
-		maxLevel = _maxLevel;
-		windowType = _supportRegionType;
-		minEigThreshold = _minEigenValue;
-		useInitialFlow = _useInitialFlow;
-		crossSegmentationThreshold = _crossSegmentationThreshold;
-	}
+        const Mat&    _prevImg,
+        const Mat&    _prevDeriv,
+        const Mat&    _nextImg,
+        const Mat&    _rgbPrevImg,
+        const Mat&    _rgbNextImg,
+        const Point2f*    _prevPts,
+        Point2f*        _nextPts,
+        uchar*            _status,
+        float*            _err,
+        Point2f*        _gainVecs,
+        int                _level,
+        int                _maxLevel,
+        int                _winSize[2],
+        int                _maxIteration,
+        bool            _useInitialFlow,
+        int                _supportRegionType,
+        float           _minEigenValue,
+        int                _crossSegmentationThreshold)
+    {
+        prevImg = &_prevImg;
+        prevDeriv = &_prevDeriv;
+        nextImg = &_nextImg;
+        rgbPrevImg = &_rgbPrevImg;
+        rgbNextImg = &_rgbNextImg;
+        prevPts = _prevPts;
+        nextPts = _nextPts;
+        status = _status;
+        err = _err;
+        gainVecs = _gainVecs;
+        minWinSize = _winSize[0];
+        maxWinSize = _winSize[1];
+        criteria.maxCount = _maxIteration;
+        criteria.epsilon = 0.01;
+        level = _level;
+        maxLevel = _maxLevel;
+        windowType = _supportRegionType;
+        minEigThreshold = _minEigenValue;
+        useInitialFlow = _useInitialFlow;
+        crossSegmentationThreshold = _crossSegmentationThreshold;
+    }
     
-	TrackerInvoker & operator=(const TrackerInvoker &) {return *this;};
+    TrackerInvoker & operator=(const TrackerInvoker &) {return *this;};
     void operator()(const cv::Range& range) const
     {
-        cv::Size	winSize;
-		cv::Point2f halfWin;
+        cv::Size    winSize;
+        cv::Point2f halfWin;
 
-		const Mat& I	= *prevImg;
-        const Mat& J	= *nextImg;
+        const Mat& I    = *prevImg;
+        const Mat& J    = *nextImg;
         const Mat& derivI = *prevDeriv;
-		const Mat& BI = *rgbPrevImg;
+        const Mat& BI = *rgbPrevImg;
 
-		winSize = cv::Size(maxWinSize,maxWinSize);
-		int winMaskwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
-		cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
-		winMaskMatBuf.setTo(1);
-		
-		const float FLT_SCALE = (1.f/(1 << 16));//(1.f/(1 << 20)); // 20
+        winSize = cv::Size(maxWinSize,maxWinSize);
+        int winMaskwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
+        cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
+        winMaskMatBuf.setTo(1);
+        
+        const float FLT_SCALE = (1.f/(1 << 16));//(1.f/(1 << 20)); // 20
 
         int cn = I.channels(), cn2 = cn*2;
-		int winbufwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
-		cv::Size winBufSize(winbufwidth,winbufwidth);
+        int winbufwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
+        cv::Size winBufSize(winbufwidth,winbufwidth);
 
-		cv::Mat invTensorMat(4,4, CV_32FC1);
-		cv::Mat mismatchMat(4, 1, CV_32FC1);
-		cv::Mat resultMat(4, 1, CV_32FC1);
+        cv::Mat invTensorMat(4,4, CV_32FC1);
+        cv::Mat mismatchMat(4, 1, CV_32FC1);
+        cv::Mat resultMat(4, 1, CV_32FC1);
 
-		cv::AutoBuffer<deriv_type> _buf(winBufSize.area()*(cn + cn2));
-		int derivDepth = DataType<deriv_type>::depth;
+        cv::AutoBuffer<deriv_type> _buf(winBufSize.area()*(cn + cn2));
+        int derivDepth = DataType<deriv_type>::depth;
 
-		Mat IWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf.data());
-		Mat derivIWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf.data() + winBufSize.area()*cn);
+        Mat IWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf.data());
+        Mat derivIWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf.data() + winBufSize.area()*cn);
 
         for( int ptidx = range.start; ptidx < range.end; ptidx++ )
         {
-			Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
+            Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
             Point2f nextPt;
             if( level == maxLevel )
             {
                 if( useInitialFlow )
-				{
+                {
                     nextPt = nextPts[ptidx]*(float)(1./(1 << level));
-				}
+                }
                 else
                     nextPt = prevPt;
             }
@@ -147,25 +109,25 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 nextPt = nextPts[ptidx]*2.f;
             nextPts[ptidx] = nextPt;
 
-			Point2i iprevPt, inextPt;
-			iprevPt.x = cvFloor(prevPt.x);
-			iprevPt.y = cvFloor(prevPt.y);
-			int winArea = maxWinSize * maxWinSize;
-			cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize));
-			winMaskMatBuf.setTo(0);
-			if( calcWinMaskMat(BI, windowType, iprevPt,
-								winMaskMat,winSize,halfWin,winArea,
-								minWinSize,maxWinSize, crossSegmentationThreshold) == false)
-						continue;
-
-			
-			prevPt -= halfWin;
-			iprevPt.x = cvFloor(prevPt.x);
+            Point2i iprevPt, inextPt;
+            iprevPt.x = cvFloor(prevPt.x);
+            iprevPt.y = cvFloor(prevPt.y);
+            int winArea = maxWinSize * maxWinSize;
+            cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize));
+            winMaskMatBuf.setTo(0);
+            if( calcWinMaskMat(BI, windowType, iprevPt,
+                                winMaskMat,winSize,halfWin,winArea,
+                                minWinSize,maxWinSize, crossSegmentationThreshold) == false)
+                        continue;
+
+            
+            prevPt -= halfWin;
+            iprevPt.x = cvFloor(prevPt.x);
             iprevPt.y = cvFloor(prevPt.y);
 
 
-			if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols ||
-				iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows )
+            if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols ||
+                iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows )
             {
                 if( level == 0 )
                 {
@@ -188,32 +150,32 @@ class TrackerInvoker : public cv::ParallelLoopBody
             
             int dstep = (int)(derivI.step/derivI.elemSize1());
             int step = (int)(I.step/I.elemSize1());
-			int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1());
+            int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1());
             CV_Assert( step == (int)(J.step/J.elemSize1()) );
             float A11 = 0, A12 = 0, A22 = 0;
             // tensor
-			float sumIx = 0;
-			float sumIy = 0;
-			float sumI  = 0;
-			float sumW = 0;
-			float w1 = 0, w2 = 0; // -IyI
-			float dI = 0; // I^2
-			float D = 0;
- #ifdef CV_SSE2
+            float sumIx = 0;
+            float sumIy = 0;
+            float sumI  = 0;
+            float sumW = 0;
+            float w1 = 0, w2 = 0; // -IyI
+            float dI = 0; // I^2
+            float D = 0;
+ #ifdef CV_SSE4_1
             __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
             __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
             __m128i z = _mm_setzero_si128();
             __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1));
             __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1));
-			__m128i mmMask4_epi32;
-			__m128i mmMaskSet_epi16   = _mm_set1_epi16(std::numeric_limits<unsigned short>::max());
-			get4BitMask(winSize.width, mmMask4_epi32);
+            __m128i mmMask4_epi32;
+            __m128i mmMaskSet_epi16   = _mm_set1_epi16(std::numeric_limits<unsigned short>::max());
+            get4BitMask(winSize.width, mmMask4_epi32);
 #endif
             // extract the patch from the first image, compute covariation matrix of derivatives
             int x, y;
             for( y = 0; y < winSize.height; y++ )
             {
-				const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+                const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
                 const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
                 const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
                 
@@ -221,12 +183,12 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step);
                 
                 x = 0;
-#ifdef CV_SSE2
+#ifdef CV_SSE4_1
                for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 )
                 {
-					__m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16);
-					__m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16);
-					__m128i v00, v01, v10, v11, t0, t1;
+                    __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16);
+                    __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16);
+                    __m128i v00, v01, v10, v11, t0, t1;
                     v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
                     v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
                     v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z);
@@ -234,12 +196,12 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     
                     t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
                                        _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-					t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
-					if( x + 4 > winSize.width)
-					{
-						t0 = _mm_and_si128(t0, mmMask4_epi32);
-					}
-					t0 = _mm_and_si128(t0, mask_0_3_epi16);
+                    t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
+                    if( x + 4 > winSize.width)
+                    {
+                        t0 = _mm_and_si128(t0, mmMask4_epi32);
+                    }
+                    t0 = _mm_and_si128(t0, mask_0_3_epi16);
                     _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0));
                    
                     
@@ -256,57 +218,57 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
                     v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
                     if( x + 4 > winSize.width)
-					{
-						v00 = _mm_and_si128(v00, mmMask4_epi32);
-					}
-					v00 = _mm_and_si128(v00, mask_0_3_epi16);
+                    {
+                        v00 = _mm_and_si128(v00, mmMask4_epi32);
+                    }
+                    v00 = _mm_and_si128(v00, mask_0_3_epi16);
                     _mm_storeu_si128((__m128i*)dIptr, v00);
                 }
 #else
                 for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 )
                 {
-					if( maskPtr[x] == 0)
-					{
-						dIptr[0] = 0;
-						dIptr[1] = 0;
-						continue;
-					}
-					int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
+                    if( maskPtr[x] == 0)
+                    {
+                        dIptr[0] = 0;
+                        dIptr[1] = 0;
+                        continue;
+                    }
+                    int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
                                           src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5);
                     int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
                                            dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
                     int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
                                            dsrc[dstep+cn2+1]*iw11, W_BITS1);
                     
-					Iptr[x] = (short)ival;
-					dIptr[0] = (short)ixval;
-					dIptr[1] = (short)iyval;
+                    Iptr[x] = (short)ival;
+                    dIptr[0] = (short)ixval;
+                    dIptr[1] = (short)iyval;
                 }
 #endif
 
             }
                         
-			cv::Point2f backUpNextPt = nextPt;
-			nextPt -= halfWin;
-			Point2f prevDelta(0,0);	//relates to h(t-1)
-			Point2f prevGain(1,0);
-			cv::Point2f gainVec = gainVecs[ptidx];
-			cv::Point2f backUpGain = gainVec;
-			int j;
+            cv::Point2f backUpNextPt = nextPt;
+            nextPt -= halfWin;
+            Point2f prevDelta(0,0);    //relates to h(t-1)
+            Point2f prevGain(1,0);
+            cv::Point2f gainVec = gainVecs[ptidx];
+            cv::Point2f backUpGain = gainVec;
+            int j;
             for( j = 0; j < criteria.maxCount; j++ )
             {
-				status[ptidx] = static_cast<uchar>(j);
+                status[ptidx] = static_cast<uchar>(j);
                 inextPt.x = cvFloor(nextPt.x);
                 inextPt.y = cvFloor(nextPt.y);
                 
-				
-				if( inextPt.x + halfWin.x < 0 || inextPt.x + halfWin.x>= derivI.cols ||
-					inextPt.y + halfWin.y < 0 || inextPt.y + halfWin.y >= derivI.rows )
-				{
-					if( level == 0 && status )
-						status[ptidx] = 3;
-					break;
-				}
+                
+                if( inextPt.x + halfWin.x < 0 || inextPt.x + halfWin.x>= derivI.cols ||
+                    inextPt.y + halfWin.y < 0 || inextPt.y + halfWin.y >= derivI.rows )
+                {
+                    if( level == 0 && status )
+                        status[ptidx] = 3;
+                    break;
+                }
 
                 a = nextPt.x - inextPt.x;
                 b = nextPt.y - inextPt.y;
@@ -314,467 +276,467 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
                 iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
                 iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-				float b1,b2,b3,b4;
+                float b1,b2,b3,b4;
                 b1 = 0;
-				b2 = 0;
-				b3 = 0;
-				b4 = 0;
-			
-				if( j == 0 )
-				{
-					// tensor
-					w1 = 0; // -IxI
-					w2 = 0; // -IyI
-					dI = 0; // I^2
-					sumIx = 0;
-					sumIy = 0;
-					sumI  = 0;
-					sumW = 0;
-					A11 = 0;
-					A12 = 0;
-					A22 = 0;
-				}
-#ifdef CV_SSE2
+                b2 = 0;
+                b3 = 0;
+                b4 = 0;
+            
+                if( j == 0 )
+                {
+                    // tensor
+                    w1 = 0; // -IxI
+                    w2 = 0; // -IyI
+                    dI = 0; // I^2
+                    sumIx = 0;
+                    sumIy = 0;
+                    sumI  = 0;
+                    sumW = 0;
+                    A11 = 0;
+                    A12 = 0;
+                    A22 = 0;
+                }
+#ifdef CV_SSE4_1
                 qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
                 qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
                 __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(), qb2 = _mm_setzero_ps();
-				__m128 qb3 = _mm_setzero_ps();
-				__m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps(); 
-				__m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps();
-				__m128 mmSumIy = _mm_setzero_ps(),  mmSumIx = _mm_setzero_ps();
-				__m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();
-				float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x;
-				int bitShift = gainVec.x == 0 ? 1 : static_cast<int>(ceil(log(200.f / gainVal) / log(2.f)));	// zwei stellen nach den komma 
-				__m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.x * (float)(1 << bitShift)));
-				__m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.y));
+                __m128 qb3 = _mm_setzero_ps();
+                __m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps(); 
+                __m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps();
+                __m128 mmSumIy = _mm_setzero_ps(),  mmSumIx = _mm_setzero_ps();
+                __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();
+                float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x;
+                int bitShift = gainVec.x == 0 ? 1 : static_cast<int>(ceil(log(200.f / gainVal) / log(2.f)));    // zwei stellen nach den komma 
+                __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.x * (float)(1 << bitShift)));
+                __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.y));
 #endif
-				for( y = 0; y < winSize.height; y++ )
+                for( y = 0; y < winSize.height; y++ )
                 {
-					const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+                    const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
                     const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn;
                     const short* Iptr = (const short*)(IWinBuf.data + ( y)*IWinBuf.step);
                     const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
                     
                     x = 0;
-#ifdef CV_SSE2					
-					
+#ifdef CV_SSE4_1                    
+                    
                     for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 )
                     {
-						// iw = iw << 14 (16384 short / 2)
-						// iq0 =  iw01 |iw00, iw01 |iw00, iw01 |iw00, iw01 |iw00    16 bit
-						__m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16);
-												
-						// I [0...8]
+                        // iw = iw << 14 (16384 short / 2)
+                        // iq0 =  iw01 |iw00, iw01 |iw00, iw01 |iw00, iw01 |iw00    16 bit
+                        __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16);
+                                                
+                        // I [0...8]
                         __m128i diff0, diff1;
-						__m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); //  element 0 to 7
-						
+                        __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); //  element 0 to 7
+                        
                         __m128i v00 = _mm_unpacklo_epi8(
-							_mm_loadl_epi64((const __m128i*)(Jptr + x)) // J0, J1, J2, ..., J7, 64 bit nullen
-								, z); //J0 , 0, J1, 0, J2, 0 ... J7,0 
+                            _mm_loadl_epi64((const __m128i*)(Jptr + x)) // J0, J1, J2, ..., J7, 64 bit nullen
+                                , z); //J0 , 0, J1, 0, J2, 0 ... J7,0 
                         __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);  
                         __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z);
                         __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z);
                         
-						
-						__m128i t0 = _mm_add_epi32
-							(_mm_madd_epi16(
-								_mm_unpacklo_epi16(v00, v01), //J0, , J1, J1, J2, J2,  ... J3 , J4
-								qw0),
+                        
+                        __m128i t0 = _mm_add_epi32
+                            (_mm_madd_epi16(
+                                _mm_unpacklo_epi16(v00, v01), //J0, , J1, J1, J2, J2,  ... J3 , J4
+                                qw0),
                             _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-						
-						__m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+                        
+                        __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
                                                    _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
 
-						t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
+                        t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
                         t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5);
-						
-					
-						__m128i Ilo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16);
-						__m128i Ihi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16); 
-
-						__m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(Ilo, Ihi), bitShift);
-						__m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(Ilo, Ihi), bitShift);
-						__m128i Igain_epi16 =  _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32);
-																					
-						// diff = J - I + I*gain.x + gain.y
-						__m128i mmDiff_epi16 = _mm_add_epi16( 
-							_mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16),	// J - I
-							_mm_add_epi16(Igain_epi16, mmConstValue_epi16));
-
-						
-						mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16);
-
-					
-					    __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... 
+                        
+                    
+                        __m128i Ilo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16);
+                        __m128i Ihi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16); 
+
+                        __m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(Ilo, Ihi), bitShift);
+                        __m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(Ilo, Ihi), bitShift);
+                        __m128i Igain_epi16 =  _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32);
+                                                                                    
+                        // diff = J - I + I*gain.x + gain.y
+                        __m128i mmDiff_epi16 = _mm_add_epi16( 
+                            _mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16),    // J - I
+                            _mm_add_epi16(Igain_epi16, mmConstValue_epi16));
+
+                        
+                        mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16);
+
+                    
+                        __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... 
                         __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
-										
-						diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7   | It12 It12 It13 It13...
+                                        
+                        diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7   | It12 It12 It13 It13...
                         diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3   | It8 It8 It9 It9...
 
-						v10 = _mm_mullo_epi16(Ixy_0, diff0);
+                        v10 = _mm_mullo_epi16(Ixy_0, diff0);
                         v11 = _mm_mulhi_epi16(Ixy_0, diff0);
                         v00 = _mm_unpacklo_epi16(v10, v11);
                         v10 = _mm_unpackhi_epi16(v10, v11);
-						qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
-						qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
+                        qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
+                        qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
 
-						v10 = _mm_mullo_epi16(Ixy_1, diff1);
-						v11 = _mm_mulhi_epi16(Ixy_1, diff1);
+                        v10 = _mm_mullo_epi16(Ixy_1, diff1);
+                        v11 = _mm_mulhi_epi16(Ixy_1, diff1);
                         v00 = _mm_unpacklo_epi16(v10, v11);
                         v10 = _mm_unpackhi_epi16(v10, v11);
-						qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
-						qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
-						// for set 1  sigma 1
-						// b3 += diff * Iptr[x]
-						Ilo = _mm_mullo_epi16(mmDiff_epi16, I_0_7_epi16);
-						Ihi = _mm_mulhi_epi16(mmDiff_epi16, I_0_7_epi16);
-						v00 = _mm_unpacklo_epi16(Ilo, Ihi);
-						v10 = _mm_unpackhi_epi16(Ilo, Ihi);
-						qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v00));
-						qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v10));
-						
-						// b4+= diff
-						__m128 mmDiff_0_3_ps  = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mmDiff_epi16)); // (_mm_srai_epi32(_mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16),16));
-						__m128 mmDiff_4_7_ps  = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16),16)));
-						qb3 = _mm_add_ps(qb3, mmDiff_0_3_ps);
-						qb3 = _mm_add_ps(qb3, mmDiff_4_7_ps);
-						if( j == 0 )
-						{
-							__m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16));
-							__m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16)));
-							
-							t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3
-							t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3
-
-							__m128 fy = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t0), mask_0_4_ps);
-							__m128 fx = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t1), mask_0_4_ps);
-
-							// 0 ... 3
-							__m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16));
-							
-							// A11 - A22
-							mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fy));
-							mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fy));
-							mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fx));
-							
-							// sumIx und sumIy
-							mmSumIx = _mm_add_ps(mmSumIx, fx);
-							mmSumIy = _mm_add_ps(mmSumIy, fy);
-							
-							mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fx));
-							mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fy));
-													
-							// sumI
-							I_ps  = _mm_blendv_ps(_mm_set1_ps(0), I_ps, mask_0_4_ps);
-							mmSumI = _mm_add_ps(mmSumI,I_ps);
-		
-							// sumDI
-							mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_ps));
-						
-							t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11
-							t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3
-							
-							fy = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t0), mask_4_7_ps);
-							fx = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t1), mask_4_7_ps);
-
-							// 4 ... 7
-							I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16));
-													
-						
-
-							mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fy));
-							mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fy));
-							mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fx));
-							
-							// sumIx und sumIy
-							mmSumIx = _mm_add_ps(mmSumIx, fx);
-							mmSumIy = _mm_add_ps(mmSumIy, fy);
-							
-							mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fx));
-							mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fy));
-													
-							// sumI
-							I_ps  = _mm_blendv_ps(_mm_set1_ps(0), I_ps, mask_4_7_ps);
-							mmSumI = _mm_add_ps(mmSumI, I_ps);
-							
-							// sumW 
-							// sumDI
-							mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_ps));
-						}
-							
-					}
+                        qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
+                        qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
+                        // for set 1  sigma 1
+                        // b3 += diff * Iptr[x]
+                        Ilo = _mm_mullo_epi16(mmDiff_epi16, I_0_7_epi16);
+                        Ihi = _mm_mulhi_epi16(mmDiff_epi16, I_0_7_epi16);
+                        v00 = _mm_unpacklo_epi16(Ilo, Ihi);
+                        v10 = _mm_unpackhi_epi16(Ilo, Ihi);
+                        qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v00));
+                        qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v10));
+                        
+                        // b4+= diff
+                        __m128 mmDiff_0_3_ps  = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mmDiff_epi16)); // (_mm_srai_epi32(_mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16),16));
+                        __m128 mmDiff_4_7_ps  = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16),16)));
+                        qb3 = _mm_add_ps(qb3, mmDiff_0_3_ps);
+                        qb3 = _mm_add_ps(qb3, mmDiff_4_7_ps);
+                        if( j == 0 )
+                        {
+                            __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16));
+                            __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16)));
+                            
+                            t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3
+                            t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3
+
+                            __m128 fy = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t0), mask_0_4_ps);
+                            __m128 fx = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t1), mask_0_4_ps);
+
+                            // 0 ... 3
+                            __m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16));
+                            
+                            // A11 - A22
+                            mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fy));
+                            mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fy));
+                            mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fx));
+                            
+                            // sumIx und sumIy
+                            mmSumIx = _mm_add_ps(mmSumIx, fx);
+                            mmSumIy = _mm_add_ps(mmSumIy, fy);
+                            
+                            mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fx));
+                            mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fy));
+                                                    
+                            // sumI
+                            I_ps  = _mm_blendv_ps(_mm_set1_ps(0), I_ps, mask_0_4_ps);
+                            mmSumI = _mm_add_ps(mmSumI,I_ps);
+        
+                            // sumDI
+                            mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_ps));
+                        
+                            t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11
+                            t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3
+                            
+                            fy = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t0), mask_4_7_ps);
+                            fx = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t1), mask_4_7_ps);
+
+                            // 4 ... 7
+                            I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16));
+                                                    
+                        
+
+                            mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fy));
+                            mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fy));
+                            mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fx));
+                            
+                            // sumIx und sumIy
+                            mmSumIx = _mm_add_ps(mmSumIx, fx);
+                            mmSumIy = _mm_add_ps(mmSumIy, fy);
+                            
+                            mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fx));
+                            mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fy));
+                                                    
+                            // sumI
+                            I_ps  = _mm_blendv_ps(_mm_set1_ps(0), I_ps, mask_4_7_ps);
+                            mmSumI = _mm_add_ps(mmSumI, I_ps);
+                            
+                            // sumW 
+                            // sumDI
+                            mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_ps));
+                        }
+                            
+                    }
 #else
                     for( ; x < winSize.width*cn; x++, dIptr += 2 )
                     {
-						if( maskPtr[x] == 0) 
-							continue;
+                        if( maskPtr[x] == 0) 
+                            continue;
                         int J = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
                                               Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11,
                                               W_BITS1-5);
-						int diff = J - Iptr[x] + static_cast<int>(Iptr[x]) * gainVec.x + gainVec.y; 
-						
-						b1 += (float)(diff*dIptr[0])  * FLT_RESCALE;
+                        int diff = J - Iptr[x] + static_cast<int>(Iptr[x]) * gainVec.x + gainVec.y; 
+                        
+                        b1 += (float)(diff*dIptr[0])  * FLT_RESCALE;
                         b2 += (float)(diff*dIptr[1])  * FLT_RESCALE;
-						b3 += (float)(diff) * Iptr[x] * FLT_RESCALE;
-						b4 += (float)(diff);
-						
-						// compute the Gradient Matrice
-						if( j == 0 )
-						{
-							A11 += (float)(dIptr[0]*dIptr[0]);
-							A12 += (float)(dIptr[0]*dIptr[1]);
-							A22 += (float)(dIptr[1]*dIptr[1]);
-							
-							
-							dI += Iptr[x] * Iptr[x] * FLT_RESCALE;
-							float dx = static_cast<float>(dIptr[0]) * FLT_RESCALE;
-							float dy = static_cast<float>(dIptr[1]) * FLT_RESCALE;
-							sumIx += dx;
-							sumIy += dy;
-							w1 += dx * Iptr[x];
-							w2 += dy * Iptr[x];
-							sumI += Iptr[x]  * FLT_RESCALE;
-							//sumW += FLT_RESCALE;
-							
-						}
+                        b3 += (float)(diff) * Iptr[x] * FLT_RESCALE;
+                        b4 += (float)(diff);
+                        
+                        // compute the Gradient Matrice
+                        if( j == 0 )
+                        {
+                            A11 += (float)(dIptr[0]*dIptr[0]);
+                            A12 += (float)(dIptr[0]*dIptr[1]);
+                            A22 += (float)(dIptr[1]*dIptr[1]);
+                            
+                            
+                            dI += Iptr[x] * Iptr[x] * FLT_RESCALE;
+                            float dx = static_cast<float>(dIptr[0]) * FLT_RESCALE;
+                            float dy = static_cast<float>(dIptr[1]) * FLT_RESCALE;
+                            sumIx += dx;
+                            sumIy += dy;
+                            w1 += dx * Iptr[x];
+                            w2 += dy * Iptr[x];
+                            sumI += Iptr[x]  * FLT_RESCALE;
+                            //sumW += FLT_RESCALE;
+                            
+                        }
                     }
 #endif
                 }
-				
-				float CV_DECL_ALIGNED(16) wbuf[4];//
-				if( j == 0 )
-				{
-#ifdef CV_SSE2
-						_mm_store_ps(wbuf, mmSumW1);
-						w1  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-						_mm_store_ps(wbuf, mmSumW2);
-						w2  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-						_mm_store_ps(wbuf, mmSumDI);
-						dI  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-						_mm_store_ps(wbuf, mmSumI);
-						sumI  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-						_mm_store_ps(wbuf, mmSumIx);
-						sumIx  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-						_mm_store_ps(wbuf, mmSumIy);
-						sumIy  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-						_mm_store_ps(wbuf, mmSumW);
-						sumW  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                
+                float CV_DECL_ALIGNED(16) wbuf[4];//
+                if( j == 0 )
+                {
+#ifdef CV_SSE4_1
+                        _mm_store_ps(wbuf, mmSumW1);
+                        w1  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                        _mm_store_ps(wbuf, mmSumW2);
+                        w2  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                        _mm_store_ps(wbuf, mmSumDI);
+                        dI  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                        _mm_store_ps(wbuf, mmSumI);
+                        sumI  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                        _mm_store_ps(wbuf, mmSumIx);
+                        sumIx  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                        _mm_store_ps(wbuf, mmSumIy);
+                        sumIy  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                        _mm_store_ps(wbuf, mmSumW);
+                        sumW  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
 #endif
-					sumIx *= -FLT_SCALE;
-					sumIy *= -FLT_SCALE;
-					sumI *=FLT_SCALE;
-					sumW = winArea * FLT_SCALE;
-					w1 *= -FLT_SCALE;
-					w2 *= -FLT_SCALE;
-					dI *= FLT_SCALE;
-#ifdef CV_SSE2
-					float CV_DECL_ALIGNED(32) A11buf[4], A12buf[4], A22buf[4];//				
-					
-					_mm_store_ps(A11buf, mmAxx);
-					_mm_store_ps(A12buf, mmAxy);
-					_mm_store_ps(A22buf, mmAyy);
-					
-					A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
-					A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
-					A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
+                    sumIx *= -FLT_SCALE;
+                    sumIy *= -FLT_SCALE;
+                    sumI *=FLT_SCALE;
+                    sumW = winArea * FLT_SCALE;
+                    w1 *= -FLT_SCALE;
+                    w2 *= -FLT_SCALE;
+                    dI *= FLT_SCALE;
+#ifdef CV_SSE4_1
+                    float CV_DECL_ALIGNED(32) A11buf[4], A12buf[4], A22buf[4];//                
+                    
+                    _mm_store_ps(A11buf, mmAxx);
+                    _mm_store_ps(A12buf, mmAxy);
+                    _mm_store_ps(A22buf, mmAyy);
+                    
+                    A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
+                    A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
+                    A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
 #endif
-					A11 *= FLT_SCALE; 
-					A12 *= FLT_SCALE; 
-					A22 *= FLT_SCALE; 
-				}
+                    A11 *= FLT_SCALE; 
+                    A12 *= FLT_SCALE; 
+                    A22 *= FLT_SCALE; 
+                }
 
-#ifdef CV_SSE2
+#ifdef CV_SSE4_1
                 float CV_DECL_ALIGNED(16) bbuf[4];
                 _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
-				b1 = bbuf[0] + bbuf[2];
-				b2 = bbuf[1] + bbuf[3];
-				_mm_store_ps(bbuf, qb2);
-				b3 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
-				_mm_store_ps(bbuf, qb3);
-				b4 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
+                b1 = bbuf[0] + bbuf[2];
+                b2 = bbuf[1] + bbuf[3];
+                _mm_store_ps(bbuf, qb2);
+                b3 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
+                _mm_store_ps(bbuf, qb3);
+                b4 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
 #endif
 
-				mismatchMat.at<float>(0,0) =  b1 * FLT_SCALE;
-				mismatchMat.at<float>(1,0) =  b2 * FLT_SCALE;
-				mismatchMat.at<float>(2,0) = -b3 * FLT_SCALE;
-				mismatchMat.at<float>(3,0) = -b4 * FLT_SCALE;
-				
-				D = - A12*A12*sumI*sumI + dI*sumW*A12*A12 + 2*A12*sumI*sumIx*w2 + 2*A12*sumI*sumIy*w1 
-						- 2*dI*A12*sumIx*sumIy - 2*sumW*A12*w1*w2 + A11*A22*sumI*sumI - 2*A22*sumI*sumIx*w1 
-						- 2*A11*sumI*sumIy*w2 - sumIx*sumIx*w2*w2 + A22*dI*sumIx*sumIx + 2*sumIx*sumIy*w1*w2 
-						- sumIy*sumIy*w1*w1 + A11*dI*sumIy*sumIy + A22*sumW*w1*w1 + A11*sumW*w2*w2 - A11*A22*dI*sumW;
-		
-
-				float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
+                mismatchMat.at<float>(0,0) =  b1 * FLT_SCALE;
+                mismatchMat.at<float>(1,0) =  b2 * FLT_SCALE;
+                mismatchMat.at<float>(2,0) = -b3 * FLT_SCALE;
+                mismatchMat.at<float>(3,0) = -b4 * FLT_SCALE;
+                
+                D = - A12*A12*sumI*sumI + dI*sumW*A12*A12 + 2*A12*sumI*sumIx*w2 + 2*A12*sumI*sumIy*w1 
+                        - 2*dI*A12*sumIx*sumIy - 2*sumW*A12*w1*w2 + A11*A22*sumI*sumI - 2*A22*sumI*sumIx*w1 
+                        - 2*A11*sumI*sumIy*w2 - sumIx*sumIx*w2*w2 + A22*dI*sumIx*sumIx + 2*sumIx*sumIy*w1*w2 
+                        - sumIy*sumIy*w1*w1 + A11*dI*sumIy*sumIy + A22*sumW*w1*w1 + A11*sumW*w2*w2 - A11*A22*dI*sumW;
+        
+
+                float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
                         4.f*A12*A12))/(2*winArea);
-				if(  minEig < minEigThreshold )
-				{
-					if( level == 0 && status || std::abs(D) < FLT_EPSILON)
-						status[ptidx] = 0;
-					if( level > 0)
-					{
-						nextPts[ptidx] = backUpNextPt;
-						gainVecs[ptidx] = backUpGain;
-					}
-					break;
-				}
-				
-		
-
-				D = (1.f / D);				
-				invTensorMat.at<float>(0,0) = (A22*sumI*sumI - 2*sumI*sumIy*w2 + dI*sumIy*sumIy + sumW*w2*w2 - A22*dI*sumW)* D;												invTensorMat.at<float>(0,1) = (A12*dI*sumW - A12*sumI * sumI - dI*sumIx*sumIy + sumI*sumIx*w2 + sumI*sumIy*w1 - sumW*w1*w2)* D;
-				invTensorMat.at<float>(0,2) = (A12*sumI*sumIy - sumIy*sumIy*w1 - A22*sumI*sumIx - A12*sumW*w2 + A22*sumW*w1 + sumIx*sumIy*w2)* D;
-				invTensorMat.at<float>(0,3) = (A22*dI*sumIx - A12*dI*sumIy - sumIx*w2*w2 + A12*sumI*w2 - A22*sumI*w1 + sumIy*w1*w2) * D;
-				invTensorMat.at<float>(1,0) = invTensorMat.at<float>(0,1);
-				invTensorMat.at<float>(1,1) = (A11*sumI * sumI - 2*sumI*sumIx*w1 + dI*sumIx * sumIx + sumW*w1*w1 - A11*dI*sumW) * D;
-				invTensorMat.at<float>(1,2) = (A12*sumI*sumIx - A11*sumI*sumIy - sumIx * sumIx*w2 + A11*sumW*w2 - A12*sumW*w1 + sumIx*sumIy*w1) * D;
-				invTensorMat.at<float>(1,3) = (A11*dI*sumIy - sumIy*w1*w1 - A12*dI*sumIx - A11*sumI*w2 + A12*sumI*w1 + sumIx*w1*w2)* D;
-				invTensorMat.at<float>(2,0) = invTensorMat.at<float>(0,2);
-				invTensorMat.at<float>(2,1) = invTensorMat.at<float>(1,2);
-				invTensorMat.at<float>(2,2) = (sumW*A12*A12 - 2*A12*sumIx*sumIy + A22*sumIx*sumIx + A11*sumIy*sumIy - A11*A22*sumW)* D;
-				invTensorMat.at<float>(2,3) = (A11*A22*sumI - A12*A12*sumI - A11*sumIy*w2 + A12*sumIx*w2 + A12*sumIy*w1 - A22*sumIx*w1)* D;
-				invTensorMat.at<float>(3,0) = invTensorMat.at<float>(0,3);
-				invTensorMat.at<float>(3,1) = invTensorMat.at<float>(1,3);
-				invTensorMat.at<float>(3,2) = invTensorMat.at<float>(2,3);
-				invTensorMat.at<float>(3,3) = (dI*A12*A12 - 2*A12*w1*w2 + A22*w1*w1 + A11*w2*w2 - A11*A22*dI)* D;
-
-				resultMat = invTensorMat * mismatchMat;
+                if(  minEig < minEigThreshold )
+                {
+                    if( (level == 0 && status) || std::abs(D) < FLT_EPSILON)
+                        status[ptidx] = 0;
+                    if( level > 0)
+                    {
+                        nextPts[ptidx] = backUpNextPt;
+                        gainVecs[ptidx] = backUpGain;
+                    }
+                    break;
+                }
+                
+        
+
+                D = (1.f / D);                
+                invTensorMat.at<float>(0,0) = (A22*sumI*sumI - 2*sumI*sumIy*w2 + dI*sumIy*sumIy + sumW*w2*w2 - A22*dI*sumW)* D;                                                invTensorMat.at<float>(0,1) = (A12*dI*sumW - A12*sumI * sumI - dI*sumIx*sumIy + sumI*sumIx*w2 + sumI*sumIy*w1 - sumW*w1*w2)* D;
+                invTensorMat.at<float>(0,2) = (A12*sumI*sumIy - sumIy*sumIy*w1 - A22*sumI*sumIx - A12*sumW*w2 + A22*sumW*w1 + sumIx*sumIy*w2)* D;
+                invTensorMat.at<float>(0,3) = (A22*dI*sumIx - A12*dI*sumIy - sumIx*w2*w2 + A12*sumI*w2 - A22*sumI*w1 + sumIy*w1*w2) * D;
+                invTensorMat.at<float>(1,0) = invTensorMat.at<float>(0,1);
+                invTensorMat.at<float>(1,1) = (A11*sumI * sumI - 2*sumI*sumIx*w1 + dI*sumIx * sumIx + sumW*w1*w1 - A11*dI*sumW) * D;
+                invTensorMat.at<float>(1,2) = (A12*sumI*sumIx - A11*sumI*sumIy - sumIx * sumIx*w2 + A11*sumW*w2 - A12*sumW*w1 + sumIx*sumIy*w1) * D;
+                invTensorMat.at<float>(1,3) = (A11*dI*sumIy - sumIy*w1*w1 - A12*dI*sumIx - A11*sumI*w2 + A12*sumI*w1 + sumIx*w1*w2)* D;
+                invTensorMat.at<float>(2,0) = invTensorMat.at<float>(0,2);
+                invTensorMat.at<float>(2,1) = invTensorMat.at<float>(1,2);
+                invTensorMat.at<float>(2,2) = (sumW*A12*A12 - 2*A12*sumIx*sumIy + A22*sumIx*sumIx + A11*sumIy*sumIy - A11*A22*sumW)* D;
+                invTensorMat.at<float>(2,3) = (A11*A22*sumI - A12*A12*sumI - A11*sumIy*w2 + A12*sumIx*w2 + A12*sumIy*w1 - A22*sumIx*w1)* D;
+                invTensorMat.at<float>(3,0) = invTensorMat.at<float>(0,3);
+                invTensorMat.at<float>(3,1) = invTensorMat.at<float>(1,3);
+                invTensorMat.at<float>(3,2) = invTensorMat.at<float>(2,3);
+                invTensorMat.at<float>(3,3) = (dI*A12*A12 - 2*A12*w1*w2 + A22*w1*w1 + A11*w2*w2 - A11*A22*dI)* D;
+
+                resultMat = invTensorMat * mismatchMat;
 
                 // 0.08 -12.10
-				Point2f delta(-resultMat.at<float>(0), -resultMat.at<float>(1));
-				Point2f deltaGain(-resultMat.at<float>(2), -resultMat.at<float>(3));
-			
-				
-				if( j == 0) 
-					prevGain = deltaGain;
-		        nextPt += delta;
+                Point2f delta(-resultMat.at<float>(0), -resultMat.at<float>(1));
+                Point2f deltaGain(-resultMat.at<float>(2), -resultMat.at<float>(3));
+            
+                
+                if( j == 0) 
+                    prevGain = deltaGain;
+                nextPt += delta;
                 nextPts[ptidx] = nextPt + halfWin;
-				gainVecs[ptidx]= gainVec;			
+                gainVecs[ptidx]= gainVec;            
                 if( delta.ddot(delta) <= criteria.epsilon)
-					break;
-				if(	(   std::abs(delta.x - prevDelta.x) < 0.01  
-					&&	std::abs(delta.y - prevDelta.y) < 0.01)  
-					|| ((delta.ddot(delta) <= 0.001) 
-					   && std::abs(prevGain.x - deltaGain.x) < 0.01)
-						)
-				{
-					nextPts[ptidx]  -= delta*0.5f;
-					gainVecs[ptidx] -= deltaGain* 0.5f;
-					break;
-				}
-
-				if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01  &&	
-							std::abs(delta.y - prevDelta.y) < 0.01)
-				{
-					nextPts[ptidx]  -= delta*0.5f;
-					break;
-				}
-				
-				prevDelta = delta;
-				prevGain = deltaGain;
-				
+                    break;
+                if(    (   std::abs(delta.x - prevDelta.x) < 0.01  
+                    &&    std::abs(delta.y - prevDelta.y) < 0.01)  
+                    || ((delta.ddot(delta) <= 0.001) 
+                       && std::abs(prevGain.x - deltaGain.x) < 0.01)
+                        )
+                {
+                    nextPts[ptidx]  -= delta*0.5f;
+                    gainVecs[ptidx] -= deltaGain* 0.5f;
+                    break;
+                }
+
+                if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01  &&    
+                            std::abs(delta.y - prevDelta.y) < 0.01)
+                {
+                    nextPts[ptidx]  -= delta*0.5f;
+                    break;
+                }
+                
+                prevDelta = delta;
+                prevGain = deltaGain;
+                
             }
-			
-		
+            
+        
         }
 
     }
     
-	const Mat*		prevImg;
-	const Mat*		nextImg;
-	const Mat*		prevDeriv;
-	const Mat*		rgbPrevImg;
-	const Mat*		rgbNextImg;
-	const Point2f*	prevPts;
-	Point2f*		nextPts;
-	uchar*			status;
-	cv::Point2f*	gainVecs;		// gain vector x -> multiplier y -> offset
-	float*			err;
-	int				maxWinSize;
-	int				minWinSize;
-	TermCriteria	criteria;
-	int				level;
-	int				maxLevel;
-	int				windowType;
-	float			minEigThreshold;
-	bool			useInitialFlow;
-	int				crossSegmentationThreshold;
+    const Mat*        prevImg;
+    const Mat*        nextImg;
+    const Mat*        prevDeriv;
+    const Mat*        rgbPrevImg;
+    const Mat*        rgbNextImg;
+    const Point2f*    prevPts;
+    Point2f*        nextPts;
+    uchar*            status;
+    cv::Point2f*    gainVecs;        // gain vector x -> multiplier y -> offset
+    float*            err;
+    int                maxWinSize;
+    int                minWinSize;
+    TermCriteria    criteria;
+    int                level;
+    int                maxLevel;
+    int                windowType;
+    float            minEigThreshold;
+    bool            useInitialFlow;
+    int                crossSegmentationThreshold;
 };
 }
 namespace ica{
 class TrackerInvoker : public cv::ParallelLoopBody
 {
 public:
-    TrackerInvoker(const Mat&	_prevImg,
-		const Mat&	_prevDeriv,
-		const Mat&	_nextImg,
-		const Mat&	_rgbPrevImg,
-		const Mat&	_rgbNextImg,
-		const Point2f*	_prevPts,
-		Point2f*		_nextPts,
-		uchar*			_status,
-		float*			_err,
-		int				_level,
-		int				_maxLevel,
-		int				_winSize[2],
-		int				_maxIteration,
-		bool			_useInitialFlow,
-		int				_supportRegionType,
-		int				_crossSegmentationThreshold,
-		float           _minEigenValue)
-	{
-		prevImg = &_prevImg;
-		prevDeriv = &_prevDeriv;
-		nextImg = &_nextImg;
-		rgbPrevImg = &_rgbPrevImg;
-		rgbNextImg = &_rgbNextImg;
-		prevPts = _prevPts;
-		nextPts = _nextPts;
-		status = _status;
-		err = _err;
-		minWinSize = _winSize[0];
-		maxWinSize = _winSize[1];
-		criteria.maxCount = _maxIteration;
-		criteria.epsilon = 0.01;
-		level = _level;
-		maxLevel = _maxLevel;
-		windowType = _supportRegionType;
-		minEigThreshold = _minEigenValue;
-		useInitialFlow = _useInitialFlow;
-		crossSegmentationThreshold = _crossSegmentationThreshold;
-	}
+    TrackerInvoker(const Mat&    _prevImg,
+        const Mat&    _prevDeriv,
+        const Mat&    _nextImg,
+        const Mat&    _rgbPrevImg,
+        const Mat&    _rgbNextImg,
+        const Point2f*    _prevPts,
+        Point2f*        _nextPts,
+        uchar*            _status,
+        float*            _err,
+        int                _level,
+        int                _maxLevel,
+        int                _winSize[2],
+        int                _maxIteration,
+        bool            _useInitialFlow,
+        int                _supportRegionType,
+        int                _crossSegmentationThreshold,
+        float           _minEigenValue)
+    {
+        prevImg = &_prevImg;
+        prevDeriv = &_prevDeriv;
+        nextImg = &_nextImg;
+        rgbPrevImg = &_rgbPrevImg;
+        rgbNextImg = &_rgbNextImg;
+        prevPts = _prevPts;
+        nextPts = _nextPts;
+        status = _status;
+        err = _err;
+        minWinSize = _winSize[0];
+        maxWinSize = _winSize[1];
+        criteria.maxCount = _maxIteration;
+        criteria.epsilon = 0.01;
+        level = _level;
+        maxLevel = _maxLevel;
+        windowType = _supportRegionType;
+        minEigThreshold = _minEigenValue;
+        useInitialFlow = _useInitialFlow;
+        crossSegmentationThreshold = _crossSegmentationThreshold;
+    }
     
-	TrackerInvoker & operator=(const TrackerInvoker &) {return *this;};
+    TrackerInvoker & operator=(const TrackerInvoker &) {return *this;};
     void operator()(const cv::Range& range) const
     {
-        cv::Size	winSize;
-		cv::Point2f halfWin;
+        cv::Size    winSize;
+        cv::Point2f halfWin;
 
-		const Mat& I	= *prevImg;
-        const Mat& J	= *nextImg;
+        const Mat& I    = *prevImg;
+        const Mat& J    = *nextImg;
         const Mat& derivI = *prevDeriv;
-		const Mat& BI = *rgbPrevImg;
+        const Mat& BI = *rgbPrevImg;
 
-		winSize = cv::Size(maxWinSize,maxWinSize);
-		int winMaskwidth = static_cast<int>(ceil(winSize.width / 8.f)) * 16;
-		cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
-		winMaskMatBuf.setTo(1);
-		
-		
+        winSize = cv::Size(maxWinSize,maxWinSize);
+        int winMaskwidth = static_cast<int>(ceil(winSize.width / 8.f)) * 16;
+        cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
+        winMaskMatBuf.setTo(1);
+        
+        
 
-		const float FLT_SCALE = (1.f/(1 << 20)); // 20
+        const float FLT_SCALE = (1.f/(1 << 20)); // 20
 
         int j, cn = I.channels(), cn2 = cn*2;
-		int winbufwidth = static_cast<int>(ceil(winSize.width / 8.f)) * 8;
-		cv::Size winBufSize(winbufwidth,winbufwidth);
+        int winbufwidth = static_cast<int>(ceil(winSize.width / 8.f)) * 8;
+        cv::Size winBufSize(winbufwidth,winbufwidth);
 
-		std::vector<short> _buf(winBufSize.area()*(cn + cn2));
-		Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]);
-		Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]);
+        std::vector<short> _buf(winBufSize.area()*(cn + cn2));
+        Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]);
+        Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]);
 
         for( int ptidx = range.start; ptidx < range.end; ptidx++ )
         {
-			Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
+            Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
             Point2f nextPt;
             if( level == maxLevel )
             {
@@ -787,25 +749,25 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 nextPt = nextPts[ptidx]*2.f;
             nextPts[ptidx] = nextPt;
 
-			Point2i iprevPt, inextPt;
-			iprevPt.x = cvFloor(prevPt.x);
-			iprevPt.y = cvFloor(prevPt.y);
-			int winArea = maxWinSize * maxWinSize;
-			cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize));
+            Point2i iprevPt, inextPt;
+            iprevPt.x = cvFloor(prevPt.x);
+            iprevPt.y = cvFloor(prevPt.y);
+            int winArea = maxWinSize * maxWinSize;
+            cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize));
 
-			if( calcWinMaskMat(BI,  windowType, iprevPt,
-								winMaskMat,winSize,halfWin,winArea,
-								minWinSize,maxWinSize, crossSegmentationThreshold) == false)
-						continue;
+            if( calcWinMaskMat(BI,  windowType, iprevPt,
+                                winMaskMat,winSize,halfWin,winArea,
+                                minWinSize,maxWinSize, crossSegmentationThreshold) == false)
+                        continue;
 
-			
-			prevPt -= halfWin;
-			iprevPt.x = cvFloor(prevPt.x);
+            
+            prevPt -= halfWin;
+            iprevPt.x = cvFloor(prevPt.x);
             iprevPt.y = cvFloor(prevPt.y);
 
 
-			if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols ||
-				iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows )
+            if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols ||
+                iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows )
             {
                 if( level == 0 )
                 {
@@ -828,26 +790,26 @@ class TrackerInvoker : public cv::ParallelLoopBody
             
             int dstep = (int)(derivI.step/derivI.elemSize1());
             int step = (int)(I.step/I.elemSize1());
-			int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1());
+            int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1());
             CV_Assert( step == (int)(J.step/J.elemSize1()) );
             float A11 = 0, A12 = 0, A22 = 0;
             
-#ifdef CV_SSE2
+#ifdef CV_SSE4_1
             __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
             __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
             __m128i z = _mm_setzero_si128();
             __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1));
             __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1));
             __m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps();
-			__m128i mmMask4_epi32;
-			get4BitMask(winSize.width, mmMask4_epi32);
+            __m128i mmMask4_epi32;
+            get4BitMask(winSize.width, mmMask4_epi32);
 #endif
             
             // extract the patch from the first image, compute covariation matrix of derivatives
             int x, y;
             for( y = 0; y < winSize.height; y++ )
             {
-				const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+                const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
                 const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
                 const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
                 
@@ -856,13 +818,13 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 
                 x = 0;
                 
-#ifdef CV_SSE2
+#ifdef CV_SSE4_1
                 for( ; x <= winSize.width*cn; x += 4, dsrc += 4*2, dIptr += 4*2 )
                 {
-					__m128i wMask = _mm_set_epi32(MaskSet * maskPtr[x+3], 
-												  MaskSet * maskPtr[x+2], 
-												  MaskSet * maskPtr[x+1], 
-												  MaskSet * maskPtr[x]);
+                    __m128i wMask = _mm_set_epi32(MaskSet * maskPtr[x+3], 
+                                                  MaskSet * maskPtr[x+2], 
+                                                  MaskSet * maskPtr[x+1], 
+                                                  MaskSet * maskPtr[x]);
                     __m128i v00, v01, v10, v11, t0, t1;
                     v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
                     v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
@@ -872,13 +834,13 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
                                        _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
 
-				
-					
+                
+                    
 
                     t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
                     _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0));
                     
-					
+                    
 
                     v00 = _mm_loadu_si128((const __m128i*)(dsrc));
                     v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
@@ -893,11 +855,11 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
                     v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
                     
-					if( x + 4 > winSize.width)
-					{
-						v00 = _mm_and_si128(v00, mmMask4_epi32);
-					}
-					v00 = _mm_and_si128(v00, wMask);
+                    if( x + 4 > winSize.width)
+                    {
+                        v00 = _mm_and_si128(v00, mmMask4_epi32);
+                    }
+                    v00 = _mm_and_si128(v00, wMask);
 
                     _mm_storeu_si128((__m128i*)dIptr, v00);
                     t0 = _mm_srai_epi32(v00, 16); // Iy0 Iy1 Iy2 Iy3
@@ -906,8 +868,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     __m128 fy = _mm_cvtepi32_ps(t0);
                     __m128 fx = _mm_cvtepi32_ps(t1);
                     
-					
-					qA22 = _mm_add_ps(qA22, _mm_mul_ps(fy, fy));
+                    
+                    qA22 = _mm_add_ps(qA22, _mm_mul_ps(fy, fy));
                     qA12 = _mm_add_ps(qA12, _mm_mul_ps(fx, fy));
                     qA11 = _mm_add_ps(qA11, _mm_mul_ps(fx, fx));
                 }
@@ -915,30 +877,30 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 
                 for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 )
                 {
-					if( maskPtr[x] == 0)
-					{
-						dIptr[0] = 0;
-						dIptr[1] = 0;
-						continue;
-					}
-					int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
+                    if( maskPtr[x] == 0)
+                    {
+                        dIptr[0] = 0;
+                        dIptr[1] = 0;
+                        continue;
+                    }
+                    int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
                                           src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5);
                     int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
                                            dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
                     int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
                                            dsrc[dstep+cn2+1]*iw11, W_BITS1);
                     
-					Iptr[x] = (short)ival;
-					dIptr[0] = (short)ixval;
-					dIptr[1] = (short)iyval;
-					A11 += (float)(ixval*ixval);
-					A12 += (float)(ixval*iyval);
-					A22 += (float)(iyval*iyval);
+                    Iptr[x] = (short)ival;
+                    dIptr[0] = (short)ixval;
+                    dIptr[1] = (short)iyval;
+                    A11 += (float)(ixval*ixval);
+                    A12 += (float)(ixval*iyval);
+                    A22 += (float)(iyval*iyval);
                 }
 #endif
             }
             
-#ifdef CV_SSE2
+#ifdef CV_SSE4_1
             float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];
             _mm_store_ps(A11buf, qA11);
             _mm_store_ps(A12buf, qA12);
@@ -948,18 +910,18 @@ class TrackerInvoker : public cv::ParallelLoopBody
             A22 += A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
 #endif
             
-			A11 *= FLT_SCALE;
+            A11 *= FLT_SCALE;
             A12 *= FLT_SCALE;
             A22 *= FLT_SCALE;
             
             float D = A11*A22 - A12*A12;
             float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
-						     4.f*A12*A12))/(2 * winArea);
+                             4.f*A12*A12))/(2 * winArea);
             
             if( err )
                 err[ptidx] = (float)minEig;
             
-			if( minEig < minEigThreshold ||  D < FLT_EPSILON )
+            if( minEig < minEigThreshold ||  D < FLT_EPSILON )
             {
                 if( level == 0 && status )
                     status[ptidx] = 0;
@@ -969,23 +931,23 @@ class TrackerInvoker : public cv::ParallelLoopBody
             D = 1.f/D;
 
             nextPt -= halfWin;
-			Point2f prevDelta(0,0);	//relates to h(t-1)
-			__m128i mmMask0, mmMask1, mmMask;
-			getWBitMask(winSize.width, mmMask0, mmMask1, mmMask);
+            Point2f prevDelta(0,0);    //relates to h(t-1)
+            __m128i mmMask0, mmMask1, mmMask;
+            getWBitMask(winSize.width, mmMask0, mmMask1, mmMask);
             for( j = 0; j < criteria.maxCount; j++ )
             {
-				status[ptidx] = static_cast<uchar>(j);
+                status[ptidx] = static_cast<uchar>(j);
                 inextPt.x = cvFloor(nextPt.x);
                 inextPt.y = cvFloor(nextPt.y);
                 
-				
-				if( inextPt.x + halfWin.x < 0 || inextPt.x + halfWin.x>= derivI.cols ||
-					inextPt.y + halfWin.y < 0 || inextPt.y + halfWin.y >= derivI.rows )
-				{
-					if( level == 0 && status )
-						status[ptidx] = 3;
-					break;
-				}
+                
+                if( inextPt.x + halfWin.x < 0 || inextPt.x + halfWin.x>= derivI.cols ||
+                    inextPt.y + halfWin.y < 0 || inextPt.y + halfWin.y >= derivI.rows )
+                {
+                    if( level == 0 && status )
+                        status[ptidx] = 3;
+                    break;
+                }
 
                 a = nextPt.x - inextPt.x;
                 b = nextPt.y - inextPt.y;
@@ -994,30 +956,29 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
                 iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
                 float b1 = 0, b2 = 0;
-#ifdef CV_SSE2
+#ifdef CV_SSE4_1
                 qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
                 qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
                 __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps();
-				__m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();
-			
+
 
 #endif
-				for( y = 0; y < winSize.height; y++ )
+                for( y = 0; y < winSize.height; y++ )
                 {
                     const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn;
                     const short* Iptr = (const short*)(IWinBuf.data + ( y)*IWinBuf.step);
                     const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
                     
                     x = 0;
-#ifdef CV_SSE2					
-					const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+#ifdef CV_SSE4_1                    
+                    const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
                     for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 )
                     {
-						if( maskPtr[x  ] == 0 && maskPtr[x+1] == 0 && maskPtr[x+2] == 0 && maskPtr[x+3] == 0
-						&&	maskPtr[x+4] == 0 && maskPtr[x+5] == 0 && maskPtr[x+6] == 0 && maskPtr[x+7] == 0)
-							continue;
+                        if( maskPtr[x  ] == 0 && maskPtr[x+1] == 0 && maskPtr[x+2] == 0 && maskPtr[x+3] == 0
+                        &&    maskPtr[x+4] == 0 && maskPtr[x+5] == 0 && maskPtr[x+6] == 0 && maskPtr[x+7] == 0)
+                            continue;
                         __m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x)), diff1;
-						__m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
+                        __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
                         __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
                         __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z);
                         __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z);
@@ -1029,44 +990,44 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
                         t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5);
                      
-						__m128i mmDiff_epi16 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), diff0);
+                        __m128i mmDiff_epi16 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), diff0);
                         
                         __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... 
                         __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
-						
-						if(  x > winSize.width*cn - 8)
-						{
-							Ixy_0 = _mm_and_si128(Ixy_0, mmMask0);
-							Ixy_1 = _mm_and_si128(Ixy_1, mmMask1);
-						}
-						
                         
-						diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7   | It12 It12 It13 It13...
+                        if(  x > winSize.width*cn - 8)
+                        {
+                            Ixy_0 = _mm_and_si128(Ixy_0, mmMask0);
+                            Ixy_1 = _mm_and_si128(Ixy_1, mmMask1);
+                        }
+                        
+                        
+                        diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7   | It12 It12 It13 It13...
                         diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3   | It8 It8 It9 It9...
-						
+                        
 
-						v10 = _mm_mullo_epi16(Ixy_0, diff0);
+                        v10 = _mm_mullo_epi16(Ixy_0, diff0);
                         v11 = _mm_mulhi_epi16(Ixy_0, diff0);
                         v00 = _mm_unpacklo_epi16(v10, v11);
                         v10 = _mm_unpackhi_epi16(v10, v11);
-						qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
-						qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
-						
-						// It * Ix It * Iy [4 ... 7]
-						// for set 1 hi sigma 1
-						
-						v10 = _mm_mullo_epi16(Ixy_1, diff1);
-						v11 = _mm_mulhi_epi16(Ixy_1, diff1);
+                        qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
+                        qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
+                        
+                        // It * Ix It * Iy [4 ... 7]
+                        // for set 1 hi sigma 1
+                        
+                        v10 = _mm_mullo_epi16(Ixy_1, diff1);
+                        v11 = _mm_mulhi_epi16(Ixy_1, diff1);
                         v00 = _mm_unpacklo_epi16(v10, v11);
                         v10 = _mm_unpackhi_epi16(v10, v11);
-						qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
-						qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
-					}
+                        qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
+                        qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
+                    }
 #else
                     for( ; x < winSize.width*cn; x++, dIptr += 2 )
                     {
-						if( dIptr[0] == 0 && dIptr[1] == 0) 
-							continue;
+                        if( dIptr[0] == 0 && dIptr[1] == 0) 
+                            continue;
                         int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
                                               Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11,
                                               W_BITS1-5) - Iptr[x];
@@ -1076,15 +1037,15 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     }
 #endif
                 }
-				
-#ifdef CV_SSE2
+                
+#ifdef CV_SSE4_1
                 float CV_DECL_ALIGNED(16) bbuf[4];
                 _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
                 b1 += bbuf[0] + bbuf[2];
                 b2 += bbuf[1] + bbuf[3];
 #endif
               
-				b1 *= FLT_SCALE;
+                b1 *= FLT_SCALE;
                 b2 *= FLT_SCALE;
 
                 Point2f delta( (float)((A12*b2 - A22*b1) * D),
@@ -1096,44 +1057,44 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 nextPts[ptidx] = nextPt + halfWin;
 
                  if( delta.ddot(delta) <= criteria.epsilon)
-					break;
-				if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01  &&	
-							std::abs(delta.y - prevDelta.y) < 0.01)
-				{
-					nextPts[ptidx]  -= delta*0.5f;
-					break;
-				}
-				
-				prevDelta = delta;
-				
+                    break;
+                if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01  &&    
+                            std::abs(delta.y - prevDelta.y) < 0.01)
+                {
+                    nextPts[ptidx]  -= delta*0.5f;
+                    break;
+                }
+                
+                prevDelta = delta;
+                
             }
-			
-			
+            
+            
 
-			
-			
+            
+            
         }
 
     }
     
-	const Mat*		prevImg;
-	const Mat*		nextImg;
-	const Mat*		prevDeriv;
-	const Mat*		rgbPrevImg;
-	const Mat*		rgbNextImg;
-	const Point2f*	prevPts;
-	Point2f*		nextPts;
-	uchar*			status;
-	float*			err;
-	int				maxWinSize;
-	int				minWinSize;
-	TermCriteria	criteria;
-	int				level;
-	int				maxLevel;
-	int				windowType;
-	float			minEigThreshold;
-	bool			useInitialFlow;
-	int		crossSegmentationThreshold;
+    const Mat*        prevImg;
+    const Mat*        nextImg;
+    const Mat*        prevDeriv;
+    const Mat*        rgbPrevImg;
+    const Mat*        rgbNextImg;
+    const Point2f*    prevPts;
+    Point2f*        nextPts;
+    uchar*            status;
+    float*            err;
+    int                maxWinSize;
+    int                minWinSize;
+    TermCriteria    criteria;
+    int                level;
+    int                maxLevel;
+    int                windowType;
+    float            minEigThreshold;
+    bool            useInitialFlow;
+    int        crossSegmentationThreshold;
 };
 }
 }
diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp
index 6dbdfef441e..fb1ad16f353 100644
--- a/modules/optflow/src/rlof/rlof_invoker.hpp
+++ b/modules/optflow/src/rlof/rlof_invoker.hpp
@@ -1,44 +1,6 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
- //
- //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
- //
- //  By downloading, copying, installing or using the software you agree to this license.
- //  If you do not agree to this license, do not download, install,
- //  copy or use the software.
- //
- //
- //                           License Agreement
- //                For Open Source Computer Vision Library
- //
- // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
- // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
- // Third party copyrights are property of their respective owners.
- //
- // Redistribution and use in source and binary forms, with or without modification,
- // are permitted provided that the following conditions are met:
- //
- //   * Redistribution's of source code must retain the above copyright notice,
- //     this list of conditions and the following disclaimer.
- //
- //   * Redistribution's in binary form must reproduce the above copyright notice,
- //     this list of conditions and the following disclaimer in the documentation
- //     and/or other materials provided with the distribution.
- //
- //   * The name of the copyright holders may not be used to endorse or promote products
- //     derived from this software without specific prior written permission.
- //
- // This software is provided by the copyright holders and contributors "as is" and
- // any express or implied warranties, including, but not limited to, the implied
- // warranties of merchantability and fitness for a particular purpose are disclaimed.
- // In no event shall the Intel Corporation or contributors be liable for any direct,
- // indirect, incidental, special, exemplary, or consequential damages
- // (including, but not limited to, procurement of substitute goods or services;
- // loss of use, data, or profits; or business interruption) however caused
- // and on any theory of liability, whether in contract, strict liability,
- // or tort (including negligence or otherwise) arising in any way out of
- // the use of this software, even if advised of the possibility of such damage.
- //
- //M*/
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
 #ifndef _RLOF_INVOKER_HPP_
 #define _RLOF_INVOKER_HPP_
 #include "rlof_invokerbase.hpp"
@@ -50,1462 +12,1447 @@ namespace rlof
 {
 namespace ica
 {
-	int calc_diff(int diff,float fParam0,float fParam1,float param_2)
-	{
-		int abss = (diff < 0) ? -diff : diff;
-		if (abss > fParam1)
-		{
-			diff = 0;
-		}
-		else if (abss > fParam0 && diff >= 0)
-		{
-			//diff = fParam1 * (diff - fParam1);
-			diff = static_cast<int>(param_2 * (diff - fParam1));
-		}
-		else if (abss > fParam0 && diff < 0)
-		{
-			//diff = fParam1 * (diff + fParam1);
-			diff = static_cast<int>(param_2 * (diff + fParam1));
-
-		}
-		return diff;
-	}
-	class TrackerInvoker : public cv::ParallelLoopBody
-	{
-	public:
-		TrackerInvoker(
-			const Mat&	_prevImg,
-			const Mat&	_prevDeriv,
-			const Mat&	_nextImg,
-			const Mat&	_rgbPrevImg,
-			const Mat&	_rgbNextImg,
-			const Point2f*	_prevPts,
-			Point2f*		_nextPts,
-			uchar*			_status,
-			float*			_err,
-			int				_level,
-			int				_maxLevel,
-			int				_winSize[2],
-			int				_maxIteration,
-			bool			_useInitialFlow,
-			int				_supportRegionType,
-			std::vector<float> _normSigmas,
-			float           _minEigenValue,
-			int				_crossSegmentationThreshold)
-		{
-			prevImg = &_prevImg;
-			prevDeriv = &_prevDeriv;
-			nextImg = &_nextImg;
-			rgbPrevImg = &_rgbPrevImg;
-			rgbNextImg = &_rgbNextImg;
-			prevPts = _prevPts;
-			nextPts = _nextPts;
-			status = _status;
-			err = _err;
-			minWinSize = _winSize[0];
-			maxWinSize = _winSize[1];
-			criteria.maxCount = _maxIteration;
-			criteria.epsilon = 0.01;
-			level = _level;
-			maxLevel = _maxLevel;
-			windowType = _supportRegionType;
-			minEigThreshold = _minEigenValue;
-			paramReset = _normSigmas;
-			useInitialFlow = _useInitialFlow;
-			crossSegmentationThreshold = _crossSegmentationThreshold;
-		}
-		TrackerInvoker & operator=(const TrackerInvoker &) { return *this; };
-		void operator()(const cv::Range& range) const
-		{
-			cv::Size	winSize;
-			cv::Point2f halfWin;
-
-			const Mat& I = *prevImg;
-			const Mat& J = *nextImg;
-			const Mat& derivI = *prevDeriv;
-			const Mat& BI = *rgbPrevImg;
-
-			winSize = cv::Size(maxWinSize, maxWinSize);
-			int winMaskwidth = static_cast<int>(ceil(winSize.width / 8.f)) * 16;
-			cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
-			winMaskMatBuf.setTo(1);
-
-			const float FLT_SCALE = (1.f / (1 << 20)); // 20
-
-			std::vector<float> param = paramReset;
-			int cn = I.channels(), cn2 = cn * 2;
-			int winbufwidth = static_cast<int>(ceil(winSize.width / 8.f)) * 8;
-			cv::Size winBufSize(winbufwidth, winbufwidth);
-
-			std::vector<short> _buf(winBufSize.area()*(cn + cn2));
-			Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]);
-			Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]);
-
-			for (int ptidx = range.start; ptidx < range.end; ptidx++)
-			{
-				Point2f prevPt = prevPts[ptidx] * (float)(1. / (1 << level));
-				Point2f nextPt;
-				if (level == maxLevel)
-				{
-					if ( useInitialFlow)
-						nextPt = nextPts[ptidx] * (float)(1. / (1 << level));
-					else
-						nextPt = prevPt;
-				}
-				else
-					nextPt = nextPts[ptidx] * 2.f;
-				nextPts[ptidx] = nextPt;
-
-				Point2i iprevPt, inextPt;
-				iprevPt.x = cvFloor(prevPt.x);
-				iprevPt.y = cvFloor(prevPt.y);
-				int winArea = maxWinSize * maxWinSize;
-				cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0, 0, maxWinSize, maxWinSize));
-
-				if (calcWinMaskMat(BI, windowType, iprevPt,
-					winMaskMat, winSize, halfWin, winArea,
-					minWinSize, maxWinSize, crossSegmentationThreshold) == false)
-					continue;
-
-
-				prevPt -= halfWin;
-				iprevPt.x = cvFloor(prevPt.x);
-				iprevPt.y = cvFloor(prevPt.y);
-
-				if (iprevPt.x + halfWin.x < 0 || iprevPt.x + halfWin.x >= derivI.cols ||
-					iprevPt.y + halfWin.y < 0 || iprevPt.y + halfWin.y >= derivI.rows)
-				{
-					if (level == 0)
-					{
-						if (status)
-							status[ptidx] = 3;
-						if (err)
-							err[ptidx] = 0;
-					}
-					continue;
-				}
-
-				float a = prevPt.x - iprevPt.x;
-				float b = prevPt.y - iprevPt.y;
-				const int W_BITS = 14, W_BITS1 = 14;
-
-				int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
-				int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
-				int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
-				int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-
-				int dstep = (int)(derivI.step / derivI.elemSize1());
-				int step = (int)(I.step / I.elemSize1());
-				int mStep = (int)(winMaskMat.step / winMaskMat.elemSize1());
-				CV_Assert(step == (int)(J.step / J.elemSize1()));
-				float A11 = 0, A12 = 0, A22 = 0;
-				float b1 = 0, b2 = 0;
-				float D = 0;
-				float minEig;
-
-#ifdef CV_SSE2
-				__m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
-				__m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
-				__m128i z = _mm_setzero_si128();
-				__m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1 - 1));
-				__m128i qdelta = _mm_set1_epi32(1 << (W_BITS1 - 5 - 1));
-				__m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps();
-				__m128i mmMask4_epi32;
-				__m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits<unsigned short>::max());
-				get4BitMask(winSize.width, mmMask4_epi32);
+    inline int calc_diff(int diff,float fParam0,float fParam1,float param_2)
+    {
+        int abss = (diff < 0) ? -diff : diff;
+        if (abss > fParam1)
+        {
+            diff = 0;
+        }
+        else if (abss > fParam0 && diff >= 0)
+        {
+            //diff = fParam1 * (diff - fParam1);
+            diff = static_cast<int>(param_2 * (diff - fParam1));
+        }
+        else if (abss > fParam0 && diff < 0)
+        {
+            //diff = fParam1 * (diff + fParam1);
+            diff = static_cast<int>(param_2 * (diff + fParam1));
+
+        }
+        return diff;
+    }
+    class TrackerInvoker : public cv::ParallelLoopBody
+    {
+    public:
+        TrackerInvoker(
+            const Mat&    _prevImg,
+            const Mat&    _prevDeriv,
+            const Mat&    _nextImg,
+            const Mat&    _rgbPrevImg,
+            const Mat&    _rgbNextImg,
+            const Point2f*    _prevPts,
+            Point2f*        _nextPts,
+            uchar*            _status,
+            float*            _err,
+            int                _level,
+            int                _maxLevel,
+            int                _winSize[2],
+            int                _maxIteration,
+            bool            _useInitialFlow,
+            int                _supportRegionType,
+            std::vector<float> _normSigmas,
+            float           _minEigenValue,
+            int                _crossSegmentationThreshold)
+        {
+            prevImg = &_prevImg;
+            prevDeriv = &_prevDeriv;
+            nextImg = &_nextImg;
+            rgbPrevImg = &_rgbPrevImg;
+            rgbNextImg = &_rgbNextImg;
+            prevPts = _prevPts;
+            nextPts = _nextPts;
+            status = _status;
+            err = _err;
+            minWinSize = _winSize[0];
+            maxWinSize = _winSize[1];
+            criteria.maxCount = _maxIteration;
+            criteria.epsilon = 0.01;
+            level = _level;
+            maxLevel = _maxLevel;
+            windowType = _supportRegionType;
+            minEigThreshold = _minEigenValue;
+            paramReset = _normSigmas;
+            useInitialFlow = _useInitialFlow;
+            crossSegmentationThreshold = _crossSegmentationThreshold;
+        }
+        TrackerInvoker & operator=(const TrackerInvoker &) { return *this; };
+        void operator()(const cv::Range& range) const
+        {
+            cv::Size    winSize;
+            cv::Point2f halfWin;
+
+            const Mat& I = *prevImg;
+            const Mat& J = *nextImg;
+            const Mat& derivI = *prevDeriv;
+            const Mat& BI = *rgbPrevImg;
+
+            winSize = cv::Size(maxWinSize, maxWinSize);
+            int winMaskwidth = static_cast<int>(ceil(winSize.width / 8.f)) * 16;
+            cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
+            winMaskMatBuf.setTo(1);
+
+            const float FLT_SCALE = (1.f / (1 << 20)); // 20
+
+            std::vector<float> param = paramReset;
+            int cn = I.channels(), cn2 = cn * 2;
+            int winbufwidth = static_cast<int>(ceil(winSize.width / 8.f)) * 8;
+            cv::Size winBufSize(winbufwidth, winbufwidth);
+
+            std::vector<short> _buf(winBufSize.area()*(cn + cn2));
+            Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]);
+            Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]);
+
+            for (int ptidx = range.start; ptidx < range.end; ptidx++)
+            {
+                Point2f prevPt = prevPts[ptidx] * (float)(1. / (1 << level));
+                Point2f nextPt;
+                if (level == maxLevel)
+                {
+                    if ( useInitialFlow)
+                        nextPt = nextPts[ptidx] * (float)(1. / (1 << level));
+                    else
+                        nextPt = prevPt;
+                }
+                else
+                    nextPt = nextPts[ptidx] * 2.f;
+                nextPts[ptidx] = nextPt;
+
+                Point2i iprevPt, inextPt;
+                iprevPt.x = cvFloor(prevPt.x);
+                iprevPt.y = cvFloor(prevPt.y);
+                int winArea = maxWinSize * maxWinSize;
+                cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0, 0, maxWinSize, maxWinSize));
+
+                if (calcWinMaskMat(BI, windowType, iprevPt,
+                    winMaskMat, winSize, halfWin, winArea,
+                    minWinSize, maxWinSize, crossSegmentationThreshold) == false)
+                    continue;
+
+
+                prevPt -= halfWin;
+                iprevPt.x = cvFloor(prevPt.x);
+                iprevPt.y = cvFloor(prevPt.y);
+
+                if (iprevPt.x + halfWin.x < 0 || iprevPt.x + halfWin.x >= derivI.cols ||
+                    iprevPt.y + halfWin.y < 0 || iprevPt.y + halfWin.y >= derivI.rows)
+                {
+                    if (level == 0)
+                    {
+                        if (status)
+                            status[ptidx] = 3;
+                        if (err)
+                            err[ptidx] = 0;
+                    }
+                    continue;
+                }
+
+                float a = prevPt.x - iprevPt.x;
+                float b = prevPt.y - iprevPt.y;
+                const int W_BITS = 14, W_BITS1 = 14;
+
+                int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+                int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+                int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+                int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+
+                int dstep = (int)(derivI.step / derivI.elemSize1());
+                int step = (int)(I.step / I.elemSize1());
+                int mStep = (int)(winMaskMat.step / winMaskMat.elemSize1());
+                CV_Assert(step == (int)(J.step / J.elemSize1()));
+                float A11 = 0, A12 = 0, A22 = 0;
+                float b1 = 0, b2 = 0;
+                float D = 0;
+                float minEig;
+
+#ifdef CV_SSE4_1
+                __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+                __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+                __m128i z = _mm_setzero_si128();
+                __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1 - 1));
+                __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1 - 5 - 1));
+                __m128i mmMask4_epi32;
+                __m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits<unsigned short>::max());
+                get4BitMask(winSize.width, mmMask4_epi32);
 #endif
 
-				// extract the patch from the first image, compute covariation matrix of derivatives
-				int x, y;
-				for (y = 0; y < winSize.height; y++)
-				{
-					const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
-					const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
-					const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
-
-					short* Iptr = (short*)(IWinBuf.data + y * IWinBuf.step);
-					short* dIptr = (short*)(derivIWinBuf.data + y * derivIWinBuf.step);
-
-					x = 0;
-
-#ifdef CV_SSE2
-					
-					for (; x <= winSize.width*cn; x += 4, dsrc += 4 * 2, dIptr += 4 * 2)
-					{
-						__m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
-						__m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16);
-
-						__m128i v00, v01, v10, v11, t0, t1;
-						v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
-						v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
-						v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z);
-						v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z);
-
-						t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
-							_mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-						t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5);
-						if (x + 4 > winSize.width)
-						{
-							t0 = _mm_and_si128(t0, mmMask4_epi32);
-						}
-						t0 = _mm_and_si128(t0, mask_0_3_epi16);
-						_mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0, t0));
-
-						v00 = _mm_loadu_si128((const __m128i*)(dsrc));
-						v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
-						v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep));
-						v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2));
-
-						t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
-							_mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-						t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
-							_mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
-						t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
-						t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
-						v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
-						if (x + 4 > winSize.width)
-						{
-							v00 = _mm_and_si128(v00, mmMask4_epi32);
-						}
-						v00 = _mm_and_si128(v00, mask_0_3_epi16);
-						_mm_storeu_si128((__m128i*)dIptr, v00);
-					}
+                // extract the patch from the first image, compute covariation matrix of derivatives
+                int x, y;
+                for (y = 0; y < winSize.height; y++)
+                {
+                    const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+                    const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
+                    const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
+
+                    short* Iptr = (short*)(IWinBuf.data + y * IWinBuf.step);
+                    short* dIptr = (short*)(derivIWinBuf.data + y * derivIWinBuf.step);
+
+                    x = 0;
+
+#ifdef CV_SSE4_1
+                    
+                    for (; x <= winSize.width*cn; x += 4, dsrc += 4 * 2, dIptr += 4 * 2)
+                    {
+                        __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
+                        __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16);
+
+                        __m128i v00, v01, v10, v11, t0, t1;
+                        v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
+                        v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
+                        v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z);
+                        v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z);
+
+                        t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                            _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                        t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5);
+                        if (x + 4 > winSize.width)
+                        {
+                            t0 = _mm_and_si128(t0, mmMask4_epi32);
+                        }
+                        t0 = _mm_and_si128(t0, mask_0_3_epi16);
+                        _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0, t0));
+
+                        v00 = _mm_loadu_si128((const __m128i*)(dsrc));
+                        v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
+                        v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep));
+                        v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2));
+
+                        t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                            _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                        t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+                            _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+                        t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
+                        t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
+                        v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
+                        if (x + 4 > winSize.width)
+                        {
+                            v00 = _mm_and_si128(v00, mmMask4_epi32);
+                        }
+                        v00 = _mm_and_si128(v00, mask_0_3_epi16);
+                        _mm_storeu_si128((__m128i*)dIptr, v00);
+                    }
 #else
 
-					for (; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2)
-					{
-						if (maskPtr[x] == 0)
-						{
-							dIptr[0] = 0;
-							dIptr[1] = 0;
-							continue;
-						}
-						int ival = CV_DESCALE(src[x] * iw00 + src[x + cn] * iw01 +
-							src[x + step] * iw10 + src[x + step + cn] * iw11, W_BITS1 - 5);
-						int ixval = CV_DESCALE(dsrc[0] * iw00 + dsrc[cn2] * iw01 +
-							dsrc[dstep] * iw10 + dsrc[dstep + cn2] * iw11, W_BITS1);
-						int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc[dstep + 1] * iw10 +
-							dsrc[dstep + cn2 + 1] * iw11, W_BITS1);
-
-						Iptr[x] = (short)ival;
-						dIptr[0] = (short)ixval;
-						dIptr[1] = (short)iyval;
-					}
+                    for (; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2)
+                    {
+                        if (maskPtr[x] == 0)
+                        {
+                            dIptr[0] = 0;
+                            dIptr[1] = 0;
+                            continue;
+                        }
+                        int ival = CV_DESCALE(src[x] * iw00 + src[x + cn] * iw01 +
+                            src[x + step] * iw10 + src[x + step + cn] * iw11, W_BITS1 - 5);
+                        int ixval = CV_DESCALE(dsrc[0] * iw00 + dsrc[cn2] * iw01 +
+                            dsrc[dstep] * iw10 + dsrc[dstep + cn2] * iw11, W_BITS1);
+                        int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc[dstep + 1] * iw10 +
+                            dsrc[dstep + cn2 + 1] * iw11, W_BITS1);
+
+                        Iptr[x] = (short)ival;
+                        dIptr[0] = (short)ixval;
+                        dIptr[1] = (short)iyval;
+                    }
 #endif
-				}
-
-				cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1);
-
-				cv::Point2f backUpNextPt = nextPt;
-				nextPt -= halfWin;
-				int j;
-				__m128i mmMask0, mmMask1, mmMask;
-				getWBitMask(winSize.width, mmMask0, mmMask1, mmMask);
-				float MEstimatorScale = 1;
-				int buffIdx = 0;
-				cv::Point2f prevDelta(0, 0);
-
-				for (j = 0; j < criteria.maxCount; j++)
-				{
-					inextPt.x = cvFloor(nextPt.x);
-					inextPt.y = cvFloor(nextPt.y);
-
-					if (inextPt.x < -winSize.width || inextPt.x >= J.cols ||
-						inextPt.y < -winSize.height || inextPt.y >= J.rows)
-					{
-						if (level == 0 && status)
-							status[ptidx] = 3;
-						if (level > 0)
-							nextPts[ptidx] = backUpNextPt;
-						break;
-					}
-
-				
-					a = nextPt.x - inextPt.x;
-					b = nextPt.y - inextPt.y;
-					iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
-					iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
-					iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
-					iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-					b1 = 0;
-					b2 = 0;
-
-					if (j == 0 )
-					{
-						A11 = 0;
-						A12 = 0;
-						A22 = 0;
-					}
-
-					if (j == 0 )
-					{
-						buffIdx = 0;
-						for (y = 0; y < winSize.height; y++)
-						{
-							const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn;
-							const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step);
-							const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
-							const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
-							x = 0;
-							for (; x < winSize.width*cn; x++, dIptr += 2)
-							{
-								if (maskPtr[x] == 0)
-									continue;
-								int diff = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11, W_BITS1 - 5) - Iptr[x];
-								residualMat.at<short>(buffIdx++) = static_cast<short>(diff);
-							}
-						}
-						/*! Estimation for the residual */
-						cv::Mat residualMatRoi(residualMat, cv::Rect(0, 0, 1, buffIdx));
-						MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi);
-					}
-
-					float eta = 1.f / winArea;
-					float fParam0 = param[0] * 32.f;
-					float fParam1 = param[1] * 32.f;
-					fParam0 = param[0] * MEstimatorScale;
-					fParam1 = param[1] * MEstimatorScale;
-#ifdef CV_SSE2
-					qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
-					qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
-					__m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps();
-					__m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();
-					__m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam0)));
-					__m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam1)));
-					__m128  mmParam2 = _mm_set1_ps(param[2]);
-					float s2Val = param[2] > 0 ? param[2] : -param[2];
-					int s2bitShift = param[2] == 0 ? 1 : static_cast<int>(ceil(log(200.f / s2Val) / log(2.f)));	// zwei stellen nach den komma 
-					__m128i mmParam2_epi16 = _mm_set1_epi16(static_cast<short>(param[2] * (float)(1 << s2bitShift)));
-					__m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift);
-					__m128  mmParam2s = _mm_set1_ps(0.01f * param[2]);
-					__m128  mmParam2s2 = _mm_set1_ps(param[2] * param[2]);
-					__m128  mmOnes = _mm_set1_ps(1.f);
-					__m128  mmError0 = _mm_setzero_ps();
-					__m128  mmError1 = _mm_setzero_ps();
-					__m128i	mmEta = _mm_setzero_si128();
-					__m128i	mmEtaNeg = _mm_set1_epi16(-1);
-					__m128i mmScale = _mm_set1_epi16(static_cast<short>(MEstimatorScale));
+                }
+
+                cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1);
+
+                cv::Point2f backUpNextPt = nextPt;
+                nextPt -= halfWin;
+                int j;
+                __m128i mmMask0, mmMask1, mmMask;
+                getWBitMask(winSize.width, mmMask0, mmMask1, mmMask);
+                float MEstimatorScale = 1;
+                int buffIdx = 0;
+                cv::Point2f prevDelta(0, 0);
+
+                for (j = 0; j < criteria.maxCount; j++)
+                {
+                    inextPt.x = cvFloor(nextPt.x);
+                    inextPt.y = cvFloor(nextPt.y);
+
+                    if (inextPt.x < -winSize.width || inextPt.x >= J.cols ||
+                        inextPt.y < -winSize.height || inextPt.y >= J.rows)
+                    {
+                        if (level == 0 && status)
+                            status[ptidx] = 3;
+                        if (level > 0)
+                            nextPts[ptidx] = backUpNextPt;
+                        break;
+                    }
+
+                
+                    a = nextPt.x - inextPt.x;
+                    b = nextPt.y - inextPt.y;
+                    iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+                    iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+                    iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+                    iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+                    b1 = 0;
+                    b2 = 0;
+
+                    if (j == 0 )
+                    {
+                        A11 = 0;
+                        A12 = 0;
+                        A22 = 0;
+                    }
+
+                    if (j == 0 )
+                    {
+                        buffIdx = 0;
+                        for (y = 0; y < winSize.height; y++)
+                        {
+                            const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn;
+                            const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step);
+                            const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
+                            const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+                            x = 0;
+                            for (; x < winSize.width*cn; x++, dIptr += 2)
+                            {
+                                if (maskPtr[x] == 0)
+                                    continue;
+                                int diff = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11, W_BITS1 - 5) - Iptr[x];
+                                residualMat.at<short>(buffIdx++) = static_cast<short>(diff);
+                            }
+                        }
+                        /*! Estimation for the residual */
+                        cv::Mat residualMatRoi(residualMat, cv::Rect(0, 0, 1, buffIdx));
+                        MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi);
+                    }
+
+                    float eta = 1.f / winArea;
+                    float fParam0 = param[0] * 32.f;
+                    float fParam1 = param[1] * 32.f;
+                    fParam0 = param[0] * MEstimatorScale;
+                    fParam1 = param[1] * MEstimatorScale;
+#ifdef CV_SSE4_1
+                    qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+                    qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+                    __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps();
+                    __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();
+                    __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam0)));
+                    __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam1)));
+                    float s2Val = param[2] > 0 ? param[2] : -param[2];
+                    int s2bitShift = param[2] == 0 ? 1 : static_cast<int>(ceil(log(200.f / s2Val) / log(2.f)));    // zwei stellen nach den komma 
+                    __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast<short>(param[2] * (float)(1 << s2bitShift)));
+                    __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift);
+                    __m128  mmParam2s = _mm_set1_ps(0.01f * param[2]);
+                    __m128  mmParam2s2 = _mm_set1_ps(param[2] * param[2]);
+                    __m128  mmOnes = _mm_set1_ps(1.f);
+                    __m128i mmEta = _mm_setzero_si128();
+                    __m128i mmScale = _mm_set1_epi16(static_cast<short>(MEstimatorScale));
 
 #endif
-					
-					buffIdx = 0;
-					for (y = 0; y < winSize.height; y++)
-					{
-						const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
-						const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn;
-						const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step);
-						const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
-
-						x = 0;
-#ifdef CV_SSE2					
-						for (; x <= winSize.width*cn; x += 8, dIptr += 8 * 2)
-						{
-							__m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
-							__m128i diff0, diff1;
-							__m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); // von element 0 bis 7
-
-							__m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
-							__m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
-							__m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z);
-							__m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z);
-
-							__m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
-								_mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-							__m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
-								_mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
-							t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5);
-							t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1 - 5);
-
-							__m128i mmDiff_epi16 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16);
-
-							mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16);
-
-							
-							__m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale);
-							mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1)));
-							
-
-							__m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... 
-							__m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
-
-							__m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16);
-							__m128i bSet2_epi16, bSet1_epi16;
-							// |It| < sigma1 ?
-							bSet2_epi16 = _mm_cmplt_epi16(abs_epi16, mmParam1);
-							// It > 0 ?
-							__m128i diffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128());
-							// sigma0 < |It| < sigma1 ?
-							bSet1_epi16 = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0));
-							// val = |It| -/+ sigma1
-							__m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)),
-								_mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1)));
-							// It == 0	 ? |It| > sigma13 
-							mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16);
-							// It == val ? sigma0 < |It| < sigma1
-							mmDiff_epi16 = _mm_blendv_epi8(mmDiff_epi16, tmpParam1_epi16, bSet1_epi16);
-
-
-							__m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3
-							// diff = diff * sigma2 
-							__m128i lo = _mm_mullo_epi16(tale_epi16_, mmDiff_epi16);
-							__m128i hi = _mm_mulhi_epi16(tale_epi16_, mmDiff_epi16);
-							__m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); //diff 0_3_epi32 
-							__m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); // diff 4_7_epi32
-
-							mmDiff_epi16 = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); 
-							diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7   | It12 It12 It13 It13...
-							diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3   | It8 It8 It9 It9...
-
-							// Ix * diff / Iy * diff
-							v10 = _mm_mullo_epi16(Ixy_0, diff0);
-							v11 = _mm_mulhi_epi16(Ixy_0, diff0);
-							v00 = _mm_unpacklo_epi16(v10, v11);
-							v10 = _mm_unpackhi_epi16(v10, v11);
-							qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
-							qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
-
-							// It * Ix It * Iy [4 ... 7]
-							// for set 1 hi sigma 1
-
-							v10 = _mm_mullo_epi16(Ixy_1, diff1);
-							v11 = _mm_mulhi_epi16(Ixy_1, diff1);
-							v00 = _mm_unpacklo_epi16(v10, v11);
-							v10 = _mm_unpackhi_epi16(v10, v11);
-							qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
-							qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
-
-
-							if (j == 0)
-							{
-								__m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16));
-								__m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16, bSet1_epi16), 16));
-								__m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16));
-								__m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16)));
-
-								__m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16));
-								__m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16), 16));
-
-								__m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps);
-								__m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps);
-								tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps);
-								tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps);
-
-								tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps);
-								tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps);
-								
-								t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3
-								t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3
-
-								__m128 fy = _mm_cvtepi32_ps(t0);
-								__m128 fx = _mm_cvtepi32_ps(t1);
-
-								// A11 - A22
-								__m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps);
-								__m128 fytale = _mm_mul_ps(fy, tale_0_3_ps);
-
-								mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
-								mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
-								mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
-
-								t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11
-								t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3
-
-								fy = _mm_cvtepi32_ps(t0);
-								fx = _mm_cvtepi32_ps(t1);
-
-								// A11 - A22
-								fxtale = _mm_mul_ps(fx, tale_4_7_ps);
-								fytale = _mm_mul_ps(fy, tale_4_7_ps);
-
-								mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
-								mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
-								mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
-							}
-						}
+                    
+                    buffIdx = 0;
+                    for (y = 0; y < winSize.height; y++)
+                    {
+                        const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+                        const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn;
+                        const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step);
+                        const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
+
+                        x = 0;
+#ifdef CV_SSE4_1                    
+                        for (; x <= winSize.width*cn; x += 8, dIptr += 8 * 2)
+                        {
+                            __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
+                            __m128i diff0, diff1;
+                            __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); // von element 0 bis 7
+
+                            __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
+                            __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
+                            __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z);
+                            __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z);
+
+                            __m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                                _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                            __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+                                _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+                            t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5);
+                            t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1 - 5);
+
+                            __m128i mmDiff_epi16 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16);
+
+                            mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16);
+
+                            
+                            __m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale);
+                            mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1)));
+                            
+
+                            __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... 
+                            __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
+
+                            __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16);
+                            __m128i bSet2_epi16, bSet1_epi16;
+                            // |It| < sigma1 ?
+                            bSet2_epi16 = _mm_cmplt_epi16(abs_epi16, mmParam1);
+                            // It > 0 ?
+                            __m128i diffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128());
+                            // sigma0 < |It| < sigma1 ?
+                            bSet1_epi16 = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0));
+                            // val = |It| -/+ sigma1
+                            __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)),
+                                _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1)));
+                            // It == 0     ? |It| > sigma13 
+                            mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16);
+                            // It == val ? sigma0 < |It| < sigma1
+                            mmDiff_epi16 = _mm_blendv_epi8(mmDiff_epi16, tmpParam1_epi16, bSet1_epi16);
+
+
+                            __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3
+                            // diff = diff * sigma2 
+                            __m128i lo = _mm_mullo_epi16(tale_epi16_, mmDiff_epi16);
+                            __m128i hi = _mm_mulhi_epi16(tale_epi16_, mmDiff_epi16);
+                            __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); //diff 0_3_epi32 
+                            __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); // diff 4_7_epi32
+
+                            mmDiff_epi16 = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); 
+                            diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7   | It12 It12 It13 It13...
+                            diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3   | It8 It8 It9 It9...
+
+                            // Ix * diff / Iy * diff
+                            v10 = _mm_mullo_epi16(Ixy_0, diff0);
+                            v11 = _mm_mulhi_epi16(Ixy_0, diff0);
+                            v00 = _mm_unpacklo_epi16(v10, v11);
+                            v10 = _mm_unpackhi_epi16(v10, v11);
+                            qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
+                            qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
+
+                            // It * Ix It * Iy [4 ... 7]
+                            // for set 1 hi sigma 1
+
+                            v10 = _mm_mullo_epi16(Ixy_1, diff1);
+                            v11 = _mm_mulhi_epi16(Ixy_1, diff1);
+                            v00 = _mm_unpacklo_epi16(v10, v11);
+                            v10 = _mm_unpackhi_epi16(v10, v11);
+                            qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
+                            qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
+
+
+                            if (j == 0)
+                            {
+                                __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16));
+                                __m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16, bSet1_epi16), 16));
+                                __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16));
+                                __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16)));
+
+                                __m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16));
+                                __m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16), 16));
+
+                                __m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps);
+                                __m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps);
+                                tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps);
+                                tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps);
+
+                                tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps);
+                                tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps);
+                                
+                                t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3
+                                t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3
+
+                                __m128 fy = _mm_cvtepi32_ps(t0);
+                                __m128 fx = _mm_cvtepi32_ps(t1);
+
+                                // A11 - A22
+                                __m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps);
+                                __m128 fytale = _mm_mul_ps(fy, tale_0_3_ps);
+
+                                mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
+                                mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
+                                mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
+
+                                t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11
+                                t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3
+
+                                fy = _mm_cvtepi32_ps(t0);
+                                fx = _mm_cvtepi32_ps(t1);
+
+                                // A11 - A22
+                                fxtale = _mm_mul_ps(fx, tale_4_7_ps);
+                                fytale = _mm_mul_ps(fy, tale_4_7_ps);
+
+                                mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
+                                mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
+                                mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
+                            }
+                        }
 #else
-						for (; x < winSize.width*cn; x++, dIptr += 2)
-						{
-							if (maskPtr[x] == 0)
-								continue;
-							int diff = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 +
-								Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11,
-								W_BITS1 - 5) - Iptr[x];
-
-						
-
-
-							
-							if (diff > MEstimatorScale)
-								MEstimatorScale += eta;
-							if (diff < MEstimatorScale)
-								MEstimatorScale -= eta;
-							
-							calc_diff(diff, fParam0, fParam1, param[2]);
-							
-				
-							float ixval = (float)(dIptr[0]);
-							float iyval = (float)(dIptr[1]);
-							b1 += (float)(diff*ixval);
-							b2 += (float)(diff*iyval);
-
-							if ( j == 0)
-							{
-								int abss = (diff < 0) ? -diff : diff;
-								float tale = param[2] * FLT_RESCALE;
-								//float tale = fParam1 * FLT_RESCALE;
-								if (abss < fParam0)
-								{
-									tale = FLT_RESCALE;
-								}
-								else if (abss > fParam1)
-								{
-									tale *= 0.01f;
-								}
-								else
-								{
-									tale *= param[2];
-								}
-								A11 += (float)(ixval*ixval*tale);
-								A12 += (float)(ixval*iyval*tale);
-								A22 += (float)(iyval*iyval*tale);
-							}
-
-
-						}
+                        for (; x < winSize.width*cn; x++, dIptr += 2)
+                        {
+                            if (maskPtr[x] == 0)
+                                continue;
+                            int diff = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 +
+                                Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11,
+                                W_BITS1 - 5) - Iptr[x];
+
+                        
+
+
+                            
+                            if (diff > MEstimatorScale)
+                                MEstimatorScale += eta;
+                            if (diff < MEstimatorScale)
+                                MEstimatorScale -= eta;
+                            
+                            calc_diff(diff, fParam0, fParam1, param[2]);
+                            
+                
+                            float ixval = (float)(dIptr[0]);
+                            float iyval = (float)(dIptr[1]);
+                            b1 += (float)(diff*ixval);
+                            b2 += (float)(diff*iyval);
+
+                            if ( j == 0)
+                            {
+                                int abss = (diff < 0) ? -diff : diff;
+                                float tale = param[2] * FLT_RESCALE;
+                                //float tale = fParam1 * FLT_RESCALE;
+                                if (abss < fParam0)
+                                {
+                                    tale = FLT_RESCALE;
+                                }
+                                else if (abss > fParam1)
+                                {
+                                    tale *= 0.01f;
+                                }
+                                else
+                                {
+                                    tale *= param[2];
+                                }
+                                A11 += (float)(ixval*ixval*tale);
+                                A12 += (float)(ixval*iyval*tale);
+                                A22 += (float)(iyval*iyval*tale);
+                            }
+
+
+                        }
 #endif
-					}
-					
-#ifdef CV_SSE2
-					short etaValues[8];
-					_mm_storeu_si128((__m128i*)(etaValues), mmEta);
-					MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3]
-							+ etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]);
-
-				
+                    }
+                    
+#ifdef CV_SSE4_1
+                    short etaValues[8];
+                    _mm_storeu_si128((__m128i*)(etaValues), mmEta);
+                    MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3]
+                            + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]);
+
+                
 #endif
-					if (j == 0)
-					{
-#ifdef CV_SSE2
-						float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];//				
+                    if (j == 0)
+                    {
+#ifdef CV_SSE4_1
+                        float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];//                
 
-						_mm_store_ps(A11buf, mmAxx);
-						_mm_store_ps(A12buf, mmAxy);
-						_mm_store_ps(A22buf, mmAyy);
+                        _mm_store_ps(A11buf, mmAxx);
+                        _mm_store_ps(A12buf, mmAxy);
+                        _mm_store_ps(A22buf, mmAyy);
 
 
-						A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
-						A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
-						A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
+                        A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
+                        A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
+                        A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
 #endif
-						A11 *= FLT_SCALE; 
-						A12 *= FLT_SCALE; 
-						A22 *= FLT_SCALE;
-
-
-						D = A11 * A22 - A12 * A12;
-						minEig = (A22 + A11 - std::sqrt((A11 - A22)*(A11 - A22) +
-							4.f*A12*A12)) / (2 * winArea);
-						D = 1.f / D;
-
-						if (minEig < minEigThreshold || std::abs(D) < FLT_EPSILON)
-						{
-							if (level == 0 && status)
-								status[ptidx] = 0;
-							if (level > 0)
-							{
-								nextPts[ptidx] = backUpNextPt;
-							}
-							break;
-						}
-					}
-
-#ifdef CV_SSE2
-					float CV_DECL_ALIGNED(16) bbuf[4];
-					_mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
-					b1 += bbuf[0] + bbuf[2];
-					b2 += bbuf[1] + bbuf[3];
+                        A11 *= FLT_SCALE; 
+                        A12 *= FLT_SCALE; 
+                        A22 *= FLT_SCALE;
+
+
+                        D = A11 * A22 - A12 * A12;
+                        minEig = (A22 + A11 - std::sqrt((A11 - A22)*(A11 - A22) +
+                            4.f*A12*A12)) / (2 * winArea);
+                        D = 1.f / D;
+
+                        if (minEig < minEigThreshold || std::abs(D) < FLT_EPSILON)
+                        {
+                            if (level == 0 && status)
+                                status[ptidx] = 0;
+                            if (level > 0)
+                            {
+                                nextPts[ptidx] = backUpNextPt;
+                            }
+                            break;
+                        }
+                    }
+
+#ifdef CV_SSE4_1
+                    float CV_DECL_ALIGNED(16) bbuf[4];
+                    _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
+                    b1 += bbuf[0] + bbuf[2];
+                    b2 += bbuf[1] + bbuf[3];
 #endif
 
-					b1 *= FLT_SCALE;
-					b2 *= FLT_SCALE;
-
-
-					Point2f delta((float)((A12*b2 - A22 * b1) * D), (float)((A12*b1 - A11 * b2) * D));
-
-					delta.x = (delta.x != delta.x) ? 0 : delta.x;
-					delta.y = (delta.y != delta.y) ? 0 : delta.y;
-
-					nextPt += delta * 0.7;
-					nextPts[ptidx] = nextPt + halfWin;
-
-					if (j > 0 && std::abs(delta.x - prevDelta.x) < 0.01  &&
-						std::abs(delta.y - prevDelta.y) < 0.01)
-					{
-						nextPts[ptidx] -= delta * 0.5f;
-						break;
-					}
-
-					prevDelta = delta;
-
-				}
-
-			}
-
-		}
-
-		const Mat*		prevImg;
-		const Mat*		nextImg;
-		const Mat*		prevDeriv;
-		const Mat*		rgbPrevImg;
-		const Mat*		rgbNextImg;
-		const Point2f*	prevPts;
-		Point2f*		nextPts;
-		uchar*			status;
-		float*			err;
-		int				maxWinSize;
-		int				minWinSize;
-		TermCriteria	criteria;
-		int				level;
-		int				maxLevel;
-		int				windowType;
-		float			minEigThreshold;
-		bool			useInitialFlow;
-		std::vector<float> paramReset;
-		int				crossSegmentationThreshold;
-	};
+                    b1 *= FLT_SCALE;
+                    b2 *= FLT_SCALE;
+
+
+                    Point2f delta((float)((A12*b2 - A22 * b1) * D), (float)((A12*b1 - A11 * b2) * D));
+
+                    delta.x = (delta.x != delta.x) ? 0 : delta.x;
+                    delta.y = (delta.y != delta.y) ? 0 : delta.y;
+
+                    nextPt += delta * 0.7;
+                    nextPts[ptidx] = nextPt + halfWin;
+
+                    if (j > 0 && std::abs(delta.x - prevDelta.x) < 0.01  &&
+                        std::abs(delta.y - prevDelta.y) < 0.01)
+                    {
+                        nextPts[ptidx] -= delta * 0.5f;
+                        break;
+                    }
+
+                    prevDelta = delta;
+
+                }
+
+            }
+
+        }
+
+        const Mat*        prevImg;
+        const Mat*        nextImg;
+        const Mat*        prevDeriv;
+        const Mat*        rgbPrevImg;
+        const Mat*        rgbNextImg;
+        const Point2f*    prevPts;
+        Point2f*        nextPts;
+        uchar*            status;
+        float*            err;
+        int                maxWinSize;
+        int                minWinSize;
+        TermCriteria    criteria;
+        int                level;
+        int                maxLevel;
+        int                windowType;
+        float            minEigThreshold;
+        bool            useInitialFlow;
+        std::vector<float> paramReset;
+        int                crossSegmentationThreshold;
+    };
 }
 namespace radial
 {
-	class TrackerInvoker : public cv::ParallelLoopBody
-	{
-	public:
-		TrackerInvoker(
-			const Mat&	_prevImg,
-			const Mat&	_prevDeriv,
-			const Mat&	_nextImg,
-			const Mat&	_rgbPrevImg,
-			const Mat&	_rgbNextImg,
-			const Point2f*	_prevPts,
-			Point2f*		_nextPts,
-			uchar*			_status,
-			float*			_err,
-			Point2f*		_gainVecs,
-			int				_level,
-			int				_maxLevel,
-			int				_winSize[2],
-			int				_maxIteration,
-			bool			_useInitialFlow,
-			int				_supportRegionType,
-			std::vector<float> _normSigmas,
-			float           _minEigenValue,
-			int				_crossSegmentationThreshold)
-		{
-			prevImg = &_prevImg;
-			prevDeriv = &_prevDeriv;
-			nextImg = &_nextImg;
-			rgbPrevImg = &_rgbPrevImg;
-			rgbNextImg = &_rgbNextImg;
-			prevPts = _prevPts;
-			nextPts = _nextPts;
-			status = _status;
-			err = _err;
-			gainVecs = _gainVecs;
-			minWinSize = _winSize[0];
-			maxWinSize = _winSize[1];
-			criteria.maxCount = _maxIteration;
-			criteria.epsilon = 0.01;
-			level = _level;
-			maxLevel = _maxLevel;
-			windowType = _supportRegionType;
-			minEigThreshold = _minEigenValue;
-			paramReset = _normSigmas;
-			useInitialFlow = _useInitialFlow;
-			crossSegmentationThreshold = _crossSegmentationThreshold;
-		}
-
-		TrackerInvoker & operator=(const TrackerInvoker &) { return *this; };
-		void operator()(const cv::Range& range) const
-		{
-			Point2f halfWin; 
-			cv::Size winSize;
-			const Mat& I = *prevImg;
-			const Mat& J = *nextImg;
-			const Mat& derivI = *prevDeriv;
-			const Mat& BI = *rgbPrevImg;
-
-
-			const float FLT_SCALE = (1.f / (1 << 16));
-
-			winSize = cv::Size(maxWinSize, maxWinSize);
-			int winMaskwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
-			cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
-			winMaskMatBuf.setTo(1);
-
-			std::vector<float> param = paramReset;
-			int cn = I.channels(), cn2 = cn * 2;
-			int winbufwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
-			cv::Size winBufSize(winbufwidth, winbufwidth);
-
-			cv::Mat invTensorMat(4, 4, CV_32FC1);
-			cv::Mat mismatchMat(4, 1, CV_32FC1);
-			cv::Mat resultMat(4, 1, CV_32FC1);
-
-
-			std::vector<short> _buf(winBufSize.area()*(cn + cn2));
-			Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]);
-			Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]);
-
-			for (int ptidx = range.start; ptidx < range.end; ptidx++)
-			{
-
-				Point2f prevPt = prevPts[ptidx] * (float)(1. / (1 << level));
-					Point2f nextPt;
-				if (level == maxLevel)
-				{
-					if (useInitialFlow)
-					{
-						nextPt = nextPts[ptidx] * (float)(1. / (1 << level));
-						}
-					else
-						nextPt = prevPt;
-				}
-				else
-				{
-					nextPt = nextPts[ptidx] * 2.f;
-					
-				}
-				nextPts[ptidx] = nextPt;
-
-				Point2i iprevPt, inextPt;
-				iprevPt.x = cvFloor(prevPt.x);
-				iprevPt.y = cvFloor(prevPt.y);
-				int winArea = maxWinSize * maxWinSize;
-				cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0, 0, maxWinSize, maxWinSize));
-				winMaskMatBuf.setTo(0);
-				if (calcWinMaskMat(BI, windowType, iprevPt,
-					winMaskMat, winSize, halfWin, winArea,
-					minWinSize, maxWinSize, crossSegmentationThreshold) == false)
-					continue;
-
-				prevPt -= halfWin;
-				iprevPt.x = cvFloor(prevPt.x);
-				iprevPt.y = cvFloor(prevPt.y);
-
-				if (iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols ||
-					iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows)
-				{
-					if (level == 0)
-					{
-						if (status)
-							status[ptidx] = 3;
-						if (err)
-							err[ptidx] = 0;
-					}
-					continue;
-				}
-
-				float a = prevPt.x - iprevPt.x;
-				float b = prevPt.y - iprevPt.y;
-				const int W_BITS = 14, W_BITS1 = 14;
-
-				int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
-				int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
-				int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
-				int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-
-				int dstep = (int)(derivI.step / derivI.elemSize1());
-				int step = (int)(I.step / I.elemSize1());
-				int mStep = (int)(winMaskMat.step / winMaskMat.elemSize1());
-				CV_Assert(step == (int)(J.step / J.elemSize1()));
-
-				float A11 = 0, A12 = 0, A22 = 0;
-				float b1 = 0, b2 = 0, b3 = 0, b4 = 0;
-				// tensor
-				float sumIx = 0;
-				float sumIy = 0;
-				float sumI = 0;
-				float sumW = 0;
-				float w1 = 0, w2 = 0; // -IyI
-				float dI = 0; // I^2
-				float D = 0;
-
-#ifdef CV_SSE2
-				__m128i rightMask = _mm_set_epi16(0, std::numeric_limits<unsigned short>::max(), 0, std::numeric_limits<unsigned short>::max(), 0, std::numeric_limits<unsigned short>::max(), 0, std::numeric_limits<unsigned short>::max());
-				__m128i leftMask = _mm_set_epi16(std::numeric_limits<unsigned short>::max(), 0, std::numeric_limits<unsigned short>::max(), 0, std::numeric_limits<unsigned short>::max(), 0, std::numeric_limits<unsigned short>::max(), 0);
-				__m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
-				__m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
-				__m128i z = _mm_setzero_si128();
-				__m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1 - 1));
-				__m128i qdelta = _mm_set1_epi32(1 << (W_BITS1 - 5 - 1));
-				__m128i mmMask4_epi32;
-				__m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits<unsigned short>::max());
-				get4BitMask(winSize.width, mmMask4_epi32);
-				__m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps();
+    class TrackerInvoker : public cv::ParallelLoopBody
+    {
+    public:
+        TrackerInvoker(
+            const Mat&    _prevImg,
+            const Mat&    _prevDeriv,
+            const Mat&    _nextImg,
+            const Mat&    _rgbPrevImg,
+            const Mat&    _rgbNextImg,
+            const Point2f*    _prevPts,
+            Point2f*        _nextPts,
+            uchar*            _status,
+            float*            _err,
+            Point2f*        _gainVecs,
+            int                _level,
+            int                _maxLevel,
+            int                _winSize[2],
+            int                _maxIteration,
+            bool            _useInitialFlow,
+            int                _supportRegionType,
+            std::vector<float> _normSigmas,
+            float           _minEigenValue,
+            int                _crossSegmentationThreshold)
+        {
+            prevImg = &_prevImg;
+            prevDeriv = &_prevDeriv;
+            nextImg = &_nextImg;
+            rgbPrevImg = &_rgbPrevImg;
+            rgbNextImg = &_rgbNextImg;
+            prevPts = _prevPts;
+            nextPts = _nextPts;
+            status = _status;
+            err = _err;
+            gainVecs = _gainVecs;
+            minWinSize = _winSize[0];
+            maxWinSize = _winSize[1];
+            criteria.maxCount = _maxIteration;
+            criteria.epsilon = 0.01;
+            level = _level;
+            maxLevel = _maxLevel;
+            windowType = _supportRegionType;
+            minEigThreshold = _minEigenValue;
+            paramReset = _normSigmas;
+            useInitialFlow = _useInitialFlow;
+            crossSegmentationThreshold = _crossSegmentationThreshold;
+        }
+
+        TrackerInvoker & operator=(const TrackerInvoker &) { return *this; };
+        void operator()(const cv::Range& range) const
+        {
+            Point2f halfWin; 
+            cv::Size winSize;
+            const Mat& I = *prevImg;
+            const Mat& J = *nextImg;
+            const Mat& derivI = *prevDeriv;
+            const Mat& BI = *rgbPrevImg;
+
+
+            const float FLT_SCALE = (1.f / (1 << 16));
+
+            winSize = cv::Size(maxWinSize, maxWinSize);
+            int winMaskwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
+            cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
+            winMaskMatBuf.setTo(1);
+
+            std::vector<float> param = paramReset;
+            int cn = I.channels(), cn2 = cn * 2;
+            int winbufwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
+            cv::Size winBufSize(winbufwidth, winbufwidth);
+
+            cv::Mat invTensorMat(4, 4, CV_32FC1);
+            cv::Mat mismatchMat(4, 1, CV_32FC1);
+            cv::Mat resultMat(4, 1, CV_32FC1);
+
+
+            std::vector<short> _buf(winBufSize.area()*(cn + cn2));
+            Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]);
+            Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]);
+
+            for (int ptidx = range.start; ptidx < range.end; ptidx++)
+            {
+
+                Point2f prevPt = prevPts[ptidx] * (float)(1. / (1 << level));
+                    Point2f nextPt;
+                if (level == maxLevel)
+                {
+                    if (useInitialFlow)
+                    {
+                        nextPt = nextPts[ptidx] * (float)(1. / (1 << level));
+                        }
+                    else
+                        nextPt = prevPt;
+                }
+                else
+                {
+                    nextPt = nextPts[ptidx] * 2.f;
+                    
+                }
+                nextPts[ptidx] = nextPt;
+
+                Point2i iprevPt, inextPt;
+                iprevPt.x = cvFloor(prevPt.x);
+                iprevPt.y = cvFloor(prevPt.y);
+                int winArea = maxWinSize * maxWinSize;
+                cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0, 0, maxWinSize, maxWinSize));
+                winMaskMatBuf.setTo(0);
+                if (calcWinMaskMat(BI, windowType, iprevPt,
+                    winMaskMat, winSize, halfWin, winArea,
+                    minWinSize, maxWinSize, crossSegmentationThreshold) == false)
+                    continue;
+
+                prevPt -= halfWin;
+                iprevPt.x = cvFloor(prevPt.x);
+                iprevPt.y = cvFloor(prevPt.y);
+
+                if (iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols ||
+                    iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows)
+                {
+                    if (level == 0)
+                    {
+                        if (status)
+                            status[ptidx] = 3;
+                        if (err)
+                            err[ptidx] = 0;
+                    }
+                    continue;
+                }
+
+                float a = prevPt.x - iprevPt.x;
+                float b = prevPt.y - iprevPt.y;
+                const int W_BITS = 14, W_BITS1 = 14;
+
+                int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+                int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+                int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+                int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+
+                int dstep = (int)(derivI.step / derivI.elemSize1());
+                int step = (int)(I.step / I.elemSize1());
+                int mStep = (int)(winMaskMat.step / winMaskMat.elemSize1());
+                CV_Assert(step == (int)(J.step / J.elemSize1()));
+
+                float A11 = 0, A12 = 0, A22 = 0;
+                float b1 = 0, b2 = 0, b3 = 0, b4 = 0;
+                // tensor
+                float sumIx = 0;
+                float sumIy = 0;
+                float sumI = 0;
+                float sumW = 0;
+                float w1 = 0, w2 = 0; // -IyI
+                float dI = 0; // I^2
+                float D = 0;
+
+#ifdef CV_SSE4_1
+                __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+                __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+                __m128i z = _mm_setzero_si128();
+                __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1 - 1));
+                __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1 - 5 - 1));
+                __m128i mmMask4_epi32;
+                __m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits<unsigned short>::max());
+                get4BitMask(winSize.width, mmMask4_epi32);
 #endif
 
-				// extract the patch from the first image, compute covariation matrix of derivatives
-				int x, y;
-				for (y = 0; y < winSize.height; y++)
-				{
-					const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
-					const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
-					const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
-
-					short* Iptr = (short*)(IWinBuf.data + y * IWinBuf.step);
-					short* dIptr = (short*)(derivIWinBuf.data + y * derivIWinBuf.step);
-
-					x = 0;
-
-#ifdef CV_SSE2
-					for (; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4 * 2, dIptr += 4 * 2)
-					{
-						__m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
-						__m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16);
-						__m128i v00, v01, v10, v11, t0, t1;
-						v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
-						v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
-						v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z);
-						v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z);
-
-						t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
-							_mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-						t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5);
-						if (x + 4 > winSize.width)
-						{
-							t0 = _mm_and_si128(t0, mmMask4_epi32);
-						}
-						t0 = _mm_and_si128(t0, mask_0_3_epi16);
-						_mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0, t0));
-
-
-						v00 = _mm_loadu_si128((const __m128i*)(dsrc));
-						v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
-						v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep));
-						v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2));
-
-						t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
-							_mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-						t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
-							_mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
-						t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
-						t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
-						v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
-						if (x + 4 > winSize.width)
-						{
-							v00 = _mm_and_si128(v00, mmMask4_epi32);
-						}
-						v00 = _mm_and_si128(v00, mask_0_3_epi16);
-						_mm_storeu_si128((__m128i*)dIptr, v00);
-					}
+                // extract the patch from the first image, compute covariation matrix of derivatives
+                int x, y;
+                for (y = 0; y < winSize.height; y++)
+                {
+                    const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+                    const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
+                    const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
+
+                    short* Iptr = (short*)(IWinBuf.data + y * IWinBuf.step);
+                    short* dIptr = (short*)(derivIWinBuf.data + y * derivIWinBuf.step);
+
+                    x = 0;
+
+#ifdef CV_SSE4_1
+                    for (; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4 * 2, dIptr += 4 * 2)
+                    {
+                        __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
+                        __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16);
+                        __m128i v00, v01, v10, v11, t0, t1;
+                        v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
+                        v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
+                        v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z);
+                        v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z);
+
+                        t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                            _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                        t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5);
+                        if (x + 4 > winSize.width)
+                        {
+                            t0 = _mm_and_si128(t0, mmMask4_epi32);
+                        }
+                        t0 = _mm_and_si128(t0, mask_0_3_epi16);
+                        _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0, t0));
+
+
+                        v00 = _mm_loadu_si128((const __m128i*)(dsrc));
+                        v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
+                        v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep));
+                        v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2));
+
+                        t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                            _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                        t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+                            _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+                        t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
+                        t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
+                        v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
+                        if (x + 4 > winSize.width)
+                        {
+                            v00 = _mm_and_si128(v00, mmMask4_epi32);
+                        }
+                        v00 = _mm_and_si128(v00, mask_0_3_epi16);
+                        _mm_storeu_si128((__m128i*)dIptr, v00);
+                    }
 #else
 
-					for (; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2)
-					{
-						if (winMaskMat.at<uchar>(y, x) == 0)
-						{
-							dIptr[0] = 0;
-							dIptr[1] = 0;
-							continue;
-						}
-						int ival = CV_DESCALE(src[x] * iw00 + src[x + cn] * iw01 +
-							src[x + step] * iw10 + src[x + step + cn] * iw11, W_BITS1 - 5);
-						int ixval = CV_DESCALE(dsrc[0] * iw00 + dsrc[cn2] * iw01 +
-							dsrc[dstep] * iw10 + dsrc[dstep + cn2] * iw11, W_BITS1);
-						int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc[dstep + 1] * iw10 +
-							dsrc[dstep + cn2 + 1] * iw11, W_BITS1);
-
-						Iptr[x] = (short)ival;
-						dIptr[0] = (short)ixval;
-						dIptr[1] = (short)iyval;
-
-					}
+                    for (; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2)
+                    {
+                        if (winMaskMat.at<uchar>(y, x) == 0)
+                        {
+                            dIptr[0] = 0;
+                            dIptr[1] = 0;
+                            continue;
+                        }
+                        int ival = CV_DESCALE(src[x] * iw00 + src[x + cn] * iw01 +
+                            src[x + step] * iw10 + src[x + step + cn] * iw11, W_BITS1 - 5);
+                        int ixval = CV_DESCALE(dsrc[0] * iw00 + dsrc[cn2] * iw01 +
+                            dsrc[dstep] * iw10 + dsrc[dstep + cn2] * iw11, W_BITS1);
+                        int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc[dstep + 1] * iw10 +
+                            dsrc[dstep + cn2 + 1] * iw11, W_BITS1);
+
+                        Iptr[x] = (short)ival;
+                        dIptr[0] = (short)ixval;
+                        dIptr[1] = (short)iyval;
+
+                    }
 #endif
-				}
-
-				cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1);
-
-				cv::Point2f backUpNextPt = nextPt;
-				nextPt -= halfWin;
-				Point2f prevDelta(0, 0);	//related to h(t-1)
-				Point2f prevGain(0, 0);
-				cv::Point2f gainVec = gainVecs[ptidx];
-				cv::Point2f backUpGain = gainVec;
-				cv::Size _winSize = winSize;
-				int j;
-				__m128i mmMask0, mmMask1, mmMask;
-				getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask);
-				__m128  mmOnes = _mm_set1_ps(1.f);
-				__m128i mmOnes_epi16 = _mm_set1_epi16(1);
-				__m128i mmZeros_epi16 = _mm_set1_epi16(0);
-				float MEstimatorScale = 1;
-				int buffIdx = 0;
-				float minEigValue, maxEigValue;
-
-				for (j = 0; j < criteria.maxCount; j++)
-				{
-					
-					inextPt.x = cvFloor(nextPt.x);
-					inextPt.y = cvFloor(nextPt.y);
-
-					if (inextPt.x < -winSize.width || inextPt.x >= J.cols ||
-						inextPt.y < -winSize.height || inextPt.y >= J.rows)
-					{
-						if (level == 0 && status)
-							status[ptidx] = 3;
-						if (level > 0)
-						{
-							nextPts[ptidx] = backUpNextPt;
-							gainVecs[ptidx] = backUpGain;
-						}
-						break;
-					}
-
-					a = nextPt.x - inextPt.x;
-					b = nextPt.y - inextPt.y;
-					iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
-					iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
-					iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
-					iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-
-					// mismatch
-					b1 = 0;
-					b2 = 0;
-					b3 = 0;
-					b4 = 0;
-					if (j == 0)
-					{
-						// tensor
-						w1 = 0; // -IxI
-						w2 = 0; // -IyI
-						dI = 0; // I^2
-						sumIx = 0;
-						sumIy = 0;
-						sumI = 0;
-						sumW = 0;
-						A11 = 0;
-						A12 = 0;
-						A22 = 0;
-					}
-
-					if (j == 0 )
-					{
-						buffIdx = 0;
-						for (y = 0; y < winSize.height; y++)
-						{
-							const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn;
-							const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step);
-							const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
-							x = 0;
-							for (; x < winSize.width*cn; x++, dIptr += 2)
-							{
-								if (dIptr[0] == 0 && dIptr[1] == 0)
-									continue;
-								int diff = static_cast<int>(CV_DESCALE(	Jptr[x] * iw00 + 
-														Jptr[x + cn] * iw01 + 
-														Jptr[x + step] * iw10 + 
-														Jptr[x + step + cn] * iw11, W_BITS1 - 5)
-									- Iptr[x] + Iptr[x] * gainVec.x + gainVec.y);
-								residualMat.at<short>(buffIdx++) = static_cast<short>(diff);
-							}
-						}
-						/*! Estimation for the residual */
-						cv::Mat residualMatRoi(residualMat, cv::Rect(0, 0, 1, buffIdx));
-						MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi);
-					}
-
-					float eta = 1.f / winArea;
-					float fParam0 = param[0] * 32.f;
-					float fParam1 = param[1] * 32.f;
-					fParam0 = param[0] * MEstimatorScale;
-					fParam1 = param[1] * MEstimatorScale;
-					
-#ifdef CV_SSE2
-
-					qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
-					qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
-					__m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(), qb2 = _mm_setzero_ps();
-					__m128 qb3 = _mm_setzero_ps();
-					__m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps();
-					__m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps();
-					__m128 mmSumIy = _mm_setzero_ps(), mmSumIx = _mm_setzero_ps();
-					__m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();
-					__m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam0)));
-					__m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam1)));
-
-
-					float s2Val = param[2] > 0 ? param[2] : -param[2];
-					int s2bitShift = param[2] == 0 ? 1 : static_cast<int>(ceil(log(200.f / s2Val) / log(2.f)));	// zwei stellen nach den komma 
-					__m128i mmParam2_epi16 = _mm_set1_epi16(static_cast<short>(param[2] * (float)(1 << s2bitShift)));
-					__m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift);
-					__m128  mmParam2 = _mm_set1_ps(param[2]);	
-					__m128  mmParam2s = _mm_set1_ps(0.01f * param[2]);
-					__m128  mmParam2s2 = _mm_set1_ps(param[2] * param[2]);
-
-					float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x;
-					int bitShift = gainVec.x == 0 ? 1 : static_cast<int>(ceil(log(200.f / gainVal) / log(2.f)));	// zwei stellen nach den komma 
-
-					__m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.x * (float)(1 << bitShift)));
-					__m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.y));
-					__m128i	mmEta = _mm_setzero_si128();
-					__m128i	mmEtaNeg = _mm_set1_epi16(-1);
-					__m128i mmScale = _mm_set1_epi16(static_cast<short>(MEstimatorScale));
+                }
+
+                cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1);
+
+                cv::Point2f backUpNextPt = nextPt;
+                nextPt -= halfWin;
+                Point2f prevDelta(0, 0);    //related to h(t-1)
+                Point2f prevGain(0, 0);
+                cv::Point2f gainVec = gainVecs[ptidx];
+                cv::Point2f backUpGain = gainVec;
+                cv::Size _winSize = winSize;
+                int j;
+                __m128i mmMask0, mmMask1, mmMask;
+                getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask);
+                __m128  mmOnes = _mm_set1_ps(1.f);
+                float MEstimatorScale = 1;
+                int buffIdx = 0;
+                float minEigValue;
+
+                for (j = 0; j < criteria.maxCount; j++)
+                {
+                    
+                    inextPt.x = cvFloor(nextPt.x);
+                    inextPt.y = cvFloor(nextPt.y);
+
+                    if (inextPt.x < -winSize.width || inextPt.x >= J.cols ||
+                        inextPt.y < -winSize.height || inextPt.y >= J.rows)
+                    {
+                        if (level == 0 && status)
+                            status[ptidx] = 3;
+                        if (level > 0)
+                        {
+                            nextPts[ptidx] = backUpNextPt;
+                            gainVecs[ptidx] = backUpGain;
+                        }
+                        break;
+                    }
+
+                    a = nextPt.x - inextPt.x;
+                    b = nextPt.y - inextPt.y;
+                    iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+                    iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+                    iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+                    iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+
+                    // mismatch
+                    b1 = 0;
+                    b2 = 0;
+                    b3 = 0;
+                    b4 = 0;
+                    if (j == 0)
+                    {
+                        // tensor
+                        w1 = 0; // -IxI
+                        w2 = 0; // -IyI
+                        dI = 0; // I^2
+                        sumIx = 0;
+                        sumIy = 0;
+                        sumI = 0;
+                        sumW = 0;
+                        A11 = 0;
+                        A12 = 0;
+                        A22 = 0;
+                    }
+
+                    if (j == 0 )
+                    {
+                        buffIdx = 0;
+                        for (y = 0; y < winSize.height; y++)
+                        {
+                            const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn;
+                            const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step);
+                            const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
+                            x = 0;
+                            for (; x < winSize.width*cn; x++, dIptr += 2)
+                            {
+                                if (dIptr[0] == 0 && dIptr[1] == 0)
+                                    continue;
+                                int diff = static_cast<int>(CV_DESCALE(    Jptr[x] * iw00 + 
+                                                        Jptr[x + cn] * iw01 + 
+                                                        Jptr[x + step] * iw10 + 
+                                                        Jptr[x + step + cn] * iw11, W_BITS1 - 5)
+                                    - Iptr[x] + Iptr[x] * gainVec.x + gainVec.y);
+                                residualMat.at<short>(buffIdx++) = static_cast<short>(diff);
+                            }
+                        }
+                        /*! Estimation for the residual */
+                        cv::Mat residualMatRoi(residualMat, cv::Rect(0, 0, 1, buffIdx));
+                        MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi);
+                    }
+
+                    float eta = 1.f / winArea;
+                    float fParam0 = param[0] * 32.f;
+                    float fParam1 = param[1] * 32.f;
+                    fParam0 = param[0] * MEstimatorScale;
+                    fParam1 = param[1] * MEstimatorScale;
+                    
+#ifdef CV_SSE4_1
+
+                    qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+                    qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+                    __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(), qb2 = _mm_setzero_ps();
+                    __m128 qb3 = _mm_setzero_ps();
+                    __m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps();
+                    __m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps();
+                    __m128 mmSumIy = _mm_setzero_ps(), mmSumIx = _mm_setzero_ps();
+                    __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();
+                    __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam0)));
+                    __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam1)));
+
+
+                    float s2Val = param[2] > 0 ? param[2] : -param[2];
+                    int s2bitShift = param[2] == 0 ? 1 : static_cast<int>(ceil(log(200.f / s2Val) / log(2.f)));    // zwei stellen nach den komma 
+                    __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast<short>(param[2] * (float)(1 << s2bitShift)));
+                    __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift);
+                    __m128  mmParam2s = _mm_set1_ps(0.01f * param[2]);
+                    __m128  mmParam2s2 = _mm_set1_ps(param[2] * param[2]);
+
+                    float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x;
+                    int bitShift = gainVec.x == 0 ? 1 : static_cast<int>(ceil(log(200.f / gainVal) / log(2.f)));    // zwei stellen nach den komma 
+
+                    __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.x * (float)(1 << bitShift)));
+                    __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.y));
+                    __m128i    mmEta = _mm_setzero_si128();
+                    __m128i mmScale = _mm_set1_epi16(static_cast<short>(MEstimatorScale));
 
 #endif
-					buffIdx = 0;
-
-					for (y = 0; y < _winSize.height; y++)
-					{
-						const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
-						const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn;
-						const short* Iptr = (const short*)(IWinBuf.data + y * IWinBuf.step);
-						const short* dIptr = (const short*)(derivIWinBuf.data + y * derivIWinBuf.step);
-
-						x = 0;
-#ifdef CV_SSE2					
-
-
-						for (; x <= _winSize.width*cn; x += 8, dIptr += 8 * 2)
-						{
-							__m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
-							__m128i diff0, diff1;
-							__m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); // von element 0 bis 7
-				   			__m128i v00 = _mm_unpacklo_epi8(
-							_mm_loadl_epi64((const __m128i*)(Jptr + x)) , z); //J0 , 0, J1, 0, J2, 0 ... J7,0 
-							__m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
-							__m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z);
-							__m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z);
-
-							__m128i t0 = _mm_add_epi32
-							(_mm_madd_epi16(
-								_mm_unpacklo_epi16(v00, v01), //J0, , J1, J1, J2, J2,  ... J3 , J4
-								qw0),
-								_mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-							__m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
-								_mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
-
-							t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5);
-							t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1 - 5);
-
-							
-							__m128i lo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16);
-							__m128i hi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16);
-
-							__m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), bitShift);
-							__m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), bitShift);
-							__m128i Igain_epi16 = _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32);
-
-							// diff = J - I + I*gain.x + gain.y
-							__m128i mmDiff_epi16 = _mm_add_epi16(
-								_mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16),	// J - I
-								_mm_add_epi16(Igain_epi16, mmConstValue_epi16));
-
-							
-							mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16);
-
-							
-							__m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale);
-							mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1)));
-								
-		
-							__m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... 
-							__m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
-
-
-							__m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16);
-							__m128i bSet2_epi16, bSet1_epi16;
-							// |It| < sigma1 ?
-							bSet2_epi16 = _mm_cmplt_epi16(abs_epi16, mmParam1);
-							// It > 0 ?
-							__m128i diffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128());
-							// sigma0 < |It| < sigma1 ?
-							bSet1_epi16 = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0));
-							// val = |It| -/+ sigma1
-							__m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)),
-								_mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1)));
-							// It == 0	 ? |It| > sigma13 
-							mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16);
-							// It == val ? sigma0 < |It| < sigma1
-							mmDiff_epi16 = _mm_blendv_epi8(mmDiff_epi16, tmpParam1_epi16, bSet1_epi16);
-
-
-							__m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3
-							// diff = diff * sigma2 
-							lo = _mm_mullo_epi16(tale_epi16_, mmDiff_epi16);
-							hi = _mm_mulhi_epi16(tale_epi16_, mmDiff_epi16);
-							__m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); //diff 0_3_epi32 
-							__m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); // diff 4_7_epi32
-
-							mmDiff_epi16 = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); // ,da das ergebniss kleiner als 16 bit sein sollte
-
-							diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7   | It12 It12 It13 It13...
-							diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3   | It8 It8 It9 It9...
-
-							// Ix * diff / Iy * diff
-							v10 = _mm_mullo_epi16(Ixy_0, diff0);
-							v11 = _mm_mulhi_epi16(Ixy_0, diff0);
-							v00 = _mm_unpacklo_epi16(v10, v11);
-							v10 = _mm_unpackhi_epi16(v10, v11);
-							qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
-							qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
-
-							// It * Ix It * Iy [4 ... 7]
-							// for set 1 hi sigma 1
-
-							v10 = _mm_mullo_epi16(Ixy_1, diff1);
-							v11 = _mm_mulhi_epi16(Ixy_1, diff1);
-							v00 = _mm_unpacklo_epi16(v10, v11);
-							v10 = _mm_unpackhi_epi16(v10, v11);
-							qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
-							qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
-							// diff * J [0 ... 7]
-							// for set 1  sigma 1
-							// b3 += diff * Iptr[x]
-
-							v10 = _mm_mullo_epi16(mmDiff_epi16, I_0_7_epi16);
-							v11 = _mm_mulhi_epi16(mmDiff_epi16, I_0_7_epi16);
-							v00 = _mm_unpacklo_epi16(v10, v11);
-							v10 = _mm_unpackhi_epi16(v10, v11);
-							qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v00));
-							qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v10));
-
-							qb3 = _mm_add_ps(qb3, _mm_cvtepi32_ps(diff_0_3_epi32));
-							qb3 = _mm_add_ps(qb3, _mm_cvtepi32_ps(diff_4_7_epi32));
-				
-							if (j == 0)
-							{
-								__m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16));
-								__m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16, bSet1_epi16), 16));
-								__m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16));
-								__m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16)));
-
-								__m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16));
-								__m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16), 16));
-
-								__m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps);
-								__m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps);
-								tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps);
-								tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps);
-
-								tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps);
-								tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps);
-	
-								t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3
-								t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3
-
-								__m128 fy = _mm_cvtepi32_ps(t0);
-								__m128 fx = _mm_cvtepi32_ps(t1);
-
-								// 0 ... 3
-								__m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16));
-
-								// A11 - A22
-								__m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps);
-								__m128 fytale = _mm_mul_ps(fy, tale_0_3_ps);
-
-								mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
-								mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
-								mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
-
-								// sumIx und sumIy
-								mmSumIx = _mm_add_ps(mmSumIx, fxtale);
-								mmSumIy = _mm_add_ps(mmSumIy, fytale);
-
-								mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale));
-								mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale));
-
-								// sumI
-								__m128 I_tale_ps = _mm_mul_ps(I_ps, tale_0_3_ps);
-								mmSumI = _mm_add_ps(mmSumI, I_tale_ps);
-
-								// sumW 
-								mmSumW = _mm_add_ps(mmSumW, tale_0_3_ps);
-
-								// sumDI
-								mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps(I_ps, I_tale_ps));
-
-								t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11
-								t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3
-
-								fy = _mm_cvtepi32_ps(t0);
-								fx = _mm_cvtepi32_ps(t1);
-
-								// 4 ... 7
-								I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16));
-
-								// A11 - A22
-								fxtale = _mm_mul_ps(fx, tale_4_7_ps);
-								fytale = _mm_mul_ps(fy, tale_4_7_ps);
-
-								mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
-								mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
-								mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
-
-								// sumIx und sumIy
-								mmSumIx = _mm_add_ps(mmSumIx, fxtale);
-								mmSumIy = _mm_add_ps(mmSumIy, fytale);
-
-								mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale));
-								mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale));
-
-								// sumI
-								I_tale_ps = _mm_mul_ps(I_ps, tale_4_7_ps);
-								mmSumI = _mm_add_ps(mmSumI, I_tale_ps);
-
-								// sumW 
-								mmSumW = _mm_add_ps(mmSumW, tale_4_7_ps);
-
-								// sumDI
-								mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps(I_ps, I_tale_ps));
-							}
+                    buffIdx = 0;
+
+                    for (y = 0; y < _winSize.height; y++)
+                    {
+                        const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+                        const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn;
+                        const short* Iptr = (const short*)(IWinBuf.data + y * IWinBuf.step);
+                        const short* dIptr = (const short*)(derivIWinBuf.data + y * derivIWinBuf.step);
+
+                        x = 0;
+#ifdef CV_SSE4_1                    
+
+
+                        for (; x <= _winSize.width*cn; x += 8, dIptr += 8 * 2)
+                        {
+                            __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
+                            __m128i diff0, diff1;
+                            __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); // von element 0 bis 7
+                               __m128i v00 = _mm_unpacklo_epi8(
+                            _mm_loadl_epi64((const __m128i*)(Jptr + x)) , z); //J0 , 0, J1, 0, J2, 0 ... J7,0 
+                            __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
+                            __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z);
+                            __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z);
+
+                            __m128i t0 = _mm_add_epi32
+                            (_mm_madd_epi16(
+                                _mm_unpacklo_epi16(v00, v01), //J0, , J1, J1, J2, J2,  ... J3 , J4
+                                qw0),
+                                _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                            __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+                                _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+
+                            t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5);
+                            t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1 - 5);
+
+                            
+                            __m128i lo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16);
+                            __m128i hi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16);
+
+                            __m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), bitShift);
+                            __m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), bitShift);
+                            __m128i Igain_epi16 = _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32);
+
+                            // diff = J - I + I*gain.x + gain.y
+                            __m128i mmDiff_epi16 = _mm_add_epi16(
+                                _mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16),    // J - I
+                                _mm_add_epi16(Igain_epi16, mmConstValue_epi16));
+
+                            
+                            mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16);
+
+                            
+                            __m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale);
+                            mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1)));
+                                
+        
+                            __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... 
+                            __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
+
+
+                            __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16);
+                            __m128i bSet2_epi16, bSet1_epi16;
+                            // |It| < sigma1 ?
+                            bSet2_epi16 = _mm_cmplt_epi16(abs_epi16, mmParam1);
+                            // It > 0 ?
+                            __m128i diffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128());
+                            // sigma0 < |It| < sigma1 ?
+                            bSet1_epi16 = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0));
+                            // val = |It| -/+ sigma1
+                            __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)),
+                                _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1)));
+                            // It == 0     ? |It| > sigma13 
+                            mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16);
+                            // It == val ? sigma0 < |It| < sigma1
+                            mmDiff_epi16 = _mm_blendv_epi8(mmDiff_epi16, tmpParam1_epi16, bSet1_epi16);
+
+
+                            __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3
+                            // diff = diff * sigma2 
+                            lo = _mm_mullo_epi16(tale_epi16_, mmDiff_epi16);
+                            hi = _mm_mulhi_epi16(tale_epi16_, mmDiff_epi16);
+                            __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); //diff 0_3_epi32 
+                            __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); // diff 4_7_epi32
+
+                            mmDiff_epi16 = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); // ,da das ergebniss kleiner als 16 bit sein sollte
+
+                            diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7   | It12 It12 It13 It13...
+                            diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3   | It8 It8 It9 It9...
+
+                            // Ix * diff / Iy * diff
+                            v10 = _mm_mullo_epi16(Ixy_0, diff0);
+                            v11 = _mm_mulhi_epi16(Ixy_0, diff0);
+                            v00 = _mm_unpacklo_epi16(v10, v11);
+                            v10 = _mm_unpackhi_epi16(v10, v11);
+                            qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
+                            qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
+
+                            // It * Ix It * Iy [4 ... 7]
+                            // for set 1 hi sigma 1
+
+                            v10 = _mm_mullo_epi16(Ixy_1, diff1);
+                            v11 = _mm_mulhi_epi16(Ixy_1, diff1);
+                            v00 = _mm_unpacklo_epi16(v10, v11);
+                            v10 = _mm_unpackhi_epi16(v10, v11);
+                            qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
+                            qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
+                            // diff * J [0 ... 7]
+                            // for set 1  sigma 1
+                            // b3 += diff * Iptr[x]
+
+                            v10 = _mm_mullo_epi16(mmDiff_epi16, I_0_7_epi16);
+                            v11 = _mm_mulhi_epi16(mmDiff_epi16, I_0_7_epi16);
+                            v00 = _mm_unpacklo_epi16(v10, v11);
+                            v10 = _mm_unpackhi_epi16(v10, v11);
+                            qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v00));
+                            qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v10));
+
+                            qb3 = _mm_add_ps(qb3, _mm_cvtepi32_ps(diff_0_3_epi32));
+                            qb3 = _mm_add_ps(qb3, _mm_cvtepi32_ps(diff_4_7_epi32));
+                
+                            if (j == 0)
+                            {
+                                __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16));
+                                __m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16, bSet1_epi16), 16));
+                                __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16));
+                                __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16)));
+
+                                __m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16));
+                                __m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16), 16));
+
+                                __m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps);
+                                __m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps);
+                                tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps);
+                                tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps);
+
+                                tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps);
+                                tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps);
+    
+                                t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3
+                                t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3
+
+                                __m128 fy = _mm_cvtepi32_ps(t0);
+                                __m128 fx = _mm_cvtepi32_ps(t1);
+
+                                // 0 ... 3
+                                __m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16));
+
+                                // A11 - A22
+                                __m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps);
+                                __m128 fytale = _mm_mul_ps(fy, tale_0_3_ps);
+
+                                mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
+                                mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
+                                mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
+
+                                // sumIx und sumIy
+                                mmSumIx = _mm_add_ps(mmSumIx, fxtale);
+                                mmSumIy = _mm_add_ps(mmSumIy, fytale);
+
+                                mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale));
+                                mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale));
+
+                                // sumI
+                                __m128 I_tale_ps = _mm_mul_ps(I_ps, tale_0_3_ps);
+                                mmSumI = _mm_add_ps(mmSumI, I_tale_ps);
+
+                                // sumW 
+                                mmSumW = _mm_add_ps(mmSumW, tale_0_3_ps);
+
+                                // sumDI
+                                mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps(I_ps, I_tale_ps));
+
+                                t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11
+                                t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3
+
+                                fy = _mm_cvtepi32_ps(t0);
+                                fx = _mm_cvtepi32_ps(t1);
+
+                                // 4 ... 7
+                                I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16));
+
+                                // A11 - A22
+                                fxtale = _mm_mul_ps(fx, tale_4_7_ps);
+                                fytale = _mm_mul_ps(fy, tale_4_7_ps);
+
+                                mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
+                                mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
+                                mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
+
+                                // sumIx und sumIy
+                                mmSumIx = _mm_add_ps(mmSumIx, fxtale);
+                                mmSumIy = _mm_add_ps(mmSumIy, fytale);
+
+                                mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale));
+                                mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale));
+
+                                // sumI
+                                I_tale_ps = _mm_mul_ps(I_ps, tale_4_7_ps);
+                                mmSumI = _mm_add_ps(mmSumI, I_tale_ps);
+
+                                // sumW 
+                                mmSumW = _mm_add_ps(mmSumW, tale_4_7_ps);
+
+                                // sumDI
+                                mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps(I_ps, I_tale_ps));
+                            }
 
-						}
+                        }
 #else
-						for (; x < _winSize.width*cn; x++, dIptr += 2)
-						{
-							if (maskPtr[x] == 0)
-								continue;
-							
-							float J = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 +
-								Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11,
-								W_BITS1 - 5);
-							short ixval = static_cast<short>(dIptr[0]);
-							short iyval = static_cast<short>(dIptr[1]);
-							int diff = J - static_cast<int>(Iptr[x]) + Iptr[x] * gainVec.x + gainVec.y;
-							int abss = (diff < 0) ? -diff : diff;
-							if (diff > MEstimatorScale)
-								MEstimatorScale += eta;
-							if (diff < MEstimatorScale)
-								MEstimatorScale -= eta;
-							// compute the missmatch vector
-							
-							if (abss > fParam1)
-							{
-								diff = 0.f;
-							}
-							else if (abss > fParam0 && diff >= 0)
-							{
-								diff = param[2] * (diff - fParam1);
-							}
-							else if (abss > fParam0 && diff < 0)
-							{
-								diff = param[2] * (diff + fParam1);
-							}
-							
-
-							b1 += (float)(diff*ixval);
-							b2 += (float)(diff*iyval); ;
-							b3 += (float)(diff)* Iptr[x];
-							b4 += (float)(diff);
-
-
-							// compute the Gradient Matrice
-							if (j == 0)
-							{
-								float tale = param[2] * FLT_RESCALE;
-								if (abss < fParam0 || j < 0)
-								{
-									tale = FLT_RESCALE;
-								}
-								else if (abss > fParam1)
-								{
-									tale *= 0.01f;
-								}
-								else
-								{
-									tale *= param[2];
-								}
-								if (j == 0)
-								{
-									A11 += (float)(ixval*ixval)*tale;
-									A12 += (float)(ixval*iyval)*tale;
-									A22 += (float)(iyval*iyval)*tale;
-								}
-
-								dI += Iptr[x] * Iptr[x] * tale;
-								float dx = static_cast<float>(dIptr[0]) * tale;
-								float dy = static_cast<float>(dIptr[1]) * tale;
-								sumIx += dx;
-								sumIy += dy;
-								w1 += dx * Iptr[x];
-								w2 += dy * Iptr[x];
-								sumI += Iptr[x] * tale;
-								sumW += tale;
-							}
-
-						}
+                        for (; x < _winSize.width*cn; x++, dIptr += 2)
+                        {
+                            if (maskPtr[x] == 0)
+                                continue;
+                            
+                            float J = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 +
+                                Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11,
+                                W_BITS1 - 5);
+                            short ixval = static_cast<short>(dIptr[0]);
+                            short iyval = static_cast<short>(dIptr[1]);
+                            int diff = J - static_cast<int>(Iptr[x]) + Iptr[x] * gainVec.x + gainVec.y;
+                            int abss = (diff < 0) ? -diff : diff;
+                            if (diff > MEstimatorScale)
+                                MEstimatorScale += eta;
+                            if (diff < MEstimatorScale)
+                                MEstimatorScale -= eta;
+                            // compute the missmatch vector
+                            
+                            if (abss > fParam1)
+                            {
+                                diff = 0.f;
+                            }
+                            else if (abss > fParam0 && diff >= 0)
+                            {
+                                diff = param[2] * (diff - fParam1);
+                            }
+                            else if (abss > fParam0 && diff < 0)
+                            {
+                                diff = param[2] * (diff + fParam1);
+                            }
+                            
+
+                            b1 += (float)(diff*ixval);
+                            b2 += (float)(diff*iyval); ;
+                            b3 += (float)(diff)* Iptr[x];
+                            b4 += (float)(diff);
+
+
+                            // compute the Gradient Matrice
+                            if (j == 0)
+                            {
+                                float tale = param[2] * FLT_RESCALE;
+                                if (abss < fParam0 || j < 0)
+                                {
+                                    tale = FLT_RESCALE;
+                                }
+                                else if (abss > fParam1)
+                                {
+                                    tale *= 0.01f;
+                                }
+                                else
+                                {
+                                    tale *= param[2];
+                                }
+                                if (j == 0)
+                                {
+                                    A11 += (float)(ixval*ixval)*tale;
+                                    A12 += (float)(ixval*iyval)*tale;
+                                    A22 += (float)(iyval*iyval)*tale;
+                                }
+
+                                dI += Iptr[x] * Iptr[x] * tale;
+                                float dx = static_cast<float>(dIptr[0]) * tale;
+                                float dy = static_cast<float>(dIptr[1]) * tale;
+                                sumIx += dx;
+                                sumIy += dy;
+                                w1 += dx * Iptr[x];
+                                w2 += dy * Iptr[x];
+                                sumI += Iptr[x] * tale;
+                                sumW += tale;
+                            }
+
+                        }
 #endif
-					}
-
-					
-#ifdef CV_SSE2
-					short etaValues[8];
-					_mm_storeu_si128((__m128i*)(etaValues), mmEta);
-					MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3]
-							+ etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]);
-
-#endif				
-					float CV_DECL_ALIGNED(32) wbuf[4];//
-					if (j == 0)
-					{
-#ifdef CV_SSE2
-						_mm_store_ps(wbuf, mmSumW1);
-						w1 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-						_mm_store_ps(wbuf, mmSumW2);
-						w2 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-						_mm_store_ps(wbuf, mmSumDI);
-						dI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-						_mm_store_ps(wbuf, mmSumI);
-						sumI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-						_mm_store_ps(wbuf, mmSumIx);
-						sumIx = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-						_mm_store_ps(wbuf, mmSumIy);
-						sumIy = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-						_mm_store_ps(wbuf, mmSumW);
-						sumW = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                    }
+
+                    
+#ifdef CV_SSE4_1
+                    short etaValues[8];
+                    _mm_storeu_si128((__m128i*)(etaValues), mmEta);
+                    MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3]
+                            + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]);
+
+#endif                
+                    float CV_DECL_ALIGNED(32) wbuf[4];//
+                    if (j == 0)
+                    {
+#ifdef CV_SSE4_1
+                        _mm_store_ps(wbuf, mmSumW1);
+                        w1 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                        _mm_store_ps(wbuf, mmSumW2);
+                        w2 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                        _mm_store_ps(wbuf, mmSumDI);
+                        dI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                        _mm_store_ps(wbuf, mmSumI);
+                        sumI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                        _mm_store_ps(wbuf, mmSumIx);
+                        sumIx = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                        _mm_store_ps(wbuf, mmSumIy);
+                        sumIy = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                        _mm_store_ps(wbuf, mmSumW);
+                        sumW = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
 #endif
-						sumIx *= -FLT_SCALE;
-						sumIy *= -FLT_SCALE;
-						sumI *= FLT_SCALE;
-						sumW *= FLT_SCALE;
-						w1 *= -FLT_SCALE;
-						w2 *= -FLT_SCALE;
-						dI *= FLT_SCALE;
+                        sumIx *= -FLT_SCALE;
+                        sumIy *= -FLT_SCALE;
+                        sumI *= FLT_SCALE;
+                        sumW *= FLT_SCALE;
+                        w1 *= -FLT_SCALE;
+                        w2 *= -FLT_SCALE;
+                        dI *= FLT_SCALE;
 
 
-#ifdef CV_SSE2
-						float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];//				
+#ifdef CV_SSE4_1
 
-						_mm_store_ps(A11buf, mmAxx);
-						_mm_store_ps(A12buf, mmAxy);
-						_mm_store_ps(A22buf, mmAyy);
+                        float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];//                
 
+                        _mm_store_ps(A11buf, mmAxx);
+                        _mm_store_ps(A12buf, mmAxy);
+                        _mm_store_ps(A22buf, mmAyy);
 
-						A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
-						A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
-						A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
+
+                        A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
+                        A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
+                        A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
 #endif
-						A11 *= FLT_SCALE; 
-						A12 *= FLT_SCALE; 
-						A22 *= FLT_SCALE; 
-
-					
-					}
-
-
-#ifdef CV_SSE2
-					float CV_DECL_ALIGNED(16) bbuf[4];
-					_mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
-					b1 = bbuf[0] + bbuf[2];
-					b2 = bbuf[1] + bbuf[3];
-					_mm_store_ps(bbuf, qb2);
-					b3 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
-					_mm_store_ps(bbuf, qb3);
-					b4 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
-					
+                        A11 *= FLT_SCALE; 
+                        A12 *= FLT_SCALE; 
+                        A22 *= FLT_SCALE; 
+
+                    
+                    }
+
+
+#ifdef CV_SSE4_1
+                    float CV_DECL_ALIGNED(16) bbuf[4];
+                    _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
+                    b1 = bbuf[0] + bbuf[2];
+                    b2 = bbuf[1] + bbuf[3];
+                    _mm_store_ps(bbuf, qb2);
+                    b3 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
+                    _mm_store_ps(bbuf, qb3);
+                    b4 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
+                    
 #endif
 
-					mismatchMat.at<float>(0, 0) = b1 * FLT_SCALE;
-					mismatchMat.at<float>(1, 0) = b2 * FLT_SCALE;
-					mismatchMat.at<float>(2, 0) = -b3 * FLT_SCALE;
-					mismatchMat.at<float>(3, 0) = -b4 * FLT_SCALE;
-
-					D = -A12 * A12*sumI*sumI + dI * sumW*A12*A12 + 2 * A12*sumI*sumIx*w2 + 2 * A12*sumI*sumIy*w1
-						- 2 * dI*A12*sumIx*sumIy - 2 * sumW*A12*w1*w2 + A11 * A22*sumI*sumI - 2 * A22*sumI*sumIx*w1
-						- 2 * A11*sumI*sumIy*w2 - sumIx * sumIx*w2*w2 + A22 * dI*sumIx*sumIx + 2 * sumIx*sumIy*w1*w2
-						- sumIy * sumIy*w1*w1 + A11 * dI*sumIy*sumIy + A22 * sumW*w1*w1 + A11 * sumW*w2*w2 - A11 * A22*dI*sumW;
-
-					float sqrtVal = std::sqrt((A11 - A22)*(A11 - A22) + 4.f*A12*A12);
-					minEigValue = (A22 + A11 - sqrtVal) / (2.0f*winArea);;
-					maxEigValue = (A22 + A11 + sqrtVal) / (2.0f*winArea);
-					if (minEigValue < minEigThreshold || std::abs(D) < FLT_EPSILON)
-					{
-						if (level == 0 && status)
-							status[ptidx] = 0;
-						if (level > 0)
-						{
-							nextPts[ptidx] = backUpNextPt;
-							gainVecs[ptidx] = backUpGain;
-						}
-						break;
-					}
-
-					D = (1.f / D);
-
-					invTensorMat.at<float>(0, 0) = (A22*sumI*sumI - 2 * sumI*sumIy*w2 + dI * sumIy*sumIy + sumW * w2*w2 - A22 * dI*sumW)* D;												invTensorMat.at<float>(0, 1) = (A12*dI*sumW - A12 * sumI * sumI - dI * sumIx*sumIy + sumI * sumIx*w2 + sumI * sumIy*w1 - sumW * w1*w2)* D;
-					invTensorMat.at<float>(0, 2) = (A12*sumI*sumIy - sumIy * sumIy*w1 - A22 * sumI*sumIx - A12 * sumW*w2 + A22 * sumW*w1 + sumIx * sumIy*w2)* D;
-					invTensorMat.at<float>(0, 3) = (A22*dI*sumIx - A12 * dI*sumIy - sumIx * w2*w2 + A12 * sumI*w2 - A22 * sumI*w1 + sumIy * w1*w2) * D;
-					invTensorMat.at<float>(1, 0) = invTensorMat.at<float>(0, 1);
-					invTensorMat.at<float>(1, 1) = (A11*sumI * sumI - 2 * sumI*sumIx*w1 + dI * sumIx * sumIx + sumW * w1*w1 - A11 * dI*sumW) * D;
-					invTensorMat.at<float>(1, 2) = (A12*sumI*sumIx - A11 * sumI*sumIy - sumIx * sumIx*w2 + A11 * sumW*w2 - A12 * sumW*w1 + sumIx * sumIy*w1) * D;
-					invTensorMat.at<float>(1, 3) = (A11*dI*sumIy - sumIy * w1*w1 - A12 * dI*sumIx - A11 * sumI*w2 + A12 * sumI*w1 + sumIx * w1*w2)* D;
-					invTensorMat.at<float>(2, 0) = invTensorMat.at<float>(0, 2);
-					invTensorMat.at<float>(2, 1) = invTensorMat.at<float>(1, 2);
-					invTensorMat.at<float>(2, 2) = (sumW*A12*A12 - 2 * A12*sumIx*sumIy + A22 * sumIx*sumIx + A11 * sumIy*sumIy - A11 * A22*sumW)* D;
-					invTensorMat.at<float>(2, 3) = (A11*A22*sumI - A12 * A12*sumI - A11 * sumIy*w2 + A12 * sumIx*w2 + A12 * sumIy*w1 - A22 * sumIx*w1)* D;
-					invTensorMat.at<float>(3, 0) = invTensorMat.at<float>(0, 3);
-					invTensorMat.at<float>(3, 1) = invTensorMat.at<float>(1, 3);
-					invTensorMat.at<float>(3, 2) = invTensorMat.at<float>(2, 3);
-					invTensorMat.at<float>(3, 3) = (dI*A12*A12 - 2 * A12*w1*w2 + A22 * w1*w1 + A11 * w2*w2 - A11 * A22*dI)* D;
-
-					resultMat = invTensorMat * mismatchMat;
-
-				
-					Point2f delta(-resultMat.at<float>(0), -resultMat.at<float>(1));
-					Point2f deltaGain(resultMat.at<float>(2), resultMat.at<float>(3));
-
-
-
-					
-					if (j == 0)
-						prevGain = deltaGain;
-					gainVec += deltaGain * 0.8;
-					nextPt += delta * 0.8;
-					nextPts[ptidx] = nextPt + halfWin;
-					gainVecs[ptidx] = gainVec;
-
-
-
-					cv::Point2f totalDistance = backUpNextPt - nextPts[ptidx];
-					
-					if (// j > 1 && 
-						(std::abs(delta.x - prevDelta.x) < 0.01  &&	std::abs(delta.y - prevDelta.y) < 0.01)
-						|| ((delta.ddot(delta) <= 0.001) && std::abs(prevGain.x - deltaGain.x) < 0.01)
-						)
-					{
-						nextPts[ptidx] -= delta * 0.5f;
-						gainVecs[ptidx] -= deltaGain * 0.5f;
-						break;
-					}
-
-					prevDelta = delta;
-					prevGain = deltaGain;
-				}
-
-			}
-
-		}
-
-		const Mat*		prevImg;
-		const Mat*		nextImg;
-		const Mat*		prevDeriv;
-		const Mat*		rgbPrevImg;
-		const Mat*		rgbNextImg;
-		const Point2f*	prevPts;
-		Point2f*		nextPts;
-		uchar*			status;
-		cv::Point2f*	gainVecs;		
-		float*			err;
-		int				maxWinSize;
-		int				minWinSize;
-		TermCriteria	criteria;
-		int				level;
-		int				maxLevel;
-		int				windowType;
-		float			minEigThreshold;
-		bool			useInitialFlow;
-		std::vector<float> paramReset;
-		int				crossSegmentationThreshold;
-	};
+                    mismatchMat.at<float>(0, 0) = b1 * FLT_SCALE;
+                    mismatchMat.at<float>(1, 0) = b2 * FLT_SCALE;
+                    mismatchMat.at<float>(2, 0) = -b3 * FLT_SCALE;
+                    mismatchMat.at<float>(3, 0) = -b4 * FLT_SCALE;
+
+                    D = -A12 * A12*sumI*sumI + dI * sumW*A12*A12 + 2 * A12*sumI*sumIx*w2 + 2 * A12*sumI*sumIy*w1
+                        - 2 * dI*A12*sumIx*sumIy - 2 * sumW*A12*w1*w2 + A11 * A22*sumI*sumI - 2 * A22*sumI*sumIx*w1
+                        - 2 * A11*sumI*sumIy*w2 - sumIx * sumIx*w2*w2 + A22 * dI*sumIx*sumIx + 2 * sumIx*sumIy*w1*w2
+                        - sumIy * sumIy*w1*w1 + A11 * dI*sumIy*sumIy + A22 * sumW*w1*w1 + A11 * sumW*w2*w2 - A11 * A22*dI*sumW;
+
+                    float sqrtVal = std::sqrt((A11 - A22)*(A11 - A22) + 4.f*A12*A12);
+                    minEigValue = (A22 + A11 - sqrtVal) / (2.0f*winArea);
+                    if (minEigValue < minEigThreshold || std::abs(D) < FLT_EPSILON)
+                    {
+                        if (level == 0 && status)
+                            status[ptidx] = 0;
+                        if (level > 0)
+                        {
+                            nextPts[ptidx] = backUpNextPt;
+                            gainVecs[ptidx] = backUpGain;
+                        }
+                        break;
+                    }
+
+                    D = (1.f / D);
+
+                    invTensorMat.at<float>(0, 0) = (A22*sumI*sumI - 2 * sumI*sumIy*w2 + dI * sumIy*sumIy + sumW * w2*w2 - A22 * dI*sumW)* D;                                                invTensorMat.at<float>(0, 1) = (A12*dI*sumW - A12 * sumI * sumI - dI * sumIx*sumIy + sumI * sumIx*w2 + sumI * sumIy*w1 - sumW * w1*w2)* D;
+                    invTensorMat.at<float>(0, 2) = (A12*sumI*sumIy - sumIy * sumIy*w1 - A22 * sumI*sumIx - A12 * sumW*w2 + A22 * sumW*w1 + sumIx * sumIy*w2)* D;
+                    invTensorMat.at<float>(0, 3) = (A22*dI*sumIx - A12 * dI*sumIy - sumIx * w2*w2 + A12 * sumI*w2 - A22 * sumI*w1 + sumIy * w1*w2) * D;
+                    invTensorMat.at<float>(1, 0) = invTensorMat.at<float>(0, 1);
+                    invTensorMat.at<float>(1, 1) = (A11*sumI * sumI - 2 * sumI*sumIx*w1 + dI * sumIx * sumIx + sumW * w1*w1 - A11 * dI*sumW) * D;
+                    invTensorMat.at<float>(1, 2) = (A12*sumI*sumIx - A11 * sumI*sumIy - sumIx * sumIx*w2 + A11 * sumW*w2 - A12 * sumW*w1 + sumIx * sumIy*w1) * D;
+                    invTensorMat.at<float>(1, 3) = (A11*dI*sumIy - sumIy * w1*w1 - A12 * dI*sumIx - A11 * sumI*w2 + A12 * sumI*w1 + sumIx * w1*w2)* D;
+                    invTensorMat.at<float>(2, 0) = invTensorMat.at<float>(0, 2);
+                    invTensorMat.at<float>(2, 1) = invTensorMat.at<float>(1, 2);
+                    invTensorMat.at<float>(2, 2) = (sumW*A12*A12 - 2 * A12*sumIx*sumIy + A22 * sumIx*sumIx + A11 * sumIy*sumIy - A11 * A22*sumW)* D;
+                    invTensorMat.at<float>(2, 3) = (A11*A22*sumI - A12 * A12*sumI - A11 * sumIy*w2 + A12 * sumIx*w2 + A12 * sumIy*w1 - A22 * sumIx*w1)* D;
+                    invTensorMat.at<float>(3, 0) = invTensorMat.at<float>(0, 3);
+                    invTensorMat.at<float>(3, 1) = invTensorMat.at<float>(1, 3);
+                    invTensorMat.at<float>(3, 2) = invTensorMat.at<float>(2, 3);
+                    invTensorMat.at<float>(3, 3) = (dI*A12*A12 - 2 * A12*w1*w2 + A22 * w1*w1 + A11 * w2*w2 - A11 * A22*dI)* D;
+
+                    resultMat = invTensorMat * mismatchMat;
+
+                
+                    Point2f delta(-resultMat.at<float>(0), -resultMat.at<float>(1));
+                    Point2f deltaGain(resultMat.at<float>(2), resultMat.at<float>(3));
+
+
+
+                    
+                    if (j == 0)
+                        prevGain = deltaGain;
+                    gainVec += deltaGain * 0.8;
+                    nextPt += delta * 0.8;
+                    nextPts[ptidx] = nextPt + halfWin;
+                    gainVecs[ptidx] = gainVec;
+
+                    
+                    if (// j > 1 && 
+                        (std::abs(delta.x - prevDelta.x) < 0.01  &&    std::abs(delta.y - prevDelta.y) < 0.01)
+                        || ((delta.ddot(delta) <= 0.001) && std::abs(prevGain.x - deltaGain.x) < 0.01)
+                        )
+                    {
+                        nextPts[ptidx] -= delta * 0.5f;
+                        gainVecs[ptidx] -= deltaGain * 0.5f;
+                        break;
+                    }
+
+                    prevDelta = delta;
+                    prevGain = deltaGain;
+                }
+
+            }
+
+        }
+
+        const Mat*        prevImg;
+        const Mat*        nextImg;
+        const Mat*        prevDeriv;
+        const Mat*        rgbPrevImg;
+        const Mat*        rgbNextImg;
+        const Point2f*    prevPts;
+        Point2f*        nextPts;
+        uchar*            status;
+        cv::Point2f*    gainVecs;        
+        float*            err;
+        int                maxWinSize;
+        int                minWinSize;
+        TermCriteria    criteria;
+        int                level;
+        int                maxLevel;
+        int                windowType;
+        float            minEigThreshold;
+        bool            useInitialFlow;
+        std::vector<float> paramReset;
+        int                crossSegmentationThreshold;
+    };
 }
 }
 }
diff --git a/modules/optflow/src/rlof/rlof_invokerbase.hpp b/modules/optflow/src/rlof/rlof_invokerbase.hpp
index cb6243bf2aa..cb03f61384b 100644
--- a/modules/optflow/src/rlof/rlof_invokerbase.hpp
+++ b/modules/optflow/src/rlof/rlof_invokerbase.hpp
@@ -1,44 +1,6 @@
-﻿/*M///////////////////////////////////////////////////////////////////////////////////////
- //
- //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
- //
- //  By downloading, copying, installing or using the software you agree to this license.
- //  If you do not agree to this license, do not download, install,
- //  copy or use the software.
- //
- //
- //                           License Agreement
- //                For Open Source Computer Vision Library
- //
- // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
- // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
- // Third party copyrights are property of their respective owners.
- //
- // Redistribution and use in source and binary forms, with or without modification,
- // are permitted provided that the following conditions are met:
- //
- //   * Redistribution's of source code must retain the above copyright notice,
- //     this list of conditions and the following disclaimer.
- //
- //   * Redistribution's in binary form must reproduce the above copyright notice,
- //     this list of conditions and the following disclaimer in the documentation
- //     and/or other materials provided with the distribution.
- //
- //   * The name of the copyright holders may not be used to endorse or promote products
- //     derived from this software without specific prior written permission.
- //
- // This software is provided by the copyright holders and contributors "as is" and
- // any express or implied warranties, including, but not limited to, the implied
- // warranties of merchantability and fitness for a particular purpose are disclaimed.
- // In no event shall the Intel Corporation or contributors be liable for any direct,
- // indirect, incidental, special, exemplary, or consequential damages
- // (including, but not limited to, procurement of substitute goods or services;
- // loss of use, data, or profits; or business interruption) however caused
- // and on any theory of liability, whether in contract, strict liability,
- // or tort (including negligence or otherwise) arising in any way out of
- // the use of this software, even if advised of the possibility of such damage.
- //
- //M*/
+﻿// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
 #ifndef _RLOF_INVOKERBASE_HPP_
 #define  _RLOF_INVOKERBASE_HPP_
 
@@ -60,250 +22,250 @@ using namespace cv;
 
 namespace cv
 {
-	namespace optflow
-	{
-		typedef short deriv_type;
-		inline void get4BitMask(const int & width, __m128i & mask)
-		{
-			int noBits = width - static_cast<int>(floor(width / 4.f) * 4.f);
-			unsigned int val[4];
-			for (int n = 0; n < 4; n++)
-			{
-				val[n] = (noBits > n) ? (std::numeric_limits<unsigned int>::max()) : 0;
-			}
-			mask = _mm_set_epi32(val[3], val[2], val[1], val[0]);
+namespace optflow
+{
+    typedef short deriv_type;
+    inline void get4BitMask(const int & width, __m128i & mask)
+    {
+        int noBits = width - static_cast<int>(floor(width / 4.f) * 4.f);
+        unsigned int val[4];
+        for (int n = 0; n < 4; n++)
+        {
+            val[n] = (noBits > n) ? (std::numeric_limits<unsigned int>::max()) : 0;
+        }
+        mask = _mm_set_epi32(val[3], val[2], val[1], val[0]);
 
-		}
-		inline void getWBitMask(const int & width, __m128i & t0, __m128i & t1, __m128i & t2)
-		{
-			int noBits = width - static_cast<int>(floor(width / 8.f) * 8.f);
-			unsigned short val[8];
-			for (int n = 0; n < 8; n++)
-			{
-				val[n] = (noBits > n) ? (0xffff) : 0;
-			}
-			t1 = _mm_set_epi16(val[7], val[7], val[6], val[6], val[5], val[5], val[4], val[4]);
-			t0 = _mm_set_epi16(val[3], val[3], val[2], val[2], val[1], val[1], val[0], val[0]);
-			t2 = _mm_set_epi16(val[7], val[6], val[5], val[4], val[3], val[2], val[1], val[0]);
-		}
+    }
+    inline void getWBitMask(const int & width, __m128i & t0, __m128i & t1, __m128i & t2)
+    {
+        int noBits = width - static_cast<int>(floor(width / 8.f) * 8.f);
+        unsigned short val[8];
+        for (int n = 0; n < 8; n++)
+        {
+            val[n] = (noBits > n) ? (0xffff) : 0;
+        }
+        t1 = _mm_set_epi16(val[7], val[7], val[6], val[6], val[5], val[5], val[4], val[4]);
+        t0 = _mm_set_epi16(val[3], val[3], val[2], val[2], val[1], val[1], val[0], val[0]);
+        t2 = _mm_set_epi16(val[7], val[6], val[5], val[4], val[3], val[2], val[1], val[0]);
+    }
 
-		typedef uchar tMaskType;
+    typedef uchar tMaskType;
 #define tCVMaskType CV_8UC1
 #define MaskSet 0xffffffff
 
-		inline bool notSameColor(const cv::Point3_<uchar> & ref, int r, int c, const cv::Mat & img, int winSize, int threshold)
-		{
-			if (r >= img.rows + winSize || c >= img.cols + winSize || r < -winSize || c < -winSize) return true;
-			int step = static_cast<int>(img.step1());
-			const cv::Point3_<uchar> * tval = img.ptr<cv::Point3_<uchar>>();
-			tval += c;
-			tval += r * step / 3;
-			int R = std::abs((int)ref.x - (int)tval->x);
-			int G = std::abs((int)ref.y - (int)tval->y);
-			int B = std::abs((int)ref.z - (int)tval->z);
-			int diff = MAX(R, MAX(G, B));
-			return diff > threshold;
-		}
+    inline bool notSameColor(const cv::Point3_<uchar> & ref, int r, int c, const cv::Mat & img, int winSize, int threshold)
+    {
+        if (r >= img.rows + winSize || c >= img.cols + winSize || r < -winSize || c < -winSize) return true;
+        int step = static_cast<int>(img.step1());
+        const cv::Point3_<uchar> * tval = img.ptr<cv::Point3_<uchar>>();
+        tval += c;
+        tval += r * step / 3;
+        int R = std::abs((int)ref.x - (int)tval->x);
+        int G = std::abs((int)ref.y - (int)tval->y);
+        int B = std::abs((int)ref.z - (int)tval->z);
+        int diff = MAX(R, MAX(G, B));
+        return diff > threshold;
+    }
 
-		/*! Estimate the mask with points of the same color as the middle point
-		*\param prevGray input CV_8UC1
-		*\param prevImg input CV_8UC3
-		*\param nextImg input CV_8UC3
-		*\param prevPoint global position of the middle point of the mask window
-		*\param nextPoint global position of the middle point of the mask window
-		*\param winPointMask mask matrice with 1 labeling points contained and 0 point is not contained by the segment
-		*\param noPoints number of points contained by the segment
-		*\param winRoi rectangle of the region of interesset in global coordinates
-		*\param minWinSize,
-		*\param threshold,
-		*\param useBothImages
-		*/
-		static void getLocalPatch(
-			const cv::Mat & prevImg,
-			const cv::Point2i & prevPoint, // feature points
-			cv::Mat & winPointMask,
-			int & noPoints,
-			cv::Rect & winRoi,
-			int minWinSize,
-			int threshold)
-		{
-			int maxWinSizeH = (winPointMask.cols - 1) / 2;
-			winRoi.x = prevPoint.x;
-			winRoi.y = prevPoint.y;
-			winRoi.width = winPointMask.cols;
-			winRoi.height = winPointMask.rows;
+    /*! Estimate the mask with points of the same color as the middle point
+    *\param prevGray input CV_8UC1
+    *\param prevImg input CV_8UC3
+    *\param nextImg input CV_8UC3
+    *\param prevPoint global position of the middle point of the mask window
+    *\param nextPoint global position of the middle point of the mask window
+    *\param winPointMask mask matrice with 1 labeling points contained and 0 point is not contained by the segment
+    *\param noPoints number of points contained by the segment
+    *\param winRoi rectangle of the region of interesset in global coordinates
+    *\param minWinSize,
+    *\param threshold,
+    *\param useBothImages
+    */
+    static void getLocalPatch(
+        const cv::Mat & prevImg,
+        const cv::Point2i & prevPoint, // feature points
+        cv::Mat & winPointMask,
+        int & noPoints,
+        cv::Rect & winRoi,
+        int minWinSize,
+        int threshold)
+    {
+        int maxWinSizeH = (winPointMask.cols - 1) / 2;
+        winRoi.x = prevPoint.x;
+        winRoi.y = prevPoint.y;
+        winRoi.width = winPointMask.cols;
+        winRoi.height = winPointMask.rows;
 
-			if (minWinSize == winPointMask.cols)
-			{
-				winRoi.x = prevPoint.x - maxWinSizeH;
-				winRoi.y = prevPoint.y - maxWinSizeH;
-				winPointMask.setTo(1);
-				noPoints = winPointMask.size().area();
-				return;
-			}
-			noPoints = 0;
-			int c = prevPoint.x;
-			int r = prevPoint.y;
-			int c_left = c - 1;
-			int c_right = c + 1;
-			int r_top = r - 1;
-			int r_bottom = r;
-			int border_left = c - maxWinSizeH;
-			int border_right = c + maxWinSizeH;
-			int border_top = r - maxWinSizeH;
-			int border_bottom = r + maxWinSizeH;
-			int c_local_diff = prevPoint.x - maxWinSizeH;
-			int r_local_diff = prevPoint.y - maxWinSizeH;
-			int _c = c - c_local_diff;
-			int _r = r - r_local_diff;
-			int min_r = _r;
-			int max_r = _r;
-			int min_c = _c;
-			int max_c = _c;
-			// horizontal line
-			if (r < 0 || r >= prevImg.rows || c < 0 || c >= prevImg.cols)
-			{
-				noPoints = 0;
-				return;
-			}
-			cv::Point3_<uchar> val1 = prevImg.at<cv::Point3_<uchar>>(r, c); // middle grayvalue
-			cv::Point3_<uchar> tval;
+        if (minWinSize == winPointMask.cols)
+        {
+            winRoi.x = prevPoint.x - maxWinSizeH;
+            winRoi.y = prevPoint.y - maxWinSizeH;
+            winPointMask.setTo(1);
+            noPoints = winPointMask.size().area();
+            return;
+        }
+        noPoints = 0;
+        int c = prevPoint.x;
+        int r = prevPoint.y;
+        int c_left = c - 1;
+        int c_right = c + 1;
+        int r_top = r - 1;
+        int r_bottom = r;
+        int border_left = c - maxWinSizeH;
+        int border_right = c + maxWinSizeH;
+        int border_top = r - maxWinSizeH;
+        int border_bottom = r + maxWinSizeH;
+        int c_local_diff = prevPoint.x - maxWinSizeH;
+        int r_local_diff = prevPoint.y - maxWinSizeH;
+        int _c = c - c_local_diff;
+        int _r = r - r_local_diff;
+        int min_r = _r;
+        int max_r = _r;
+        int min_c = _c;
+        int max_c = _c;
+        // horizontal line
+        if (r < 0 || r >= prevImg.rows || c < 0 || c >= prevImg.cols)
+        {
+            noPoints = 0;
+            return;
+        }
+        cv::Point3_<uchar> val1 = prevImg.at<cv::Point3_<uchar>>(r, c); // middle grayvalue
+        cv::Point3_<uchar> tval;
 
-			//vertical line
-			for (int dr = r_top; dr >= border_top; dr--)
-			{
-				if (notSameColor(val1, dr, c, prevImg, maxWinSizeH, threshold))
-					break;
+        //vertical line
+        for (int dr = r_top; dr >= border_top; dr--)
+        {
+            if (notSameColor(val1, dr, c, prevImg, maxWinSizeH, threshold))
+                break;
 
-				int _dr = dr - r_local_diff;
-				min_r = MIN(min_r, _dr);
-				max_r = MAX(max_r, _dr);
-				winPointMask.at<uchar>(_dr, _c) = 1;
-				noPoints++;
-			}
-			for (int dr = r_bottom; dr < border_bottom; dr++)
-			{
-				if (notSameColor(val1, dr, c, prevImg, maxWinSizeH, threshold)
-					)
-					break;
+            int _dr = dr - r_local_diff;
+            min_r = MIN(min_r, _dr);
+            max_r = MAX(max_r, _dr);
+            winPointMask.at<uchar>(_dr, _c) = 1;
+            noPoints++;
+        }
+        for (int dr = r_bottom; dr < border_bottom; dr++)
+        {
+            if (notSameColor(val1, dr, c, prevImg, maxWinSizeH, threshold)
+                )
+                break;
 
-				int _dr = dr - r_local_diff;
-				min_r = MIN(min_r, _dr);
-				max_r = MAX(max_r, _dr);
-				winPointMask.at<uchar>(_dr, _c) = 1;
-				noPoints++;
-			}
-			// accumulate along the vertical line and the search line that was still labled
-			for (int dr = min_r + r_local_diff; dr <= max_r + r_local_diff; dr++)
-			{
-				int _dr = dr - r_local_diff;
-				if (winPointMask.at<uchar>(_dr, _c) == 0)
-				{
-					winPointMask.row(_dr).setTo(0);
-					continue;
-				}
-				bool skip = false;
-				int _dc = c_right - c_local_diff;
-				for (int dc = c_right; dc < border_right; dc++, _dc++)
-				{
-					if (skip == false)
-					{
-						if (notSameColor(val1, dr, dc, prevImg, maxWinSizeH, threshold))
-							skip = true;
-					}
-					if (skip == false)
-					{
-						min_c = MIN(min_c, _dc);
-						max_c = MAX(max_c, _dc);
-						winPointMask.at<uchar>(_dr, _dc) = 1;
-						noPoints++;
-					}
-					else
-						winPointMask.at<uchar>(_dr, _dc) = 0;
-				}
+            int _dr = dr - r_local_diff;
+            min_r = MIN(min_r, _dr);
+            max_r = MAX(max_r, _dr);
+            winPointMask.at<uchar>(_dr, _c) = 1;
+            noPoints++;
+        }
+        // accumulate along the vertical line and the search line that was still labled
+        for (int dr = min_r + r_local_diff; dr <= max_r + r_local_diff; dr++)
+        {
+            int _dr = dr - r_local_diff;
+            if (winPointMask.at<uchar>(_dr, _c) == 0)
+            {
+                winPointMask.row(_dr).setTo(0);
+                continue;
+            }
+            bool skip = false;
+            int _dc = c_right - c_local_diff;
+            for (int dc = c_right; dc < border_right; dc++, _dc++)
+            {
+                if (skip == false)
+                {
+                    if (notSameColor(val1, dr, dc, prevImg, maxWinSizeH, threshold))
+                        skip = true;
+                }
+                if (skip == false)
+                {
+                    min_c = MIN(min_c, _dc);
+                    max_c = MAX(max_c, _dc);
+                    winPointMask.at<uchar>(_dr, _dc) = 1;
+                    noPoints++;
+                }
+                else
+                    winPointMask.at<uchar>(_dr, _dc) = 0;
+            }
 
-				skip = false;
-				_dc = c_left - c_local_diff;
-				for (int dc = c_left; dc >= border_left; dc--, _dc--)
-				{
-					if (skip == false)
-					{
-						if (notSameColor(val1, dr, dc, prevImg, maxWinSizeH, threshold))
-							skip = true;
-					}
-					if (skip == false)
-					{
-						min_c = MIN(min_c, _dc);
-						max_c = MAX(max_c, _dc);
-						winPointMask.at<uchar>(_dr, _dc) = 1;
-						noPoints++;
-					}
-					else
-						winPointMask.at<uchar>(_dr, _dc) = 0;
-				}
-			}
+            skip = false;
+            _dc = c_left - c_local_diff;
+            for (int dc = c_left; dc >= border_left; dc--, _dc--)
+            {
+                if (skip == false)
+                {
+                    if (notSameColor(val1, dr, dc, prevImg, maxWinSizeH, threshold))
+                        skip = true;
+                }
+                if (skip == false)
+                {
+                    min_c = MIN(min_c, _dc);
+                    max_c = MAX(max_c, _dc);
+                    winPointMask.at<uchar>(_dr, _dc) = 1;
+                    noPoints++;
+                }
+                else
+                    winPointMask.at<uchar>(_dr, _dc) = 0;
+            }
+        }
 
 
-			// get the initial small window
-			if (noPoints < minWinSize * minWinSize)
-			{
-				cv::Rect roi(winPointMask.cols / 2 - (minWinSize - 1) / 2,
-					winPointMask.rows / 2 - (minWinSize - 1) / 2,
-					minWinSize, minWinSize);
-				cv::Mat(winPointMask, roi).setTo(1);
-				min_c = MIN(MIN(min_c, roi.tl().x), roi.br().x - 1);
-				max_c = MAX(MAX(max_c, roi.tl().x), roi.br().x - 1);
-				min_r = MIN(MIN(min_r, roi.tl().y), roi.br().y - 1);
-				max_r = MAX(MAX(max_r, roi.tl().y), roi.br().y - 1);
-				noPoints += minWinSize * minWinSize;
-			}
-			winRoi.x = c_local_diff + min_c;
-			winRoi.y = r_local_diff + min_r;
-			winRoi.width = max_c - min_c + 1;
-			winRoi.height = max_r - min_r + 1;
-			winPointMask = winPointMask(cv::Rect(min_c, min_r, winRoi.width, winRoi.height));
-		}
+        // get the initial small window
+        if (noPoints < minWinSize * minWinSize)
+        {
+            cv::Rect roi(winPointMask.cols / 2 - (minWinSize - 1) / 2,
+                winPointMask.rows / 2 - (minWinSize - 1) / 2,
+                minWinSize, minWinSize);
+            cv::Mat(winPointMask, roi).setTo(1);
+            min_c = MIN(MIN(min_c, roi.tl().x), roi.br().x - 1);
+            max_c = MAX(MAX(max_c, roi.tl().x), roi.br().x - 1);
+            min_r = MIN(MIN(min_r, roi.tl().y), roi.br().y - 1);
+            max_r = MAX(MAX(max_r, roi.tl().y), roi.br().y - 1);
+            noPoints += minWinSize * minWinSize;
+        }
+        winRoi.x = c_local_diff + min_c;
+        winRoi.y = r_local_diff + min_r;
+        winRoi.width = max_c - min_c + 1;
+        winRoi.height = max_r - min_r + 1;
+        winPointMask = winPointMask(cv::Rect(min_c, min_r, winRoi.width, winRoi.height));
+    }
 
-		inline bool calcWinMaskMat(	const cv::Mat & BI,
-									const int windowType,
-									const cv::Point2i & iprevPt,
-									cv::Mat & winMaskMat,
-									cv::Size & winSize,
-									cv::Point2f & halfWin,
-									int & winArea,
-									const int minWinSize,
-									const int maxWinSize,
-									const int crossSegmentationThreshold)
-		{
+    inline bool calcWinMaskMat(    const cv::Mat & BI,
+                                const int windowType,
+                                const cv::Point2i & iprevPt,
+                                cv::Mat & winMaskMat,
+                                cv::Size & winSize,
+                                cv::Point2f & halfWin,
+                                int & winArea,
+                                const int minWinSize,
+                                const int maxWinSize,
+                                const int crossSegmentationThreshold)
+    {
 
-			if (windowType == RLOFOpticalFlowParameter::SR_CROSS && maxWinSize != minWinSize)
-			{
-				// patch generation
-				cv::Rect winRoi;
-				getLocalPatch(BI, iprevPt, winMaskMat, winArea, winRoi, minWinSize, crossSegmentationThreshold);
-				if (winArea == 0)
-					return false;
-				winSize = winRoi.size();
-				halfWin = Point2f(static_cast<float>(iprevPt.x - winRoi.tl().x),
-					static_cast<float>(iprevPt.y - winRoi.tl().y));
-			}
-			else
-			{
-				winSize = cv::Size(maxWinSize, maxWinSize);
-				halfWin = Point2f((winSize.width - 1) / 2.f, (winSize.height - 1) / 2.f);
-				winMaskMat.setTo(1);
-			}
-			return true;
-		}
+        if (windowType == SR_CROSS && maxWinSize != minWinSize)
+        {
+            // patch generation
+            cv::Rect winRoi;
+            getLocalPatch(BI, iprevPt, winMaskMat, winArea, winRoi, minWinSize, crossSegmentationThreshold);
+            if (winArea == 0)
+                return false;
+            winSize = winRoi.size();
+            halfWin = Point2f(static_cast<float>(iprevPt.x - winRoi.tl().x),
+                static_cast<float>(iprevPt.y - winRoi.tl().y));
+        }
+        else
+        {
+            winSize = cv::Size(maxWinSize, maxWinSize);
+            halfWin = Point2f((winSize.width - 1) / 2.f, (winSize.height - 1) / 2.f);
+            winMaskMat.setTo(1);
+        }
+        return true;
+    }
 
-		inline short estimateScale(cv::Mat & residuals)
-		{
-			cv::Mat absMat = cv::abs(residuals);
-			return quickselect<short>(absMat, absMat.rows / 2);
-		}
-	}
+    inline short estimateScale(cv::Mat & residuals)
+    {
+        cv::Mat absMat = cv::abs(residuals);
+        return quickselect<short>(absMat, absMat.rows / 2);
+    }
+}
 
-	
+    
 }
 
 #endif
\ No newline at end of file
diff --git a/modules/optflow/src/rlof/rlof_localflow.cpp b/modules/optflow/src/rlof/rlof_localflow.cpp
index 3f995fd5477..b34a9c26e30 100644
--- a/modules/optflow/src/rlof/rlof_localflow.cpp
+++ b/modules/optflow/src/rlof/rlof_localflow.cpp
@@ -1,44 +1,6 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
- //
- //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
- //
- //  By downloading, copying, installing or using the software you agree to this license.
- //  If you do not agree to this license, do not download, install,
- //  copy or use the software.
- //
- //
- //                           License Agreement
- //                For Open Source Computer Vision Library
- //
- // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
- // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
- // Third party copyrights are property of their respective owners.
- //
- // Redistribution and use in source and binary forms, with or without modification,
- // are permitted provided that the following conditions are met:
- //
- //   * Redistribution's of source code must retain the above copyright notice,
- //     this list of conditions and the following disclaimer.
- //
- //   * Redistribution's in binary form must reproduce the above copyright notice,
- //     this list of conditions and the following disclaimer in the documentation
- //     and/or other materials provided with the distribution.
- //
- //   * The name of the copyright holders may not be used to endorse or promote products
- //     derived from this software without specific prior written permission.
- //
- // This software is provided by the copyright holders and contributors "as is" and
- // any express or implied warranties, including, but not limited to, the implied
- // warranties of merchantability and fitness for a particular purpose are disclaimed.
- // In no event shall the Intel Corporation or contributors be liable for any direct,
- // indirect, incidental, special, exemplary, or consequential damages
- // (including, but not limited to, procurement of substitute goods or services;
- // loss of use, data, or profits; or business interruption) however caused
- // and on any theory of liability, whether in contract, strict liability,
- // or tort (including negligence or otherwise) arising in any way out of
- // the use of this software, even if advised of the possibility of such damage.
- //
- //M*/
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
 #include "precomp.hpp"
 #include "rlof_localflow.h"
 #ifdef HAVE_OPENCV_CALIB3D
@@ -52,535 +14,541 @@ using namespace cv;
 
 namespace cv
 {
-	namespace detail
-	{
+    namespace detail
+    {
 
-		typedef short deriv_type;
-	}
+        typedef short deriv_type;
+    }
 namespace
 {
-	static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
-	{
-		using namespace cv;
-		using cv::detail::deriv_type;
-		int rows = src.rows, cols = src.cols, cn = src.channels(), colsn = cols * cn, depth = src.depth();
-		CV_Assert(depth == CV_8U);
-		dst.create(rows, cols, CV_MAKETYPE(DataType<deriv_type>::depth, cn * 2));
-
-		int x, y, delta = (int)alignSize((cols + 2)*cn, 16);
-		AutoBuffer<deriv_type> _tempBuf(delta * 2 + 64);
-		deriv_type *trow0 = alignPtr(_tempBuf.data() + cn, 16), *trow1 = alignPtr(trow0 + delta, 16);
+    static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
+    {
+        using namespace cv;
+        using cv::detail::deriv_type;
+        int rows = src.rows, cols = src.cols, cn = src.channels(), colsn = cols * cn, depth = src.depth();
+        CV_Assert(depth == CV_8U);
+        dst.create(rows, cols, CV_MAKETYPE(DataType<deriv_type>::depth, cn * 2));
+
+        int x, y, delta = (int)alignSize((cols + 2)*cn, 16);
+        AutoBuffer<deriv_type> _tempBuf(delta * 2 + 64);
+        deriv_type *trow0 = alignPtr(_tempBuf.data() + cn, 16), *trow1 = alignPtr(trow0 + delta, 16);
 
 #if CV_SIMD128
-		v_int16x8 c3 = v_setall_s16(3), c10 = v_setall_s16(10);
-		bool haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON);
+        v_int16x8 c3 = v_setall_s16(3), c10 = v_setall_s16(10);
+        bool haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON);
 #endif
 
-		for (y = 0; y < rows; y++)
-		{
-			const uchar* srow0 = src.ptr<uchar>(y > 0 ? y - 1 : rows > 1 ? 1 : 0);
-			const uchar* srow1 = src.ptr<uchar>(y);
-			const uchar* srow2 = src.ptr<uchar>(y < rows - 1 ? y + 1 : rows > 1 ? rows - 2 : 0);
-			deriv_type* drow = dst.ptr<deriv_type>(y);
+        for (y = 0; y < rows; y++)
+        {
+            const uchar* srow0 = src.ptr<uchar>(y > 0 ? y - 1 : rows > 1 ? 1 : 0);
+            const uchar* srow1 = src.ptr<uchar>(y);
+            const uchar* srow2 = src.ptr<uchar>(y < rows - 1 ? y + 1 : rows > 1 ? rows - 2 : 0);
+            deriv_type* drow = dst.ptr<deriv_type>(y);
 
-			// do vertical convolution
-			x = 0;
+            // do vertical convolution
+            x = 0;
 #if CV_SIMD128
-			if (haveSIMD)
-			{
-				for (; x <= colsn - 8; x += 8)
-				{
-					v_int16x8 s0 = v_reinterpret_as_s16(v_load_expand(srow0 + x));
-					v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(srow1 + x));
-					v_int16x8 s2 = v_reinterpret_as_s16(v_load_expand(srow2 + x));
-
-					v_int16x8 t1 = s2 - s0;
-					v_int16x8 t0 = v_mul_wrap(s0 + s2, c3) + v_mul_wrap(s1, c10);
-
-					v_store(trow0 + x, t0);
-					v_store(trow1 + x, t1);
-				}
-			}
+            if (haveSIMD)
+            {
+                for (; x <= colsn - 8; x += 8)
+                {
+                    v_int16x8 s0 = v_reinterpret_as_s16(v_load_expand(srow0 + x));
+                    v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(srow1 + x));
+                    v_int16x8 s2 = v_reinterpret_as_s16(v_load_expand(srow2 + x));
+
+                    v_int16x8 t1 = s2 - s0;
+                    v_int16x8 t0 = v_mul_wrap(s0 + s2, c3) + v_mul_wrap(s1, c10);
+
+                    v_store(trow0 + x, t0);
+                    v_store(trow1 + x, t1);
+                }
+            }
 #endif
 
-			for (; x < colsn; x++)
-			{
-				int t0 = (srow0[x] + srow2[x]) * 3 + srow1[x] * 10;
-				int t1 = srow2[x] - srow0[x];
-				trow0[x] = (deriv_type)t0;
-				trow1[x] = (deriv_type)t1;
-			}
-
-			// make border
-			int x0 = (cols > 1 ? 1 : 0)*cn, x1 = (cols > 1 ? cols - 2 : 0)*cn;
-			for (int k = 0; k < cn; k++)
-			{
-				trow0[-cn + k] = trow0[x0 + k]; trow0[colsn + k] = trow0[x1 + k];
-				trow1[-cn + k] = trow1[x0 + k]; trow1[colsn + k] = trow1[x1 + k];
-			}
-
-			// do horizontal convolution, interleave the results and store them to dst
-			x = 0;
+            for (; x < colsn; x++)
+            {
+                int t0 = (srow0[x] + srow2[x]) * 3 + srow1[x] * 10;
+                int t1 = srow2[x] - srow0[x];
+                trow0[x] = (deriv_type)t0;
+                trow1[x] = (deriv_type)t1;
+            }
+
+            // make border
+            int x0 = (cols > 1 ? 1 : 0)*cn, x1 = (cols > 1 ? cols - 2 : 0)*cn;
+            for (int k = 0; k < cn; k++)
+            {
+                trow0[-cn + k] = trow0[x0 + k]; trow0[colsn + k] = trow0[x1 + k];
+                trow1[-cn + k] = trow1[x0 + k]; trow1[colsn + k] = trow1[x1 + k];
+            }
+
+            // do horizontal convolution, interleave the results and store them to dst
+            x = 0;
 #if CV_SIMD128
-			if (haveSIMD)
-			{
-				for (; x <= colsn - 8; x += 8)
-				{
-					v_int16x8 s0 = v_load(trow0 + x - cn);
-					v_int16x8 s1 = v_load(trow0 + x + cn);
-					v_int16x8 s2 = v_load(trow1 + x - cn);
-					v_int16x8 s3 = v_load(trow1 + x);
-					v_int16x8 s4 = v_load(trow1 + x + cn);
-
-					v_int16x8 t0 = s1 - s0;
-					v_int16x8 t1 = v_mul_wrap(s2 + s4, c3) + v_mul_wrap(s3, c10);
-
-					v_store_interleave((drow + x * 2), t0, t1);
-				}
-			}
+            if (haveSIMD)
+            {
+                for (; x <= colsn - 8; x += 8)
+                {
+                    v_int16x8 s0 = v_load(trow0 + x - cn);
+                    v_int16x8 s1 = v_load(trow0 + x + cn);
+                    v_int16x8 s2 = v_load(trow1 + x - cn);
+                    v_int16x8 s3 = v_load(trow1 + x);
+                    v_int16x8 s4 = v_load(trow1 + x + cn);
+
+                    v_int16x8 t0 = s1 - s0;
+                    v_int16x8 t1 = v_mul_wrap(s2 + s4, c3) + v_mul_wrap(s3, c10);
+
+                    v_store_interleave((drow + x * 2), t0, t1);
+                }
+            }
 #endif
-			for (; x < colsn; x++)
-			{
-				deriv_type t0 = (deriv_type)(trow0[x + cn] - trow0[x - cn]);
-				deriv_type t1 = (deriv_type)((trow1[x + cn] + trow1[x - cn]) * 3 + trow1[x] * 10);
-				drow[x * 2] = t0; drow[x * 2 + 1] = t1;
-			}
-		}
-	}
+            for (; x < colsn; x++)
+            {
+                deriv_type t0 = (deriv_type)(trow0[x + cn] - trow0[x - cn]);
+                deriv_type t1 = (deriv_type)((trow1[x + cn] + trow1[x - cn]) * 3 + trow1[x] * 10);
+                drow[x * 2] = t0; drow[x * 2 + 1] = t1;
+            }
+        }
+    }
 
 }//namespace
-	namespace optflow
-	{
-		bool isrobust(const RLOFOpticalFlowParameter & param)
-		{
-			return (param.normSigma0 != std::numeric_limits<float>::max())
-				&& (param.normSigma1 != std::numeric_limits<float>::max());
-		}
-		std::vector<float> get_norm(float sigma0, float sigma1)
-		{
-			std::vector<float> result = { sigma0, sigma1, sigma0 / (sigma0 - sigma1), sigma0 * sigma1 };
-			return result;
-		}
-
-	
-		int buildOpticalFlowPyramidScale(InputArray _img, OutputArrayOfArrays pyramid, Size winSize, int maxLevel, bool withDerivatives,
-			int pyrBorder, int derivBorder, bool tryReuseInputImage, float levelScale[2])
-		{
-			Mat img = _img.getMat();
-			CV_Assert(img.depth() == CV_8U && winSize.width > 2 && winSize.height > 2);
-			int pyrstep = withDerivatives ? 2 : 1;
-
-			pyramid.create(1, (maxLevel + 1) * pyrstep, 0 /*type*/, -1, true);
-
-			int derivType = CV_MAKETYPE(DataType<short>::depth, img.channels() * 2);
-
-			//level 0
-			bool lvl0IsSet = false;
-			if (tryReuseInputImage && img.isSubmatrix() && (pyrBorder & BORDER_ISOLATED) == 0)
-			{
-				Size wholeSize;
-				Point ofs;
-				img.locateROI(wholeSize, ofs);
-				if (ofs.x >= winSize.width && ofs.y >= winSize.height
-					&& ofs.x + img.cols + winSize.width <= wholeSize.width
-					&& ofs.y + img.rows + winSize.height <= wholeSize.height)
-				{
-					pyramid.getMatRef(0) = img;
-					lvl0IsSet = true;
-				}
-			}
-
-			if (!lvl0IsSet)
-			{
-				Mat& temp = pyramid.getMatRef(0);
-
-				if (!temp.empty())
-					temp.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width);
-				if (temp.type() != img.type() || temp.cols != winSize.width * 2 + img.cols || temp.rows != winSize.height * 2 + img.rows)
-					temp.create(img.rows + winSize.height * 2, img.cols + winSize.width * 2, img.type());
-
-				if (pyrBorder == BORDER_TRANSPARENT)
-					img.copyTo(temp(Rect(winSize.width, winSize.height, img.cols, img.rows)));
-				else
-					copyMakeBorder(img, temp, winSize.height, winSize.height, winSize.width, winSize.width, pyrBorder);
-				temp.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width);
-			}
-
-			Size sz = img.size();
-			Mat prevLevel = pyramid.getMatRef(0);
-			Mat thisLevel = prevLevel;
-
-			for (int level = 0; level <= maxLevel; ++level)
-			{
-				if (level != 0)
-				{
-					Mat& temp = pyramid.getMatRef(level * pyrstep);
-
-					if (!temp.empty())
-						temp.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width);
-					if (temp.type() != img.type() || temp.cols != winSize.width * 2 + sz.width || temp.rows != winSize.height * 2 + sz.height)
-						temp.create(sz.height + winSize.height * 2, sz.width + winSize.width * 2, img.type());
-
-					thisLevel = temp(Rect(winSize.width, winSize.height, sz.width, sz.height));
-					pyrDown(prevLevel, thisLevel, sz);
-
-					if (pyrBorder != BORDER_TRANSPARENT)
-						copyMakeBorder(thisLevel, temp, winSize.height, winSize.height, winSize.width, winSize.width, pyrBorder | BORDER_ISOLATED);
-					temp.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width);
-				}
-
-				if (withDerivatives)
-				{
-					Mat& deriv = pyramid.getMatRef(level * pyrstep + 1);
-
-					if (!deriv.empty())
-						deriv.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width);
-					if (deriv.type() != derivType || deriv.cols != winSize.width * 2 + sz.width || deriv.rows != winSize.height * 2 + sz.height)
-						deriv.create(sz.height + winSize.height * 2, sz.width + winSize.width * 2, derivType);
-
-					Mat derivI = deriv(Rect(winSize.width, winSize.height, sz.width, sz.height));
-					calcSharrDeriv(thisLevel, derivI);
-
-					if (derivBorder != BORDER_TRANSPARENT)
-						copyMakeBorder(derivI, deriv, winSize.height, winSize.height, winSize.width, winSize.width, derivBorder | BORDER_ISOLATED);
-					deriv.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width);
-				}
-
-				sz = Size(static_cast<int>((sz.width + 1) / levelScale[0]),
-					static_cast<int>((sz.height + 1) / levelScale[1]));
-				if (sz.width <= winSize.width || sz.height <= winSize.height)
-				{
-					pyramid.create(1, (level + 1) * pyrstep, 0 /*type*/, -1, true);//check this
-					return level;
-				}
-
-				prevLevel = thisLevel;
-			}
-
-			return maxLevel;
-		}
-		int CImageBuffer::buildPyramid(cv::Size winSize, int maxLevel, float levelScale[2])
-		{
-			if (m_Overwrite == false)
-				return m_maxLevel;
-			m_maxLevel = buildOpticalFlowPyramidScale(m_Image, m_ImagePyramid, winSize, maxLevel, false, 4, 0, true, levelScale);
-			return m_maxLevel;
-		}
-				
-		void calcLocalOpticalFlowCore(
-			Ptr<CImageBuffer>  prevPyramids[2],
-			Ptr<CImageBuffer> currPyramids[2],
-			InputArray _prevPts,
-			InputOutputArray _nextPts,
-			const RLOFOpticalFlowParameter & param)
-		{
-		
-			bool useAdditionalRGB = param.supportRegionType == RLOFOpticalFlowParameter::SR_CROSS;
-			int iWinSize = param.largeWinSize;
-			int winSizes[2] = { iWinSize, iWinSize };
-			if (param.supportRegionType != RLOFOpticalFlowParameter::SR_FIXED)
-			{
-				winSizes[0] = param.smallWinSize;
-			}
-			//cv::Size winSize(iWinSize, iWinSize);
-			cv::TermCriteria criteria(cv::TermCriteria::COUNT | cv::TermCriteria::EPS, param.maxIteration, 0.01);
-			std::vector<float> rlofNorm = get_norm(param.normSigma0, param.normSigma1);
-			CV_Assert(winSizes[0] <= winSizes[1]);
-
-			Mat prevPtsMat = _prevPts.getMat();
-			const int derivDepth = DataType<detail::deriv_type>::depth;
-
-			CV_Assert(param.maxLevel >= 0 && iWinSize > 2);
-
-			int level = 0,  npoints;
-			CV_Assert((npoints = prevPtsMat.checkVector(2, CV_32F, true)) >= 0);
-
-			if (!(param.useInitialFlow))
-				_nextPts.create(prevPtsMat.size(), prevPtsMat.type(), -1, true);
-
-			Mat nextPtsMat = _nextPts.getMat();
-			CV_Assert(nextPtsMat.checkVector(2, CV_32F, true) == npoints);
-
-			const Point2f* prevPts = (const Point2f*)prevPtsMat.data;
-			Point2f* nextPts = (Point2f*)nextPtsMat.data;
-			std::vector<uchar> status(npoints);
-			std::vector<float> err(npoints);
-			std::vector<Point2f> gainPts(npoints);
-
-			float levelScale[2] = { 2.f,2.f };
-
-			int maxLevel = prevPyramids[0]->buildPyramid(cv::Size(iWinSize, iWinSize), param.maxLevel, levelScale);
-
-			maxLevel = currPyramids[0]->buildPyramid(cv::Size(iWinSize, iWinSize), maxLevel, levelScale);
-			if (useAdditionalRGB)
-			{
-				prevPyramids[1]->buildPyramid(cv::Size(iWinSize, iWinSize), maxLevel, levelScale);
-				currPyramids[1]->buildPyramid(cv::Size(iWinSize, iWinSize), maxLevel, levelScale);
-			}
-
-			if ((criteria.type & TermCriteria::COUNT) == 0)
-				criteria.maxCount = 30;
-			else
-				criteria.maxCount = std::min(std::max(criteria.maxCount, 0), 100);
-			if ((criteria.type & TermCriteria::EPS) == 0)
-				criteria.epsilon = 0.001;
-			else
-				criteria.epsilon = std::min(std::max(criteria.epsilon, 0.), 10.);
-			criteria.epsilon *= criteria.epsilon;
-
-			// dI/dx ~ Ix, dI/dy ~ Iy
-			Mat derivIBuf;
-			derivIBuf.create(prevPyramids[0]->m_ImagePyramid[0].rows + iWinSize * 2, prevPyramids[0]->m_ImagePyramid[0].cols + iWinSize * 2, CV_MAKETYPE(derivDepth, prevPyramids[0]->m_ImagePyramid[0].channels() * 2));
-
-			for (level = maxLevel; level >= 0; level--)
-			{
-				Mat derivI;
-				Size imgSize = prevPyramids[0]->getImage(level).size();
-				Mat _derivI(imgSize.height + iWinSize * 2, imgSize.width + iWinSize * 2, derivIBuf.type(), derivIBuf.data);
-				derivI = _derivI(Rect(iWinSize, iWinSize, imgSize.width, imgSize.height));
-				calcSharrDeriv(prevPyramids[0]->getImage(level), derivI);
-				copyMakeBorder(derivI, _derivI, iWinSize, iWinSize, iWinSize, iWinSize, BORDER_CONSTANT | BORDER_ISOLATED);
-				
-				cv::Mat tRGBPrevPyr;
-				cv::Mat tRGBNextPyr;
-				if (useAdditionalRGB)
-				{
-					tRGBPrevPyr = prevPyramids[1]->getImage(level);
-					tRGBNextPyr = prevPyramids[1]->getImage(level);
-
-					prevPyramids[1]->m_Overwrite = false;
-					currPyramids[1]->m_Overwrite = false;
-				}
-
-				cv::Mat prevImage = prevPyramids[0]->getImage(level);
-				cv::Mat currImage = currPyramids[0]->getImage(level);
-
-				// apply plk like tracker
-				if (isrobust(param) == false)
-				{
-					if (param.useIlluminationModel)
-					{
-						cv::parallel_for_(cv::Range(0, npoints),
-							plk::radial::TrackerInvoker(
-								prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
-								prevPts, nextPts, &status[0], &err[0], &gainPts[0],
-								level, maxLevel, winSizes,
-								param.maxIteration,
-								param.useInitialFlow,
-								param.supportRegionType,
-								param.minEigenValue,
-								param.crossSegmentationThreshold));
-					}
-					else
-					{
-						if (param.solverType == RLOFOpticalFlowParameter::SolverType::ST_STANDART)
-						{
-							cv::parallel_for_(cv::Range(0, npoints),
-								plk::ica::TrackerInvoker(
-									prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
-									prevPts, nextPts, &status[0], &err[0],
-									level, maxLevel, winSizes,
-									param.maxIteration,
-									param.useInitialFlow,
-									param.supportRegionType,
-									param.crossSegmentationThreshold, param.minEigenValue));
-						}
-						else
-						{
-							cv::parallel_for_(cv::Range(0, npoints),
-								beplk::ica::TrackerInvoker(prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
-									prevPts, nextPts, &status[0], &err[0],
-									level, maxLevel, winSizes,
-									param.maxIteration,
-									param.useInitialFlow,
-									param.supportRegionType,
-									param.crossSegmentationThreshold,
-									param.minEigenValue));
-						}
-					}
-				}
-				// for robust models
-				else
-				{
-					if (param.useIlluminationModel)
-					{
-						if (param.solverType == RLOFOpticalFlowParameter::SolverType::ST_STANDART)
-						{
-							cv::parallel_for_(cv::Range(0, npoints),
-								rlof::radial::TrackerInvoker(
-									prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
-									prevPts, nextPts, &status[0], &err[0], &gainPts[0],
-									level, maxLevel, winSizes,
-									param.maxIteration,
-									param.useInitialFlow,
-									param.supportRegionType,
-									rlofNorm,
-									param.minEigenValue,
-									param.crossSegmentationThreshold));
-						}
-						else
-						{
-							cv::parallel_for_(cv::Range(0, npoints),
-								berlof::radial::TrackerInvoker(prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
-									prevPts, nextPts, &status[0], &err[0], &gainPts[0],
-									level, maxLevel, winSizes,
-									param.maxIteration,
-									param.useInitialFlow,
-									param.supportRegionType,
-									param.crossSegmentationThreshold,
-									rlofNorm,
-									param.minEigenValue));
-						}
-					}
-					else
-					{
-						
-						if (param.solverType == RLOFOpticalFlowParameter::SolverType::ST_STANDART)
-						{
-							cv::parallel_for_(cv::Range(0, npoints),
-								rlof::ica::TrackerInvoker(
-									prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
-									prevPts, nextPts, &status[0], &err[0],
-									level, maxLevel, winSizes,
-									param.maxIteration,
-									param.useInitialFlow,
-									param.supportRegionType,
-									rlofNorm,
-									param.minEigenValue,
-									param.crossSegmentationThreshold));
-						}
-						else
-						{
-							cv::parallel_for_(cv::Range(0, npoints),
-								berlof::ica::TrackerInvoker(prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
-									prevPts, nextPts, &status[0], &err[0],
-									level, maxLevel, winSizes,
-									param.maxIteration,
-									param.useInitialFlow,
-									param.supportRegionType,
-									param.crossSegmentationThreshold,
-									rlofNorm,
-									param.minEigenValue));
-						}
-						
-					}
-				}
-
-				prevPyramids[0]->m_Overwrite = true;
-				currPyramids[0]->m_Overwrite = true;
-			}
-		}
-		
-		void preprocess(Ptr<CImageBuffer> prevPyramids[2],
-			Ptr<CImageBuffer> currPyramids[2],
-			const std::vector<cv::Point2f> & prevPoints,
-			std::vector<cv::Point2f> & currPoints,
-			const RLOFOpticalFlowParameter & param)
-		{
-			cv::Mat mask, homography;
-			if (param.useGlobalMotionPrior == false)
-				return;
-
-			currPoints.resize(prevPoints.size());
-
-			RLOFOpticalFlowParameter gmeTrackerParam = param;
-			gmeTrackerParam.useGlobalMotionPrior = false;
-			gmeTrackerParam.largeWinSize = 17;
-			// use none robust tracker for global motion estimation since it is faster
-			gmeTrackerParam.normSigma0 = std::numeric_limits<float>::max();
-			gmeTrackerParam.maxIteration = MAX(15, param.maxIteration);
-			gmeTrackerParam.minEigenValue = 0.000001f;
-
-			std::vector<cv::Point2f> gmPrevPoints, gmCurrPoints, gmCheckPoints;
-
-			// Initialize point grid
-			int stepr = prevPyramids[0]->m_Image.rows / 30;
-			int stepc = prevPyramids[0]->m_Image.rows / 40;
-			for (int r = stepr / 2; r < prevPyramids[0]->m_Image.rows; r += stepr)
-			{
-				for (int c = stepc / 2; c < prevPyramids[0]->m_Image.cols; c += stepc)
-				{
-					gmPrevPoints.push_back(cv::Point2f(static_cast<float>(c), static_cast<float>(r)));
-				}
-			}
-
-			// perform motion estimation
-			calcLocalOpticalFlowCore(prevPyramids, currPyramids, gmPrevPoints, gmCurrPoints, gmeTrackerParam);
-			calcLocalOpticalFlowCore(currPyramids, prevPyramids, gmCurrPoints, gmCheckPoints, gmeTrackerParam);
-
-			cv::Mat prevPointsMat(static_cast<int>(gmPrevPoints.size()), 1, CV_32FC2);
-			cv::Mat currPointsMat(static_cast<int>(gmPrevPoints.size()), 1, CV_32FC2);
-			cv::Mat distMat(static_cast<int>(gmPrevPoints.size()), 1, CV_32FC1);
-
-			// Forward backward confidence to estimate optimal ransac reprojection error
-			int noPoints = 0;
-			for (unsigned int n = 0; n < gmPrevPoints.size(); n++)
-			{
-				cv::Point2f dist = gmCheckPoints[n] - gmPrevPoints[n];
-
-			    cv::Point2f flow = gmCurrPoints[n] - gmPrevPoints[n];
-				prevPointsMat.at<cv::Point2f>(noPoints) = gmPrevPoints[n];
-				currPointsMat.at<cv::Point2f>(noPoints) = gmCurrPoints[n];
-				distMat.at<float>(noPoints) = flow.x * flow.x + flow.y* flow.y;
-				if (isnan(distMat.at<float>(noPoints)) == false)
-					noPoints++;
-			}
-
-			float medianDist = (param.globalMotionRansacThreshold == 0) ? 1.f :
-				quickselect<float>(distMat, static_cast<int>(noPoints  * static_cast<float>(param.globalMotionRansacThreshold) / 100.f));
-			medianDist = sqrt(medianDist);
-
-			if (noPoints < 8)
-				return;
-
-			cv::findHomography(prevPointsMat(cv::Rect(0, 0, 1, noPoints)), currPointsMat(cv::Rect(0, 0, 1, noPoints)), cv::RANSAC, medianDist, mask).convertTo(homography, CV_32FC1);
-
-			if (homography.empty())
-				return;
-
-			cv::perspectiveTransform(prevPoints, currPoints, homography);
-		}
-
-		void calcLocalOpticalFlow(
-			const Mat prevImage,
-			const Mat currImage,
-			Ptr<CImageBuffer>  prevPyramids[2],
-			Ptr<CImageBuffer>  currPyramids[2],
-			const std::vector<Point2f> & prevPoints,
-			std::vector<Point2f> & currPoints,
-			const RLOFOpticalFlowParameter & param)
-		{
-			if (prevImage.empty() == false && currImage.empty()== false)
-			{
-				prevPyramids[0]->m_Overwrite = true;
-				currPyramids[0]->m_Overwrite = true;
-				prevPyramids[1]->m_Overwrite = true;
-				currPyramids[1]->m_Overwrite = true;
-				if (prevImage.type() == CV_8UC3)
-				{
-					prevPyramids[0]->setGrayFromRGB(prevImage);
-					currPyramids[0]->setGrayFromRGB(currImage);
-					prevPyramids[1]->setImage(prevImage);
-					currPyramids[1]->setImage(currImage);
-
-					if (param.supportRegionType == RLOFOpticalFlowParameter::SR_CROSS)
-					{
-						prevPyramids[1]->setBlurFromRGB(prevImage);
-						currPyramids[1]->setBlurFromRGB(currImage);
-					}
-				}
-				else
-				{
-					prevPyramids[0]->setImage(prevImage);
-					currPyramids[0]->setImage(currImage);
-				}
-			}
-			preprocess(prevPyramids, currPyramids, prevPoints, currPoints, param);
-			RLOFOpticalFlowParameter internParam = param;
-			if (param.useGlobalMotionPrior == true)
-				internParam.useInitialFlow = true;
-			calcLocalOpticalFlowCore(prevPyramids, currPyramids, prevPoints, currPoints, internParam);
-		}
-	}
+namespace optflow
+{
+    int buildOpticalFlowPyramidScale(InputArray _img, OutputArrayOfArrays pyramid, Size winSize, int maxLevel, bool withDerivatives,
+        int pyrBorder, int derivBorder, bool tryReuseInputImage, float levelScale[2]);
+    void calcLocalOpticalFlowCore(Ptr<CImageBuffer>  prevPyramids[2], Ptr<CImageBuffer> currPyramids[2], InputArray _prevPts,
+        InputOutputArray _nextPts, const RLOFOpticalFlowParameter & param);
+
+    void preprocess(Ptr<CImageBuffer> prevPyramids[2], Ptr<CImageBuffer> currPyramids[2], const std::vector<cv::Point2f> & prevPoints,
+        std::vector<cv::Point2f> & currPoints,  const RLOFOpticalFlowParameter & param);
+
+    inline bool isrobust(const RLOFOpticalFlowParameter & param)
+    {
+        return (param.normSigma0 != std::numeric_limits<float>::max())
+            && (param.normSigma1 != std::numeric_limits<float>::max());
+    }
+    inline std::vector<float> get_norm(float sigma0, float sigma1)
+    {
+        std::vector<float> result = { sigma0, sigma1, sigma0 / (sigma0 - sigma1), sigma0 * sigma1 };
+        return result;
+    }
+
+
+    int buildOpticalFlowPyramidScale(InputArray _img, OutputArrayOfArrays pyramid, Size winSize, int maxLevel, bool withDerivatives,
+        int pyrBorder, int derivBorder, bool tryReuseInputImage, float levelScale[2])
+    {
+        Mat img = _img.getMat();
+        CV_Assert(img.depth() == CV_8U && winSize.width > 2 && winSize.height > 2);
+        int pyrstep = withDerivatives ? 2 : 1;
+
+        pyramid.create(1, (maxLevel + 1) * pyrstep, 0 /*type*/, -1, true);
+
+        int derivType = CV_MAKETYPE(DataType<short>::depth, img.channels() * 2);
+
+        //level 0
+        bool lvl0IsSet = false;
+        if (tryReuseInputImage && img.isSubmatrix() && (pyrBorder & BORDER_ISOLATED) == 0)
+        {
+            Size wholeSize;
+            Point ofs;
+            img.locateROI(wholeSize, ofs);
+            if (ofs.x >= winSize.width && ofs.y >= winSize.height
+                && ofs.x + img.cols + winSize.width <= wholeSize.width
+                && ofs.y + img.rows + winSize.height <= wholeSize.height)
+            {
+                pyramid.getMatRef(0) = img;
+                lvl0IsSet = true;
+            }
+        }
+
+        if (!lvl0IsSet)
+        {
+            Mat& temp = pyramid.getMatRef(0);
+
+            if (!temp.empty())
+                temp.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width);
+            if (temp.type() != img.type() || temp.cols != winSize.width * 2 + img.cols || temp.rows != winSize.height * 2 + img.rows)
+                temp.create(img.rows + winSize.height * 2, img.cols + winSize.width * 2, img.type());
+
+            if (pyrBorder == BORDER_TRANSPARENT)
+                img.copyTo(temp(Rect(winSize.width, winSize.height, img.cols, img.rows)));
+            else
+                copyMakeBorder(img, temp, winSize.height, winSize.height, winSize.width, winSize.width, pyrBorder);
+            temp.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width);
+        }
+
+        Size sz = img.size();
+        Mat prevLevel = pyramid.getMatRef(0);
+        Mat thisLevel = prevLevel;
+
+        for (int level = 0; level <= maxLevel; ++level)
+        {
+            if (level != 0)
+            {
+                Mat& temp = pyramid.getMatRef(level * pyrstep);
+
+                if (!temp.empty())
+                    temp.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width);
+                if (temp.type() != img.type() || temp.cols != winSize.width * 2 + sz.width || temp.rows != winSize.height * 2 + sz.height)
+                    temp.create(sz.height + winSize.height * 2, sz.width + winSize.width * 2, img.type());
+
+                thisLevel = temp(Rect(winSize.width, winSize.height, sz.width, sz.height));
+                pyrDown(prevLevel, thisLevel, sz);
+
+                if (pyrBorder != BORDER_TRANSPARENT)
+                    copyMakeBorder(thisLevel, temp, winSize.height, winSize.height, winSize.width, winSize.width, pyrBorder | BORDER_ISOLATED);
+                temp.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width);
+            }
+
+            if (withDerivatives)
+            {
+                Mat& deriv = pyramid.getMatRef(level * pyrstep + 1);
+
+                if (!deriv.empty())
+                    deriv.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width);
+                if (deriv.type() != derivType || deriv.cols != winSize.width * 2 + sz.width || deriv.rows != winSize.height * 2 + sz.height)
+                    deriv.create(sz.height + winSize.height * 2, sz.width + winSize.width * 2, derivType);
+
+                Mat derivI = deriv(Rect(winSize.width, winSize.height, sz.width, sz.height));
+                calcSharrDeriv(thisLevel, derivI);
+
+                if (derivBorder != BORDER_TRANSPARENT)
+                    copyMakeBorder(derivI, deriv, winSize.height, winSize.height, winSize.width, winSize.width, derivBorder | BORDER_ISOLATED);
+                deriv.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width);
+            }
+
+            sz = Size(static_cast<int>((sz.width + 1) / levelScale[0]),
+                static_cast<int>((sz.height + 1) / levelScale[1]));
+            if (sz.width <= winSize.width || sz.height <= winSize.height)
+            {
+                pyramid.create(1, (level + 1) * pyrstep, 0 /*type*/, -1, true);//check this
+                return level;
+            }
+
+            prevLevel = thisLevel;
+        }
+
+        return maxLevel;
+    }
+    int CImageBuffer::buildPyramid(cv::Size winSize, int maxLevel, float levelScale[2])
+    {
+        if (m_Overwrite == false)
+            return m_maxLevel;
+        m_maxLevel = buildOpticalFlowPyramidScale(m_Image, m_ImagePyramid, winSize, maxLevel, false, 4, 0, true, levelScale);
+        return m_maxLevel;
+    }
+
+    void calcLocalOpticalFlowCore(
+        Ptr<CImageBuffer>  prevPyramids[2],
+        Ptr<CImageBuffer> currPyramids[2],
+        InputArray _prevPts,
+        InputOutputArray _nextPts,
+        const RLOFOpticalFlowParameter & param)
+    {
+
+        bool useAdditionalRGB = param.supportRegionType == SR_CROSS;
+        int iWinSize = param.largeWinSize;
+        int winSizes[2] = { iWinSize, iWinSize };
+        if (param.supportRegionType != SR_FIXED)
+        {
+            winSizes[0] = param.smallWinSize;
+        }
+        //cv::Size winSize(iWinSize, iWinSize);
+        cv::TermCriteria criteria(cv::TermCriteria::COUNT | cv::TermCriteria::EPS, param.maxIteration, 0.01);
+        std::vector<float> rlofNorm = get_norm(param.normSigma0, param.normSigma1);
+        CV_Assert(winSizes[0] <= winSizes[1]);
+
+        Mat prevPtsMat = _prevPts.getMat();
+        const int derivDepth = DataType<detail::deriv_type>::depth;
+
+        CV_Assert(param.maxLevel >= 0 && iWinSize > 2);
+
+        int level = 0,  npoints;
+        CV_Assert((npoints = prevPtsMat.checkVector(2, CV_32F, true)) >= 0);
+
+        if (!(param.useInitialFlow))
+            _nextPts.create(prevPtsMat.size(), prevPtsMat.type(), -1, true);
+
+        Mat nextPtsMat = _nextPts.getMat();
+        CV_Assert(nextPtsMat.checkVector(2, CV_32F, true) == npoints);
+
+        const Point2f* prevPts = (const Point2f*)prevPtsMat.data;
+        Point2f* nextPts = (Point2f*)nextPtsMat.data;
+        std::vector<uchar> status(npoints);
+        std::vector<float> err(npoints);
+        std::vector<Point2f> gainPts(npoints);
+
+        float levelScale[2] = { 2.f,2.f };
+
+        int maxLevel = prevPyramids[0]->buildPyramid(cv::Size(iWinSize, iWinSize), param.maxLevel, levelScale);
+
+        maxLevel = currPyramids[0]->buildPyramid(cv::Size(iWinSize, iWinSize), maxLevel, levelScale);
+        if (useAdditionalRGB)
+        {
+            prevPyramids[1]->buildPyramid(cv::Size(iWinSize, iWinSize), maxLevel, levelScale);
+            currPyramids[1]->buildPyramid(cv::Size(iWinSize, iWinSize), maxLevel, levelScale);
+        }
+
+        if ((criteria.type & TermCriteria::COUNT) == 0)
+            criteria.maxCount = 30;
+        else
+            criteria.maxCount = std::min(std::max(criteria.maxCount, 0), 100);
+        if ((criteria.type & TermCriteria::EPS) == 0)
+            criteria.epsilon = 0.001;
+        else
+            criteria.epsilon = std::min(std::max(criteria.epsilon, 0.), 10.);
+        criteria.epsilon *= criteria.epsilon;
+
+        // dI/dx ~ Ix, dI/dy ~ Iy
+        Mat derivIBuf;
+        derivIBuf.create(prevPyramids[0]->m_ImagePyramid[0].rows + iWinSize * 2, prevPyramids[0]->m_ImagePyramid[0].cols + iWinSize * 2, CV_MAKETYPE(derivDepth, prevPyramids[0]->m_ImagePyramid[0].channels() * 2));
+
+        for (level = maxLevel; level >= 0; level--)
+        {
+            Mat derivI;
+            Size imgSize = prevPyramids[0]->getImage(level).size();
+            Mat _derivI(imgSize.height + iWinSize * 2, imgSize.width + iWinSize * 2, derivIBuf.type(), derivIBuf.data);
+            derivI = _derivI(Rect(iWinSize, iWinSize, imgSize.width, imgSize.height));
+            calcSharrDeriv(prevPyramids[0]->getImage(level), derivI);
+            copyMakeBorder(derivI, _derivI, iWinSize, iWinSize, iWinSize, iWinSize, BORDER_CONSTANT | BORDER_ISOLATED);
+
+            cv::Mat tRGBPrevPyr;
+            cv::Mat tRGBNextPyr;
+            if (useAdditionalRGB)
+            {
+                tRGBPrevPyr = prevPyramids[1]->getImage(level);
+                tRGBNextPyr = prevPyramids[1]->getImage(level);
+
+                prevPyramids[1]->m_Overwrite = false;
+                currPyramids[1]->m_Overwrite = false;
+            }
+
+            cv::Mat prevImage = prevPyramids[0]->getImage(level);
+            cv::Mat currImage = currPyramids[0]->getImage(level);
+
+            // apply plk like tracker
+            if (isrobust(param) == false)
+            {
+                if (param.useIlluminationModel)
+                {
+                    cv::parallel_for_(cv::Range(0, npoints),
+                        plk::radial::TrackerInvoker(
+                            prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
+                            prevPts, nextPts, &status[0], &err[0], &gainPts[0],
+                            level, maxLevel, winSizes,
+                            param.maxIteration,
+                            param.useInitialFlow,
+                            param.supportRegionType,
+                            param.minEigenValue,
+                            param.crossSegmentationThreshold));
+                }
+                else
+                {
+                    if (param.solverType == SolverType::ST_STANDART)
+                    {
+                        cv::parallel_for_(cv::Range(0, npoints),
+                            plk::ica::TrackerInvoker(
+                                prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
+                                prevPts, nextPts, &status[0], &err[0],
+                                level, maxLevel, winSizes,
+                                param.maxIteration,
+                                param.useInitialFlow,
+                                param.supportRegionType,
+                                param.crossSegmentationThreshold,
+                                param.minEigenValue));
+                    }
+                    else
+                    {
+                        cv::parallel_for_(cv::Range(0, npoints),
+                            beplk::ica::TrackerInvoker(prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
+                                prevPts, nextPts, &status[0], &err[0],
+                                level, maxLevel, winSizes,
+                                param.maxIteration,
+                                param.useInitialFlow,
+                                param.supportRegionType,
+                                param.crossSegmentationThreshold,
+                                param.minEigenValue));
+                    }
+                }
+            }
+            // for robust models
+            else
+            {
+                if (param.useIlluminationModel)
+                {
+                    if (param.solverType == SolverType::ST_STANDART)
+                    {
+                        cv::parallel_for_(cv::Range(0, npoints),
+                            rlof::radial::TrackerInvoker(
+                                prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
+                                prevPts, nextPts, &status[0], &err[0], &gainPts[0],
+                                level, maxLevel, winSizes,
+                                param.maxIteration,
+                                param.useInitialFlow,
+                                param.supportRegionType,
+                                rlofNorm,
+                                param.minEigenValue,
+                                param.crossSegmentationThreshold));
+                    }
+                    else
+                    {
+                        cv::parallel_for_(cv::Range(0, npoints),
+                            berlof::radial::TrackerInvoker(prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
+                                prevPts, nextPts, &status[0], &err[0], &gainPts[0],
+                                level, maxLevel, winSizes,
+                                param.maxIteration,
+                                param.useInitialFlow,
+                                param.supportRegionType,
+                                param.crossSegmentationThreshold,
+                                rlofNorm,
+                                param.minEigenValue));
+                    }
+                }
+                else
+                {
+
+                    if (param.solverType == SolverType::ST_STANDART)
+                    {
+                        cv::parallel_for_(cv::Range(0, npoints),
+                            rlof::ica::TrackerInvoker(
+                                prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
+                                prevPts, nextPts, &status[0], &err[0],
+                                level, maxLevel, winSizes,
+                                param.maxIteration,
+                                param.useInitialFlow,
+                                param.supportRegionType,
+                                rlofNorm,
+                                param.minEigenValue,
+                                param.crossSegmentationThreshold));
+                    }
+                    else
+                    {
+                        cv::parallel_for_(cv::Range(0, npoints),
+                            berlof::ica::TrackerInvoker(prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
+                                prevPts, nextPts, &status[0], &err[0],
+                                level, maxLevel, winSizes,
+                                param.maxIteration,
+                                param.useInitialFlow,
+                                param.supportRegionType,
+                                param.crossSegmentationThreshold,
+                                rlofNorm,
+                                param.minEigenValue));
+                    }
+
+                }
+            }
+
+            prevPyramids[0]->m_Overwrite = true;
+            currPyramids[0]->m_Overwrite = true;
+        }
+    }
+
+    void preprocess(Ptr<CImageBuffer> prevPyramids[2],
+        Ptr<CImageBuffer> currPyramids[2],
+        const std::vector<cv::Point2f> & prevPoints,
+        std::vector<cv::Point2f> & currPoints,
+        const RLOFOpticalFlowParameter & param)
+    {
+        cv::Mat mask, homography;
+        if (param.useGlobalMotionPrior == false)
+            return;
+
+        currPoints.resize(prevPoints.size());
+
+        RLOFOpticalFlowParameter gmeTrackerParam = param;
+        gmeTrackerParam.useGlobalMotionPrior = false;
+        gmeTrackerParam.largeWinSize = 17;
+        // use none robust tracker for global motion estimation since it is faster
+        gmeTrackerParam.normSigma0 = std::numeric_limits<float>::max();
+        gmeTrackerParam.maxIteration = MAX(15, param.maxIteration);
+        gmeTrackerParam.minEigenValue = 0.000001f;
+
+        std::vector<cv::Point2f> gmPrevPoints, gmCurrPoints;
+
+        // Initialize point grid
+        int stepr = prevPyramids[0]->m_Image.rows / 30;
+        int stepc = prevPyramids[0]->m_Image.rows / 40;
+        for (int r = stepr / 2; r < prevPyramids[0]->m_Image.rows; r += stepr)
+        {
+            for (int c = stepc / 2; c < prevPyramids[0]->m_Image.cols; c += stepc)
+            {
+                gmPrevPoints.push_back(cv::Point2f(static_cast<float>(c), static_cast<float>(r)));
+            }
+        }
+
+        // perform motion estimation
+        calcLocalOpticalFlowCore(prevPyramids, currPyramids, gmPrevPoints, gmCurrPoints, gmeTrackerParam);
+
+        cv::Mat prevPointsMat(static_cast<int>(gmPrevPoints.size()), 1, CV_32FC2);
+        cv::Mat currPointsMat(static_cast<int>(gmPrevPoints.size()), 1, CV_32FC2);
+        cv::Mat distMat(static_cast<int>(gmPrevPoints.size()), 1, CV_32FC1);
+
+        // Forward backward confidence to estimate optimal ransac reprojection error
+        int noPoints = 0;
+        for (unsigned int n = 0; n < gmPrevPoints.size(); n++)
+        {
+            cv::Point2f flow = gmCurrPoints[n] - gmPrevPoints[n];
+            prevPointsMat.at<cv::Point2f>(noPoints) = gmPrevPoints[n];
+            currPointsMat.at<cv::Point2f>(noPoints) = gmCurrPoints[n];
+            distMat.at<float>(noPoints) = flow.x * flow.x + flow.y* flow.y;
+            if (isnan(distMat.at<float>(noPoints)) == false)
+                noPoints++;
+        }
+
+        float medianDist = (param.globalMotionRansacThreshold == 0) ? 1.f :
+            quickselect<float>(distMat, static_cast<int>(noPoints  * static_cast<float>(param.globalMotionRansacThreshold) / 100.f));
+        medianDist = sqrt(medianDist);
+
+        if (noPoints < 8)
+            return;
+
+        cv::findHomography(prevPointsMat(cv::Rect(0, 0, 1, noPoints)), currPointsMat(cv::Rect(0, 0, 1, noPoints)), cv::RANSAC, medianDist, mask).convertTo(homography, CV_32FC1);
+
+        if (homography.empty())
+            return;
+
+        cv::perspectiveTransform(prevPoints, currPoints, homography);
+    }
+
+    void calcLocalOpticalFlow(
+        const Mat prevImage,
+        const Mat currImage,
+        Ptr<CImageBuffer>  prevPyramids[2],
+        Ptr<CImageBuffer>  currPyramids[2],
+        const std::vector<Point2f> & prevPoints,
+        std::vector<Point2f> & currPoints,
+        const RLOFOpticalFlowParameter & param)
+    {
+        if (prevImage.empty() == false && currImage.empty()== false)
+        {
+            prevPyramids[0]->m_Overwrite = true;
+            currPyramids[0]->m_Overwrite = true;
+            prevPyramids[1]->m_Overwrite = true;
+            currPyramids[1]->m_Overwrite = true;
+            if (prevImage.type() == CV_8UC3)
+            {
+                prevPyramids[0]->setGrayFromRGB(prevImage);
+                currPyramids[0]->setGrayFromRGB(currImage);
+                prevPyramids[1]->setImage(prevImage);
+                currPyramids[1]->setImage(currImage);
+
+                if (param.supportRegionType == SR_CROSS)
+                {
+                    prevPyramids[1]->setBlurFromRGB(prevImage);
+                    currPyramids[1]->setBlurFromRGB(currImage);
+                }
+            }
+            else
+            {
+                prevPyramids[0]->setImage(prevImage);
+                currPyramids[0]->setImage(currImage);
+            }
+        }
+        preprocess(prevPyramids, currPyramids, prevPoints, currPoints, param);
+        RLOFOpticalFlowParameter internParam = param;
+        if (param.useGlobalMotionPrior == true)
+            internParam.useInitialFlow = true;
+        calcLocalOpticalFlowCore(prevPyramids, currPyramids, prevPoints, currPoints, internParam);
+    }
+}
 }
-	
\ No newline at end of file
+    
\ No newline at end of file
diff --git a/modules/optflow/src/rlof/rlof_localflow.h b/modules/optflow/src/rlof/rlof_localflow.h
index 6f6a0fc5e2a..bd59721f277 100644
--- a/modules/optflow/src/rlof/rlof_localflow.h
+++ b/modules/optflow/src/rlof/rlof_localflow.h
@@ -1,44 +1,6 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
- //
- //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
- //
- //  By downloading, copying, installing or using the software you agree to this license.
- //  If you do not agree to this license, do not download, install,
- //  copy or use the software.
- //
- //
- //                           License Agreement
- //                For Open Source Computer Vision Library
- //
- // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
- // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
- // Third party copyrights are property of their respective owners.
- //
- // Redistribution and use in source and binary forms, with or without modification,
- // are permitted provided that the following conditions are met:
- //
- //   * Redistribution's of source code must retain the above copyright notice,
- //     this list of conditions and the following disclaimer.
- //
- //   * Redistribution's in binary form must reproduce the above copyright notice,
- //     this list of conditions and the following disclaimer in the documentation
- //     and/or other materials provided with the distribution.
- //
- //   * The name of the copyright holders may not be used to endorse or promote products
- //     derived from this software without specific prior written permission.
- //
- // This software is provided by the copyright holders and contributors "as is" and
- // any express or implied warranties, including, but not limited to, the implied
- // warranties of merchantability and fitness for a particular purpose are disclaimed.
- // In no event shall the Intel Corporation or contributors be liable for any direct,
- // indirect, incidental, special, exemplary, or consequential damages
- // (including, but not limited to, procurement of substitute goods or services;
- // loss of use, data, or profits; or business interruption) however caused
- // and on any theory of liability, whether in contract, strict liability,
- // or tort (including negligence or otherwise) arising in any way out of
- // the use of this software, even if advised of the possibility of such damage.
- //
- //M*/
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
 #ifndef _RLOF_LOCALFLOW_H_
 #define _RLOF_LOCALFLOW_H_
 #include <limits>
@@ -53,106 +15,106 @@ using namespace cv;
 template<typename T>
 T quickselect(const Mat & inp, int k)
 {
-	unsigned long i;
-	unsigned long ir;
-	unsigned long j;
-	unsigned long l;
-	unsigned long mid;
-	Mat values = inp.clone();
-	T a;
+    unsigned long i;
+    unsigned long ir;
+    unsigned long j;
+    unsigned long l;
+    unsigned long mid;
+    Mat values = inp.clone();
+    T a;
 
-	l = 0;
-	ir = MAX(values.rows, values.cols) - 1;
-	while (true) 
-	{
-		if (ir <= l + 1) 
-		{
-			if (ir == l + 1 && values.at<T>(ir) < values.at<T>(l))
-				std::swap(values.at<T>(l), values.at<T>(ir));
-			return values.at<T>(k);
-		}
-		else 
-		{
-			mid = (l + ir) >> 1;
-			std::swap(values.at<T>(mid), values.at<T>(l+1));
-			if (values.at<T>(l) > values.at<T>(ir))
-				std::swap(values.at<T>(l), values.at<T>(ir));
-			if (values.at<T>(l+1) > values.at<T>(ir))
-				std::swap(values.at<T>(l+1), values.at<T>(ir));
-			if (values.at<T>(l) > values.at<T>(l+1))
-				std::swap(values.at<T>(l), values.at<T>(l+1));
-			i = l + 1;
-			j = ir;
-			a = values.at<T>(l+1);
-			while (true) 
-			{
-				do
-				{
-					i++;
-				}
-				while (values.at<T>(i) < a);
-				do
-				{
-					j--;
-				}
-				while (values.at<T>(j) > a);
-				if (j < i) break;
-				std::swap(values.at<T>(i), values.at<T>(j));
-			}
-			values.at<T>(l+1) = values.at<T>(j);
-			values.at<T>(j) = a;
-			if (j >= static_cast<unsigned long>(k)) ir = j - 1;
-			if (j <= static_cast<unsigned long>(k)) l = i;
-		}
-	}
+    l = 0;
+    ir = MAX(values.rows, values.cols) - 1;
+    while (true) 
+    {
+        if (ir <= l + 1) 
+        {
+            if (ir == l + 1 && values.at<T>(ir) < values.at<T>(l))
+                std::swap(values.at<T>(l), values.at<T>(ir));
+            return values.at<T>(k);
+        }
+        else 
+        {
+            mid = (l + ir) >> 1;
+            std::swap(values.at<T>(mid), values.at<T>(l+1));
+            if (values.at<T>(l) > values.at<T>(ir))
+                std::swap(values.at<T>(l), values.at<T>(ir));
+            if (values.at<T>(l+1) > values.at<T>(ir))
+                std::swap(values.at<T>(l+1), values.at<T>(ir));
+            if (values.at<T>(l) > values.at<T>(l+1))
+                std::swap(values.at<T>(l), values.at<T>(l+1));
+            i = l + 1;
+            j = ir;
+            a = values.at<T>(l+1);
+            while (true) 
+            {
+                do
+                {
+                    i++;
+                }
+                while (values.at<T>(i) < a);
+                do
+                {
+                    j--;
+                }
+                while (values.at<T>(j) > a);
+                if (j < i) break;
+                std::swap(values.at<T>(i), values.at<T>(j));
+            }
+            values.at<T>(l+1) = values.at<T>(j);
+            values.at<T>(j) = a;
+            if (j >= static_cast<unsigned long>(k)) ir = j - 1;
+            if (j <= static_cast<unsigned long>(k)) l = i;
+        }
+    }
 }
 
 namespace cv
 {
-	namespace optflow
-	{
-	class CImageBuffer
-	{
-	public:
-		CImageBuffer()
-			: m_Overwrite(true)
-		{};
-		void setGrayFromRGB(const cv::Mat & inp)
-		{
-			if(m_Overwrite)
-				cv::cvtColor(inp, m_Image, cv::COLOR_BGR2GRAY);
-		}
-		void setImage(const cv::Mat & inp)
-		{
-			if(m_Overwrite)
-				inp.copyTo(m_Image);
-		}
-		void setBlurFromRGB(const cv::Mat & inp)
-		{
-			//cv::medianBlur(constNextImage, blurNextImg, 7);
-			if(m_Overwrite)
-				cv::GaussianBlur(inp, m_BlurredImage, cv::Size(7,7), -1);
-		}
-		
-		int buildPyramid(cv::Size winSize, int maxLevel, float levelScale[2]);
-		cv::Mat & getImage(int level) {return m_ImagePyramid[level];}
+namespace optflow
+{
+class CImageBuffer
+{
+public:
+    CImageBuffer()
+        : m_Overwrite(true)
+    {};
+    void setGrayFromRGB(const cv::Mat & inp)
+    {
+        if(m_Overwrite)
+            cv::cvtColor(inp, m_Image, cv::COLOR_BGR2GRAY);
+    }
+    void setImage(const cv::Mat & inp)
+    {
+        if(m_Overwrite)
+            inp.copyTo(m_Image);
+    }
+    void setBlurFromRGB(const cv::Mat & inp)
+    {
+        //cv::medianBlur(constNextImage, blurNextImg, 7);
+        if(m_Overwrite)
+            cv::GaussianBlur(inp, m_BlurredImage, cv::Size(7,7), -1);
+    }
+
+    int buildPyramid(cv::Size winSize, int maxLevel, float levelScale[2]);
+    cv::Mat & getImage(int level) {return m_ImagePyramid[level];}
+
+    std::vector<cv::Mat>        m_ImagePyramid;
+    cv::Mat                        m_BlurredImage;
+    cv::Mat                        m_Image;
+    std::vector<cv::Mat>        m_CrossPyramid;
+    int                            m_maxLevel;
+    bool m_Overwrite;
+};
 
-		std::vector<cv::Mat>		m_ImagePyramid;
-		cv::Mat						m_BlurredImage;
-		cv::Mat						m_Image;
-		std::vector<cv::Mat>		m_CrossPyramid;
-		int							m_maxLevel;
-		bool m_Overwrite;
-	};
-		
-	void calcLocalOpticalFlow(
-		const Mat prevImage,
-		const Mat currImage,
-		Ptr<CImageBuffer>  prevPyramids[2],
-		Ptr<CImageBuffer>  currPyramids[2],
-		const std::vector<Point2f> & prevPoints,
-		std::vector<Point2f> & currPoints,
-		const RLOFOpticalFlowParameter & param);
+void calcLocalOpticalFlow(
+    const Mat prevImage,
+    const Mat currImage,
+    Ptr<CImageBuffer>  prevPyramids[2],
+    Ptr<CImageBuffer>  currPyramids[2],
+    const std::vector<Point2f> & prevPoints,
+    std::vector<Point2f> & currPoints,
+    const RLOFOpticalFlowParameter & param);
 }
 }
 #endif
\ No newline at end of file
diff --git a/modules/optflow/src/rlofflow.cpp b/modules/optflow/src/rlofflow.cpp
index 1bdc45a0221..d7309c15210 100644
--- a/modules/optflow/src/rlofflow.cpp
+++ b/modules/optflow/src/rlofflow.cpp
@@ -1,385 +1,345 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
- //
- //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
- //
- //  By downloading, copying, installing or using the software you agree to this license.
- //  If you do not agree to this license, do not download, install,
- //  copy or use the software.
- //
- //
- //                           License Agreement
- //                For Open Source Computer Vision Library
- //
- // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
- // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
- // Third party copyrights are property of their respective owners.
- //
- // Redistribution and use in source and binary forms, with or without modification,
- // are permitted provided that the following conditions are met:
- //
- //   * Redistribution's of source code must retain the above copyright notice,
- //     this list of conditions and the following disclaimer.
- //
- //   * Redistribution's in binary form must reproduce the above copyright notice,
- //     this list of conditions and the following disclaimer in the documentation
- //     and/or other materials provided with the distribution.
- //
- //   * The name of the copyright holders may not be used to endorse or promote products
- //     derived from this software without specific prior written permission.
- //
- // This software is provided by the copyright holders and contributors "as is" and
- // any express or implied warranties, including, but not limited to, the implied
- // warranties of merchantability and fitness for a particular purpose are disclaimed.
- // In no event shall the Intel Corporation or contributors be liable for any direct,
- // indirect, incidental, special, exemplary, or consequential damages
- // (including, but not limited to, procurement of substitute goods or services;
- // loss of use, data, or profits; or business interruption) however caused
- // and on any theory of liability, whether in contract, strict liability,
- // or tort (including negligence or otherwise) arising in any way out of
- // the use of this software, even if advised of the possibility of such damage.
- //
- //M*/
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
 #include "precomp.hpp"
 #include "rlof/rlof_localflow.h"
 #include "rlof/geo_interpolation.hpp"
 #include "opencv2/ximgproc.hpp"
 namespace cv
 { 
-	namespace optflow 
-	{
-				
-		class DenseOpticalFlowRLOFImpl : public DenseRLOFOpticalFlow
-		{
-		public:
-			DenseOpticalFlowRLOFImpl()
-				: forwardBackwardThreshold(1.f)
-				, gridStep(6, 6)
-				, interp_type(InterpolationType::INTERP_GEO)
-				, k(128)
-				, sigma(0.05f)
-				, lambda(999.f)
-				, fgs_lambda(500.0f)
-				, fgs_sigma(1.5f)
-				, use_post_proc(true)
-
-			{
-				prevPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer);
-				prevPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer);
-				currPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer);
-				currPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer);
-			}
-			virtual void setRLOFOpticalFlowParameter(RLOFOpticalFlowParameter val) CV_OVERRIDE { param = val; }
-			virtual RLOFOpticalFlowParameter getRLOFOpticalFlowParameter() const CV_OVERRIDE { return param; }
-
-			virtual float getForwardBackward() const CV_OVERRIDE { return forwardBackwardThreshold; }
-			virtual void setForwardBackward(float val) CV_OVERRIDE { forwardBackwardThreshold = val; }
-
-			virtual void setInterpolation(InterpolationType val) CV_OVERRIDE { interp_type = val; }
-			virtual InterpolationType getInterpolation() const CV_OVERRIDE { return interp_type; }
-
-			virtual Size getGridStep() const CV_OVERRIDE { return gridStep; }
-			virtual void setGridStep(Size val) CV_OVERRIDE { gridStep = val; }
-
-			virtual int getEPICK() const CV_OVERRIDE { return k; }
-			virtual void setEPICK(int val) CV_OVERRIDE { k = val; }
-
-			virtual float getEPICSigma() const CV_OVERRIDE { return sigma; }
-			virtual void setEPICSigma(float val) CV_OVERRIDE { sigma = val; }
-
-			virtual float getEPICLambda() const CV_OVERRIDE { return lambda; }
-			virtual void setEPICLambda(float val)  CV_OVERRIDE { lambda = val; }
-
-			virtual float getFgsLambda() const CV_OVERRIDE { return fgs_lambda; }
-			virtual void setFgsLambda(float val) CV_OVERRIDE { fgs_lambda = val; }
-
-			virtual float getFgsSigma() const CV_OVERRIDE { return fgs_sigma; }
-			virtual void setFgsSigma(float val) CV_OVERRIDE { fgs_sigma = val; }
-
-			virtual bool getUsePostProc() const CV_OVERRIDE { return use_post_proc; }
-			virtual void setUsePostProc(bool val) CV_OVERRIDE { use_post_proc = val; }
-
-			virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow) CV_OVERRIDE 
-			{
-				CV_Assert(!I0.empty() && I0.depth() == CV_8U && (I0.channels() == 3 || I0.channels() == 1));
-				CV_Assert(!I1.empty() && I1.depth() == CV_8U && (I1.channels() == 3 || I1.channels() == 1));
-				CV_Assert(I0.sameSize(I1));
-				if (param.supportRegionType == RLOFOpticalFlowParameter::SR_CROSS)
-					CV_Assert( I0.channels() == 3 && I1.channels() == 3);
-				CV_Assert(interp_type == InterpolationType::INTERP_EPIC || interp_type == InterpolationType::INTERP_GEO);
-
-				Mat prevImage = I0.getMat();
-				Mat currImage = I1.getMat();
-				int noPoints = prevImage.cols * prevImage.rows;
-				std::vector<cv::Point2f> prevPoints(noPoints);
-				std::vector<cv::Point2f> currPoints, refPoints;
-				noPoints = 0;
-				cv::Size grid_h = gridStep / 2;
-				for (int r = grid_h.height; r < prevImage.rows; r += gridStep.height)
-				{
-					for (int c = grid_h.width; c < prevImage.cols; c += gridStep.width)
-					{
-						prevPoints[noPoints++] = cv::Point2f(static_cast<float>(c), static_cast<float>(r));
-					}
-				}
-				prevPoints.erase(prevPoints.begin() + noPoints, prevPoints.end());
-				currPoints.resize(prevPoints.size());
-
-				calcLocalOpticalFlow(prevImage, currImage, prevPyramid, currPyramid, prevPoints, currPoints, param);
-
-				flow.create(prevImage.size(), CV_32FC2);
-				Mat dense_flow = flow.getMat();
-
-				std::vector<Point2f> filtered_prevPoints;
-				std::vector<Point2f> filtered_currPoints;
-				if (gridStep == cv::Size(1, 1) && forwardBackwardThreshold <= 0)
-				{
-					for (unsigned int n = 0; n < prevPoints.size(); n++)
-					{
-						dense_flow.at<Point2f>(prevPoints[n]) = currPoints[n] - prevPoints[n];
-					}
-					return;
-				}
-				if (forwardBackwardThreshold > 0)
-				{
-					// reuse image pyramids
-					calcLocalOpticalFlow(currImage, prevImage, currPyramid, prevPyramid, currPoints, refPoints, param);
-
-					filtered_prevPoints.resize(prevPoints.size());
-					filtered_currPoints.resize(prevPoints.size());
-					float sqrForwardBackwardThreshold = forwardBackwardThreshold * forwardBackwardThreshold;
-					noPoints = 0;
-					for (unsigned int r = 0; r < refPoints.size(); r++)
-					{
-						Point2f diff = refPoints[r] - prevPoints[r];
-						if (diff.x * diff.x + diff.y * diff.y < sqrForwardBackwardThreshold)
-						{
-							filtered_prevPoints[noPoints] = prevPoints[r];
-							filtered_currPoints[noPoints++] = currPoints[r];
-						}
-					}
-
-					filtered_prevPoints.erase(filtered_prevPoints.begin() + noPoints, filtered_prevPoints.end());
-					filtered_currPoints.erase(filtered_currPoints.begin() + noPoints, filtered_currPoints.end());
-
-				}
-				else
-				{
-					filtered_prevPoints = prevPoints;
-					filtered_currPoints = currPoints;
-				}
-
-				if (interp_type == InterpolationType::INTERP_EPIC)
-				{
-					Ptr<ximgproc::EdgeAwareInterpolator> gd = ximgproc::createEdgeAwareInterpolator();
-					gd->setK(k);
-					gd->setSigma(sigma);
-					gd->setLambda(lambda);
-					gd->setUsePostProcessing(false);
-					gd->interpolate(prevImage, filtered_prevPoints, currImage, filtered_currPoints, dense_flow);
-				}
-				else
-				{
-					Mat blurredPrevImage, blurredCurrImage;
-					GaussianBlur(prevImage, blurredPrevImage, cv::Size(5, 5), -1);
-					std::vector<uchar> status(filtered_currPoints.size(), 1);
-					interpolate_irregular_nn_raster(filtered_prevPoints, filtered_currPoints, status, blurredPrevImage).copyTo(dense_flow);
-					std::vector<Mat> vecMats;
-					std::vector<Mat> vecMats2(2);
-					cv::split(dense_flow, vecMats);
-					cv::bilateralFilter(vecMats[0], vecMats2[0], 5, 2, 20);
-					cv::bilateralFilter(vecMats[1], vecMats2[1], 5, 2, 20);
-					cv::merge(vecMats2, dense_flow);
-				}
-				if (use_post_proc)
-				{
-					ximgproc::fastGlobalSmootherFilter(prevImage, flow, flow, fgs_lambda, fgs_sigma);
-				}
-			}
-			
-			virtual void collectGarbage() CV_OVERRIDE
-			{
-				prevPyramid[0].release();
-				prevPyramid[1].release();
-				currPyramid[0].release();
-				currPyramid[1].release();
-			}
-
-		protected:
-			RLOFOpticalFlowParameter		param;
-			float				forwardBackwardThreshold;
-			Ptr<CImageBuffer>	prevPyramid[2];
-			Ptr<CImageBuffer>	currPyramid[2];
-			cv::Size			gridStep;
-			InterpolationType   interp_type;
-			int					k;
-			float				sigma;
-			float				lambda;
-			float				fgs_lambda;
-			float				fgs_sigma;
-			bool				use_post_proc;
-		};
-	
-		Ptr<DenseRLOFOpticalFlow> DenseRLOFOpticalFlow::create(
-			const RLOFOpticalFlowParameter & rlofParam,
-			float forwardBackwardThreshold,
-			cv::Size gridStep,
-			InterpolationType interp_type,
-			int epicK,
-			float epicSigma,
-			float epicLambda,
-			float fgs_lambda,
-			float fgs_sigma)
-		{
-			Ptr<DenseRLOFOpticalFlow> algo = makePtr<DenseOpticalFlowRLOFImpl>();
-			algo->setRLOFOpticalFlowParameter(rlofParam);
-			algo->setForwardBackward(forwardBackwardThreshold);
-			algo->setGridStep(gridStep);
-			algo->setInterpolation(interp_type);
-			algo->setEPICK(epicK);
-			algo->setEPICSigma(epicSigma);
-			algo->setEPICLambda(epicLambda);
-			algo->setFgsLambda(fgs_lambda);
-			algo->setFgsSigma(fgs_sigma);
-			return algo;
-		}
-
-		class SparseRLOFOpticalFlowImpl : public SparseRLOFOpticalFlow
-		{
-			public:
-			SparseRLOFOpticalFlowImpl()
-				: forwardBackwardThreshold(1.f)
-			{
-				prevPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer);
-				prevPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer);
-				currPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer);
-				currPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer);
-			}
-			virtual void setRLOFOpticalFlowParameter(RLOFOpticalFlowParameter val) CV_OVERRIDE  { param = val; }
-			virtual RLOFOpticalFlowParameter getRLOFOpticalFlowParameter() const CV_OVERRIDE { return param; }
-
-			virtual float getForwardBackward()  const CV_OVERRIDE { return forwardBackwardThreshold; }
-			virtual void setForwardBackward(float val) CV_OVERRIDE { forwardBackwardThreshold = val; }
-
-			virtual void calc(InputArray prevImg, InputArray nextImg,
-				InputArray prevPts, InputOutputArray nextPts,
-				OutputArray status,
-				OutputArray err) CV_OVERRIDE
-			{
-				CV_Assert(!prevImg.empty() && prevImg.depth() == CV_8U && (prevImg.channels() == 3 || prevImg.channels() == 1));
-				CV_Assert(!nextImg.empty() && nextImg.depth() == CV_8U && (nextImg.channels() == 3 || nextImg.channels() == 1));
-				CV_Assert(prevImg.sameSize(nextImg));
-
-				//CV_Assert(param.supportRegionType == RLOFOpticalFlowParameter::SR_CROSS && I0.channels() == 3 && I1.channels() == 3);
-
-				Mat prevImage = prevImg.getMat();
-				Mat nextImage = nextImg.getMat();
-				Mat prevPtsMat = prevPts.getMat();
-
-				if (param.useInitialFlow == false)
-					nextPts.create(prevPtsMat.size(), prevPtsMat.type(), -1, true);
-
-				int npoints = 0;
-				CV_Assert((npoints = prevPtsMat.checkVector(2, CV_32F, true)) >= 0);
-
-				if (npoints == 0)
-				{
-					nextPts.release();
-					status.release();
-					err.release();
-					return;
-				}
-
-				Mat nextPtsMat = nextPts.getMat();
-				CV_Assert(nextPtsMat.checkVector(2, CV_32F, true) == npoints);
-				std::vector<cv::Point2f> prevPoints(npoints), nextPoints(npoints), refPoints;
-				prevPtsMat.copyTo(cv::Mat(1, npoints, CV_32FC2, &prevPoints[0]));
-				if (param.useInitialFlow )
-					nextPtsMat.copyTo(cv::Mat(1, nextPtsMat.cols, CV_32FC2, &nextPoints[0]));
-
-				cv::Mat statusMat;
-				cv::Mat errorMat;
-				if (status.needed() || forwardBackwardThreshold > 0)
-				{
-					status.create((int)npoints, 1, CV_8U, -1, true);
-					statusMat = status.getMat();
-					statusMat.setTo(1);
-				}
-
-				if (err.needed() || forwardBackwardThreshold > 0)
-				{
-					err.create((int)npoints, 1, CV_32F, -1, true);
-					errorMat = err.getMat();
-					errorMat.setTo(0);
-				}
-
-				calcLocalOpticalFlow(prevImage, nextImage, prevPyramid, currPyramid, prevPoints, nextPoints, param);
-				cv::Mat(1,npoints , CV_32FC2, &nextPoints[0]).copyTo(nextPtsMat);
-				if (forwardBackwardThreshold > 0)
-				{
-					// reuse image pyramids
-					calcLocalOpticalFlow(nextImage, prevImage, currPyramid, prevPyramid, nextPoints, refPoints, param);
-				}
-				for (unsigned int r = 0; r < refPoints.size(); r++)
-				{
-					Point2f diff = refPoints[r] - prevPoints[r];
-					errorMat.at<float>(r) = sqrt(diff.x * diff.x + diff.y * diff.y);
-					if (errorMat.at<float>(r) <= forwardBackwardThreshold)
-						statusMat.at<uchar>(r) = 0;
-				}
-
-			}
-
-		protected:
-			RLOFOpticalFlowParameter	param;
-			float				forwardBackwardThreshold;
-			Ptr<CImageBuffer>	prevPyramid[2];
-			Ptr<CImageBuffer>	currPyramid[2];
-		};
-		
-		Ptr<SparseRLOFOpticalFlow> SparseRLOFOpticalFlow::create(
-			const RLOFOpticalFlowParameter & rlofParam,
-			float forwardBackwardThreshold)
-		{
-			Ptr<SparseRLOFOpticalFlow> algo = makePtr<SparseRLOFOpticalFlowImpl>();
-			algo->setRLOFOpticalFlowParameter(rlofParam);
-			algo->setForwardBackward(forwardBackwardThreshold);
-			return algo;
-		}
-		
-		void calcOpticalFlowDenseRLOF(InputArray I0, InputArray I1, InputOutputArray flow,
-			RLOFOpticalFlowParameter rlofParam ,
-			float forewardBackwardThreshold, Size gridStep, 
-			DenseRLOFOpticalFlow::InterpolationType interp_type,
-			int epicK, float epicSigma, 
-			bool use_post_proc, float fgsLambda, float fgsSigma)
-		{
-			Ptr<DenseRLOFOpticalFlow> algo = DenseRLOFOpticalFlow::create(
-				rlofParam, forewardBackwardThreshold, gridStep, interp_type,
-				epicK, epicSigma, use_post_proc, fgsLambda, fgsSigma);
-			algo->calc(I0, I1, flow);
-			algo->collectGarbage();
-		}
-
-		void calcOpticalFlowSparseRLOF(InputArray prevImg, InputArray nextImg,
-			InputArray prevPts, InputOutputArray nextPts,
-			OutputArray status, OutputArray err,
-			RLOFOpticalFlowParameter rlofParam,
-			float forewardBackwardThreshold)
-		{
-			Ptr<SparseRLOFOpticalFlow> algo = SparseRLOFOpticalFlow::create(
-				rlofParam, forewardBackwardThreshold);
-			algo->calc(prevImg, nextImg, prevPts, nextPts, status, err);
-		}
-		Ptr<DenseOpticalFlow> createOptFlow_DenseRLOF()
-		{
-			return DenseRLOFOpticalFlow::create();
-		}
-
-		Ptr<SparseOpticalFlow> createOptFlow_SparseRLOF()
-		{
-			return SparseRLOFOpticalFlow::create();
-		}
-	}
+namespace optflow
+{
+
+    class DenseOpticalFlowRLOFImpl : public DenseRLOFOpticalFlow
+    {
+    public:
+        DenseOpticalFlowRLOFImpl()
+            : forwardBackwardThreshold(1.f)
+            , gridStep(6, 6)
+            , interp_type(InterpolationType::INTERP_GEO)
+            , k(128)
+            , sigma(0.05f)
+            , lambda(999.f)
+            , fgs_lambda(500.0f)
+            , fgs_sigma(1.5f)
+            , use_post_proc(true)
+
+        {
+            prevPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer);
+            prevPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer);
+            currPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer);
+            currPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer);
+        }
+        virtual void setRLOFOpticalFlowParameter(Ptr<RLOFOpticalFlowParameter>  val) CV_OVERRIDE { }//param = *val; }
+        virtual Ptr<RLOFOpticalFlowParameter>  getRLOFOpticalFlowParameter() const CV_OVERRIDE { return Ptr<RLOFOpticalFlowParameter>(); }
+
+        virtual float getForwardBackward() const CV_OVERRIDE { return forwardBackwardThreshold; }
+        virtual void setForwardBackward(float val) CV_OVERRIDE { forwardBackwardThreshold = val; }
+
+        virtual void setInterpolation(InterpolationType val) CV_OVERRIDE { interp_type = val; }
+        virtual InterpolationType getInterpolation() const CV_OVERRIDE { return interp_type; }
+
+        virtual Size getGridStep() const CV_OVERRIDE { return gridStep; }
+        virtual void setGridStep(Size val) CV_OVERRIDE { gridStep = val; }
+
+        virtual int getEPICK() const CV_OVERRIDE { return k; }
+        virtual void setEPICK(int val) CV_OVERRIDE { k = val; }
+
+        virtual float getEPICSigma() const CV_OVERRIDE { return sigma; }
+        virtual void setEPICSigma(float val) CV_OVERRIDE { sigma = val; }
+
+        virtual float getEPICLambda() const CV_OVERRIDE { return lambda; }
+        virtual void setEPICLambda(float val)  CV_OVERRIDE { lambda = val; }
+
+        virtual float getFgsLambda() const CV_OVERRIDE { return fgs_lambda; }
+        virtual void setFgsLambda(float val) CV_OVERRIDE { fgs_lambda = val; }
+
+        virtual float getFgsSigma() const CV_OVERRIDE { return fgs_sigma; }
+        virtual void setFgsSigma(float val) CV_OVERRIDE { fgs_sigma = val; }
+
+        virtual bool getUsePostProc() const CV_OVERRIDE { return use_post_proc; }
+        virtual void setUsePostProc(bool val) CV_OVERRIDE { use_post_proc = val; }
+
+        virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow) CV_OVERRIDE
+        {
+            CV_Assert(!I0.empty() && I0.depth() == CV_8U && (I0.channels() == 3 || I0.channels() == 1));
+            CV_Assert(!I1.empty() && I1.depth() == CV_8U && (I1.channels() == 3 || I1.channels() == 1));
+            CV_Assert(I0.sameSize(I1));
+            if (param.supportRegionType == SR_CROSS)
+                CV_Assert( I0.channels() == 3 && I1.channels() == 3);
+            CV_Assert(interp_type == InterpolationType::INTERP_EPIC || interp_type == InterpolationType::INTERP_GEO);
+
+            Mat prevImage = I0.getMat();
+            Mat currImage = I1.getMat();
+            int noPoints = prevImage.cols * prevImage.rows;
+            std::vector<cv::Point2f> prevPoints(noPoints);
+            std::vector<cv::Point2f> currPoints, refPoints;
+            noPoints = 0;
+            cv::Size grid_h = gridStep / 2;
+            for (int r = grid_h.height; r < prevImage.rows; r += gridStep.height)
+            {
+                for (int c = grid_h.width; c < prevImage.cols; c += gridStep.width)
+                {
+                    prevPoints[noPoints++] = cv::Point2f(static_cast<float>(c), static_cast<float>(r));
+                }
+            }
+            prevPoints.erase(prevPoints.begin() + noPoints, prevPoints.end());
+            currPoints.resize(prevPoints.size());
+
+            calcLocalOpticalFlow(prevImage, currImage, prevPyramid, currPyramid, prevPoints, currPoints, param);
+
+            flow.create(prevImage.size(), CV_32FC2);
+            Mat dense_flow = flow.getMat();
+
+            std::vector<Point2f> filtered_prevPoints;
+            std::vector<Point2f> filtered_currPoints;
+            if (gridStep == cv::Size(1, 1) && forwardBackwardThreshold <= 0)
+            {
+                for (unsigned int n = 0; n < prevPoints.size(); n++)
+                {
+                    dense_flow.at<Point2f>(prevPoints[n]) = currPoints[n] - prevPoints[n];
+                }
+                return;
+            }
+            if (forwardBackwardThreshold > 0)
+            {
+                // reuse image pyramids
+                calcLocalOpticalFlow(currImage, prevImage, currPyramid, prevPyramid, currPoints, refPoints, param);
+
+                filtered_prevPoints.resize(prevPoints.size());
+                filtered_currPoints.resize(prevPoints.size());
+                float sqrForwardBackwardThreshold = forwardBackwardThreshold * forwardBackwardThreshold;
+                noPoints = 0;
+                for (unsigned int r = 0; r < refPoints.size(); r++)
+                {
+                    Point2f diff = refPoints[r] - prevPoints[r];
+                    if (diff.x * diff.x + diff.y * diff.y < sqrForwardBackwardThreshold)
+                    {
+                        filtered_prevPoints[noPoints] = prevPoints[r];
+                        filtered_currPoints[noPoints++] = currPoints[r];
+                    }
+                }
+
+                filtered_prevPoints.erase(filtered_prevPoints.begin() + noPoints, filtered_prevPoints.end());
+                filtered_currPoints.erase(filtered_currPoints.begin() + noPoints, filtered_currPoints.end());
+
+            }
+            else
+            {
+                filtered_prevPoints = prevPoints;
+                filtered_currPoints = currPoints;
+            }
+
+            if (interp_type == InterpolationType::INTERP_EPIC)
+            {
+                Ptr<ximgproc::EdgeAwareInterpolator> gd = ximgproc::createEdgeAwareInterpolator();
+                gd->setK(k);
+                gd->setSigma(sigma);
+                gd->setLambda(lambda);
+                gd->setUsePostProcessing(false);
+                gd->interpolate(prevImage, filtered_prevPoints, currImage, filtered_currPoints, dense_flow);
+            }
+            else
+            {
+                Mat blurredPrevImage, blurredCurrImage;
+                GaussianBlur(prevImage, blurredPrevImage, cv::Size(5, 5), -1);
+                std::vector<uchar> status(filtered_currPoints.size(), 1);
+                interpolate_irregular_nn_raster(filtered_prevPoints, filtered_currPoints, status, blurredPrevImage).copyTo(dense_flow);
+                std::vector<Mat> vecMats;
+                std::vector<Mat> vecMats2(2);
+                cv::split(dense_flow, vecMats);
+                cv::bilateralFilter(vecMats[0], vecMats2[0], 5, 2, 20);
+                cv::bilateralFilter(vecMats[1], vecMats2[1], 5, 2, 20);
+                cv::merge(vecMats2, dense_flow);
+            }
+            if (use_post_proc)
+            {
+                ximgproc::fastGlobalSmootherFilter(prevImage, flow, flow, fgs_lambda, fgs_sigma);
+            }
+        }
+
+        virtual void collectGarbage() CV_OVERRIDE
+        {
+            prevPyramid[0].release();
+            prevPyramid[1].release();
+            currPyramid[0].release();
+            currPyramid[1].release();
+        }
+
+    protected:
+        RLOFOpticalFlowParameter        param;
+        float                forwardBackwardThreshold;
+        Ptr<CImageBuffer>    prevPyramid[2];
+        Ptr<CImageBuffer>    currPyramid[2];
+        cv::Size            gridStep;
+        InterpolationType   interp_type;
+        int                    k;
+        float                sigma;
+        float                lambda;
+        float                fgs_lambda;
+        float                fgs_sigma;
+        bool                use_post_proc;
+    };
+
+    Ptr<DenseRLOFOpticalFlow> DenseRLOFOpticalFlow::create(
+        Ptr<RLOFOpticalFlowParameter>  rlofParam,
+        float forwardBackwardThreshold,
+        cv::Size gridStep,
+        InterpolationType interp_type,
+        int epicK,
+        float epicSigma,
+        float epicLambda,
+        float fgs_lambda,
+        float fgs_sigma)
+    {
+        Ptr<DenseRLOFOpticalFlow> algo = makePtr<DenseOpticalFlowRLOFImpl>();
+        algo->setRLOFOpticalFlowParameter(rlofParam);
+        algo->setForwardBackward(forwardBackwardThreshold);
+        algo->setGridStep(gridStep);
+        algo->setInterpolation(interp_type);
+        algo->setEPICK(epicK);
+        algo->setEPICSigma(epicSigma);
+        algo->setEPICLambda(epicLambda);
+        algo->setFgsLambda(fgs_lambda);
+        algo->setFgsSigma(fgs_sigma);
+        return algo;
+    }
+
+    class SparseRLOFOpticalFlowImpl : public SparseRLOFOpticalFlow
+    {
+        public:
+        SparseRLOFOpticalFlowImpl()
+            : forwardBackwardThreshold(1.f)
+        {
+            prevPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer);
+            prevPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer);
+            currPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer);
+            currPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer);
+        }
+        virtual void setRLOFOpticalFlowParameter(Ptr<RLOFOpticalFlowParameter>  val) CV_OVERRIDE  { } // param = *val; }
+        virtual Ptr<RLOFOpticalFlowParameter>  getRLOFOpticalFlowParameter() const CV_OVERRIDE { return Ptr<RLOFOpticalFlowParameter>(); }
+
+        virtual float getForwardBackward()  const CV_OVERRIDE { return forwardBackwardThreshold; }
+        virtual void setForwardBackward(float val) CV_OVERRIDE { forwardBackwardThreshold = val; }
+
+        virtual void calc(InputArray prevImg, InputArray nextImg,
+            InputArray prevPts, InputOutputArray nextPts,
+            OutputArray status,
+            OutputArray err) CV_OVERRIDE
+        {
+            CV_Assert(!prevImg.empty() && prevImg.depth() == CV_8U && (prevImg.channels() == 3 || prevImg.channels() == 1));
+            CV_Assert(!nextImg.empty() && nextImg.depth() == CV_8U && (nextImg.channels() == 3 || nextImg.channels() == 1));
+            CV_Assert(prevImg.sameSize(nextImg));
+
+            Mat prevImage = prevImg.getMat();
+            Mat nextImage = nextImg.getMat();
+            Mat prevPtsMat = prevPts.getMat();
+
+            if (param.useInitialFlow == false)
+                nextPts.create(prevPtsMat.size(), prevPtsMat.type(), -1, true);
+
+            int npoints = 0;
+            CV_Assert((npoints = prevPtsMat.checkVector(2, CV_32F, true)) >= 0);
+
+            if (npoints == 0)
+            {
+                nextPts.release();
+                status.release();
+                err.release();
+                return;
+            }
+
+            Mat nextPtsMat = nextPts.getMat();
+            CV_Assert(nextPtsMat.checkVector(2, CV_32F, true) == npoints);
+            std::vector<cv::Point2f> prevPoints(npoints), nextPoints(npoints), refPoints;
+            prevPtsMat.copyTo(cv::Mat(1, npoints, CV_32FC2, &prevPoints[0]));
+            if (param.useInitialFlow )
+                nextPtsMat.copyTo(cv::Mat(1, nextPtsMat.cols, CV_32FC2, &nextPoints[0]));
+
+            cv::Mat statusMat;
+            cv::Mat errorMat;
+            if (status.needed() || forwardBackwardThreshold > 0)
+            {
+                status.create((int)npoints, 1, CV_8U, -1, true);
+                statusMat = status.getMat();
+                statusMat.setTo(1);
+            }
+
+            if (err.needed() || forwardBackwardThreshold > 0)
+            {
+                err.create((int)npoints, 1, CV_32F, -1, true);
+                errorMat = err.getMat();
+                errorMat.setTo(0);
+            }
+
+            calcLocalOpticalFlow(prevImage, nextImage, prevPyramid, currPyramid, prevPoints, nextPoints, param);
+            cv::Mat(1,npoints , CV_32FC2, &nextPoints[0]).copyTo(nextPtsMat);
+            if (forwardBackwardThreshold > 0)
+            {
+                // reuse image pyramids
+                calcLocalOpticalFlow(nextImage, prevImage, currPyramid, prevPyramid, nextPoints, refPoints, param);
+            }
+            for (unsigned int r = 0; r < refPoints.size(); r++)
+            {
+                Point2f diff = refPoints[r] - prevPoints[r];
+                errorMat.at<float>(r) = sqrt(diff.x * diff.x + diff.y * diff.y);
+                if (errorMat.at<float>(r) <= forwardBackwardThreshold)
+                    statusMat.at<uchar>(r) = 0;
+            }
+
+        }
+
+    protected:
+        RLOFOpticalFlowParameter    param;
+        float                forwardBackwardThreshold;
+        Ptr<CImageBuffer>    prevPyramid[2];
+        Ptr<CImageBuffer>    currPyramid[2];
+    };
+
+    Ptr<SparseRLOFOpticalFlow> SparseRLOFOpticalFlow::create(
+        Ptr<RLOFOpticalFlowParameter>  rlofParam,
+        float forwardBackwardThreshold)
+    {
+        Ptr<SparseRLOFOpticalFlow> algo = makePtr<SparseRLOFOpticalFlowImpl>();
+        algo->setRLOFOpticalFlowParameter(rlofParam);
+        algo->setForwardBackward(forwardBackwardThreshold);
+        return algo;
+    }
+
+    void calcOpticalFlowDenseRLOF(InputArray I0, InputArray I1, InputOutputArray flow,
+        Ptr<RLOFOpticalFlowParameter>  rlofParam ,
+        float forewardBackwardThreshold, Size gridStep,
+        InterpolationType interp_type,
+        int epicK, float epicSigma,
+        bool use_post_proc, float fgsLambda, float fgsSigma)
+    {
+        Ptr<DenseRLOFOpticalFlow> algo = DenseRLOFOpticalFlow::create(
+            rlofParam, forewardBackwardThreshold, gridStep, interp_type,
+            epicK, epicSigma, use_post_proc, fgsLambda, fgsSigma);
+        algo->calc(I0, I1, flow);
+        algo->collectGarbage();
+    }
+
+    void calcOpticalFlowSparseRLOF(InputArray prevImg, InputArray nextImg,
+        InputArray prevPts, InputOutputArray nextPts,
+        OutputArray status, OutputArray err,
+        Ptr<RLOFOpticalFlowParameter>  rlofParam,
+        float forewardBackwardThreshold)
+    {
+        Ptr<SparseRLOFOpticalFlow> algo = SparseRLOFOpticalFlow::create(
+            rlofParam, forewardBackwardThreshold);
+        algo->calc(prevImg, nextImg, prevPts, nextPts, status, err);
+    }
+    Ptr<DenseOpticalFlow> createOptFlow_DenseRLOF()
+    {
+        return DenseRLOFOpticalFlow::create();
+    }
+
+    Ptr<SparseOpticalFlow> createOptFlow_SparseRLOF()
+    {
+        return SparseRLOFOpticalFlow::create();
+    }
+}
 
 
 }
\ No newline at end of file
diff --git a/modules/optflow/test/test_OF_accuracy.cpp b/modules/optflow/test/test_OF_accuracy.cpp
index d2aedf2573a..b517896e045 100644
--- a/modules/optflow/test/test_OF_accuracy.cpp
+++ b/modules/optflow/test/test_OF_accuracy.cpp
@@ -85,17 +85,17 @@ static float calcRMSE(Mat flow1, Mat flow2)
 }
 static float calcRMSE(vector<Point2f> prevPts, vector<Point2f> currPts, Mat flow)
 {
-	vector<float> ee;
-	for (unsigned int n = 0; n < prevPts.size(); n++)
-	{
-		Point2f gtFlow = flow.at<Point2f>(prevPts[n]);
-		if (isFlowCorrect(gtFlow.x) && isFlowCorrect(gtFlow.y))
-		{
-			Point2f diffFlow = (currPts[n] - prevPts[n]) - gtFlow;
-			ee.push_back(sqrt(diffFlow.x * diffFlow.x + diffFlow.y * diffFlow.y));
-		}
-	}
-	return mean(ee).val[0];
+    vector<float> ee;
+    for (unsigned int n = 0; n < prevPts.size(); n++)
+    {
+        Point2f gtFlow = flow.at<Point2f>(prevPts[n]);
+        if (isFlowCorrect(gtFlow.x) && isFlowCorrect(gtFlow.y))
+        {
+            Point2f diffFlow = (currPts[n] - prevPts[n]) - gtFlow;
+            ee.push_back(sqrt(diffFlow.x * diffFlow.x + diffFlow.y * diffFlow.y));
+        }
+    }
+    return mean(ee).val[0];
 }
 static float calcAvgEPE(vector< pair<Point2i, Point2i> > corr, Mat flow)
 {
@@ -127,10 +127,10 @@ bool readRubberWhale(Mat &dst_frame_1, Mat &dst_frame_2, Mat &dst_GT)
     string frame1_path = getRubberWhaleFrame1();
     string frame2_path = getRubberWhaleFrame2();
     string gt_flow_path = getRubberWhaleGroundTruth();
-	// removing space may be an issue on windows machines
-	frame1_path.erase(std::remove_if(frame1_path.begin(), frame1_path.end(), isspace), frame1_path.end());
-	frame2_path.erase(std::remove_if(frame2_path.begin(), frame2_path.end(), isspace), frame2_path.end());
-	gt_flow_path.erase(std::remove_if(gt_flow_path.begin(), gt_flow_path.end(), isspace), gt_flow_path.end());
+    // removing space may be an issue on windows machines
+    frame1_path.erase(std::remove_if(frame1_path.begin(), frame1_path.end(), isspace), frame1_path.end());
+    frame2_path.erase(std::remove_if(frame2_path.begin(), frame2_path.end(), isspace), frame2_path.end());
+    gt_flow_path.erase(std::remove_if(gt_flow_path.begin(), gt_flow_path.end(), isspace), gt_flow_path.end());
 
     dst_frame_1 = imread(frame1_path);
     dst_frame_2 = imread(frame2_path);
@@ -177,103 +177,103 @@ TEST(DenseOpticalFlow_DeepFlow, ReferenceAccuracy)
 
 TEST(SparseOpticalFlow, ReferenceAccuracy)
 {
-	// with the following test each invoker class should be tested once
-	Mat frame1, frame2, GT;
-	ASSERT_TRUE(readRubberWhale(frame1, frame2, GT));
-	vector<Point2f> prevPts, currPts;
-	
-	for (int r = 0; r < frame1.rows; r+=10)
-	{
-		for (int c = 0; c < frame1.cols; c+=10)
-		{
-			prevPts.push_back(Point2f(static_cast<float>(c), static_cast<float>(r)));
-		}
-	}
-	vector<uchar> status(prevPts.size());
-	vector<float> err(prevPts.size());
-	Ptr<SparseRLOFOpticalFlow> algo = SparseRLOFOpticalFlow::create();
-	algo->setForwardBackward(0.0f);
-	
-	RLOFOpticalFlowParameter param;
-	param.supportRegionType = RLOFOpticalFlowParameter::SR_CROSS;
-	param.useIlluminationModel = true;
-	
-	param.solverType = RLOFOpticalFlowParameter::ST_BILINEAR;
-	algo->setRLOFOpticalFlowParameter(param);
-	algo->calc(frame1, frame2, prevPts, currPts, status, err);
-	EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.22f);
-
-	param.solverType = RLOFOpticalFlowParameter::ST_STANDART;
-	algo->setRLOFOpticalFlowParameter(param);
-	algo->calc(frame1, frame2, prevPts, currPts, status, err);
-	EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.34f);
-
-	param.useIlluminationModel = false;
-	
-	param.solverType = RLOFOpticalFlowParameter::ST_BILINEAR;
-	algo->setRLOFOpticalFlowParameter(param);
-	algo->calc(frame1, frame2, prevPts, currPts, status, err);
-	EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.24f);
-
-	param.solverType = RLOFOpticalFlowParameter::ST_STANDART;
-	algo->setRLOFOpticalFlowParameter(param);
-	algo->calc(frame1, frame2, prevPts, currPts, status, err);
-	EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.23f);
-
-	param.normSigma0 = numeric_limits<float>::max();
-	param.normSigma1 = numeric_limits<float>::max();
-	param.useIlluminationModel = true;
-
-	param.solverType = RLOFOpticalFlowParameter::ST_BILINEAR;
-	algo->setRLOFOpticalFlowParameter(param);
-	algo->calc(frame1, frame2, prevPts, currPts, status, err);
-	EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.26f);
-
-	param.solverType = RLOFOpticalFlowParameter::ST_STANDART;
-	algo->setRLOFOpticalFlowParameter(param);
-	algo->calc(frame1, frame2, prevPts, currPts, status, err);
-	EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.26f);
-
-	param.useIlluminationModel = false;
-
-	param.solverType = RLOFOpticalFlowParameter::ST_BILINEAR;
-	algo->setRLOFOpticalFlowParameter(param);
-	algo->calc(frame1, frame2, prevPts, currPts, status, err);
-	EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.25f);
-
-	param.solverType = RLOFOpticalFlowParameter::ST_STANDART;
-	algo->setRLOFOpticalFlowParameter(param);
-	algo->calc(frame1, frame2, prevPts, currPts, status, err);
-	EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.26f);
+    // with the following test each invoker class should be tested once
+    Mat frame1, frame2, GT;
+    ASSERT_TRUE(readRubberWhale(frame1, frame2, GT));
+    vector<Point2f> prevPts, currPts;
+    
+    for (int r = 0; r < frame1.rows; r+=10)
+    {
+        for (int c = 0; c < frame1.cols; c+=10)
+        {
+            prevPts.push_back(Point2f(static_cast<float>(c), static_cast<float>(r)));
+        }
+    }
+    vector<uchar> status(prevPts.size());
+    vector<float> err(prevPts.size());
+    Ptr<SparseRLOFOpticalFlow> algo = SparseRLOFOpticalFlow::create();
+    algo->setForwardBackward(0.0f);
+
+    Ptr<RLOFOpticalFlowParameter> param;
+    param->supportRegionType = SR_CROSS;
+    param->useIlluminationModel = true;
+    
+    param->solverType = ST_BILINEAR;
+    algo->setRLOFOpticalFlowParameter(param);
+    algo->calc(frame1, frame2, prevPts, currPts, status, err);
+    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.22f);
+
+    param->solverType = ST_STANDART;
+    algo->setRLOFOpticalFlowParameter(param);
+    algo->calc(frame1, frame2, prevPts, currPts, status, err);
+    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.34f);
+
+    param->useIlluminationModel = false;
+    
+    param->solverType = ST_BILINEAR;
+    algo->setRLOFOpticalFlowParameter(param);
+    algo->calc(frame1, frame2, prevPts, currPts, status, err);
+    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.24f);
+
+    param->solverType = ST_STANDART;
+    algo->setRLOFOpticalFlowParameter(param);
+    algo->calc(frame1, frame2, prevPts, currPts, status, err);
+    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.23f);
+
+    param->normSigma0 = numeric_limits<float>::max();
+    param->normSigma1 = numeric_limits<float>::max();
+    param->useIlluminationModel = true;
+
+    param->solverType = ST_BILINEAR;
+    algo->setRLOFOpticalFlowParameter(param);
+    algo->calc(frame1, frame2, prevPts, currPts, status, err);
+    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.26f);
+
+    param->solverType = ST_STANDART;
+    algo->setRLOFOpticalFlowParameter(param);
+    algo->calc(frame1, frame2, prevPts, currPts, status, err);
+    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.26f);
+
+    param->useIlluminationModel = false;
+
+    param->solverType = ST_BILINEAR;
+    algo->setRLOFOpticalFlowParameter(param);
+    algo->calc(frame1, frame2, prevPts, currPts, status, err);
+    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.25f);
+
+    param->solverType = ST_STANDART;
+    algo->setRLOFOpticalFlowParameter(param);
+    algo->calc(frame1, frame2, prevPts, currPts, status, err);
+    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.26f);
 }
 
 TEST(DenseOpticalFlow_RLOF, ReferenceAccuracy)
 {
-	Mat frame1, frame2, GT;
-	ASSERT_TRUE(readRubberWhale(frame1, frame2, GT));
-	
-	Mat flow;
-	Ptr<DenseRLOFOpticalFlow> algo = DenseRLOFOpticalFlow::create();
-	RLOFOpticalFlowParameter param;
-	param.supportRegionType = RLOFOpticalFlowParameter::SR_CROSS;
-	param.solverType = RLOFOpticalFlowParameter::ST_BILINEAR;
-	algo->setRLOFOpticalFlowParameter(param);
-	algo->setForwardBackward(1.0f);
-	algo->setGridStep(cv::Size(4, 4));
-	
-	algo->setInterpolation(DenseRLOFOpticalFlow::INTERP_EPIC);
-	algo->calc(frame1, frame2, flow);
-
-	ASSERT_EQ(GT.rows, flow.rows);
-	ASSERT_EQ(GT.cols, flow.cols);
-	EXPECT_LE(calcRMSE(GT, flow), 0.36f);
-
-	algo->setInterpolation(DenseRLOFOpticalFlow::INTERP_GEO);
-	algo->calc(frame1, frame2, flow);
-
-	ASSERT_EQ(GT.rows, flow.rows);
-	ASSERT_EQ(GT.cols, flow.cols);
-	EXPECT_LE(calcRMSE(GT, flow), 0.49f);
+    Mat frame1, frame2, GT;
+    ASSERT_TRUE(readRubberWhale(frame1, frame2, GT));
+    
+    Mat flow;
+    Ptr<DenseRLOFOpticalFlow> algo = DenseRLOFOpticalFlow::create();
+    Ptr<RLOFOpticalFlowParameter> param;
+    param->supportRegionType = SR_CROSS;
+    param->solverType = ST_BILINEAR;
+    algo->setRLOFOpticalFlowParameter(param);
+    algo->setForwardBackward(1.0f);
+    algo->setGridStep(cv::Size(4, 4));
+    
+    algo->setInterpolation(INTERP_EPIC);
+    algo->calc(frame1, frame2, flow);
+
+    ASSERT_EQ(GT.rows, flow.rows);
+    ASSERT_EQ(GT.cols, flow.cols);
+    EXPECT_LE(calcRMSE(GT, flow), 0.36f);
+
+    algo->setInterpolation(INTERP_GEO);
+    algo->calc(frame1, frame2, flow);
+
+    ASSERT_EQ(GT.rows, flow.rows);
+    ASSERT_EQ(GT.cols, flow.cols);
+    EXPECT_LE(calcRMSE(GT, flow), 0.49f);
 
 }
 

From 9b11b49ac5de723dd7812dbd7a7550f07e21a516 Mon Sep 17 00:00:00 2001
From: senst <senst@nue.tu-berlin.de>
Date: Tue, 11 Dec 2018 01:55:37 +0100
Subject: [PATCH 03/25] introducing precompiler flag RLOD_SSE

---
 modules/optflow/doc/optflow.bib               |  16 +-
 .../include/opencv2/optflow/rlofflow.hpp      | 147 +++++++++---------
 modules/optflow/src/rlof/berlof_invoker.hpp   |  28 ++--
 modules/optflow/src/rlof/plk_invoker.hpp      |  29 ++--
 modules/optflow/src/rlof/rlof_invoker.hpp     |  57 ++++---
 modules/optflow/src/rlof/rlof_invokerbase.hpp |   7 +
 modules/optflow/src/rlof/rlof_localflow.cpp   |   4 +-
 modules/optflow/src/rlof/rlof_localflow.h     |   5 +-
 modules/optflow/src/rlofflow.cpp              |  32 ++--
 9 files changed, 168 insertions(+), 157 deletions(-)

diff --git a/modules/optflow/doc/optflow.bib b/modules/optflow/doc/optflow.bib
index 746cecb6732..5d4a47c30a9 100644
--- a/modules/optflow/doc/optflow.bib
+++ b/modules/optflow/doc/optflow.bib
@@ -94,15 +94,15 @@ @inproceedings{Senst2013
   pages = {2499--2503},
 }
 
-@article{Senst2012,                                                        
-  author = {Tobias Senst and Volker Eiselein and Thomas Sikora},             
-  title = {Robust Local Optical Flow for Feature Tracking},                  
+@article{Senst2012,
+  author = {Tobias Senst and Volker Eiselein and Thomas Sikora},
+  title = {Robust Local Optical Flow for Feature Tracking},
   journal = {IEEE Transactions on Circuits and Systems for Video Technology},
-  year = {2012},                                                             
-  pages = {1377--1387},                                                      
-  volume = {22},                                                             
-  number = {9},                                                              
-}                                                                          
+  year = {2012},
+  pages = {1377--1387},
+  volume = {22},
+  number = {9},
+}
 
 @article{Tibshirani2008,
   title={Fast computation of the median by successive binning},
diff --git a/modules/optflow/include/opencv2/optflow/rlofflow.hpp b/modules/optflow/include/opencv2/optflow/rlofflow.hpp
index d1a3d6f53ad..f088562215e 100644
--- a/modules/optflow/include/opencv2/optflow/rlofflow.hpp
+++ b/modules/optflow/include/opencv2/optflow/rlofflow.hpp
@@ -14,29 +14,29 @@ namespace optflow
     //! @addtogroup optflow
     //! @{
 
-    /** @brief This is used store and set up the parameters of the robust local optical flow (RLOF) algoritm. 
+    /** @brief This is used store and set up the parameters of the robust local optical flow (RLOF) algoritm.
      *
-     * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as 
-     * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). 
-     * This RLOF implementation can be seen as an improved pyramidal iterative Lucas-Kanade and includes 
+     * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as
+     * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK().
+     * This RLOF implementation can be seen as an improved pyramidal iterative Lucas-Kanade and includes
      * a set of improving modules. The main improvements in respect to the pyramidal iterative Lucas-Kanade
      * are:
-     *    - A more robust redecending M-estimator framework (see @cite Senst2012) to improve the accuracy at 
+     *   - A more robust redecending M-estimator framework (see @cite Senst2012) to improve the accuracy at
      *     motion boundaries and appearing and disappearing pixels.
-     *  - an adaptive support region strategies to improve the accuracy at motion boundaries to reduce the 
-     *     corona effect, i.e oversmoothing of the PLK at motion/object boundaries. The cross-based segementation 
-     *     strategy (SR_CROSS) proposed in @cite Senst2014 uses a simple segmenation approach to obtain the optimal 
+     *  - an adaptive support region strategies to improve the accuracy at motion boundaries to reduce the
+     *     corona effect, i.e oversmoothing of the PLK at motion/object boundaries. The cross-based segementation
+     *     strategy (SR_CROSS) proposed in @cite Senst2014 uses a simple segmenation approach to obtain the optimal
      *     shape of the support region.
-     *  - To deal with illumination changes (outdoor sequences and shadow) the intensity constancy assumption 
-     *     based optical flow equation has been adopt with the Gennert and Negahdaripour illumination model 
+     *  - To deal with illumination changes (outdoor sequences and shadow) the intensity constancy assumption
+     *     based optical flow equation has been adopt with the Gennert and Negahdaripour illumination model
      *     (see @cite Senst2016). This model can be switched on/off with the useIlluminationModel variable.
      *  - By using a global motion prior initialization (see @cite Senst2016) of the iterative refinement
      *     the accuracy could be significantly improved for large displacements. This initialization can be
      *     switched on and of with useGlobalMotionPrior variable.
      *
-     * The RLOF can be computed with the SparseOpticalFlow class or function interface to track a set of features 
-     * or with the DenseOpticalFlow class or function interface to compute dense optical flow. 
-     * 
+     * The RLOF can be computed with the SparseOpticalFlow class or function interface to track a set of features
+     * or with the DenseOpticalFlow class or function interface to compute dense optical flow.
+     *
      * @see optflow::DenseRLOFOpticalFlow, optflow::calcOpticalFlowDenseRLOF(), optflow::SparseRLOFOpticalFlow, optflow::calcOpticalFlowSparseRLOF()
      */
     enum SupportRegionType {
@@ -54,7 +54,7 @@ namespace optflow
 
     enum InterpolationType
     {
-        INTERP_GEO = 0,    /**<  Fast geodesic interpolation, see @cite Geister2016 */
+        INTERP_GEO = 0,    /**<  Fast geodesic interpolation, see @cite Geistert2016 */
         INTERP_EPIC = 1,    /**<  Edge-preserving interpolation, see @cite Revaud2015,Geistert2016. */
     };
 
@@ -86,14 +86,14 @@ namespace optflow
         */
 
         float normSigma0;
-        /**< \$ \sigma_0 \$ paramter of the shrinked Hampel norm introduced in @cite Senst2012. If
-         * \$ \sigma_0 \$ = std::numeric_limist<float>::max() the least-square estimator will be used
+        /**< &sigma paramter of the shrinked Hampel norm introduced in @cite Senst2012. If
+         * &sigma = std::numeric_limist<float>::max() the least-square estimator will be used
          * instead of the M-estimator. Althoug M-estimator is more robust against outlier in the support
          * region the least-square can be fast in computation.
         */
         float normSigma1;
-        /**< \$ \sigma_1 \$ paramter of the shrinked Hampel norm introduced in @cite Senst2012. If
-         * \$ \sigma_1 \$ = std::numeric_limist<float>::max() the least-square estimator will be used
+        /**< &sigma paramter of the shrinked Hampel norm introduced in @cite Senst2012. If
+         * &sigma = std::numeric_limist<float>::max() the least-square estimator will be used
          * instead of the M-estimator. Althoug M-estimator is more robust against outlier in the support
          * region the least-square can be fast in computation.
         */
@@ -122,7 +122,7 @@ namespace optflow
         bool useIlluminationModel;
         /**< Use the Gennert and Negahdaripour illumination model instead of the intensity brigthness
          *   constraint. (Probosed in @cite Senst2016 ) This model is defined as follow:
-         *   \f[ I(\mathbf{x},t) + m \cdot I(\mathbf{x},t) + c = I(\mathbf{x},t+1) \f[
+         *   \f[ I(\mathbf{x},t) + m \cdot I(\mathbf{x},t) + c = I(\mathbf{x},t+1) \f]
          *   and contains with m and c a multiplicative and additive term which makes the estimate
          *   more robust against illumination changes. The computational complexity is increased by
          *   enabling the illumination model.
@@ -148,15 +148,15 @@ namespace optflow
         */
     };
 
-    /** @brief Fast dense optical flow computation based on robust local optical flow (RLOF) algorithms and sparse-to-dense interpolation 
+    /** @brief Fast dense optical flow computation based on robust local optical flow (RLOF) algorithms and sparse-to-dense interpolation
      * scheme.
      *
-     * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as 
-     * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). 
+     * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as
+     * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK().
      *
      * The sparse-to-dense interpolation scheme allows for fast computation of dense optical flow using RLOF (see @cite Geistert2016).
      * For this scheme the following steps are applied:
-     * -# motion vector seeded at a regular sampled grid are computed. The sparsity of this grid can be configured with setGridStep 
+     * -# motion vector seeded at a regular sampled grid are computed. The sparsity of this grid can be configured with setGridStep
      * -# (optinally) errornous motion vectors are filter based on the forward backward confidence. The threshold can be configured
      * with setForwardBackward. The filter is only applied if the threshold >0 but than the runtime is doubled due to the estimation
      * of the backward flow.
@@ -165,7 +165,7 @@ namespace optflow
      * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details.
      * Parameters have been described in @cite Senst2012 @cite Senst2013 @cite Senst2014 and @cite Senst2016.
      *
-     * @note If the grid size is set to (1,1) and the forward backward threshold <= 0 that the dense optical flow field is purely.
+     * @note SIMD parallelization is available compiled with SSE4.1. If the grid size is set to (1,1) and the forward backward threshold <= 0 that the dense optical flow field is purely.
      * computed with the RLOF.
      *
      * @see optflow::calcOpticalFlowDenseRLOF(), optflow::RLOFOpticalFlowParameter
@@ -174,43 +174,43 @@ namespace optflow
     {
     public:
         //! @brief Configuration of the RLOF alogrithm.
-        /** 
-            @see optflow::RLOFOpticalFlowParameter, getRLOFOpticalFlowParameter 
+        /**
+            @see optflow::RLOFOpticalFlowParameter, getRLOFOpticalFlowParameter
         */
         CV_WRAP virtual void setRLOFOpticalFlowParameter(Ptr<RLOFOpticalFlowParameter>  val) = 0;
-        /** @copybrief setRLOFOpticalFlowParameter 
-            @see optflow::RLOFOpticalFlowParameter, setRLOFOpticalFlowParameter 
+        /** @copybrief setRLOFOpticalFlowParameter
+            @see optflow::RLOFOpticalFlowParameter, setRLOFOpticalFlowParameter
         */
         CV_WRAP virtual Ptr<RLOFOpticalFlowParameter>  getRLOFOpticalFlowParameter() const = 0;
         //! @brief Threshold for the forward backward confidence check
-        /**For each grid point \f$ \mathbf{x} \f$ a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed. 
+        /**For each grid point \f$ \mathbf{x} \f$ a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed.
          *     If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f]
-         *     is larger than threshold given by this function then the motion vector will not be used by the following 
+         *     is larger than threshold given by this function then the motion vector will not be used by the following
          *    vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test
          *    will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation.
          *    @see getForwardBackward, setGridStep
         */
         CV_WRAP virtual void setForwardBackward(float val) = 0;
         /** @copybrief setForwardBackward
-            @see setForwardBackward 
+            @see setForwardBackward
         */
         CV_WRAP virtual float getForwardBackward() const = 0;
         //! @brief Size of the grid to spawn the motion vectors.
-        /** For each grid point a motion vector is computed. Some motion vectors will be removed due to the forwatd backward 
+        /** For each grid point a motion vector is computed. Some motion vectors will be removed due to the forwatd backward
          *  threshold (if set >0). The rest will be the base of the vector field interpolation.
          *    @see getForwardBackward, setGridStep
         */
         CV_WRAP virtual Size getGridStep() const = 0;
         /** @copybrief getGridStep
-         *    @see getGridStep 
+         *    @see getGridStep
          */
         CV_WRAP virtual void setGridStep(Size val) = 0;
 
         //! @brief Interpolation used to compute the dense optical flow.
         /** Two interpolation algorithms have been supported
-         * - **INTERP_GEO** applies the fast geodesic interpolation, see @cite Geister2016.
+         * - **INTERP_GEO** applies the fast geodesic interpolation, see @cite Geistert2016.
          * - **INTERP_EPIC_RESIDUAL** applies the edge-preserving interpolation, see @cite Revaud2015,Geistert2016.
-         * @see ximgproc::EdgeAwareInterpolator, getInterpolation 
+         * @see ximgproc::EdgeAwareInterpolator, getInterpolation
         */
         CV_WRAP virtual void setInterpolation(InterpolationType val) = 0;
         /** @copybrief setInterpolation
@@ -224,7 +224,7 @@ namespace optflow
         */
         CV_WRAP virtual int getEPICK() const = 0;
         /** @copybrief getEPICK
-         *    @see ximgproc::EdgeAwareInterpolator, getEPICK 
+         *    @see ximgproc::EdgeAwareInterpolator, getEPICK
          */
         CV_WRAP virtual void setEPICK(int val) = 0;
         //! @brief see ximgproc::EdgeAwareInterpolator() sigma value.
@@ -235,7 +235,7 @@ namespace optflow
         */
         CV_WRAP virtual float getEPICSigma() const = 0;
         /** @copybrief getEPICSigma
-         *  @see ximgproc::EdgeAwareInterpolator, getEPICSigma 
+         *  @see ximgproc::EdgeAwareInterpolator, getEPICSigma
          */
         CV_WRAP virtual void setEPICSigma(float val) = 0;
         //! @brief  see ximgproc::EdgeAwareInterpolator() lambda value.
@@ -245,7 +245,7 @@ namespace optflow
         */
         CV_WRAP virtual float getEPICLambda() const = 0;
         /** @copybrief getEPICLambda
-         *    @see ximgproc::EdgeAwareInterpolator, getEPICLambda 
+         *    @see ximgproc::EdgeAwareInterpolator, getEPICLambda
         */
         CV_WRAP virtual void setEPICLambda(float val) = 0;
         //! @brief see ximgproc::EdgeAwareInterpolator().
@@ -254,16 +254,16 @@ namespace optflow
         */
         CV_WRAP virtual float getFgsLambda() const = 0;
         /** @copybrief getFgsLambda
-         *    @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, getFgsLambda 
+         *    @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, getFgsLambda
         */
         CV_WRAP virtual void setFgsLambda(float val) = 0;
         //! @brief see ximgproc::EdgeAwareInterpolator().
         /** Sets the respective fastGlobalSmootherFilter() parameter.
-         *    @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, setFgsSigma 
+         *    @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, setFgsSigma
         */
         CV_WRAP virtual float getFgsSigma() const = 0;
         /** @copybrief getFgsSigma
-         *    @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, getFgsSigma  
+         *    @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, getFgsSigma
          */
         CV_WRAP virtual void setFgsSigma(float val) = 0;
         //! @brief enables ximgproc::fastGlobalSmootherFilter
@@ -275,7 +275,7 @@ namespace optflow
          *    @see ximgproc::fastGlobalSmootherFilter, setUsePostProc
          */
         CV_WRAP virtual bool getUsePostProc() const = 0;
-        //! @brief Creates instance of optflow::DenseRLOFOpticalFlow 
+        //! @brief Creates instance of optflow::DenseRLOFOpticalFlow
         /**
          *    @param rlofParam see optflow::RLOFOpticalFlowParameter
          *    @param forwardBackwardThreshold see setForwardBackward
@@ -283,6 +283,7 @@ namespace optflow
          *    @param epicK see setEPICK
          *    @param epicSigma see setEPICSigma
          *    @param epicLambda see setEPICLambda
+         *    @param use_post_proc see setUsePostProc
          *    @param fgsLambda see setFgsLambda
          *    @param fgsSigma see setFgsSigma
         */
@@ -294,19 +295,21 @@ namespace optflow
             int epicK = 128,
             float epicSigma = 0.05f,
             float epicLambda = 999.0f,
+            bool use_post_proc = true,
             float fgsLambda = 500.0f,
             float fgsSigma = 1.5f);
     };
 
     /** @brief Class used for calculation sparse optical flow and feature tracking with robust local optical flow (RLOF) algorithms.
     *
-    * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as 
-    * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). 
+    * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as
+    * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK().
     *
-    * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details. 
+    * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details.
     * Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014 and @cite Senst2016.
-    @sa optflow::calcOpticalFlowSparseRLOF(), optflow::RLOFOpticalFlowParameter
-
+    *
+    * @note SIMD parallelization is available compiled with SSE4.1.
+    *@see optflow::calcOpticalFlowSparseRLOF(), optflow::RLOFOpticalFlowParameter
     */
     class CV_EXPORTS_W SparseRLOFOpticalFlow : public SparseOpticalFlow
     {
@@ -331,8 +334,8 @@ namespace optflow
          *    @see setForwardBackward
         */
         CV_WRAP virtual float getForwardBackward() const = 0;
-        
-        //! @brief Creates instance of SparseRLOFOpticalFlow 
+
+        //! @brief Creates instance of SparseRLOFOpticalFlow
         /**
          *    @param rlofParam see setRLOFOpticalFlowParameter
          *    @param forwardBackwardThreshold see setForwardBackward
@@ -345,8 +348,8 @@ namespace optflow
 
     /** @brief Fast dense optical flow computation based on robust local optical flow (RLOF) algorithms and sparse-to-dense interpolation scheme.
      *
-     * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as 
-     * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). 
+     * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as
+     * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK().
      *
      * The sparse-to-dense interpolation scheme allows for fast computation of dense optical flow using RLOF (see @cite Geistert2016).
      * For this scheme the following steps are applied:
@@ -356,20 +359,20 @@ namespace optflow
      * of the backward flow.
      * -# Vector field interpolation is applied to the motion vector set to obtain a dense vector field.
      *
-     * @param I0 first 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType 
-     * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image. 
-     * @param I1 second 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType 
+     * @param I0 first 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType
+     * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image.
+     * @param I1 second 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType
      * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image.
      * @param flow computed flow image that has the same size as I0 and type CV_32FC2.
      * @param rlofParam see optflow::RLOFOpticalFlowParameter
-     * @param forwardBackwardThreshold Threshold for the forward backward confidence check.
-     * For each grid point \f$ \mathbf{x} \f$ a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed. 
+     * @param forewardBackwardThreshold Threshold for the forward backward confidence check.
+     * For each grid point \f$ \mathbf{x} \f$ a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed.
      * If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f]
-     * is larger than threshold given by this function then the motion vector will not be used by the following 
+     * is larger than threshold given by this function then the motion vector will not be used by the following
      * vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test
      *    will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation.
-     * @param gridStep Size of the grid to spawn the motion vectors. For each grid point a motion vector is computed. 
-     * Some motion vectors will be removed due to the forwatd backward threshold (if set >0). The rest will be the 
+     * @param gridStep Size of the grid to spawn the motion vectors. For each grid point a motion vector is computed.
+     * Some motion vectors will be removed due to the forwatd backward threshold (if set >0). The rest will be the
      * base of the vector field interpolation.
      * @param epicK see ximgproc::EdgeAwareInterpolator() sets the respective parameter.
      * @param epicSigma see ximgproc::EdgeAwareInterpolator() sets the respective parameter.
@@ -383,35 +386,39 @@ namespace optflow
      * @note If the grid size is set to (1,1) and the forward backward threshold <= 0 that the dense optical flow field is purely
      * computed with the RLOF.
      *
+     * @note SIMD parallelization is available compiled with SSE4.1.
+     *
      * @sa optflow::DenseRLOFOpticalFlow, optflow::RLOFOpticalFlowParameter
     */
     CV_EXPORTS_W void calcOpticalFlowDenseRLOF(InputArray I0, InputArray I1, InputOutputArray flow,
         Ptr<RLOFOpticalFlowParameter> rlofParam = Ptr<RLOFOpticalFlowParameter>(),
         float forewardBackwardThreshold = 0, Size gridStep = Size(6, 6),
         InterpolationType interp_type = InterpolationType::INTERP_EPIC,
-        int epicK = 128, float epicSigma = 0.05f, 
+        int epicK = 128, float epicSigma = 0.05f, float epicLambda = 100.f,
         bool use_post_proc = true, float fgsLambda = 500.0f, float fgsSigma = 1.5f);
 
-    /** @brief Calculates fast optical flow for a sparse feature set using the robust local optical flow (RLOF) similar 
+    /** @brief Calculates fast optical flow for a sparse feature set using the robust local optical flow (RLOF) similar
     * to optflow::calcOpticalFlowPyrLK().
     *
     * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as
     * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK().
     *
-    * @param I0 first 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType
+    * @param prevImg first 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType
     * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image.
-    * @param I1 second 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType
+    * @param nextImg second 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType
     * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image.
-    * @param prevPts vector of 2D points for which the flow needs to be found; point coordinates must be single-precision 
+    * @param prevPts vector of 2D points for which the flow needs to be found; point coordinates must be single-precision
     * floating-point numbers.
-    * @param nextPts output vector of 2D points (with single-precision floating-point coordinates) containing the calculated 
-    * new positions of input features in the second image; when optflow::RLOFOpticalFlowParameter::useInitialFlow variable is true  the vector must 
+    * @param nextPts output vector of 2D points (with single-precision floating-point coordinates) containing the calculated
+    * new positions of input features in the second image; when optflow::RLOFOpticalFlowParameter::useInitialFlow variable is true  the vector must
     * have the same size as in the input and contain the initialization point correspondences.
-    * @param status output status vector (of unsigned chars); each element of the vector is set to 1 if the flow for the 
-    * corresponding features has passed the forward backward check. 
-    * @param err output vector of errors; each element of the vector is set to the forward backward error for the corresponding feature. 
+    * @param status output status vector (of unsigned chars); each element of the vector is set to 1 if the flow for the
+    * corresponding features has passed the forward backward check.
+    * @param err output vector of errors; each element of the vector is set to the forward backward error for the corresponding feature.
     * @param rlofParam see optflow::RLOFOpticalFlowParameter
-    * @param forwardBackwardThreshold Threshold for the forward backward confidence check. If forewardBackwardThreshold <=0 the forward
+    * @param forewardBackwardThreshold Threshold for the forward backward confidence check. If forewardBackwardThreshold <=0 the forward
+    *
+    * @note SIMD parallelization is available compiled with SSE4.1.
     *
     * Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014 and @cite Senst2016.
     * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details.
diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp
index 437e807d810..93ccb94fd89 100644
--- a/modules/optflow/src/rlof/berlof_invoker.hpp
+++ b/modules/optflow/src/rlof/berlof_invoker.hpp
@@ -708,7 +708,7 @@ namespace berlof
                     float dI = 0; // I^2
                     float D = 0;
 
-        #ifdef CV_SSE4_1
+        #ifdef RLOF_SSE
 
                     __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
                     __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
@@ -734,7 +734,7 @@ namespace berlof
 
 
 
-        #ifdef CV_SSE4_1
+        #ifdef RLOF_SSE
                         for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 )
                         {
                             __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16);
@@ -906,7 +906,7 @@ namespace berlof
                         fParam0 = param[0] * MEstimatorScale;
                         fParam1 = param[1] * MEstimatorScale;
 
-#ifdef CV_SSE4_1
+#ifdef RLOF_SSE
 
                         qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
                         qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
@@ -952,7 +952,7 @@ namespace berlof
                             const short* dIptr = (const short*)(derivIWinBuf.data + y*derivIWinBuf.step);
 
                             x = 0;
-#ifdef CV_SSE4_1
+#ifdef RLOF_SSE
                         for( ; x <= _winSize.width*cn; x += 8, dIptr += 8*2 )
                         {
                             __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16);
@@ -1269,7 +1269,7 @@ namespace berlof
 #endif
                         }
 
-#ifdef CV_SSE4_1
+#ifdef RLOF_SSE
                         short etaValues[8];
                         _mm_storeu_si128((__m128i*)(etaValues), mmEta);
                         MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3]
@@ -1279,7 +1279,7 @@ namespace berlof
                         float CV_DECL_ALIGNED(32) wbuf[4];//
                         if( j == 0 )
                         {
-#ifdef CV_SSE4_1
+#ifdef RLOF_SSE
                                 _mm_store_ps(wbuf, mmSumW1);
                                 w1  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
                                 _mm_store_ps(wbuf, mmSumW2);
@@ -1304,7 +1304,7 @@ namespace berlof
                                 dI *= FLT_SCALE;
 
 
-#ifdef CV_SSE4_1
+#ifdef RLOF_SSE
                             float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];//
 
                             _mm_store_ps(A11buf, mmAxx);
@@ -1362,7 +1362,7 @@ namespace berlof
                         }
 
 
-        #ifdef CV_SSE4_1
+        #ifdef RLOF_SSE
                         float CV_DECL_ALIGNED(16) bbuf[4];
                         for(int mmi = 0; mmi < 4; mmi++)
                         {
@@ -1658,7 +1658,7 @@ namespace beplk
                 CV_Assert( step == (int)(J.step/J.elemSize1()) );
                 float A11 = 0, A12 = 0, A22 = 0;
 
-    #ifdef CV_SSE4_1
+    #ifdef RLOF_SSE
                 __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
                 __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
                 __m128i z = _mm_setzero_si128();
@@ -1682,7 +1682,7 @@ namespace beplk
 
                     x = 0;
 
-    #ifdef CV_SSE4_1
+    #ifdef RLOF_SSE
                     for( ; x < winSize.width*cn; x += 4, dsrc += 4*2, dIptr += 4*2 )
                     {
                         __m128i wMask = _mm_set_epi32(MaskSet * maskPtr[x+3],
@@ -1763,7 +1763,7 @@ namespace beplk
 
                 }
 
-    #ifdef CV_SSE4_1
+    #ifdef RLOF_SSE
                 float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];
                 _mm_store_ps(A11buf, qA11);
                 _mm_store_ps(A12buf, qA12);
@@ -1834,7 +1834,7 @@ namespace beplk
 
                     float _b1[4] = {0,0,0,0};
                     float _b2[4] = {0,0,0,0};
-    #ifdef CV_SSE4_1
+    #ifdef RLOF_SSE
                     __m128 qbc0[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
                     __m128 qbc1[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
                     qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
@@ -1846,7 +1846,7 @@ namespace beplk
                         const short* Iptr = (const short*)(IWinBuf.data + ( y)*IWinBuf.step);
                         const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
                         x = 0;
-    #ifdef CV_SSE4_1
+    #ifdef RLOF_SSE
                         const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
                         for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 )
                         {
@@ -1982,7 +1982,7 @@ namespace beplk
 
                     }
 
-    #ifdef CV_SSE4_1
+    #ifdef RLOF_SSE
                     float CV_DECL_ALIGNED(16) bbuf[4];
                     _mm_store_ps(bbuf, _mm_add_ps(qbc0[0], qbc1[0]));
                     _b1[0] += bbuf[0] + bbuf[2];
diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp
index 2637fa7fbe7..072cd3390b0 100644
--- a/modules/optflow/src/rlof/plk_invoker.hpp
+++ b/modules/optflow/src/rlof/plk_invoker.hpp
@@ -161,7 +161,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
             float w1 = 0, w2 = 0; // -IyI
             float dI = 0; // I^2
             float D = 0;
- #ifdef CV_SSE4_1
+ #ifdef RLOF_SSE
+
             __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
             __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
             __m128i z = _mm_setzero_si128();
@@ -183,7 +184,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step);
                 
                 x = 0;
-#ifdef CV_SSE4_1
+#ifdef RLOF_SSE
                for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 )
                 {
                     __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16);
@@ -296,7 +297,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     A12 = 0;
                     A22 = 0;
                 }
-#ifdef CV_SSE4_1
+#ifdef RLOF_SSE
                 qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
                 qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
                 __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(), qb2 = _mm_setzero_ps();
@@ -306,7 +307,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 __m128 mmSumIy = _mm_setzero_ps(),  mmSumIx = _mm_setzero_ps();
                 __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();
                 float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x;
-                int bitShift = gainVec.x == 0 ? 1 : static_cast<int>(ceil(log(200.f / gainVal) / log(2.f)));    // zwei stellen nach den komma 
+                int bitShift = gainVec.x == 0 ? 1 : static_cast<int>(ceil(log(200.f / gainVal) / log(2.f)));
                 __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.x * (float)(1 << bitShift)));
                 __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.y));
 #endif
@@ -318,7 +319,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
                     
                     x = 0;
-#ifdef CV_SSE4_1                    
+#ifdef RLOF_SSE                    
                     
                     for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 )
                     {
@@ -506,7 +507,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 float CV_DECL_ALIGNED(16) wbuf[4];//
                 if( j == 0 )
                 {
-#ifdef CV_SSE4_1
+#ifdef RLOF_SSE
                         _mm_store_ps(wbuf, mmSumW1);
                         w1  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
                         _mm_store_ps(wbuf, mmSumW2);
@@ -529,7 +530,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     w1 *= -FLT_SCALE;
                     w2 *= -FLT_SCALE;
                     dI *= FLT_SCALE;
-#ifdef CV_SSE4_1
+#ifdef RLOF_SSE
                     float CV_DECL_ALIGNED(32) A11buf[4], A12buf[4], A22buf[4];//                
                     
                     _mm_store_ps(A11buf, mmAxx);
@@ -545,7 +546,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     A22 *= FLT_SCALE; 
                 }
 
-#ifdef CV_SSE4_1
+#ifdef RLOF_SSE
                 float CV_DECL_ALIGNED(16) bbuf[4];
                 _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
                 b1 = bbuf[0] + bbuf[2];
@@ -794,7 +795,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
             CV_Assert( step == (int)(J.step/J.elemSize1()) );
             float A11 = 0, A12 = 0, A22 = 0;
             
-#ifdef CV_SSE4_1
+#ifdef RLOF_SSE
             __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
             __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
             __m128i z = _mm_setzero_si128();
@@ -818,7 +819,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 
                 x = 0;
                 
-#ifdef CV_SSE4_1
+#ifdef RLOF_SSE
                 for( ; x <= winSize.width*cn; x += 4, dsrc += 4*2, dIptr += 4*2 )
                 {
                     __m128i wMask = _mm_set_epi32(MaskSet * maskPtr[x+3], 
@@ -900,7 +901,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
 #endif
             }
             
-#ifdef CV_SSE4_1
+#ifdef RLOF_SSE
             float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];
             _mm_store_ps(A11buf, qA11);
             _mm_store_ps(A12buf, qA12);
@@ -956,7 +957,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
                 iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
                 float b1 = 0, b2 = 0;
-#ifdef CV_SSE4_1
+#ifdef RLOF_SSE
                 qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
                 qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
                 __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps();
@@ -970,7 +971,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
                     
                     x = 0;
-#ifdef CV_SSE4_1                    
+#ifdef RLOF_SSE                    
                     const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
                     for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 )
                     {
@@ -1038,7 +1039,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
 #endif
                 }
                 
-#ifdef CV_SSE4_1
+#ifdef RLOF_SSE
                 float CV_DECL_ALIGNED(16) bbuf[4];
                 _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
                 b1 += bbuf[0] + bbuf[2];
diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp
index fb1ad16f353..c810debbdb9 100644
--- a/modules/optflow/src/rlof/rlof_invoker.hpp
+++ b/modules/optflow/src/rlof/rlof_invoker.hpp
@@ -165,7 +165,7 @@ namespace ica
                 float D = 0;
                 float minEig;
 
-#ifdef CV_SSE4_1
+#ifdef RLOF_SSE
                 __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
                 __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
                 __m128i z = _mm_setzero_si128();
@@ -189,7 +189,7 @@ namespace ica
 
                     x = 0;
 
-#ifdef CV_SSE4_1
+#ifdef RLOF_SSE
                     
                     for (; x <= winSize.width*cn; x += 4, dsrc += 4 * 2, dIptr += 4 * 2)
                     {
@@ -326,7 +326,7 @@ namespace ica
                     float fParam1 = param[1] * 32.f;
                     fParam0 = param[0] * MEstimatorScale;
                     fParam1 = param[1] * MEstimatorScale;
-#ifdef CV_SSE4_1
+#ifdef RLOF_SSE
                     qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
                     qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
                     __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps();
@@ -334,7 +334,7 @@ namespace ica
                     __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam0)));
                     __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam1)));
                     float s2Val = param[2] > 0 ? param[2] : -param[2];
-                    int s2bitShift = param[2] == 0 ? 1 : static_cast<int>(ceil(log(200.f / s2Val) / log(2.f)));    // zwei stellen nach den komma 
+                    int s2bitShift = param[2] == 0 ? 1 : static_cast<int>(ceil(log(200.f / s2Val) / log(2.f)));
                     __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast<short>(param[2] * (float)(1 << s2bitShift)));
                     __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift);
                     __m128  mmParam2s = _mm_set1_ps(0.01f * param[2]);
@@ -354,7 +354,7 @@ namespace ica
                         const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
 
                         x = 0;
-#ifdef CV_SSE4_1                    
+#ifdef RLOF_SSE                    
                         for (; x <= winSize.width*cn; x += 8, dIptr += 8 * 2)
                         {
                             __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
@@ -532,7 +532,7 @@ namespace ica
 #endif
                     }
                     
-#ifdef CV_SSE4_1
+#ifdef RLOF_SSE
                     short etaValues[8];
                     _mm_storeu_si128((__m128i*)(etaValues), mmEta);
                     MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3]
@@ -542,7 +542,7 @@ namespace ica
 #endif
                     if (j == 0)
                     {
-#ifdef CV_SSE4_1
+#ifdef RLOF_SSE
                         float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];//                
 
                         _mm_store_ps(A11buf, mmAxx);
@@ -576,7 +576,7 @@ namespace ica
                         }
                     }
 
-#ifdef CV_SSE4_1
+#ifdef RLOF_SSE
                     float CV_DECL_ALIGNED(16) bbuf[4];
                     _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
                     b1 += bbuf[0] + bbuf[2];
@@ -786,7 +786,7 @@ namespace radial
                 float dI = 0; // I^2
                 float D = 0;
 
-#ifdef CV_SSE4_1
+#ifdef RLOF_SSE
                 __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
                 __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
                 __m128i z = _mm_setzero_si128();
@@ -810,7 +810,7 @@ namespace radial
 
                     x = 0;
 
-#ifdef CV_SSE4_1
+#ifdef RLOF_SSE
                     for (; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4 * 2, dIptr += 4 * 2)
                     {
                         __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
@@ -971,7 +971,7 @@ namespace radial
                     fParam0 = param[0] * MEstimatorScale;
                     fParam1 = param[1] * MEstimatorScale;
                     
-#ifdef CV_SSE4_1
+#ifdef RLOF_SSE
 
                     qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
                     qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
@@ -986,14 +986,14 @@ namespace radial
 
 
                     float s2Val = param[2] > 0 ? param[2] : -param[2];
-                    int s2bitShift = param[2] == 0 ? 1 : static_cast<int>(ceil(log(200.f / s2Val) / log(2.f)));    // zwei stellen nach den komma 
+                    int s2bitShift = param[2] == 0 ? 1 : static_cast<int>(ceil(log(200.f / s2Val) / log(2.f)));
                     __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast<short>(param[2] * (float)(1 << s2bitShift)));
                     __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift);
                     __m128  mmParam2s = _mm_set1_ps(0.01f * param[2]);
                     __m128  mmParam2s2 = _mm_set1_ps(param[2] * param[2]);
 
                     float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x;
-                    int bitShift = gainVec.x == 0 ? 1 : static_cast<int>(ceil(log(200.f / gainVal) / log(2.f)));    // zwei stellen nach den komma 
+                    int bitShift = gainVec.x == 0 ? 1 : static_cast<int>(ceil(log(200.f / gainVal) / log(2.f)));
 
                     __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.x * (float)(1 << bitShift)));
                     __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.y));
@@ -1011,7 +1011,7 @@ namespace radial
                         const short* dIptr = (const short*)(derivIWinBuf.data + y * derivIWinBuf.step);
 
                         x = 0;
-#ifdef CV_SSE4_1                    
+#ifdef RLOF_SSE                    
 
 
                         for (; x <= _winSize.width*cn; x += 8, dIptr += 8 * 2)
@@ -1057,7 +1057,7 @@ namespace radial
                             mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1)));
                                 
         
-                            __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... 
+                            __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1
                             __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
 
 
@@ -1072,7 +1072,7 @@ namespace radial
                             // val = |It| -/+ sigma1
                             __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)),
                                 _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1)));
-                            // It == 0     ? |It| > sigma13 
+                            // It == 0     ? |It| > sigma13
                             mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16);
                             // It == val ? sigma0 < |It| < sigma1
                             mmDiff_epi16 = _mm_blendv_epi8(mmDiff_epi16, tmpParam1_epi16, bSet1_epi16);
@@ -1167,7 +1167,7 @@ namespace radial
                                 __m128 I_tale_ps = _mm_mul_ps(I_ps, tale_0_3_ps);
                                 mmSumI = _mm_add_ps(mmSumI, I_tale_ps);
 
-                                // sumW 
+                                // sumW
                                 mmSumW = _mm_add_ps(mmSumW, tale_0_3_ps);
 
                                 // sumDI
@@ -1201,10 +1201,8 @@ namespace radial
                                 I_tale_ps = _mm_mul_ps(I_ps, tale_4_7_ps);
                                 mmSumI = _mm_add_ps(mmSumI, I_tale_ps);
 
-                                // sumW 
                                 mmSumW = _mm_add_ps(mmSumW, tale_4_7_ps);
 
-                                // sumDI
                                 mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps(I_ps, I_tale_ps));
                             }
 
@@ -1287,17 +1285,17 @@ namespace radial
                     }
 
                     
-#ifdef CV_SSE4_1
+#ifdef RLOF_SSE
                     short etaValues[8];
                     _mm_storeu_si128((__m128i*)(etaValues), mmEta);
                     MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3]
                             + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]);
 
-#endif                
+#endif
                     float CV_DECL_ALIGNED(32) wbuf[4];//
                     if (j == 0)
                     {
-#ifdef CV_SSE4_1
+#ifdef RLOF_SSE
                         _mm_store_ps(wbuf, mmSumW1);
                         w1 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
                         _mm_store_ps(wbuf, mmSumW2);
@@ -1322,7 +1320,7 @@ namespace radial
                         dI *= FLT_SCALE;
 
 
-#ifdef CV_SSE4_1
+#ifdef RLOF_SSE
 
                         float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];//                
 
@@ -1335,15 +1333,15 @@ namespace radial
                         A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
                         A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
 #endif
-                        A11 *= FLT_SCALE; 
-                        A12 *= FLT_SCALE; 
-                        A22 *= FLT_SCALE; 
+                        A11 *= FLT_SCALE;
+                        A12 *= FLT_SCALE;
+                        A22 *= FLT_SCALE;
 
                     
                     }
 
 
-#ifdef CV_SSE4_1
+#ifdef RLOF_SSE
                     float CV_DECL_ALIGNED(16) bbuf[4];
                     _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
                     b1 = bbuf[0] + bbuf[2];
@@ -1413,8 +1411,7 @@ namespace radial
                     nextPts[ptidx] = nextPt + halfWin;
                     gainVecs[ptidx] = gainVec;
 
-                    
-                    if (// j > 1 && 
+                    if (
                         (std::abs(delta.x - prevDelta.x) < 0.01  &&    std::abs(delta.y - prevDelta.y) < 0.01)
                         || ((delta.ddot(delta) <= 0.001) && std::abs(prevGain.x - deltaGain.x) < 0.01)
                         )
@@ -1440,7 +1437,7 @@ namespace radial
         const Point2f*    prevPts;
         Point2f*        nextPts;
         uchar*            status;
-        cv::Point2f*    gainVecs;        
+        cv::Point2f*    gainVecs;
         float*            err;
         int                maxWinSize;
         int                minWinSize;
diff --git a/modules/optflow/src/rlof/rlof_invokerbase.hpp b/modules/optflow/src/rlof/rlof_invokerbase.hpp
index cb03f61384b..34b112fa570 100644
--- a/modules/optflow/src/rlof/rlof_invokerbase.hpp
+++ b/modules/optflow/src/rlof/rlof_invokerbase.hpp
@@ -5,6 +5,13 @@
 #define  _RLOF_INVOKERBASE_HPP_
 
 
+#if CV_CPU_COMPILE_SSE4_1
+#define RLOF_SSE
+#elif CV_CPU_COMPILE_SSE4_2
+#define RLOF_SSE
+#endif
+
+
 #define  CV_DESCALE(x,n)     (((x) + (1 << ((n)-1))) >> (n))
 #define FLT_RESCALE 1
 #ifndef _MSC_VER
diff --git a/modules/optflow/src/rlof/rlof_localflow.cpp b/modules/optflow/src/rlof/rlof_localflow.cpp
index b34a9c26e30..4d18380ea77 100644
--- a/modules/optflow/src/rlof/rlof_localflow.cpp
+++ b/modules/optflow/src/rlof/rlof_localflow.cpp
@@ -1,7 +1,6 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
-#include "precomp.hpp"
 #include "rlof_localflow.h"
 #ifdef HAVE_OPENCV_CALIB3D
 #include "opencv2/calib3d.hpp"
@@ -550,5 +549,4 @@ namespace optflow
         calcLocalOpticalFlowCore(prevPyramids, currPyramids, prevPoints, currPoints, internParam);
     }
 }
-}
-    
\ No newline at end of file
+}
\ No newline at end of file
diff --git a/modules/optflow/src/rlof/rlof_localflow.h b/modules/optflow/src/rlof/rlof_localflow.h
index bd59721f277..98347e9dcb1 100644
--- a/modules/optflow/src/rlof/rlof_localflow.h
+++ b/modules/optflow/src/rlof/rlof_localflow.h
@@ -33,7 +33,7 @@ T quickselect(const Mat & inp, int k)
                 std::swap(values.at<T>(l), values.at<T>(ir));
             return values.at<T>(k);
         }
-        else 
+        else
         {
             mid = (l + ir) >> 1;
             std::swap(values.at<T>(mid), values.at<T>(l+1));
@@ -46,7 +46,7 @@ T quickselect(const Mat & inp, int k)
             i = l + 1;
             j = ir;
             a = values.at<T>(l+1);
-            while (true) 
+            while (true)
             {
                 do
                 {
@@ -91,7 +91,6 @@ class CImageBuffer
     }
     void setBlurFromRGB(const cv::Mat & inp)
     {
-        //cv::medianBlur(constNextImage, blurNextImg, 7);
         if(m_Overwrite)
             cv::GaussianBlur(inp, m_BlurredImage, cv::Size(7,7), -1);
     }
diff --git a/modules/optflow/src/rlofflow.cpp b/modules/optflow/src/rlofflow.cpp
index d7309c15210..08e16d9a31d 100644
--- a/modules/optflow/src/rlofflow.cpp
+++ b/modules/optflow/src/rlofflow.cpp
@@ -6,7 +6,7 @@
 #include "rlof/geo_interpolation.hpp"
 #include "opencv2/ximgproc.hpp"
 namespace cv
-{ 
+{
 namespace optflow
 {
 
@@ -30,8 +30,8 @@ namespace optflow
             currPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer);
             currPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer);
         }
-        virtual void setRLOFOpticalFlowParameter(Ptr<RLOFOpticalFlowParameter>  val) CV_OVERRIDE { }//param = *val; }
-        virtual Ptr<RLOFOpticalFlowParameter>  getRLOFOpticalFlowParameter() const CV_OVERRIDE { return Ptr<RLOFOpticalFlowParameter>(); }
+        virtual void setRLOFOpticalFlowParameter(Ptr<RLOFOpticalFlowParameter>  val) CV_OVERRIDE { param = val; }
+        virtual Ptr<RLOFOpticalFlowParameter>  getRLOFOpticalFlowParameter() const CV_OVERRIDE { return param; }
 
         virtual float getForwardBackward() const CV_OVERRIDE { return forwardBackwardThreshold; }
         virtual void setForwardBackward(float val) CV_OVERRIDE { forwardBackwardThreshold = val; }
@@ -65,7 +65,7 @@ namespace optflow
             CV_Assert(!I0.empty() && I0.depth() == CV_8U && (I0.channels() == 3 || I0.channels() == 1));
             CV_Assert(!I1.empty() && I1.depth() == CV_8U && (I1.channels() == 3 || I1.channels() == 1));
             CV_Assert(I0.sameSize(I1));
-            if (param.supportRegionType == SR_CROSS)
+            if (param->supportRegionType == SR_CROSS)
                 CV_Assert( I0.channels() == 3 && I1.channels() == 3);
             CV_Assert(interp_type == InterpolationType::INTERP_EPIC || interp_type == InterpolationType::INTERP_GEO);
 
@@ -86,7 +86,7 @@ namespace optflow
             prevPoints.erase(prevPoints.begin() + noPoints, prevPoints.end());
             currPoints.resize(prevPoints.size());
 
-            calcLocalOpticalFlow(prevImage, currImage, prevPyramid, currPyramid, prevPoints, currPoints, param);
+            calcLocalOpticalFlow(prevImage, currImage, prevPyramid, currPyramid, prevPoints, currPoints, *(param.get()));
 
             flow.create(prevImage.size(), CV_32FC2);
             Mat dense_flow = flow.getMat();
@@ -104,7 +104,7 @@ namespace optflow
             if (forwardBackwardThreshold > 0)
             {
                 // reuse image pyramids
-                calcLocalOpticalFlow(currImage, prevImage, currPyramid, prevPyramid, currPoints, refPoints, param);
+                calcLocalOpticalFlow(currImage, prevImage, currPyramid, prevPyramid, currPoints, refPoints, *(param.get()));
 
                 filtered_prevPoints.resize(prevPoints.size());
                 filtered_currPoints.resize(prevPoints.size());
@@ -167,7 +167,7 @@ namespace optflow
         }
 
     protected:
-        RLOFOpticalFlowParameter        param;
+        Ptr<RLOFOpticalFlowParameter> param;
         float                forwardBackwardThreshold;
         Ptr<CImageBuffer>    prevPyramid[2];
         Ptr<CImageBuffer>    currPyramid[2];
@@ -189,6 +189,7 @@ namespace optflow
         int epicK,
         float epicSigma,
         float epicLambda,
+        bool use_post_proc,
         float fgs_lambda,
         float fgs_sigma)
     {
@@ -200,6 +201,7 @@ namespace optflow
         algo->setEPICK(epicK);
         algo->setEPICSigma(epicSigma);
         algo->setEPICLambda(epicLambda);
+        algo->setUsePostProc(use_post_proc);
         algo->setFgsLambda(fgs_lambda);
         algo->setFgsSigma(fgs_sigma);
         return algo;
@@ -235,7 +237,7 @@ namespace optflow
             Mat nextImage = nextImg.getMat();
             Mat prevPtsMat = prevPts.getMat();
 
-            if (param.useInitialFlow == false)
+            if (param->useInitialFlow == false)
                 nextPts.create(prevPtsMat.size(), prevPtsMat.type(), -1, true);
 
             int npoints = 0;
@@ -253,7 +255,7 @@ namespace optflow
             CV_Assert(nextPtsMat.checkVector(2, CV_32F, true) == npoints);
             std::vector<cv::Point2f> prevPoints(npoints), nextPoints(npoints), refPoints;
             prevPtsMat.copyTo(cv::Mat(1, npoints, CV_32FC2, &prevPoints[0]));
-            if (param.useInitialFlow )
+            if (param->useInitialFlow )
                 nextPtsMat.copyTo(cv::Mat(1, nextPtsMat.cols, CV_32FC2, &nextPoints[0]));
 
             cv::Mat statusMat;
@@ -272,12 +274,12 @@ namespace optflow
                 errorMat.setTo(0);
             }
 
-            calcLocalOpticalFlow(prevImage, nextImage, prevPyramid, currPyramid, prevPoints, nextPoints, param);
+            calcLocalOpticalFlow(prevImage, nextImage, prevPyramid, currPyramid, prevPoints, nextPoints, *(param.get()));
             cv::Mat(1,npoints , CV_32FC2, &nextPoints[0]).copyTo(nextPtsMat);
             if (forwardBackwardThreshold > 0)
             {
                 // reuse image pyramids
-                calcLocalOpticalFlow(nextImage, prevImage, currPyramid, prevPyramid, nextPoints, refPoints, param);
+                calcLocalOpticalFlow(nextImage, prevImage, currPyramid, prevPyramid, nextPoints, refPoints, *(param.get()));
             }
             for (unsigned int r = 0; r < refPoints.size(); r++)
             {
@@ -290,7 +292,7 @@ namespace optflow
         }
 
     protected:
-        RLOFOpticalFlowParameter    param;
+        Ptr<RLOFOpticalFlowParameter> param;
         float                forwardBackwardThreshold;
         Ptr<CImageBuffer>    prevPyramid[2];
         Ptr<CImageBuffer>    currPyramid[2];
@@ -310,12 +312,12 @@ namespace optflow
         Ptr<RLOFOpticalFlowParameter>  rlofParam ,
         float forewardBackwardThreshold, Size gridStep,
         InterpolationType interp_type,
-        int epicK, float epicSigma,
+        int epicK, float epicSigma, float epicLambda,
         bool use_post_proc, float fgsLambda, float fgsSigma)
     {
         Ptr<DenseRLOFOpticalFlow> algo = DenseRLOFOpticalFlow::create(
             rlofParam, forewardBackwardThreshold, gridStep, interp_type,
-            epicK, epicSigma, use_post_proc, fgsLambda, fgsSigma);
+            epicK, epicSigma, epicLambda, use_post_proc, fgsLambda, fgsSigma);
         algo->calc(I0, I1, flow);
         algo->collectGarbage();
     }
@@ -342,4 +344,4 @@ namespace optflow
 }
 
 
-}
\ No newline at end of file
+}

From 2b41b05bb6fb63bc72dda494a6b41f694880ac7c Mon Sep 17 00:00:00 2001
From: senst <senst@nue.tu-berlin.de>
Date: Tue, 11 Dec 2018 10:28:19 +0100
Subject: [PATCH 04/25] remove header that could not be found

---
 modules/optflow/doc/optflow.bib               |   2 +-
 modules/optflow/perf/perf_rlof.cpp            |   5 -
 modules/optflow/src/rlof/berlof_invoker.hpp   |   4 +-
 .../optflow/src/rlof/geo_interpolation.cpp    |   7 +-
 modules/optflow/src/rlof/plk_invoker.hpp      | 290 +++++++++---------
 modules/optflow/src/rlof/rlof_invoker.hpp     | 110 +++----
 modules/optflow/src/rlof/rlof_localflow.h     |   6 +-
 modules/optflow/test/test_OF_accuracy.cpp     |   5 -
 8 files changed, 197 insertions(+), 232 deletions(-)

diff --git a/modules/optflow/doc/optflow.bib b/modules/optflow/doc/optflow.bib
index 5d4a47c30a9..5b4869564bd 100644
--- a/modules/optflow/doc/optflow.bib
+++ b/modules/optflow/doc/optflow.bib
@@ -109,4 +109,4 @@ @article{Tibshirani2008
   author={Tibshirani, Ryan J},
   journal={arXiv preprint arXiv:0806.3301},
   year={2008}
-}             
+}
diff --git a/modules/optflow/perf/perf_rlof.cpp b/modules/optflow/perf/perf_rlof.cpp
index 4c87f51ef78..910fa58375c 100644
--- a/modules/optflow/perf/perf_rlof.cpp
+++ b/modules/optflow/perf/perf_rlof.cpp
@@ -1,7 +1,6 @@
 #include "perf_precomp.hpp"
 namespace opencv_test {
     namespace {
-        
         typedef tuple<std::string, std::string, bool> ST_SR_IM_Sparse_t;
         typedef TestBaseWithParam<ST_SR_IM_Sparse_t> ST_SR_IM_Sparse;
         PERF_TEST_P(ST_SR_IM_Sparse, OpticalFlow_SparseRLOF,
@@ -67,7 +66,6 @@ namespace opencv_test {
             Mat frame2 = imread(frame2_path);
             ASSERT_FALSE(frame1.empty());
             ASSERT_FALSE(frame2.empty());
-        
             Ptr<RLOFOpticalFlowParameter> param;
             Ptr< DenseRLOFOpticalFlow> algo = DenseRLOFOpticalFlow::create();
             InterpolationType interp_type;
@@ -75,13 +73,10 @@ namespace opencv_test {
                 interp_type = INTERP_EPIC;
             if (get<0>(GetParam()) == "INTERP_GEO")
                 interp_type = INTERP_GEO;
-            
-
             TEST_CYCLE_N(5)
             {
                 calcOpticalFlowDenseRLOF(frame1, frame2,flow, param, 1.0f, Size(get<1>(GetParam()), get<1>(GetParam())), interp_type);
             }
-
             SANITY_CHECK_NOTHING();
         }
     }
diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp
index 93ccb94fd89..9c229bf542b 100644
--- a/modules/optflow/src/rlof/berlof_invoker.hpp
+++ b/modules/optflow/src/rlof/berlof_invoker.hpp
@@ -1275,7 +1275,7 @@ namespace berlof
                         MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3]
                                                    + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]);
 
-#endif                
+#endif
                         float CV_DECL_ALIGNED(32) wbuf[4];//
                         if( j == 0 )
                         {
@@ -2128,4 +2128,4 @@ namespace beplk
 }
 }
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/modules/optflow/src/rlof/geo_interpolation.cpp b/modules/optflow/src/rlof/geo_interpolation.cpp
index 823070bf5e9..f93c0b6093e 100644
--- a/modules/optflow/src/rlof/geo_interpolation.cpp
+++ b/modules/optflow/src/rlof/geo_interpolation.cpp
@@ -2,7 +2,6 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 // This functions have been contributed by Jonas Geisters <geistert@nue.tu-berlin.de>
-#include "precomp.hpp"
 #include "geo_interpolation.hpp"
 #include <string>
 #include <map>
@@ -11,7 +10,7 @@ namespace cv
 namespace optflow
 {
 
-    double K_h1(float x, double h2)
+    inline double K_h1(float x, double h2)
     {
         return exp(-(0.5 / (h2)) * x);
     }
@@ -135,7 +134,7 @@ namespace optflow
         /*
             * assign quellknoten ids.
             */
-        for (int i = 0; i < prevPoints.size(); i++)
+        for (int i = 0; i < static_cast<int>(prevPoints.size()); i++)
         {
             int x = (int)prevPoints[i].x;
             int y = (int)prevPoints[i].y;
@@ -484,4 +483,4 @@ namespace optflow
     }
 
 }
-}
\ No newline at end of file
+}
diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp
index 072cd3390b0..0d7b5ddb481 100644
--- a/modules/optflow/src/rlof/plk_invoker.hpp
+++ b/modules/optflow/src/rlof/plk_invoker.hpp
@@ -59,7 +59,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
         useInitialFlow = _useInitialFlow;
         crossSegmentationThreshold = _crossSegmentationThreshold;
     }
-    
+
     TrackerInvoker & operator=(const TrackerInvoker &) {return *this;};
     void operator()(const cv::Range& range) const
     {
@@ -75,7 +75,6 @@ class TrackerInvoker : public cv::ParallelLoopBody
         int winMaskwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
         cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
         winMaskMatBuf.setTo(1);
-        
         const float FLT_SCALE = (1.f/(1 << 16));//(1.f/(1 << 20)); // 20
 
         int cn = I.channels(), cn2 = cn*2;
@@ -120,7 +119,6 @@ class TrackerInvoker : public cv::ParallelLoopBody
                                 minWinSize,maxWinSize, crossSegmentationThreshold) == false)
                         continue;
 
-            
             prevPt -= halfWin;
             iprevPt.x = cvFloor(prevPt.x);
             iprevPt.y = cvFloor(prevPt.y);
@@ -142,12 +140,10 @@ class TrackerInvoker : public cv::ParallelLoopBody
             float a = prevPt.x - iprevPt.x;
             float b = prevPt.y - iprevPt.y;
             const int W_BITS = 14, W_BITS1 = 14;
-            
             int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
             int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
             int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
             int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-            
             int dstep = (int)(derivI.step/derivI.elemSize1());
             int step = (int)(I.step/I.elemSize1());
             int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1());
@@ -179,10 +175,10 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
                 const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
                 const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
-                
+
                 short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step);
                 short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step);
-                
+
                 x = 0;
 #ifdef RLOF_SSE
                for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 )
@@ -194,7 +190,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
                     v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z);
                     v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z);
-                    
+
                     t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
                                        _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
                     t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
@@ -204,13 +200,13 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     }
                     t0 = _mm_and_si128(t0, mask_0_3_epi16);
                     _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0));
-                   
-                    
+
+
                     v00 = _mm_loadu_si128((const __m128i*)(dsrc));
                     v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
                     v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep));
                     v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2));
-                    
+
                     t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
                                        _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
                     t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
@@ -240,7 +236,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                                            dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
                     int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
                                            dsrc[dstep+cn2+1]*iw11, W_BITS1);
-                    
+
                     Iptr[x] = (short)ival;
                     dIptr[0] = (short)ixval;
                     dIptr[1] = (short)iyval;
@@ -248,7 +244,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
 #endif
 
             }
-                        
+
             cv::Point2f backUpNextPt = nextPt;
             nextPt -= halfWin;
             Point2f prevDelta(0,0);    //relates to h(t-1)
@@ -261,8 +257,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 status[ptidx] = static_cast<uchar>(j);
                 inextPt.x = cvFloor(nextPt.x);
                 inextPt.y = cvFloor(nextPt.y);
-                
-                
+
+
                 if( inextPt.x + halfWin.x < 0 || inextPt.x + halfWin.x>= derivI.cols ||
                     inextPt.y + halfWin.y < 0 || inextPt.y + halfWin.y >= derivI.rows )
                 {
@@ -282,7 +278,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 b2 = 0;
                 b3 = 0;
                 b4 = 0;
-            
+
                 if( j == 0 )
                 {
                     // tensor
@@ -302,7 +298,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
                 __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(), qb2 = _mm_setzero_ps();
                 __m128 qb3 = _mm_setzero_ps();
-                __m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps(); 
+                __m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps();
                 __m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps();
                 __m128 mmSumIy = _mm_setzero_ps(),  mmSumIx = _mm_setzero_ps();
                 __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();
@@ -317,60 +313,60 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn;
                     const short* Iptr = (const short*)(IWinBuf.data + ( y)*IWinBuf.step);
                     const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
-                    
+
                     x = 0;
-#ifdef RLOF_SSE                    
-                    
+#ifdef RLOF_SSE
+
                     for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 )
                     {
                         // iw = iw << 14 (16384 short / 2)
                         // iq0 =  iw01 |iw00, iw01 |iw00, iw01 |iw00, iw01 |iw00    16 bit
                         __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16);
-                                                
+
                         // I [0...8]
                         __m128i diff0, diff1;
                         __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); //  element 0 to 7
-                        
+
                         __m128i v00 = _mm_unpacklo_epi8(
                             _mm_loadl_epi64((const __m128i*)(Jptr + x)) // J0, J1, J2, ..., J7, 64 bit nullen
-                                , z); //J0 , 0, J1, 0, J2, 0 ... J7,0 
-                        __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);  
+                                , z); //J0 , 0, J1, 0, J2, 0 ... J7,0
+                        __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
                         __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z);
                         __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z);
-                        
-                        
+
+
                         __m128i t0 = _mm_add_epi32
                             (_mm_madd_epi16(
                                 _mm_unpacklo_epi16(v00, v01), //J0, , J1, J1, J2, J2,  ... J3 , J4
                                 qw0),
                             _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-                        
+
                         __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
                                                    _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
 
                         t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
                         t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5);
-                        
-                    
+
+
                         __m128i Ilo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16);
-                        __m128i Ihi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16); 
+                        __m128i Ihi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16);
 
                         __m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(Ilo, Ihi), bitShift);
                         __m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(Ilo, Ihi), bitShift);
                         __m128i Igain_epi16 =  _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32);
-                                                                                    
+
                         // diff = J - I + I*gain.x + gain.y
-                        __m128i mmDiff_epi16 = _mm_add_epi16( 
+                        __m128i mmDiff_epi16 = _mm_add_epi16(
                             _mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16),    // J - I
                             _mm_add_epi16(Igain_epi16, mmConstValue_epi16));
 
-                        
+
                         mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16);
 
-                    
-                        __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... 
+
+                        __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
                         __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
-                                        
+
                         diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7   | It12 It12 It13 It13...
                         diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3   | It8 It8 It9 It9...
 
@@ -395,7 +391,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         v10 = _mm_unpackhi_epi16(Ilo, Ihi);
                         qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v00));
                         qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v10));
-                        
+
                         // b4+= diff
                         __m128 mmDiff_0_3_ps  = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mmDiff_epi16)); // (_mm_srai_epi32(_mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16),16));
                         __m128 mmDiff_4_7_ps  = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16),16)));
@@ -405,7 +401,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         {
                             __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16));
                             __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16)));
-                            
+
                             t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3
                             t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3
 
@@ -414,81 +410,81 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
                             // 0 ... 3
                             __m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16));
-                            
+
                             // A11 - A22
                             mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fy));
                             mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fy));
                             mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fx));
-                            
+
                             // sumIx und sumIy
                             mmSumIx = _mm_add_ps(mmSumIx, fx);
                             mmSumIy = _mm_add_ps(mmSumIy, fy);
-                            
+
                             mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fx));
                             mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fy));
-                                                    
+
                             // sumI
                             I_ps  = _mm_blendv_ps(_mm_set1_ps(0), I_ps, mask_0_4_ps);
                             mmSumI = _mm_add_ps(mmSumI,I_ps);
-        
+
                             // sumDI
                             mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_ps));
-                        
+
                             t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11
                             t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3
-                            
+
                             fy = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t0), mask_4_7_ps);
                             fx = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t1), mask_4_7_ps);
 
                             // 4 ... 7
                             I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16));
-                                                    
-                        
+
+
 
                             mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fy));
                             mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fy));
                             mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fx));
-                            
+
                             // sumIx und sumIy
                             mmSumIx = _mm_add_ps(mmSumIx, fx);
                             mmSumIy = _mm_add_ps(mmSumIy, fy);
-                            
+
                             mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fx));
                             mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fy));
-                                                    
+
                             // sumI
                             I_ps  = _mm_blendv_ps(_mm_set1_ps(0), I_ps, mask_4_7_ps);
                             mmSumI = _mm_add_ps(mmSumI, I_ps);
-                            
-                            // sumW 
+
+                            // sumW
                             // sumDI
                             mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_ps));
                         }
-                            
+
                     }
 #else
                     for( ; x < winSize.width*cn; x++, dIptr += 2 )
                     {
-                        if( maskPtr[x] == 0) 
+                        if( maskPtr[x] == 0)
                             continue;
                         int J = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
                                               Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11,
                                               W_BITS1-5);
-                        int diff = J - Iptr[x] + static_cast<int>(Iptr[x]) * gainVec.x + gainVec.y; 
-                        
+                        int diff = J - Iptr[x] + static_cast<int>(Iptr[x]) * gainVec.x + gainVec.y;
+
                         b1 += (float)(diff*dIptr[0])  * FLT_RESCALE;
                         b2 += (float)(diff*dIptr[1])  * FLT_RESCALE;
                         b3 += (float)(diff) * Iptr[x] * FLT_RESCALE;
                         b4 += (float)(diff);
-                        
+
                         // compute the Gradient Matrice
                         if( j == 0 )
                         {
                             A11 += (float)(dIptr[0]*dIptr[0]);
                             A12 += (float)(dIptr[0]*dIptr[1]);
                             A22 += (float)(dIptr[1]*dIptr[1]);
-                            
-                            
+
+
                             dI += Iptr[x] * Iptr[x] * FLT_RESCALE;
                             float dx = static_cast<float>(dIptr[0]) * FLT_RESCALE;
                             float dy = static_cast<float>(dIptr[1]) * FLT_RESCALE;
@@ -498,12 +494,12 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             w2 += dy * Iptr[x];
                             sumI += Iptr[x]  * FLT_RESCALE;
                             //sumW += FLT_RESCALE;
-                            
+
                         }
                     }
 #endif
                 }
-                
+
                 float CV_DECL_ALIGNED(16) wbuf[4];//
                 if( j == 0 )
                 {
@@ -531,19 +527,19 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     w2 *= -FLT_SCALE;
                     dI *= FLT_SCALE;
 #ifdef RLOF_SSE
-                    float CV_DECL_ALIGNED(32) A11buf[4], A12buf[4], A22buf[4];//                
-                    
+                    float CV_DECL_ALIGNED(32) A11buf[4], A12buf[4], A22buf[4];//
+
                     _mm_store_ps(A11buf, mmAxx);
                     _mm_store_ps(A12buf, mmAxy);
                     _mm_store_ps(A22buf, mmAyy);
-                    
+
                     A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
                     A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
                     A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
 #endif
-                    A11 *= FLT_SCALE; 
-                    A12 *= FLT_SCALE; 
-                    A22 *= FLT_SCALE; 
+                    A11 *= FLT_SCALE;
+                    A12 *= FLT_SCALE;
+                    A22 *= FLT_SCALE;
                 }
 
 #ifdef RLOF_SSE
@@ -561,12 +557,12 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 mismatchMat.at<float>(1,0) =  b2 * FLT_SCALE;
                 mismatchMat.at<float>(2,0) = -b3 * FLT_SCALE;
                 mismatchMat.at<float>(3,0) = -b4 * FLT_SCALE;
-                
-                D = - A12*A12*sumI*sumI + dI*sumW*A12*A12 + 2*A12*sumI*sumIx*w2 + 2*A12*sumI*sumIy*w1 
-                        - 2*dI*A12*sumIx*sumIy - 2*sumW*A12*w1*w2 + A11*A22*sumI*sumI - 2*A22*sumI*sumIx*w1 
-                        - 2*A11*sumI*sumIy*w2 - sumIx*sumIx*w2*w2 + A22*dI*sumIx*sumIx + 2*sumIx*sumIy*w1*w2 
+
+                D = - A12*A12*sumI*sumI + dI*sumW*A12*A12 + 2*A12*sumI*sumIx*w2 + 2*A12*sumI*sumIy*w1
+                        - 2*dI*A12*sumIx*sumIy - 2*sumW*A12*w1*w2 + A11*A22*sumI*sumI - 2*A22*sumI*sumIx*w1
+                        - 2*A11*sumI*sumIy*w2 - sumIx*sumIx*w2*w2 + A22*dI*sumIx*sumIx + 2*sumIx*sumIy*w1*w2
                         - sumIy*sumIy*w1*w1 + A11*dI*sumIy*sumIy + A22*sumW*w1*w1 + A11*sumW*w2*w2 - A11*A22*dI*sumW;
-        
+
 
                 float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
                         4.f*A12*A12))/(2*winArea);
@@ -581,10 +577,10 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     }
                     break;
                 }
-                
-        
 
-                D = (1.f / D);                
+
+
+                D = (1.f / D);
                 invTensorMat.at<float>(0,0) = (A22*sumI*sumI - 2*sumI*sumIy*w2 + dI*sumIy*sumIy + sumW*w2*w2 - A22*dI*sumW)* D;                                                invTensorMat.at<float>(0,1) = (A12*dI*sumW - A12*sumI * sumI - dI*sumIx*sumIy + sumI*sumIx*w2 + sumI*sumIy*w1 - sumW*w1*w2)* D;
                 invTensorMat.at<float>(0,2) = (A12*sumI*sumIy - sumIy*sumIy*w1 - A22*sumI*sumIx - A12*sumW*w2 + A22*sumW*w1 + sumIx*sumIy*w2)* D;
                 invTensorMat.at<float>(0,3) = (A22*dI*sumIx - A12*dI*sumIy - sumIx*w2*w2 + A12*sumI*w2 - A22*sumI*w1 + sumIy*w1*w2) * D;
@@ -606,18 +602,18 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 // 0.08 -12.10
                 Point2f delta(-resultMat.at<float>(0), -resultMat.at<float>(1));
                 Point2f deltaGain(-resultMat.at<float>(2), -resultMat.at<float>(3));
-            
-                
-                if( j == 0) 
+
+
+                if( j == 0)
                     prevGain = deltaGain;
                 nextPt += delta;
                 nextPts[ptidx] = nextPt + halfWin;
-                gainVecs[ptidx]= gainVec;            
+                gainVecs[ptidx]= gainVec;
                 if( delta.ddot(delta) <= criteria.epsilon)
                     break;
-                if(    (   std::abs(delta.x - prevDelta.x) < 0.01  
-                    &&    std::abs(delta.y - prevDelta.y) < 0.01)  
-                    || ((delta.ddot(delta) <= 0.001) 
+                if(    (   std::abs(delta.x - prevDelta.x) < 0.01
+                    &&    std::abs(delta.y - prevDelta.y) < 0.01)
+                    || ((delta.ddot(delta) <= 0.001)
                        && std::abs(prevGain.x - deltaGain.x) < 0.01)
                         )
                 {
@@ -626,23 +622,23 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     break;
                 }
 
-                if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01  &&    
+                if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01  &&
                             std::abs(delta.y - prevDelta.y) < 0.01)
                 {
                     nextPts[ptidx]  -= delta*0.5f;
                     break;
                 }
-                
+
                 prevDelta = delta;
                 prevGain = deltaGain;
-                
+
             }
-            
-        
+
+
         }
 
     }
-    
+
     const Mat*        prevImg;
     const Mat*        nextImg;
     const Mat*        prevDeriv;
@@ -706,7 +702,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
         useInitialFlow = _useInitialFlow;
         crossSegmentationThreshold = _crossSegmentationThreshold;
     }
-    
+
     TrackerInvoker & operator=(const TrackerInvoker &) {return *this;};
     void operator()(const cv::Range& range) const
     {
@@ -722,8 +718,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
         int winMaskwidth = static_cast<int>(ceil(winSize.width / 8.f)) * 16;
         cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
         winMaskMatBuf.setTo(1);
-        
-        
+
+
 
         const float FLT_SCALE = (1.f/(1 << 20)); // 20
 
@@ -761,7 +757,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                                 minWinSize,maxWinSize, crossSegmentationThreshold) == false)
                         continue;
 
-            
+
             prevPt -= halfWin;
             iprevPt.x = cvFloor(prevPt.x);
             iprevPt.y = cvFloor(prevPt.y);
@@ -783,18 +779,18 @@ class TrackerInvoker : public cv::ParallelLoopBody
             float a = prevPt.x - iprevPt.x;
             float b = prevPt.y - iprevPt.y;
             const int W_BITS = 14, W_BITS1 = 14;
-            
+
             int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
             int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
             int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
             int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-            
+
             int dstep = (int)(derivI.step/derivI.elemSize1());
             int step = (int)(I.step/I.elemSize1());
             int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1());
             CV_Assert( step == (int)(J.step/J.elemSize1()) );
             float A11 = 0, A12 = 0, A22 = 0;
-            
+
 #ifdef RLOF_SSE
             __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
             __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
@@ -805,7 +801,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
             __m128i mmMask4_epi32;
             get4BitMask(winSize.width, mmMask4_epi32);
 #endif
-            
+
             // extract the patch from the first image, compute covariation matrix of derivatives
             int x, y;
             for( y = 0; y < winSize.height; y++ )
@@ -813,41 +809,41 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
                 const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
                 const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
-                
+
                 short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step);
                 short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step);
-                
+
                 x = 0;
-                
+
 #ifdef RLOF_SSE
                 for( ; x <= winSize.width*cn; x += 4, dsrc += 4*2, dIptr += 4*2 )
                 {
-                    __m128i wMask = _mm_set_epi32(MaskSet * maskPtr[x+3], 
-                                                  MaskSet * maskPtr[x+2], 
-                                                  MaskSet * maskPtr[x+1], 
+                    __m128i wMask = _mm_set_epi32(MaskSet * maskPtr[x+3],
+                                                  MaskSet * maskPtr[x+2],
+                                                  MaskSet * maskPtr[x+1],
                                                   MaskSet * maskPtr[x]);
                     __m128i v00, v01, v10, v11, t0, t1;
                     v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
                     v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
                     v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z);
                     v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z);
-                    
+
                     t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
                                        _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
 
-                
-                    
+
+
 
                     t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
                     _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0));
-                    
-                    
+
+
 
                     v00 = _mm_loadu_si128((const __m128i*)(dsrc));
                     v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
                     v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep));
                     v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2));
-                    
+
                     t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
                                        _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
                     t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
@@ -855,7 +851,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
                     t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
                     v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
-                    
+
                     if( x + 4 > winSize.width)
                     {
                         v00 = _mm_and_si128(v00, mmMask4_epi32);
@@ -865,17 +861,17 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     _mm_storeu_si128((__m128i*)dIptr, v00);
                     t0 = _mm_srai_epi32(v00, 16); // Iy0 Iy1 Iy2 Iy3
                     t1 = _mm_srai_epi32(_mm_slli_epi32(v00, 16), 16); // Ix0 Ix1 Ix2 Ix3
-                    
+
                     __m128 fy = _mm_cvtepi32_ps(t0);
                     __m128 fx = _mm_cvtepi32_ps(t1);
-                    
-                    
+
+
                     qA22 = _mm_add_ps(qA22, _mm_mul_ps(fy, fy));
                     qA12 = _mm_add_ps(qA12, _mm_mul_ps(fx, fy));
                     qA11 = _mm_add_ps(qA11, _mm_mul_ps(fx, fx));
                 }
 #else
-                
+
                 for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 )
                 {
                     if( maskPtr[x] == 0)
@@ -890,7 +886,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                                            dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
                     int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
                                            dsrc[dstep+cn2+1]*iw11, W_BITS1);
-                    
+
                     Iptr[x] = (short)ival;
                     dIptr[0] = (short)ixval;
                     dIptr[1] = (short)iyval;
@@ -900,7 +896,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 }
 #endif
             }
-            
+
 #ifdef RLOF_SSE
             float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];
             _mm_store_ps(A11buf, qA11);
@@ -910,25 +906,25 @@ class TrackerInvoker : public cv::ParallelLoopBody
             A12 += A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
             A22 += A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
 #endif
-            
+
             A11 *= FLT_SCALE;
             A12 *= FLT_SCALE;
             A22 *= FLT_SCALE;
-            
+
             float D = A11*A22 - A12*A12;
             float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
                              4.f*A12*A12))/(2 * winArea);
-            
+
             if( err )
                 err[ptidx] = (float)minEig;
-            
+
             if( minEig < minEigThreshold ||  D < FLT_EPSILON )
             {
                 if( level == 0 && status )
                     status[ptidx] = 0;
                 continue;
             }
-            
+
             D = 1.f/D;
 
             nextPt -= halfWin;
@@ -940,8 +936,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 status[ptidx] = static_cast<uchar>(j);
                 inextPt.x = cvFloor(nextPt.x);
                 inextPt.y = cvFloor(nextPt.y);
-                
-                
+
+
                 if( inextPt.x + halfWin.x < 0 || inextPt.x + halfWin.x>= derivI.cols ||
                     inextPt.y + halfWin.y < 0 || inextPt.y + halfWin.y >= derivI.rows )
                 {
@@ -969,9 +965,9 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn;
                     const short* Iptr = (const short*)(IWinBuf.data + ( y)*IWinBuf.step);
                     const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
-                    
+
                     x = 0;
-#ifdef RLOF_SSE                    
+#ifdef RLOF_SSE
                     const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
                     for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 )
                     {
@@ -983,29 +979,29 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
                         __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z);
                         __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z);
-                        
+
                         __m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
                                                    _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
                         __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
                                                    _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
                         t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
                         t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5);
-                     
+
                         __m128i mmDiff_epi16 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), diff0);
-                        
-                        __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... 
+
+                        __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
                         __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
-                        
+
                         if(  x > winSize.width*cn - 8)
                         {
                             Ixy_0 = _mm_and_si128(Ixy_0, mmMask0);
                             Ixy_1 = _mm_and_si128(Ixy_1, mmMask1);
                         }
-                        
-                        
+
+
                         diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7   | It12 It12 It13 It13...
                         diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3   | It8 It8 It9 It9...
-                        
+
 
                         v10 = _mm_mullo_epi16(Ixy_0, diff0);
                         v11 = _mm_mulhi_epi16(Ixy_0, diff0);
@@ -1013,10 +1009,10 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         v10 = _mm_unpackhi_epi16(v10, v11);
                         qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
                         qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
-                        
+
                         // It * Ix It * Iy [4 ... 7]
                         // for set 1 hi sigma 1
-                        
+
                         v10 = _mm_mullo_epi16(Ixy_1, diff1);
                         v11 = _mm_mulhi_epi16(Ixy_1, diff1);
                         v00 = _mm_unpacklo_epi16(v10, v11);
@@ -1027,7 +1023,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
 #else
                     for( ; x < winSize.width*cn; x++, dIptr += 2 )
                     {
-                        if( dIptr[0] == 0 && dIptr[1] == 0) 
+                        if( dIptr[0] == 0 && dIptr[1] == 0)
                             continue;
                         int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
                                               Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11,
@@ -1038,46 +1034,46 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     }
 #endif
                 }
-                
+
 #ifdef RLOF_SSE
                 float CV_DECL_ALIGNED(16) bbuf[4];
                 _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
                 b1 += bbuf[0] + bbuf[2];
                 b2 += bbuf[1] + bbuf[3];
 #endif
-              
+
                 b1 *= FLT_SCALE;
                 b2 *= FLT_SCALE;
 
                 Point2f delta( (float)((A12*b2 - A22*b1) * D),
                               (float)((A12*b1 - A11*b2) * D));
 
-               
+
 
                 nextPt += delta;
                 nextPts[ptidx] = nextPt + halfWin;
 
                  if( delta.ddot(delta) <= criteria.epsilon)
                     break;
-                if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01  &&    
+                if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01  &&
                             std::abs(delta.y - prevDelta.y) < 0.01)
                 {
                     nextPts[ptidx]  -= delta*0.5f;
                     break;
                 }
-                
+
                 prevDelta = delta;
-                
+
             }
-            
-            
 
-            
-            
+
+
+
+
         }
 
     }
-    
+
     const Mat*        prevImg;
     const Mat*        nextImg;
     const Mat*        prevDeriv;
@@ -1100,4 +1096,4 @@ class TrackerInvoker : public cv::ParallelLoopBody
 }
 }
 }}
-#endif
\ No newline at end of file
+#endif
diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp
index c810debbdb9..2a4d39b1046 100644
--- a/modules/optflow/src/rlof/rlof_invoker.hpp
+++ b/modules/optflow/src/rlof/rlof_invoker.hpp
@@ -190,7 +190,7 @@ namespace ica
                     x = 0;
 
 #ifdef RLOF_SSE
-                    
+
                     for (; x <= winSize.width*cn; x += 4, dsrc += 4 * 2, dIptr += 4 * 2)
                     {
                         __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
@@ -281,7 +281,7 @@ namespace ica
                         break;
                     }
 
-                
+
                     a = nextPt.x - inextPt.x;
                     b = nextPt.y - inextPt.y;
                     iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
@@ -344,7 +344,7 @@ namespace ica
                     __m128i mmScale = _mm_set1_epi16(static_cast<short>(MEstimatorScale));
 
 #endif
-                    
+
                     buffIdx = 0;
                     for (y = 0; y < winSize.height; y++)
                     {
@@ -354,7 +354,7 @@ namespace ica
                         const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
 
                         x = 0;
-#ifdef RLOF_SSE                    
+#ifdef RLOF_SSE
                         for (; x <= winSize.width*cn; x += 8, dIptr += 8 * 2)
                         {
                             __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
@@ -377,12 +377,12 @@ namespace ica
 
                             mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16);
 
-                            
+
                             __m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale);
                             mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1)));
-                            
 
-                            __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... 
+
+                            __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
                             __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
 
                             __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16);
@@ -396,20 +396,20 @@ namespace ica
                             // val = |It| -/+ sigma1
                             __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)),
                                 _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1)));
-                            // It == 0     ? |It| > sigma13 
+                            // It == 0     ? |It| > sigma13
                             mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16);
                             // It == val ? sigma0 < |It| < sigma1
                             mmDiff_epi16 = _mm_blendv_epi8(mmDiff_epi16, tmpParam1_epi16, bSet1_epi16);
 
 
                             __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3
-                            // diff = diff * sigma2 
+                            // diff = diff * sigma2
                             __m128i lo = _mm_mullo_epi16(tale_epi16_, mmDiff_epi16);
                             __m128i hi = _mm_mulhi_epi16(tale_epi16_, mmDiff_epi16);
-                            __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); //diff 0_3_epi32 
+                            __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); //diff 0_3_epi32
                             __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); // diff 4_7_epi32
 
-                            mmDiff_epi16 = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); 
+                            mmDiff_epi16 = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32);
                             diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7   | It12 It12 It13 It13...
                             diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3   | It8 It8 It9 It9...
 
@@ -449,7 +449,7 @@ namespace ica
 
                                 tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps);
                                 tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps);
-                                
+
                                 t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3
                                 t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3
 
@@ -488,18 +488,18 @@ namespace ica
                                 Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11,
                                 W_BITS1 - 5) - Iptr[x];
 
-                        
 
 
-                            
+
+
                             if (diff > MEstimatorScale)
                                 MEstimatorScale += eta;
                             if (diff < MEstimatorScale)
                                 MEstimatorScale -= eta;
-                            
+
                             calc_diff(diff, fParam0, fParam1, param[2]);
-                            
-                
+
+
                             float ixval = (float)(dIptr[0]);
                             float iyval = (float)(dIptr[1]);
                             b1 += (float)(diff*ixval);
@@ -531,19 +531,19 @@ namespace ica
                         }
 #endif
                     }
-                    
+
 #ifdef RLOF_SSE
                     short etaValues[8];
                     _mm_storeu_si128((__m128i*)(etaValues), mmEta);
                     MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3]
                             + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]);
 
-                
+
 #endif
                     if (j == 0)
                     {
 #ifdef RLOF_SSE
-                        float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];//                
+                        float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];//
 
                         _mm_store_ps(A11buf, mmAxx);
                         _mm_store_ps(A12buf, mmAxy);
@@ -554,8 +554,8 @@ namespace ica
                         A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
                         A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
 #endif
-                        A11 *= FLT_SCALE; 
-                        A12 *= FLT_SCALE; 
+                        A11 *= FLT_SCALE;
+                        A12 *= FLT_SCALE;
                         A22 *= FLT_SCALE;
 
 
@@ -683,7 +683,7 @@ namespace radial
         TrackerInvoker & operator=(const TrackerInvoker &) { return *this; };
         void operator()(const cv::Range& range) const
         {
-            Point2f halfWin; 
+            Point2f halfWin;
             cv::Size winSize;
             const Mat& I = *prevImg;
             const Mat& J = *nextImg;
@@ -729,7 +729,7 @@ namespace radial
                 else
                 {
                     nextPt = nextPts[ptidx] * 2.f;
-                    
+
                 }
                 nextPts[ptidx] = nextPt;
 
@@ -895,7 +895,7 @@ namespace radial
 
                 for (j = 0; j < criteria.maxCount; j++)
                 {
-                    
+
                     inextPt.x = cvFloor(nextPt.x);
                     inextPt.y = cvFloor(nextPt.y);
 
@@ -952,9 +952,9 @@ namespace radial
                             {
                                 if (dIptr[0] == 0 && dIptr[1] == 0)
                                     continue;
-                                int diff = static_cast<int>(CV_DESCALE(    Jptr[x] * iw00 + 
-                                                        Jptr[x + cn] * iw01 + 
-                                                        Jptr[x + step] * iw10 + 
+                                int diff = static_cast<int>(CV_DESCALE(    Jptr[x] * iw00 +
+                                                        Jptr[x + cn] * iw01 +
+                                                        Jptr[x + step] * iw10 +
                                                         Jptr[x + step + cn] * iw11, W_BITS1 - 5)
                                     - Iptr[x] + Iptr[x] * gainVec.x + gainVec.y);
                                 residualMat.at<short>(buffIdx++) = static_cast<short>(diff);
@@ -970,7 +970,7 @@ namespace radial
                     float fParam1 = param[1] * 32.f;
                     fParam0 = param[0] * MEstimatorScale;
                     fParam1 = param[1] * MEstimatorScale;
-                    
+
 #ifdef RLOF_SSE
 
                     qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
@@ -1011,7 +1011,7 @@ namespace radial
                         const short* dIptr = (const short*)(derivIWinBuf.data + y * derivIWinBuf.step);
 
                         x = 0;
-#ifdef RLOF_SSE                    
+#ifdef RLOF_SSE
 
 
                         for (; x <= _winSize.width*cn; x += 8, dIptr += 8 * 2)
@@ -1020,7 +1020,7 @@ namespace radial
                             __m128i diff0, diff1;
                             __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); // von element 0 bis 7
                                __m128i v00 = _mm_unpacklo_epi8(
-                            _mm_loadl_epi64((const __m128i*)(Jptr + x)) , z); //J0 , 0, J1, 0, J2, 0 ... J7,0 
+                            _mm_loadl_epi64((const __m128i*)(Jptr + x)) , z); //J0 , 0, J1, 0, J2, 0 ... J7,0
                             __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
                             __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z);
                             __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z);
@@ -1036,7 +1036,7 @@ namespace radial
                             t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5);
                             t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1 - 5);
 
-                            
+
                             __m128i lo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16);
                             __m128i hi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16);
 
@@ -1049,14 +1049,14 @@ namespace radial
                                 _mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16),    // J - I
                                 _mm_add_epi16(Igain_epi16, mmConstValue_epi16));
 
-                            
+
                             mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16);
 
-                            
+
                             __m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale);
                             mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1)));
-                                
-        
+
+
                             __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1
                             __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
 
@@ -1079,10 +1079,10 @@ namespace radial
 
 
                             __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3
-                            // diff = diff * sigma2 
+                            // diff = diff * sigma2
                             lo = _mm_mullo_epi16(tale_epi16_, mmDiff_epi16);
                             hi = _mm_mulhi_epi16(tale_epi16_, mmDiff_epi16);
-                            __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); //diff 0_3_epi32 
+                            __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); //diff 0_3_epi32
                             __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); // diff 4_7_epi32
 
                             mmDiff_epi16 = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); // ,da das ergebniss kleiner als 16 bit sein sollte
@@ -1120,7 +1120,7 @@ namespace radial
 
                             qb3 = _mm_add_ps(qb3, _mm_cvtepi32_ps(diff_0_3_epi32));
                             qb3 = _mm_add_ps(qb3, _mm_cvtepi32_ps(diff_4_7_epi32));
-                
+
                             if (j == 0)
                             {
                                 __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16));
@@ -1138,9 +1138,8 @@ namespace radial
 
                                 tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps);
                                 tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps);
-    
-                                t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3
-                                t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3
+                                t0 = _mm_srai_epi32(Ixy_0, 16);
+                                t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16);
 
                                 __m128 fy = _mm_cvtepi32_ps(t0);
                                 __m128 fx = _mm_cvtepi32_ps(t1);
@@ -1212,9 +1211,7 @@ namespace radial
                         {
                             if (maskPtr[x] == 0)
                                 continue;
-                            
-                            float J = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 +
-                                Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11,
+                            float J = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11,
                                 W_BITS1 - 5);
                             short ixval = static_cast<short>(dIptr[0]);
                             short iyval = static_cast<short>(dIptr[1]);
@@ -1224,8 +1221,6 @@ namespace radial
                                 MEstimatorScale += eta;
                             if (diff < MEstimatorScale)
                                 MEstimatorScale -= eta;
-                            // compute the missmatch vector
-                            
                             if (abss > fParam1)
                             {
                                 diff = 0.f;
@@ -1238,8 +1233,6 @@ namespace radial
                             {
                                 diff = param[2] * (diff + fParam1);
                             }
-                            
-
                             b1 += (float)(diff*ixval);
                             b2 += (float)(diff*iyval); ;
                             b3 += (float)(diff)* Iptr[x];
@@ -1283,8 +1276,6 @@ namespace radial
                         }
 #endif
                     }
-
-                    
 #ifdef RLOF_SSE
                     short etaValues[8];
                     _mm_storeu_si128((__m128i*)(etaValues), mmEta);
@@ -1318,12 +1309,8 @@ namespace radial
                         w1 *= -FLT_SCALE;
                         w2 *= -FLT_SCALE;
                         dI *= FLT_SCALE;
-
-
 #ifdef RLOF_SSE
-
-                        float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];//                
-
+                        float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];
                         _mm_store_ps(A11buf, mmAxx);
                         _mm_store_ps(A12buf, mmAxy);
                         _mm_store_ps(A22buf, mmAyy);
@@ -1336,11 +1323,7 @@ namespace radial
                         A11 *= FLT_SCALE;
                         A12 *= FLT_SCALE;
                         A22 *= FLT_SCALE;
-
-                    
                     }
-
-
 #ifdef RLOF_SSE
                     float CV_DECL_ALIGNED(16) bbuf[4];
                     _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
@@ -1350,9 +1333,7 @@ namespace radial
                     b3 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
                     _mm_store_ps(bbuf, qb3);
                     b4 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
-                    
 #endif
-
                     mismatchMat.at<float>(0, 0) = b1 * FLT_SCALE;
                     mismatchMat.at<float>(1, 0) = b2 * FLT_SCALE;
                     mismatchMat.at<float>(2, 0) = -b3 * FLT_SCALE;
@@ -1397,13 +1378,12 @@ namespace radial
 
                     resultMat = invTensorMat * mismatchMat;
 
-                
                     Point2f delta(-resultMat.at<float>(0), -resultMat.at<float>(1));
                     Point2f deltaGain(resultMat.at<float>(2), resultMat.at<float>(3));
 
 
 
-                    
+
                     if (j == 0)
                         prevGain = deltaGain;
                     gainVec += deltaGain * 0.8;
@@ -1454,4 +1434,4 @@ namespace radial
 }
 }
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/modules/optflow/src/rlof/rlof_localflow.h b/modules/optflow/src/rlof/rlof_localflow.h
index 98347e9dcb1..89ac2f8db21 100644
--- a/modules/optflow/src/rlof/rlof_localflow.h
+++ b/modules/optflow/src/rlof/rlof_localflow.h
@@ -25,9 +25,9 @@ T quickselect(const Mat & inp, int k)
 
     l = 0;
     ir = MAX(values.rows, values.cols) - 1;
-    while (true) 
+    while(true)
     {
-        if (ir <= l + 1) 
+        if (ir <= l + 1)
         {
             if (ir == l + 1 && values.at<T>(ir) < values.at<T>(l))
                 std::swap(values.at<T>(l), values.at<T>(ir));
@@ -116,4 +116,4 @@ void calcLocalOpticalFlow(
     const RLOFOpticalFlowParameter & param);
 }
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/modules/optflow/test/test_OF_accuracy.cpp b/modules/optflow/test/test_OF_accuracy.cpp
index b517896e045..e99a44c02c9 100644
--- a/modules/optflow/test/test_OF_accuracy.cpp
+++ b/modules/optflow/test/test_OF_accuracy.cpp
@@ -181,7 +181,6 @@ TEST(SparseOpticalFlow, ReferenceAccuracy)
     Mat frame1, frame2, GT;
     ASSERT_TRUE(readRubberWhale(frame1, frame2, GT));
     vector<Point2f> prevPts, currPts;
-    
     for (int r = 0; r < frame1.rows; r+=10)
     {
         for (int c = 0; c < frame1.cols; c+=10)
@@ -197,7 +196,6 @@ TEST(SparseOpticalFlow, ReferenceAccuracy)
     Ptr<RLOFOpticalFlowParameter> param;
     param->supportRegionType = SR_CROSS;
     param->useIlluminationModel = true;
-    
     param->solverType = ST_BILINEAR;
     algo->setRLOFOpticalFlowParameter(param);
     algo->calc(frame1, frame2, prevPts, currPts, status, err);
@@ -209,7 +207,6 @@ TEST(SparseOpticalFlow, ReferenceAccuracy)
     EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.34f);
 
     param->useIlluminationModel = false;
-    
     param->solverType = ST_BILINEAR;
     algo->setRLOFOpticalFlowParameter(param);
     algo->calc(frame1, frame2, prevPts, currPts, status, err);
@@ -251,7 +248,6 @@ TEST(DenseOpticalFlow_RLOF, ReferenceAccuracy)
 {
     Mat frame1, frame2, GT;
     ASSERT_TRUE(readRubberWhale(frame1, frame2, GT));
-    
     Mat flow;
     Ptr<DenseRLOFOpticalFlow> algo = DenseRLOFOpticalFlow::create();
     Ptr<RLOFOpticalFlowParameter> param;
@@ -260,7 +256,6 @@ TEST(DenseOpticalFlow_RLOF, ReferenceAccuracy)
     algo->setRLOFOpticalFlowParameter(param);
     algo->setForwardBackward(1.0f);
     algo->setGridStep(cv::Size(4, 4));
-    
     algo->setInterpolation(INTERP_EPIC);
     algo->calc(frame1, frame2, flow);
 

From 4956de9010c0df00cea94e3007ae9b59ae41e7fa Mon Sep 17 00:00:00 2001
From: senst <senst@nue.tu-berlin.de>
Date: Tue, 11 Dec 2018 12:26:52 +0100
Subject: [PATCH 05/25] remove whitespaces fix perf and accuracy tests

---
 .../include/opencv2/optflow/rlofflow.hpp      | 32 ++++++----
 modules/optflow/perf/perf_rlof.cpp            | 24 +++-----
 modules/optflow/src/rlof/berlof_invoker.hpp   | 28 +++++----
 modules/optflow/src/rlof/plk_invoker.hpp      | 19 ++++--
 modules/optflow/src/rlof/rlof_invoker.hpp     | 59 +++++++++----------
 modules/optflow/src/rlof/rlof_invokerbase.hpp |  9 ++-
 modules/optflow/src/rlofflow.cpp              | 12 ++--
 modules/optflow/test/test_OF_accuracy.cpp     | 19 +++---
 8 files changed, 103 insertions(+), 99 deletions(-)

diff --git a/modules/optflow/include/opencv2/optflow/rlofflow.hpp b/modules/optflow/include/opencv2/optflow/rlofflow.hpp
index f088562215e..4911c53b947 100644
--- a/modules/optflow/include/opencv2/optflow/rlofflow.hpp
+++ b/modules/optflow/include/opencv2/optflow/rlofflow.hpp
@@ -21,7 +21,7 @@ namespace optflow
      * This RLOF implementation can be seen as an improved pyramidal iterative Lucas-Kanade and includes
      * a set of improving modules. The main improvements in respect to the pyramidal iterative Lucas-Kanade
      * are:
-     *   - A more robust redecending M-estimator framework (see @cite Senst2012) to improve the accuracy at
+     *  - A more robust redecending M-estimator framework (see @cite Senst2012) to improve the accuracy at
      *     motion boundaries and appearing and disappearing pixels.
      *  - an adaptive support region strategies to improve the accuracy at motion boundaries to reduce the
      *     corona effect, i.e oversmoothing of the PLK at motion/object boundaries. The cross-based segementation
@@ -143,8 +143,8 @@ namespace optflow
         float globalMotionRansacThreshold;
         /**< To apply the global motion prior motion vectors will be computed on a regulary sampled which
          *   are the basis for Homography estimation using RANSAC. The reprojection threshold is based on
-         *   n-th percentil (given by this value [0 ... 100]) of the forward backward errors relating to
-         *   the motion vectors. See @cite Senst2016 for more details.
+         *   n-th percentil (given by this value [0 ... 100]) of the motion vectors magnitude.
+         *   See @cite Senst2016 for more details.
         */
     };
 
@@ -165,7 +165,8 @@ namespace optflow
      * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details.
      * Parameters have been described in @cite Senst2012 @cite Senst2013 @cite Senst2014 and @cite Senst2016.
      *
-     * @note SIMD parallelization is available compiled with SSE4.1. If the grid size is set to (1,1) and the forward backward threshold <= 0 that the dense optical flow field is purely.
+     * @note SIMD parallelization is only available when compiling with SSE4.1. If the grid size is set to (1,1) and the
+     * forward backward threshold <= 0 that the dense optical flow field is purely.
      * computed with the RLOF.
      *
      * @see optflow::calcOpticalFlowDenseRLOF(), optflow::RLOFOpticalFlowParameter
@@ -207,7 +208,7 @@ namespace optflow
         CV_WRAP virtual void setGridStep(Size val) = 0;
 
         //! @brief Interpolation used to compute the dense optical flow.
-        /** Two interpolation algorithms have been supported
+        /** Two interpolation algorithms are supported
          * - **INTERP_GEO** applies the fast geodesic interpolation, see @cite Geistert2016.
          * - **INTERP_EPIC_RESIDUAL** applies the edge-preserving interpolation, see @cite Revaud2015,Geistert2016.
          * @see ximgproc::EdgeAwareInterpolator, getInterpolation
@@ -280,6 +281,7 @@ namespace optflow
          *    @param rlofParam see optflow::RLOFOpticalFlowParameter
          *    @param forwardBackwardThreshold see setForwardBackward
          *    @param gridStep see setGridStep
+         *    @param interp_type see setInterpolation
          *    @param epicK see setEPICK
          *    @param epicSigma see setEPICSigma
          *    @param epicLambda see setEPICLambda
@@ -308,8 +310,8 @@ namespace optflow
     * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details.
     * Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014 and @cite Senst2016.
     *
-    * @note SIMD parallelization is available compiled with SSE4.1.
-    *@see optflow::calcOpticalFlowSparseRLOF(), optflow::RLOFOpticalFlowParameter
+    * @note SIMD parallelization is only available when compiling with SSE4.1.
+    * @see optflow::calcOpticalFlowSparseRLOF(), optflow::RLOFOpticalFlowParameter
     */
     class CV_EXPORTS_W SparseRLOFOpticalFlow : public SparseOpticalFlow
     {
@@ -365,7 +367,7 @@ namespace optflow
      * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image.
      * @param flow computed flow image that has the same size as I0 and type CV_32FC2.
      * @param rlofParam see optflow::RLOFOpticalFlowParameter
-     * @param forewardBackwardThreshold Threshold for the forward backward confidence check.
+     * @param forwardBackwardThreshold Threshold for the forward backward confidence check.
      * For each grid point \f$ \mathbf{x} \f$ a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed.
      * If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f]
      * is larger than threshold given by this function then the motion vector will not be used by the following
@@ -374,6 +376,10 @@ namespace optflow
      * @param gridStep Size of the grid to spawn the motion vectors. For each grid point a motion vector is computed.
      * Some motion vectors will be removed due to the forwatd backward threshold (if set >0). The rest will be the
      * base of the vector field interpolation.
+     * @param interp_type interpolation method used to compute the dense optical flow. Two interpolation algorithms are
+     * supported:
+     * - **INTERP_GEO** applies the fast geodesic interpolation, see @cite Geistert2016.
+     * - **INTERP_EPIC_RESIDUAL** applies the edge-preserving interpolation, see @cite Revaud2015,Geistert2016.
      * @param epicK see ximgproc::EdgeAwareInterpolator() sets the respective parameter.
      * @param epicSigma see ximgproc::EdgeAwareInterpolator() sets the respective parameter.
      * @param epicLambda see ximgproc::EdgeAwareInterpolator() sets the respective parameter.
@@ -386,13 +392,13 @@ namespace optflow
      * @note If the grid size is set to (1,1) and the forward backward threshold <= 0 that the dense optical flow field is purely
      * computed with the RLOF.
      *
-     * @note SIMD parallelization is available compiled with SSE4.1.
+     * @note SIMD parallelization is only available when compiling with SSE4.1.
      *
      * @sa optflow::DenseRLOFOpticalFlow, optflow::RLOFOpticalFlowParameter
     */
     CV_EXPORTS_W void calcOpticalFlowDenseRLOF(InputArray I0, InputArray I1, InputOutputArray flow,
         Ptr<RLOFOpticalFlowParameter> rlofParam = Ptr<RLOFOpticalFlowParameter>(),
-        float forewardBackwardThreshold = 0, Size gridStep = Size(6, 6),
+        float forwardBackwardThreshold = 0, Size gridStep = Size(6, 6),
         InterpolationType interp_type = InterpolationType::INTERP_EPIC,
         int epicK = 128, float epicSigma = 0.05f, float epicLambda = 100.f,
         bool use_post_proc = true, float fgsLambda = 500.0f, float fgsSigma = 1.5f);
@@ -416,9 +422,9 @@ namespace optflow
     * corresponding features has passed the forward backward check.
     * @param err output vector of errors; each element of the vector is set to the forward backward error for the corresponding feature.
     * @param rlofParam see optflow::RLOFOpticalFlowParameter
-    * @param forewardBackwardThreshold Threshold for the forward backward confidence check. If forewardBackwardThreshold <=0 the forward
+    * @param forwardBackwardThreshold Threshold for the forward backward confidence check. If forewardBackwardThreshold <=0 the forward
     *
-    * @note SIMD parallelization is available compiled with SSE4.1.
+    * @note SIMD parallelization is only available when compiling with SSE4.1.
     *
     * Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014 and @cite Senst2016.
     * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details.
@@ -427,7 +433,7 @@ namespace optflow
         InputArray prevPts, InputOutputArray nextPts,
         OutputArray status, OutputArray err,
         Ptr<RLOFOpticalFlowParameter> rlofParam = Ptr<RLOFOpticalFlowParameter>(),
-        float forewardBackwardThreshold = 0);
+        float forwardBackwardThreshold = 0);
 
     //! Additional interface to the Dense RLOF algorithm - optflow::calcOpticalFlowDenseRLOF()
     CV_EXPORTS_W Ptr<DenseOpticalFlow> createOptFlow_DenseRLOF();
diff --git a/modules/optflow/perf/perf_rlof.cpp b/modules/optflow/perf/perf_rlof.cpp
index 910fa58375c..dda15a4b94e 100644
--- a/modules/optflow/perf/perf_rlof.cpp
+++ b/modules/optflow/perf/perf_rlof.cpp
@@ -1,6 +1,5 @@
 #include "perf_precomp.hpp"
-namespace opencv_test {
-    namespace {
+namespace opencv_test { namespace {
         typedef tuple<std::string, std::string, bool> ST_SR_IM_Sparse_t;
         typedef TestBaseWithParam<ST_SR_IM_Sparse_t> ST_SR_IM_Sparse;
         PERF_TEST_P(ST_SR_IM_Sparse, OpticalFlow_SparseRLOF,
@@ -10,12 +9,8 @@ namespace opencv_test {
                 testing::Values(true, false))
         )
         {
-            string frame1_path = TS::ptr()->get_data_path() + "/cv/optflow/RubberWhale1.png";
-            string frame2_path = TS::ptr()->get_data_path() + "/cv/optflow/RubberWhale2.png";
-            frame1_path.erase(std::remove_if(frame1_path.begin(), frame1_path.end(), isspace), frame1_path.end());
-            frame2_path.erase(std::remove_if(frame2_path.begin(), frame2_path.end(), isspace), frame2_path.end());
-            Mat frame1 = imread(frame1_path);
-            Mat frame2 = imread(frame2_path);
+            Mat frame1 = imread(getDataPath("cv/optflow/RubberWhale1.png"));
+            Mat frame2 = imread(getDataPath("cv/optflow/RubberWhale2.png"));
             ASSERT_FALSE(frame1.empty());
             ASSERT_FALSE(frame2.empty());
             vector<Point2f> prevPts, currPts;
@@ -29,7 +24,7 @@ namespace opencv_test {
             vector<uchar> status(prevPts.size());
             vector<float> err(prevPts.size());
 
-            Ptr<RLOFOpticalFlowParameter> param;
+            Ptr<RLOFOpticalFlowParameter> param = Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter);
             if (get<0>(GetParam()) == "ST_BILINEAR")
                 param->solverType = ST_BILINEAR;
             if (get<0>(GetParam()) == "ST_STANDART")
@@ -57,16 +52,11 @@ namespace opencv_test {
         )
         {
             Mat flow;
-            string frame1_path = TS::ptr()->get_data_path() + "/cv/optflow/RubberWhale1.png";
-            string frame2_path = TS::ptr()->get_data_path() + "/cv/optflow/RubberWhale2.png";
-            // removing space may be an issue on windows machines
-            frame1_path.erase(std::remove_if(frame1_path.begin(), frame1_path.end(), isspace), frame1_path.end());
-            frame2_path.erase(std::remove_if(frame2_path.begin(), frame2_path.end(), isspace), frame2_path.end());
-            Mat frame1 = imread(frame1_path);
-            Mat frame2 = imread(frame2_path);
+            Mat frame1 = imread(getDataPath("cv/optflow/RubberWhale1.png"));
+            Mat frame2 = imread(getDataPath("cv/optflow/RubberWhale1.png"));
             ASSERT_FALSE(frame1.empty());
             ASSERT_FALSE(frame2.empty());
-            Ptr<RLOFOpticalFlowParameter> param;
+            Ptr<RLOFOpticalFlowParameter> param = Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter);;
             Ptr< DenseRLOFOpticalFlow> algo = DenseRLOFOpticalFlow::create();
             InterpolationType interp_type;
             if (get<0>(GetParam()) == "INTERP_EPIC")
diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp
index 9c229bf542b..3dfed1ce6bd 100644
--- a/modules/optflow/src/rlof/berlof_invoker.hpp
+++ b/modules/optflow/src/rlof/berlof_invoker.hpp
@@ -95,9 +95,11 @@ namespace berlof
             }
 
             TrackerInvoker & operator=(const TrackerInvoker &) {return *this;};
-            void operator()(const cv::Range& range) const
+            void operator()(const cv::Range& range) const CV_OVERRIDE
             {
-
+#ifdef DEBUG_INVOKER
+                printf("berlof::ica");fflush(stdout);
+#endif
                 Point2f halfWin;
                 cv::Size winSize;
                 const Mat& I = *prevImg;
@@ -610,8 +612,11 @@ namespace berlof
             }
 
             TrackerInvoker & operator=(const TrackerInvoker &) {return *this;};
-            void operator()(const cv::Range& range) const
+            void operator()(const cv::Range& range) const CV_OVERRIDE
             {
+#ifdef DEBUG_INVOKER
+                printf("berlof::radial");fflush(stdout);
+#endif
                 Point2f halfWin;
                 cv::Size winSize;
                 const Mat& I = *prevImg;
@@ -725,16 +730,14 @@ namespace berlof
                     for( y = 0; y < winSize.height; y++ )
                     {
                         x = 0;
-                        const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+
                         const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
                         const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
 
                         short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step);
                         short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step);
-
-
-
         #ifdef RLOF_SSE
+                        const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
                         for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 )
                         {
                             __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16);
@@ -812,9 +815,11 @@ namespace berlof
                     cv::Point2f backUpGain = gainVec;
                     cv::Size _winSize = winSize;
                     int j;
+        #ifdef RLOF_SSE
                     __m128i mmMask0, mmMask1, mmMask;
                     getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask);
                     __m128  mmOnes   = _mm_set1_ps(1.f );
+        #endif
                     float MEstimatorScale = 1;
                     int buffIdx = 0;
                     float c[8];
@@ -1172,11 +1177,11 @@ namespace berlof
                                              (Jptr[x] << 5)            + illValue};
 
 
-                                float J  =  CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
+                                float J_val  =  CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
                                                       Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11,
                                                       W_BITS1-5);
 
-                                int diff =  J + illValue;
+                                int diff =  J_val + illValue;
 
 
 
@@ -1274,9 +1279,8 @@ namespace berlof
                         _mm_storeu_si128((__m128i*)(etaValues), mmEta);
                         MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3]
                                                    + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]);
-
+                        float CV_DECL_ALIGNED(32) wbuf[4];
 #endif
-                        float CV_DECL_ALIGNED(32) wbuf[4];//
                         if( j == 0 )
                         {
 #ifdef RLOF_SSE
@@ -1575,7 +1579,7 @@ namespace beplk
 
         TrackerInvoker & operator=(const TrackerInvoker &) {return *this;};
 
-        void operator()(const cv::Range& range) const
+        void operator()(const cv::Range& range) const CV_OVERRIDE
         {
             cv::Size    winSize;
             cv::Point2f halfWin;
diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp
index 0d7b5ddb481..14b6eb78d22 100644
--- a/modules/optflow/src/rlof/plk_invoker.hpp
+++ b/modules/optflow/src/rlof/plk_invoker.hpp
@@ -61,8 +61,11 @@ class TrackerInvoker : public cv::ParallelLoopBody
     }
 
     TrackerInvoker & operator=(const TrackerInvoker &) {return *this;};
-    void operator()(const cv::Range& range) const
+    void operator()(const cv::Range& range) const CV_OVERRIDE
     {
+#ifdef DEBUG_INVOKER
+        printf("plk::radial");fflush(stdout);
+#endif
         cv::Size    winSize;
         cv::Point2f halfWin;
 
@@ -467,10 +470,10 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     {
                         if( maskPtr[x] == 0)
                             continue;
-                        int J = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
+                        int J_val = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
                                               Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11,
                                               W_BITS1-5);
-                        int diff = J - Iptr[x] + static_cast<int>(Iptr[x]) * gainVec.x + gainVec.y;
+                        int diff = J_val - Iptr[x] + static_cast<int>(Iptr[x]) * gainVec.x + gainVec.y;
 
                         b1 += (float)(diff*dIptr[0])  * FLT_RESCALE;
                         b2 += (float)(diff*dIptr[1])  * FLT_RESCALE;
@@ -499,8 +502,9 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     }
 #endif
                 }
-
-                float CV_DECL_ALIGNED(16) wbuf[4];//
+#ifdef RLOF_SSE
+                float CV_DECL_ALIGNED(16) wbuf[4];
+#endif
                 if( j == 0 )
                 {
 #ifdef RLOF_SSE
@@ -704,8 +708,11 @@ class TrackerInvoker : public cv::ParallelLoopBody
     }
 
     TrackerInvoker & operator=(const TrackerInvoker &) {return *this;};
-    void operator()(const cv::Range& range) const
+    void operator()(const cv::Range& range) const CV_OVERRIDE
     {
+#ifdef DEBUG_INVOKER
+        printf("plk::ica");fflush(stdout);
+#endif
         cv::Size    winSize;
         cv::Point2f halfWin;
 
diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp
index 2a4d39b1046..7da20aa7d77 100644
--- a/modules/optflow/src/rlof/rlof_invoker.hpp
+++ b/modules/optflow/src/rlof/rlof_invoker.hpp
@@ -12,26 +12,6 @@ namespace rlof
 {
 namespace ica
 {
-    inline int calc_diff(int diff,float fParam0,float fParam1,float param_2)
-    {
-        int abss = (diff < 0) ? -diff : diff;
-        if (abss > fParam1)
-        {
-            diff = 0;
-        }
-        else if (abss > fParam0 && diff >= 0)
-        {
-            //diff = fParam1 * (diff - fParam1);
-            diff = static_cast<int>(param_2 * (diff - fParam1));
-        }
-        else if (abss > fParam0 && diff < 0)
-        {
-            //diff = fParam1 * (diff + fParam1);
-            diff = static_cast<int>(param_2 * (diff + fParam1));
-
-        }
-        return diff;
-    }
     class TrackerInvoker : public cv::ParallelLoopBody
     {
     public:
@@ -77,8 +57,11 @@ namespace ica
             crossSegmentationThreshold = _crossSegmentationThreshold;
         }
         TrackerInvoker & operator=(const TrackerInvoker &) { return *this; };
-        void operator()(const cv::Range& range) const
+        void operator()(const cv::Range& range) const CV_OVERRIDE
         {
+#ifdef DEBUG_INVOKER
+            printf("rlof::ica");fflush(stdout);
+#endif
             cv::Size    winSize;
             cv::Point2f halfWin;
 
@@ -497,8 +480,22 @@ namespace ica
                             if (diff < MEstimatorScale)
                                 MEstimatorScale -= eta;
 
-                            calc_diff(diff, fParam0, fParam1, param[2]);
+                            int abss = (diff < 0) ? -diff : diff;
+                            if (abss > fParam1)
+                            {
+                                diff = 0;
+                            }
+                            else if (abss > fParam0 && diff >= 0)
+                            {
+                                //diff = fParam1 * (diff - fParam1);
+                                diff = static_cast<int>(param[2] * (diff - fParam1));
+                            }
+                            else if (abss > fParam0 && diff < 0)
+                            {
+                                //diff = fParam1 * (diff + fParam1);
+                                diff = static_cast<int>(param[2] * (diff + fParam1));
 
+                            }
 
                             float ixval = (float)(dIptr[0]);
                             float iyval = (float)(dIptr[1]);
@@ -507,9 +504,7 @@ namespace ica
 
                             if ( j == 0)
                             {
-                                int abss = (diff < 0) ? -diff : diff;
                                 float tale = param[2] * FLT_RESCALE;
-                                //float tale = fParam1 * FLT_RESCALE;
                                 if (abss < fParam0)
                                 {
                                     tale = FLT_RESCALE;
@@ -681,8 +676,11 @@ namespace radial
         }
 
         TrackerInvoker & operator=(const TrackerInvoker &) { return *this; };
-        void operator()(const cv::Range& range) const
+        void operator()(const cv::Range& range) const CV_OVERRIDE
         {
+#ifdef DEBUG_INVOKER
+            printf("rlof::radial");fflush(stdout);
+#endif
             Point2f halfWin;
             cv::Size winSize;
             const Mat& I = *prevImg;
@@ -801,7 +799,6 @@ namespace radial
                 int x, y;
                 for (y = 0; y < winSize.height; y++)
                 {
-                    const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
                     const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
                     const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
 
@@ -811,6 +808,7 @@ namespace radial
                     x = 0;
 
 #ifdef RLOF_SSE
+                    const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
                     for (; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4 * 2, dIptr += 4 * 2)
                     {
                         __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
@@ -886,9 +884,11 @@ namespace radial
                 cv::Point2f backUpGain = gainVec;
                 cv::Size _winSize = winSize;
                 int j;
+#ifdef RLOF_SSE
                 __m128i mmMask0, mmMask1, mmMask;
                 getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask);
                 __m128  mmOnes = _mm_set1_ps(1.f);
+#endif
                 float MEstimatorScale = 1;
                 int buffIdx = 0;
                 float minEigValue;
@@ -1211,11 +1211,11 @@ namespace radial
                         {
                             if (maskPtr[x] == 0)
                                 continue;
-                            float J = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11,
+                            float J_val = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11,
                                 W_BITS1 - 5);
                             short ixval = static_cast<short>(dIptr[0]);
                             short iyval = static_cast<short>(dIptr[1]);
-                            int diff = J - static_cast<int>(Iptr[x]) + Iptr[x] * gainVec.x + gainVec.y;
+                            int diff = J_val - static_cast<int>(Iptr[x]) + Iptr[x] * gainVec.x + gainVec.y;
                             int abss = (diff < 0) ? -diff : diff;
                             if (diff > MEstimatorScale)
                                 MEstimatorScale += eta;
@@ -1281,9 +1281,8 @@ namespace radial
                     _mm_storeu_si128((__m128i*)(etaValues), mmEta);
                     MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3]
                             + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]);
-
+                            float CV_DECL_ALIGNED(32) wbuf[4];
 #endif
-                    float CV_DECL_ALIGNED(32) wbuf[4];//
                     if (j == 0)
                     {
 #ifdef RLOF_SSE
diff --git a/modules/optflow/src/rlof/rlof_invokerbase.hpp b/modules/optflow/src/rlof/rlof_invokerbase.hpp
index 34b112fa570..42275cfb2a7 100644
--- a/modules/optflow/src/rlof/rlof_invokerbase.hpp
+++ b/modules/optflow/src/rlof/rlof_invokerbase.hpp
@@ -7,10 +7,12 @@
 
 #if CV_CPU_COMPILE_SSE4_1
 #define RLOF_SSE
-#elif CV_CPU_COMPILE_SSE4_2
+#endif
+#if CV_CPU_COMPILE_SSE4_2
 #define RLOF_SSE
 #endif
 
+//#define DEBUG_INVOKER
 
 #define  CV_DESCALE(x,n)     (((x) + (1 << ((n)-1))) >> (n))
 #define FLT_RESCALE 1
@@ -271,8 +273,5 @@ namespace optflow
         return quickselect<short>(absMat, absMat.rows / 2);
     }
 }
-
-    
 }
-
-#endif
\ No newline at end of file
+#endif
diff --git a/modules/optflow/src/rlofflow.cpp b/modules/optflow/src/rlofflow.cpp
index 08e16d9a31d..3ba5a782f0b 100644
--- a/modules/optflow/src/rlofflow.cpp
+++ b/modules/optflow/src/rlofflow.cpp
@@ -14,7 +14,8 @@ namespace optflow
     {
     public:
         DenseOpticalFlowRLOFImpl()
-            : forwardBackwardThreshold(1.f)
+            : param(Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter))
+            , forwardBackwardThreshold(1.f)
             , gridStep(6, 6)
             , interp_type(InterpolationType::INTERP_GEO)
             , k(128)
@@ -68,7 +69,6 @@ namespace optflow
             if (param->supportRegionType == SR_CROSS)
                 CV_Assert( I0.channels() == 3 && I1.channels() == 3);
             CV_Assert(interp_type == InterpolationType::INTERP_EPIC || interp_type == InterpolationType::INTERP_GEO);
-
             Mat prevImage = I0.getMat();
             Mat currImage = I1.getMat();
             int noPoints = prevImage.cols * prevImage.rows;
@@ -85,7 +85,6 @@ namespace optflow
             }
             prevPoints.erase(prevPoints.begin() + noPoints, prevPoints.end());
             currPoints.resize(prevPoints.size());
-
             calcLocalOpticalFlow(prevImage, currImage, prevPyramid, currPyramid, prevPoints, currPoints, *(param.get()));
 
             flow.create(prevImage.size(), CV_32FC2);
@@ -211,15 +210,16 @@ namespace optflow
     {
         public:
         SparseRLOFOpticalFlowImpl()
-            : forwardBackwardThreshold(1.f)
+            : param(Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter))
+            , forwardBackwardThreshold(1.f)
         {
             prevPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer);
             prevPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer);
             currPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer);
             currPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer);
         }
-        virtual void setRLOFOpticalFlowParameter(Ptr<RLOFOpticalFlowParameter>  val) CV_OVERRIDE  { } // param = *val; }
-        virtual Ptr<RLOFOpticalFlowParameter>  getRLOFOpticalFlowParameter() const CV_OVERRIDE { return Ptr<RLOFOpticalFlowParameter>(); }
+        virtual void setRLOFOpticalFlowParameter(Ptr<RLOFOpticalFlowParameter>  val) CV_OVERRIDE { param = val; }
+        virtual Ptr<RLOFOpticalFlowParameter>  getRLOFOpticalFlowParameter() const CV_OVERRIDE { return param; }
 
         virtual float getForwardBackward()  const CV_OVERRIDE { return forwardBackwardThreshold; }
         virtual void setForwardBackward(float val) CV_OVERRIDE { forwardBackwardThreshold = val; }
diff --git a/modules/optflow/test/test_OF_accuracy.cpp b/modules/optflow/test/test_OF_accuracy.cpp
index e99a44c02c9..67d58f243b0 100644
--- a/modules/optflow/test/test_OF_accuracy.cpp
+++ b/modules/optflow/test/test_OF_accuracy.cpp
@@ -192,14 +192,13 @@ TEST(SparseOpticalFlow, ReferenceAccuracy)
     vector<float> err(prevPts.size());
     Ptr<SparseRLOFOpticalFlow> algo = SparseRLOFOpticalFlow::create();
     algo->setForwardBackward(0.0f);
-
-    Ptr<RLOFOpticalFlowParameter> param;
+    Ptr<RLOFOpticalFlowParameter> param = Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter);
     param->supportRegionType = SR_CROSS;
     param->useIlluminationModel = true;
     param->solverType = ST_BILINEAR;
     algo->setRLOFOpticalFlowParameter(param);
     algo->calc(frame1, frame2, prevPts, currPts, status, err);
-    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.22f);
+    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.25f);
 
     param->solverType = ST_STANDART;
     algo->setRLOFOpticalFlowParameter(param);
@@ -210,12 +209,12 @@ TEST(SparseOpticalFlow, ReferenceAccuracy)
     param->solverType = ST_BILINEAR;
     algo->setRLOFOpticalFlowParameter(param);
     algo->calc(frame1, frame2, prevPts, currPts, status, err);
-    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.24f);
+    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.25f);
 
     param->solverType = ST_STANDART;
     algo->setRLOFOpticalFlowParameter(param);
     algo->calc(frame1, frame2, prevPts, currPts, status, err);
-    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.23f);
+    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.25f);
 
     param->normSigma0 = numeric_limits<float>::max();
     param->normSigma1 = numeric_limits<float>::max();
@@ -224,12 +223,12 @@ TEST(SparseOpticalFlow, ReferenceAccuracy)
     param->solverType = ST_BILINEAR;
     algo->setRLOFOpticalFlowParameter(param);
     algo->calc(frame1, frame2, prevPts, currPts, status, err);
-    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.26f);
+    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.28f);
 
     param->solverType = ST_STANDART;
     algo->setRLOFOpticalFlowParameter(param);
     algo->calc(frame1, frame2, prevPts, currPts, status, err);
-    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.26f);
+    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.28f);
 
     param->useIlluminationModel = false;
 
@@ -241,7 +240,7 @@ TEST(SparseOpticalFlow, ReferenceAccuracy)
     param->solverType = ST_STANDART;
     algo->setRLOFOpticalFlowParameter(param);
     algo->calc(frame1, frame2, prevPts, currPts, status, err);
-    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.26f);
+    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.28f);
 }
 
 TEST(DenseOpticalFlow_RLOF, ReferenceAccuracy)
@@ -250,7 +249,7 @@ TEST(DenseOpticalFlow_RLOF, ReferenceAccuracy)
     ASSERT_TRUE(readRubberWhale(frame1, frame2, GT));
     Mat flow;
     Ptr<DenseRLOFOpticalFlow> algo = DenseRLOFOpticalFlow::create();
-    Ptr<RLOFOpticalFlowParameter> param;
+    Ptr<RLOFOpticalFlowParameter> param = Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter);
     param->supportRegionType = SR_CROSS;
     param->solverType = ST_BILINEAR;
     algo->setRLOFOpticalFlowParameter(param);
@@ -261,7 +260,7 @@ TEST(DenseOpticalFlow_RLOF, ReferenceAccuracy)
 
     ASSERT_EQ(GT.rows, flow.rows);
     ASSERT_EQ(GT.cols, flow.cols);
-    EXPECT_LE(calcRMSE(GT, flow), 0.36f);
+    EXPECT_LE(calcRMSE(GT, flow), 0.37f);
 
     algo->setInterpolation(INTERP_GEO);
     algo->calc(frame1, frame2, flow);

From a6cc9bb0bc5d2a4362ad2a35a120519789e4a8b5 Mon Sep 17 00:00:00 2001
From: senst <senst@nue.tu-berlin.de>
Date: Tue, 11 Dec 2018 12:57:24 +0100
Subject: [PATCH 06/25] remove x86intrin.h header

---
 modules/optflow/src/rlof/rlof_invokerbase.hpp | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/modules/optflow/src/rlof/rlof_invokerbase.hpp b/modules/optflow/src/rlof/rlof_invokerbase.hpp
index 42275cfb2a7..adb45f039b4 100644
--- a/modules/optflow/src/rlof/rlof_invokerbase.hpp
+++ b/modules/optflow/src/rlof/rlof_invokerbase.hpp
@@ -5,10 +5,10 @@
 #define  _RLOF_INVOKERBASE_HPP_
 
 
-#if CV_CPU_COMPILE_SSE4_1
+#ifdef CV_CPU_COMPILE_SSE4_1
 #define RLOF_SSE
 #endif
-#if CV_CPU_COMPILE_SSE4_2
+#ifdef CV_CPU_COMPILE_SSE4_2
 #define RLOF_SSE
 #endif
 
@@ -16,9 +16,6 @@
 
 #define  CV_DESCALE(x,n)     (((x) + (1 << ((n)-1))) >> (n))
 #define FLT_RESCALE 1
-#ifndef _MSC_VER
-#include <x86intrin.h>
-#endif
 
 #include "opencv2/core.hpp"
 #include "opencv2/video.hpp"

From c367b8c55b43452d9b784f23e36f7955a7378b0e Mon Sep 17 00:00:00 2001
From: senst <senst@nue.tu-berlin.de>
Date: Tue, 11 Dec 2018 14:16:08 +0100
Subject: [PATCH 07/25] fix ios and arm by removing last sse commands

---
 modules/optflow/src/rlof/berlof_invoker.hpp   | 4 ++++
 modules/optflow/src/rlof/plk_invoker.hpp      | 2 ++
 modules/optflow/src/rlof/rlof_invoker.hpp     | 2 ++
 modules/optflow/src/rlof/rlof_invokerbase.hpp | 3 ++-
 4 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp
index 3dfed1ce6bd..d550e67a7b6 100644
--- a/modules/optflow/src/rlof/berlof_invoker.hpp
+++ b/modules/optflow/src/rlof/berlof_invoker.hpp
@@ -222,8 +222,10 @@ namespace berlof
                     nextPt -= halfWin;
                     Point2f prevDelta(0,0);    //denotes h(t-1)
                     cv::Size _winSize = winSize;
+#ifdef RLOF_SSE
                     __m128i mmMask0, mmMask1, mmMask;
                     getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask);
+#endif
                     float MEstimatorScale = 1;
                     int buffIdx = 0;
                     float c[8];
@@ -1803,8 +1805,10 @@ namespace beplk
                 Point2f prevDelta(0,0);
 
                 float c[8];
+#ifdef RLOF_SSE
                 __m128i mmMask0, mmMask1, mmMask;
                 getWBitMask(winSize.width, mmMask0, mmMask1, mmMask);
+#endif
                 for( j = 0; j < criteria.maxCount; j++ )
                 {
                     cv::Point2f delta;
diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp
index 14b6eb78d22..39f81fe6a18 100644
--- a/modules/optflow/src/rlof/plk_invoker.hpp
+++ b/modules/optflow/src/rlof/plk_invoker.hpp
@@ -936,8 +936,10 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
             nextPt -= halfWin;
             Point2f prevDelta(0,0);    //relates to h(t-1)
+#ifdef RLOF_SSE
             __m128i mmMask0, mmMask1, mmMask;
             getWBitMask(winSize.width, mmMask0, mmMask1, mmMask);
+#endif
             for( j = 0; j < criteria.maxCount; j++ )
             {
                 status[ptidx] = static_cast<uchar>(j);
diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp
index 7da20aa7d77..6faf5a6247f 100644
--- a/modules/optflow/src/rlof/rlof_invoker.hpp
+++ b/modules/optflow/src/rlof/rlof_invoker.hpp
@@ -243,8 +243,10 @@ namespace ica
                 cv::Point2f backUpNextPt = nextPt;
                 nextPt -= halfWin;
                 int j;
+#ifdef RLOF_SSE
                 __m128i mmMask0, mmMask1, mmMask;
                 getWBitMask(winSize.width, mmMask0, mmMask1, mmMask);
+#endif
                 float MEstimatorScale = 1;
                 int buffIdx = 0;
                 cv::Point2f prevDelta(0, 0);
diff --git a/modules/optflow/src/rlof/rlof_invokerbase.hpp b/modules/optflow/src/rlof/rlof_invokerbase.hpp
index adb45f039b4..68e2ca28bbc 100644
--- a/modules/optflow/src/rlof/rlof_invokerbase.hpp
+++ b/modules/optflow/src/rlof/rlof_invokerbase.hpp
@@ -31,6 +31,7 @@ namespace cv
 namespace optflow
 {
     typedef short deriv_type;
+#ifdef RLOF_SSE
     inline void get4BitMask(const int & width, __m128i & mask)
     {
         int noBits = width - static_cast<int>(floor(width / 4.f) * 4.f);
@@ -54,7 +55,7 @@ namespace optflow
         t0 = _mm_set_epi16(val[3], val[3], val[2], val[2], val[1], val[1], val[0], val[0]);
         t2 = _mm_set_epi16(val[7], val[6], val[5], val[4], val[3], val[2], val[1], val[0]);
     }
-
+#endif
     typedef uchar tMaskType;
 #define tCVMaskType CV_8UC1
 #define MaskSet 0xffffffff

From 085e3c0b82496398403b35761c7d34bb7da60e9a Mon Sep 17 00:00:00 2001
From: senst <senst@nue.tu-berlin.de>
Date: Tue, 11 Dec 2018 14:28:51 +0100
Subject: [PATCH 08/25] fix warnings for windows compilation

---
 modules/optflow/perf/perf_rlof.cpp            |  2 +-
 modules/optflow/src/rlof/berlof_invoker.hpp   | 28 +++++++++----------
 .../optflow/src/rlof/geo_interpolation.cpp    |  4 +--
 modules/optflow/src/rlof/plk_invoker.hpp      |  2 +-
 modules/optflow/src/rlof/rlof_invoker.hpp     | 12 ++++----
 modules/optflow/test/test_OF_accuracy.cpp     |  2 +-
 6 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/modules/optflow/perf/perf_rlof.cpp b/modules/optflow/perf/perf_rlof.cpp
index dda15a4b94e..e67feec2777 100644
--- a/modules/optflow/perf/perf_rlof.cpp
+++ b/modules/optflow/perf/perf_rlof.cpp
@@ -58,7 +58,7 @@ namespace opencv_test { namespace {
             ASSERT_FALSE(frame2.empty());
             Ptr<RLOFOpticalFlowParameter> param = Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter);;
             Ptr< DenseRLOFOpticalFlow> algo = DenseRLOFOpticalFlow::create();
-            InterpolationType interp_type;
+            InterpolationType interp_type = INTERP_EPIC;
             if (get<0>(GetParam()) == "INTERP_EPIC")
                 interp_type = INTERP_EPIC;
             if (get<0>(GetParam()) == "INTERP_GEO")
diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp
index d550e67a7b6..5f917dc12d4 100644
--- a/modules/optflow/src/rlof/berlof_invoker.hpp
+++ b/modules/optflow/src/rlof/berlof_invoker.hpp
@@ -1169,9 +1169,9 @@ namespace berlof
                                 if( maskPtr[x] == 0)
                                     continue;
 
-                                short ixval = static_cast<float>(dIptr[0]);
-                                short iyval = static_cast<float>(dIptr[1]);
-                                int illValue =  Iptr[x] * gainVec.x + gainVec.y  - Iptr[x];
+                                short ixval = dIptr[0];
+                                short iyval = dIptr[1];
+                                int illValue =  static_cast<int>(Iptr[x] * gainVec.x + gainVec.y  - Iptr[x]);
 
                                 int It[4] = {(Jptr[x+step+cn]<< 5)    + illValue,
                                              (Jptr[x+cn]<< 5)        + illValue,
@@ -1179,7 +1179,7 @@ namespace berlof
                                              (Jptr[x] << 5)            + illValue};
 
 
-                                float J_val  =  CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
+                                int J_val  =  CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
                                                       Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11,
                                                       W_BITS1-5);
 
@@ -1202,19 +1202,19 @@ namespace berlof
                                         It[2] = 0;
                                         It[3] = 0;
                                     }
-                                    else if( abss > fParam0 && diff >= 0 )
+                                    else if( abss > static_cast<int>(fParam0) && diff >= 0 )
                                     {
-                                        It[0] = param[2] * (It[0] - fParam1);
-                                        It[1] = param[2] * (It[1] - fParam1);
-                                        It[2] = param[2] * (It[2] - fParam1);
-                                        It[3] = param[2] * (It[3] - fParam1);
+                                        It[0] = static_cast<int>(param[2] * (It[0] - fParam1));
+                                        It[1] = static_cast<int>(param[2] * (It[1] - fParam1));
+                                        It[2] = static_cast<int>(param[2] * (It[2] - fParam1));
+                                        It[3] = static_cast<int>(param[2] * (It[3] - fParam1));
                                     }
-                                    else if( abss > fParam0 && diff < 0 )
+                                    else if( abss > static_cast<int>(fParam0) && diff < 0 )
                                     {
-                                        It[0] = param[2] * (It[0] + fParam1);
-                                        It[1] = param[2] * (It[1] + fParam1);
-                                        It[2] = param[2] * (It[2] + fParam1);
-                                        It[3] = param[2] * (It[3] + fParam1);
+                                        It[0] = static_cast<int>(param[2] * (It[0] + fParam1));
+                                        It[1] = static_cast<int>(param[2] * (It[1] + fParam1));
+                                        It[2] = static_cast<int>(param[2] * (It[2] + fParam1));
+                                        It[3] = static_cast<int>(param[2] * (It[3] + fParam1));
                                     }
                                 }
                                 _b0[0] += (float)(It[0]*dIptr[0]) ;
diff --git a/modules/optflow/src/rlof/geo_interpolation.cpp b/modules/optflow/src/rlof/geo_interpolation.cpp
index f93c0b6093e..f96e3914cff 100644
--- a/modules/optflow/src/rlof/geo_interpolation.cpp
+++ b/modules/optflow/src/rlof/geo_interpolation.cpp
@@ -363,7 +363,7 @@ namespace optflow
             }
         }
         return ret;
-
+/*
         for (int y = 0; y < in.rows; y++) {
             for (int x = 0; x < in.cols; x++) {
                 int v_id = y * in.cols + x;
@@ -389,7 +389,7 @@ namespace optflow
         }
         free(graph_helper.mem);
         return nnFlow;
-
+*/
 
     }
     Mat getGraph(const Mat &image, float edge_length)
diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp
index 39f81fe6a18..dd8ab5e7d1b 100644
--- a/modules/optflow/src/rlof/plk_invoker.hpp
+++ b/modules/optflow/src/rlof/plk_invoker.hpp
@@ -473,7 +473,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         int J_val = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
                                               Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11,
                                               W_BITS1-5);
-                        int diff = J_val - Iptr[x] + static_cast<int>(Iptr[x]) * gainVec.x + gainVec.y;
+                        int diff = static_cast<int>(J_val - Iptr[x] + Iptr[x] * gainVec.x + gainVec.y);
 
                         b1 += (float)(diff*dIptr[0])  * FLT_RESCALE;
                         b2 += (float)(diff*dIptr[1])  * FLT_RESCALE;
diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp
index 6faf5a6247f..660585cd1a3 100644
--- a/modules/optflow/src/rlof/rlof_invoker.hpp
+++ b/modules/optflow/src/rlof/rlof_invoker.hpp
@@ -1213,7 +1213,7 @@ namespace radial
                         {
                             if (maskPtr[x] == 0)
                                 continue;
-                            float J_val = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11,
+                            int J_val = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11,
                                 W_BITS1 - 5);
                             short ixval = static_cast<short>(dIptr[0]);
                             short iyval = static_cast<short>(dIptr[1]);
@@ -1223,17 +1223,17 @@ namespace radial
                                 MEstimatorScale += eta;
                             if (diff < MEstimatorScale)
                                 MEstimatorScale -= eta;
-                            if (abss > fParam1)
+                            if (abss > static_cast<int>(fParam1))
                             {
                                 diff = 0.f;
                             }
-                            else if (abss > fParam0 && diff >= 0)
+                            else if (abss > static_cast<int>(fParam0) && diff >= 0)
                             {
-                                diff = param[2] * (diff - fParam1);
+                                diff = static_cast<int>(param[2] * (diff - fParam1));
                             }
-                            else if (abss > fParam0 && diff < 0)
+                            else if (abss > static_cast<int>(fParam0) && diff < 0)
                             {
-                                diff = param[2] * (diff + fParam1);
+                                diff = static_cast<int>(param[2] * (diff + fParam1));
                             }
                             b1 += (float)(diff*ixval);
                             b2 += (float)(diff*iyval); ;
diff --git a/modules/optflow/test/test_OF_accuracy.cpp b/modules/optflow/test/test_OF_accuracy.cpp
index 67d58f243b0..26f6b3668c4 100644
--- a/modules/optflow/test/test_OF_accuracy.cpp
+++ b/modules/optflow/test/test_OF_accuracy.cpp
@@ -95,7 +95,7 @@ static float calcRMSE(vector<Point2f> prevPts, vector<Point2f> currPts, Mat flow
             ee.push_back(sqrt(diffFlow.x * diffFlow.x + diffFlow.y * diffFlow.y));
         }
     }
-    return mean(ee).val[0];
+    return static_cast<float>(mean(ee).val[0]);
 }
 static float calcAvgEPE(vector< pair<Point2i, Point2i> > corr, Mat flow)
 {

From 3f759afa08044a1adba1743a8b1ef79cb6238b9d Mon Sep 17 00:00:00 2001
From: senst <senst@nue.tu-berlin.de>
Date: Tue, 11 Dec 2018 14:38:33 +0100
Subject: [PATCH 09/25] fix documentation RLOFOpticalFlowParameter

---
 .../include/opencv2/optflow/rlofflow.hpp      | 38 +++++++++----------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/modules/optflow/include/opencv2/optflow/rlofflow.hpp b/modules/optflow/include/opencv2/optflow/rlofflow.hpp
index 4911c53b947..cbd352ec406 100644
--- a/modules/optflow/include/opencv2/optflow/rlofflow.hpp
+++ b/modules/optflow/include/opencv2/optflow/rlofflow.hpp
@@ -14,6 +14,25 @@ namespace optflow
     //! @addtogroup optflow
     //! @{
 
+     enum SupportRegionType {
+        SR_FIXED = 0,           /**<  Apply a constant support region */
+        SR_CROSS = 1            /**<  Apply a adaptive support region obtained by cross-based segmentation
+                                 *    as described in @cite Senst2014
+                                */
+    };
+    enum SolverType {
+        ST_STANDART = 0,        /**< Apply standard iterative refinement */
+        ST_BILINEAR = 1         /**< Apply optimized iterative refinement based bilinear equation solutions
+                                 *   as described in @cite Senst2013
+                                */
+    };
+
+    enum InterpolationType
+    {
+        INTERP_GEO = 0,    /**<  Fast geodesic interpolation, see @cite Geistert2016 */
+        INTERP_EPIC = 1,    /**<  Edge-preserving interpolation, see @cite Revaud2015,Geistert2016. */
+    };
+
     /** @brief This is used store and set up the parameters of the robust local optical flow (RLOF) algoritm.
      *
      * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as
@@ -39,25 +58,6 @@ namespace optflow
      *
      * @see optflow::DenseRLOFOpticalFlow, optflow::calcOpticalFlowDenseRLOF(), optflow::SparseRLOFOpticalFlow, optflow::calcOpticalFlowSparseRLOF()
      */
-    enum SupportRegionType {
-        SR_FIXED = 0,           /**<  Apply a constant support region */
-        SR_CROSS = 1            /**<  Apply a adaptive support region obtained by cross-based segmentation
-                                 *    as described in @cite Senst2014
-                                */
-    };
-    enum SolverType {
-        ST_STANDART = 0,        /**< Apply standard iterative refinement */
-        ST_BILINEAR = 1         /**< Apply optimized iterative refinement based bilinear equation solutions
-                                 *   as described in @cite Senst2013
-                                */
-    };
-
-    enum InterpolationType
-    {
-        INTERP_GEO = 0,    /**<  Fast geodesic interpolation, see @cite Geistert2016 */
-        INTERP_EPIC = 1,    /**<  Edge-preserving interpolation, see @cite Revaud2015,Geistert2016. */
-    };
-
     class CV_EXPORTS_W RLOFOpticalFlowParameter{
     public:
         RLOFOpticalFlowParameter()

From fb3133743f7289db86a438f6dd10e7da97c75f5c Mon Sep 17 00:00:00 2001
From: senst <senst@nue.tu-berlin.de>
Date: Tue, 11 Dec 2018 15:56:17 +0100
Subject: [PATCH 10/25] integrate cast to remove last warnings

---
 modules/optflow/include/opencv2/optflow/rlofflow.hpp | 2 +-
 modules/optflow/src/rlof/rlof_invoker.hpp            | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/optflow/include/opencv2/optflow/rlofflow.hpp b/modules/optflow/include/opencv2/optflow/rlofflow.hpp
index cbd352ec406..87b1d30266d 100644
--- a/modules/optflow/include/opencv2/optflow/rlofflow.hpp
+++ b/modules/optflow/include/opencv2/optflow/rlofflow.hpp
@@ -121,7 +121,7 @@ namespace optflow
         */
         bool useIlluminationModel;
         /**< Use the Gennert and Negahdaripour illumination model instead of the intensity brigthness
-         *   constraint. (Probosed in @cite Senst2016 ) This model is defined as follow:
+         *   constraint. (proposed in @cite Senst2016 ) This model is defined as follow:
          *   \f[ I(\mathbf{x},t) + m \cdot I(\mathbf{x},t) + c = I(\mathbf{x},t+1) \f]
          *   and contains with m and c a multiplicative and additive term which makes the estimate
          *   more robust against illumination changes. The computational complexity is increased by
diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp
index 660585cd1a3..444d9114bd3 100644
--- a/modules/optflow/src/rlof/rlof_invoker.hpp
+++ b/modules/optflow/src/rlof/rlof_invoker.hpp
@@ -1217,7 +1217,7 @@ namespace radial
                                 W_BITS1 - 5);
                             short ixval = static_cast<short>(dIptr[0]);
                             short iyval = static_cast<short>(dIptr[1]);
-                            int diff = J_val - static_cast<int>(Iptr[x]) + Iptr[x] * gainVec.x + gainVec.y;
+                            int diff = static_cast<int>(J_val - Iptr[x] + Iptr[x] * gainVec.x + gainVec.y);
                             int abss = (diff < 0) ? -diff : diff;
                             if (diff > MEstimatorScale)
                                 MEstimatorScale += eta;
@@ -1225,7 +1225,7 @@ namespace radial
                                 MEstimatorScale -= eta;
                             if (abss > static_cast<int>(fParam1))
                             {
-                                diff = 0.f;
+                                diff = 0;
                             }
                             else if (abss > static_cast<int>(fParam0) && diff >= 0)
                             {

From d1440c04bfc0215b331edfebdf377288382e53da Mon Sep 17 00:00:00 2001
From: senst <senst@nue.tu-berlin.de>
Date: Wed, 12 Dec 2018 14:57:15 +0100
Subject: [PATCH 11/25] * add create method and function inferfaces to
 RLOFOpticalFlowParamter to enable python wrapper interfaces

---
 .../include/opencv2/optflow/rlofflow.hpp      | 46 +++++++++++++++
 modules/optflow/src/rlofflow.cpp              | 57 ++++++++++++++++++-
 2 files changed, 102 insertions(+), 1 deletion(-)

diff --git a/modules/optflow/include/opencv2/optflow/rlofflow.hpp b/modules/optflow/include/opencv2/optflow/rlofflow.hpp
index 87b1d30266d..f0932abc557 100644
--- a/modules/optflow/include/opencv2/optflow/rlofflow.hpp
+++ b/modules/optflow/include/opencv2/optflow/rlofflow.hpp
@@ -76,6 +76,7 @@ namespace optflow
             ,minEigenValue(0.0001f)
             ,globalMotionRansacThreshold(10)
         {}
+
         SolverType solverType;
         /**< Variable specifies the iterative refinement strategy. Please consider citing  @cite Senst2013 when
          *  using ST_BILINEAR.
@@ -146,6 +147,51 @@ namespace optflow
          *   n-th percentil (given by this value [0 ... 100]) of the motion vectors magnitude.
          *   See @cite Senst2016 for more details.
         */
+
+        CV_WRAP void setSolverType(SolverType val);
+        CV_WRAP SolverType getSolverType() const;
+
+        CV_WRAP void setSupportRegionType(SupportRegionType val);
+        CV_WRAP SupportRegionType getSupportRegionType() const;
+
+        CV_WRAP void setNormSigma0(float val);
+        CV_WRAP float getNormSigma0() const;
+
+        CV_WRAP void setNormSigma1(float val);
+        CV_WRAP float getNormSigma1() const;
+
+        CV_WRAP void setSmallWinSize(int val);
+        CV_WRAP int getSmallWinSize() const;
+
+        CV_WRAP void setLargeWinSize(int val);
+        CV_WRAP int getLargeWinSize() const;
+
+        CV_WRAP void setCrossSegmentationThreshold(int val);
+        CV_WRAP int getCrossSegmentationThreshold() const;
+
+        CV_WRAP void setMaxLevel(int val);
+        CV_WRAP int getMaxLevel() const;
+
+        CV_WRAP void setUseInitialFlow(bool val);
+        CV_WRAP bool getUseInitialFlow() const;
+
+        CV_WRAP void setUseIlluminationModel(bool val);
+        CV_WRAP bool getUseIlluminationModel() const;
+
+        CV_WRAP void setUseGlobalMotionPrior(bool val);
+        CV_WRAP bool getUseGlobalMotionPrior() const;
+
+        CV_WRAP void setMaxIteration(int val);
+        CV_WRAP int getMaxIteration() const;
+
+        CV_WRAP void setMinEigenValue(float val);
+        CV_WRAP float getMinEigenValue() const;
+
+        CV_WRAP void setGlobalMotionRansacThreshold(float val);
+        CV_WRAP float getGlobalMotionRansacThreshold() const;
+
+        //! @brief Creates instance of optflow::RLOFOpticalFlowParameter
+        CV_WRAP static Ptr<RLOFOpticalFlowParameter> create();
     };
 
     /** @brief Fast dense optical flow computation based on robust local optical flow (RLOF) algorithms and sparse-to-dense interpolation
diff --git a/modules/optflow/src/rlofflow.cpp b/modules/optflow/src/rlofflow.cpp
index 3ba5a782f0b..ff43fc39820 100644
--- a/modules/optflow/src/rlofflow.cpp
+++ b/modules/optflow/src/rlofflow.cpp
@@ -10,6 +10,53 @@ namespace cv
 namespace optflow
 {
 
+    Ptr<RLOFOpticalFlowParameter> RLOFOpticalFlowParameter::create()
+    {
+        return Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter);
+    }
+
+    void RLOFOpticalFlowParameter::setSolverType(SolverType val){ solverType = val;}
+    SolverType RLOFOpticalFlowParameter::getSolverType() const { return solverType;}
+
+    void RLOFOpticalFlowParameter::setSupportRegionType(SupportRegionType val){ supportRegionType = val;}
+    SupportRegionType RLOFOpticalFlowParameter::getSupportRegionType() const { return supportRegionType;}
+
+    void RLOFOpticalFlowParameter::setNormSigma0(float val){ normSigma0 = val;}
+    float RLOFOpticalFlowParameter::getNormSigma0() const { return normSigma0;}
+
+    void RLOFOpticalFlowParameter::setNormSigma1(float val){ normSigma1 = val;}
+    float RLOFOpticalFlowParameter::getNormSigma1() const { return normSigma1;}
+
+    void RLOFOpticalFlowParameter::setSmallWinSize(int val){ smallWinSize = val;}
+    int RLOFOpticalFlowParameter::getSmallWinSize() const { return smallWinSize;}
+
+    void RLOFOpticalFlowParameter::setLargeWinSize(int val){ largeWinSize = val;}
+    int RLOFOpticalFlowParameter::getLargeWinSize() const { return largeWinSize;}
+
+    void RLOFOpticalFlowParameter::setCrossSegmentationThreshold(int val){ crossSegmentationThreshold = val;}
+    int RLOFOpticalFlowParameter::getCrossSegmentationThreshold() const { return crossSegmentationThreshold;}
+
+    void RLOFOpticalFlowParameter::setMaxLevel(int val){ maxLevel = val;}
+    int RLOFOpticalFlowParameter::getMaxLevel() const { return maxLevel;}
+
+    void RLOFOpticalFlowParameter::setUseInitialFlow(bool val){ useInitialFlow = val;}
+    bool RLOFOpticalFlowParameter::getUseInitialFlow() const { return useInitialFlow;}
+
+    void RLOFOpticalFlowParameter::setUseIlluminationModel(bool val){ useIlluminationModel = val;}
+    bool RLOFOpticalFlowParameter::getUseIlluminationModel() const { return useIlluminationModel;}
+
+    void RLOFOpticalFlowParameter::setUseGlobalMotionPrior(bool val){ useGlobalMotionPrior = val;}
+    bool RLOFOpticalFlowParameter::getUseGlobalMotionPrior() const { return useGlobalMotionPrior;}
+
+    void RLOFOpticalFlowParameter::setMaxIteration(int val){ maxIteration = val;}
+    int RLOFOpticalFlowParameter::getMaxIteration() const { return maxIteration;}
+
+    void RLOFOpticalFlowParameter::setMinEigenValue(float val){ minEigenValue = val;}
+    float RLOFOpticalFlowParameter::getMinEigenValue() const { return minEigenValue;}
+
+    void RLOFOpticalFlowParameter::setGlobalMotionRansacThreshold(float val){ globalMotionRansacThreshold = val;}
+    float RLOFOpticalFlowParameter::getGlobalMotionRansacThreshold() const { return globalMotionRansacThreshold;}
+
     class DenseOpticalFlowRLOFImpl : public DenseRLOFOpticalFlow
     {
     public:
@@ -66,9 +113,13 @@ namespace optflow
             CV_Assert(!I0.empty() && I0.depth() == CV_8U && (I0.channels() == 3 || I0.channels() == 1));
             CV_Assert(!I1.empty() && I1.depth() == CV_8U && (I1.channels() == 3 || I1.channels() == 1));
             CV_Assert(I0.sameSize(I1));
+            if (param.empty())
+                param = Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter());
             if (param->supportRegionType == SR_CROSS)
                 CV_Assert( I0.channels() == 3 && I1.channels() == 3);
             CV_Assert(interp_type == InterpolationType::INTERP_EPIC || interp_type == InterpolationType::INTERP_GEO);
+            // if no parameter is used use the default parameter
+
             Mat prevImage = I0.getMat();
             Mat currImage = I1.getMat();
             int noPoints = prevImage.cols * prevImage.rows;
@@ -86,7 +137,6 @@ namespace optflow
             prevPoints.erase(prevPoints.begin() + noPoints, prevPoints.end());
             currPoints.resize(prevPoints.size());
             calcLocalOpticalFlow(prevImage, currImage, prevPyramid, currPyramid, prevPoints, currPoints, *(param.get()));
-
             flow.create(prevImage.size(), CV_32FC2);
             Mat dense_flow = flow.getMat();
 
@@ -237,6 +287,11 @@ namespace optflow
             Mat nextImage = nextImg.getMat();
             Mat prevPtsMat = prevPts.getMat();
 
+            if (param.empty())
+            {
+                param = Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter);
+            }
+
             if (param->useInitialFlow == false)
                 nextPts.create(prevPtsMat.size(), prevPtsMat.type(), -1, true);
 

From 912c1dd29f0574552b3f299a3525a33d4879acd8 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.alekhin@intel.com>
Date: Fri, 14 Dec 2018 13:55:26 +0300
Subject: [PATCH 12/25] white space fixes / coding style

---
 modules/optflow/include/opencv2/optflow.hpp   |    1 -
 .../include/opencv2/optflow/rlofflow.hpp      |  945 ++---
 modules/optflow/perf/perf_rlof.cpp            |  131 +-
 modules/optflow/src/rlof/berlof_invoker.hpp   | 3599 ++++++++---------
 .../optflow/src/rlof/geo_interpolation.cpp    |  730 ++--
 .../optflow/src/rlof/geo_interpolation.hpp    |   55 +-
 modules/optflow/src/rlof/plk_invoker.hpp      |  171 +-
 modules/optflow/src/rlof/rlof_invoker.hpp     | 2362 ++++++-----
 modules/optflow/src/rlof/rlof_invokerbase.hpp |  421 +-
 modules/optflow/src/rlof/rlof_localflow.cpp   |  833 ++--
 modules/optflow/src/rlof/rlof_localflow.h     |   25 +-
 modules/optflow/src/rlofflow.cpp              |  651 ++-
 12 files changed, 4952 insertions(+), 4972 deletions(-)

diff --git a/modules/optflow/include/opencv2/optflow.hpp b/modules/optflow/include/opencv2/optflow.hpp
index a4bd3a737e6..093b5fe2c7f 100644
--- a/modules/optflow/include/opencv2/optflow.hpp
+++ b/modules/optflow/include/opencv2/optflow.hpp
@@ -173,7 +173,6 @@ CV_EXPORTS_W Ptr<DenseOpticalFlow> createOptFlow_Farneback();
 //! Additional interface to the SparseToDenseFlow algorithm - calcOpticalFlowSparseToDense()
 CV_EXPORTS_W Ptr<DenseOpticalFlow> createOptFlow_SparseToDense();
 
-
 /** @brief "Dual TV L1" Optical Flow Algorithm.
 
 The class implements the "Dual TV L1" optical flow algorithm described in @cite Zach2007 and
diff --git a/modules/optflow/include/opencv2/optflow/rlofflow.hpp b/modules/optflow/include/opencv2/optflow/rlofflow.hpp
index f0932abc557..4d602d2b28f 100644
--- a/modules/optflow/include/opencv2/optflow/rlofflow.hpp
+++ b/modules/optflow/include/opencv2/optflow/rlofflow.hpp
@@ -11,482 +11,483 @@ namespace cv
 {
 namespace optflow
 {
-    //! @addtogroup optflow
-    //! @{
-
-     enum SupportRegionType {
-        SR_FIXED = 0,           /**<  Apply a constant support region */
-        SR_CROSS = 1            /**<  Apply a adaptive support region obtained by cross-based segmentation
-                                 *    as described in @cite Senst2014
-                                */
-    };
-    enum SolverType {
-        ST_STANDART = 0,        /**< Apply standard iterative refinement */
-        ST_BILINEAR = 1         /**< Apply optimized iterative refinement based bilinear equation solutions
-                                 *   as described in @cite Senst2013
-                                */
-    };
-
-    enum InterpolationType
-    {
-        INTERP_GEO = 0,    /**<  Fast geodesic interpolation, see @cite Geistert2016 */
-        INTERP_EPIC = 1,    /**<  Edge-preserving interpolation, see @cite Revaud2015,Geistert2016. */
-    };
-
-    /** @brief This is used store and set up the parameters of the robust local optical flow (RLOF) algoritm.
-     *
-     * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as
-     * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK().
-     * This RLOF implementation can be seen as an improved pyramidal iterative Lucas-Kanade and includes
-     * a set of improving modules. The main improvements in respect to the pyramidal iterative Lucas-Kanade
-     * are:
-     *  - A more robust redecending M-estimator framework (see @cite Senst2012) to improve the accuracy at
-     *     motion boundaries and appearing and disappearing pixels.
-     *  - an adaptive support region strategies to improve the accuracy at motion boundaries to reduce the
-     *     corona effect, i.e oversmoothing of the PLK at motion/object boundaries. The cross-based segementation
-     *     strategy (SR_CROSS) proposed in @cite Senst2014 uses a simple segmenation approach to obtain the optimal
-     *     shape of the support region.
-     *  - To deal with illumination changes (outdoor sequences and shadow) the intensity constancy assumption
-     *     based optical flow equation has been adopt with the Gennert and Negahdaripour illumination model
-     *     (see @cite Senst2016). This model can be switched on/off with the useIlluminationModel variable.
-     *  - By using a global motion prior initialization (see @cite Senst2016) of the iterative refinement
-     *     the accuracy could be significantly improved for large displacements. This initialization can be
-     *     switched on and of with useGlobalMotionPrior variable.
-     *
-     * The RLOF can be computed with the SparseOpticalFlow class or function interface to track a set of features
-     * or with the DenseOpticalFlow class or function interface to compute dense optical flow.
-     *
-     * @see optflow::DenseRLOFOpticalFlow, optflow::calcOpticalFlowDenseRLOF(), optflow::SparseRLOFOpticalFlow, optflow::calcOpticalFlowSparseRLOF()
-     */
-    class CV_EXPORTS_W RLOFOpticalFlowParameter{
-    public:
-        RLOFOpticalFlowParameter()
-            :solverType(ST_BILINEAR)
-            ,supportRegionType(SR_CROSS)
-            ,normSigma0(3.2f)
-            ,normSigma1(7.f)
-            ,smallWinSize(9)
-            ,largeWinSize(21)
-            ,crossSegmentationThreshold(25)
-            ,maxLevel(3)
-            ,useInitialFlow(false)
-            ,useIlluminationModel(true)
-            ,useGlobalMotionPrior(true)
-            ,maxIteration(30)
-            ,minEigenValue(0.0001f)
-            ,globalMotionRansacThreshold(10)
-        {}
-
-        SolverType solverType;
-        /**< Variable specifies the iterative refinement strategy. Please consider citing  @cite Senst2013 when
-         *  using ST_BILINEAR.
-        */
-
-        SupportRegionType supportRegionType;
-        /**< Variable specifies the support region shape extraction or shrinking strategy.
-        */
-
-        float normSigma0;
-        /**< &sigma paramter of the shrinked Hampel norm introduced in @cite Senst2012. If
-         * &sigma = std::numeric_limist<float>::max() the least-square estimator will be used
-         * instead of the M-estimator. Althoug M-estimator is more robust against outlier in the support
-         * region the least-square can be fast in computation.
-        */
-        float normSigma1;
-        /**< &sigma paramter of the shrinked Hampel norm introduced in @cite Senst2012. If
-         * &sigma = std::numeric_limist<float>::max() the least-square estimator will be used
-         * instead of the M-estimator. Althoug M-estimator is more robust against outlier in the support
-         * region the least-square can be fast in computation.
-        */
-        int smallWinSize;
-        /**< Minimal window size of the support region. This parameter is only used if supportRegionType is SR_CROSS.
-        */
-        int largeWinSize;
-        /**< Maximal window size of the support region. If supportRegionType is SR_FIXED this gives the exact support
-         * region size. The speed of the RLOF is related to the applied win sizes. The smaller the window size the lower is the runtime,
-         * but the more sensitive to noise is the method.
-        */
-        int crossSegmentationThreshold;
-        /**< Color similarity threshold used by cross-based segmentation following @cite Senst2014 .
-         *   (Only used  if supportRegionType is SR_CROSS). With the cross-bassed segmentation
-         *   motion boundaries can be computed more accurately.
-        */
-        int maxLevel;
-        /**< Maximal number of pyramid level used. The large this value is the more likely it is
-         *   to obtain accurate solutions for long-range motions. The runtime is linear related to
-         *   this parameter.
-        */
-        bool useInitialFlow;
-        /**< Use next point list as initial values. A good intialization can imporve the algortihm
-         *   accuracy and reduce the runtime by a faster convergence of the iteration refinement.
-        */
-        bool useIlluminationModel;
-        /**< Use the Gennert and Negahdaripour illumination model instead of the intensity brigthness
-         *   constraint. (proposed in @cite Senst2016 ) This model is defined as follow:
-         *   \f[ I(\mathbf{x},t) + m \cdot I(\mathbf{x},t) + c = I(\mathbf{x},t+1) \f]
-         *   and contains with m and c a multiplicative and additive term which makes the estimate
-         *   more robust against illumination changes. The computational complexity is increased by
-         *   enabling the illumination model.
-        */
-        bool useGlobalMotionPrior;
-        /**< Use global motion prior initialisation has been introduced in @cite Senst2016 . It
-         *   allows to be more accurate for long-range motion. The computational complexity is
-         *   slightly increased by enabling the global motion prior initialisation.
-        */
-        int maxIteration;
-        /**< Number of maximal iterations used for the iterative refinement. Lower values can
-         *   reduce the runtime but also the accuracy.
-        */
-        float minEigenValue;
-        /**< Threshold for the minimal eigenvalue of the gradient matrix defines when to abort the
-         *   iterative refinement.
-        */
-        float globalMotionRansacThreshold;
-        /**< To apply the global motion prior motion vectors will be computed on a regulary sampled which
-         *   are the basis for Homography estimation using RANSAC. The reprojection threshold is based on
-         *   n-th percentil (given by this value [0 ... 100]) of the motion vectors magnitude.
-         *   See @cite Senst2016 for more details.
-        */
-
-        CV_WRAP void setSolverType(SolverType val);
-        CV_WRAP SolverType getSolverType() const;
-
-        CV_WRAP void setSupportRegionType(SupportRegionType val);
-        CV_WRAP SupportRegionType getSupportRegionType() const;
-
-        CV_WRAP void setNormSigma0(float val);
-        CV_WRAP float getNormSigma0() const;
-
-        CV_WRAP void setNormSigma1(float val);
-        CV_WRAP float getNormSigma1() const;
-
-        CV_WRAP void setSmallWinSize(int val);
-        CV_WRAP int getSmallWinSize() const;
-
-        CV_WRAP void setLargeWinSize(int val);
-        CV_WRAP int getLargeWinSize() const;
-
-        CV_WRAP void setCrossSegmentationThreshold(int val);
-        CV_WRAP int getCrossSegmentationThreshold() const;
-
-        CV_WRAP void setMaxLevel(int val);
-        CV_WRAP int getMaxLevel() const;
-
-        CV_WRAP void setUseInitialFlow(bool val);
-        CV_WRAP bool getUseInitialFlow() const;
-
-        CV_WRAP void setUseIlluminationModel(bool val);
-        CV_WRAP bool getUseIlluminationModel() const;
-
-        CV_WRAP void setUseGlobalMotionPrior(bool val);
-        CV_WRAP bool getUseGlobalMotionPrior() const;
-
-        CV_WRAP void setMaxIteration(int val);
-        CV_WRAP int getMaxIteration() const;
-
-        CV_WRAP void setMinEigenValue(float val);
-        CV_WRAP float getMinEigenValue() const;
-
-        CV_WRAP void setGlobalMotionRansacThreshold(float val);
-        CV_WRAP float getGlobalMotionRansacThreshold() const;
-
-        //! @brief Creates instance of optflow::RLOFOpticalFlowParameter
-        CV_WRAP static Ptr<RLOFOpticalFlowParameter> create();
-    };
-
-    /** @brief Fast dense optical flow computation based on robust local optical flow (RLOF) algorithms and sparse-to-dense interpolation
-     * scheme.
-     *
-     * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as
-     * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK().
-     *
-     * The sparse-to-dense interpolation scheme allows for fast computation of dense optical flow using RLOF (see @cite Geistert2016).
-     * For this scheme the following steps are applied:
-     * -# motion vector seeded at a regular sampled grid are computed. The sparsity of this grid can be configured with setGridStep
-     * -# (optinally) errornous motion vectors are filter based on the forward backward confidence. The threshold can be configured
-     * with setForwardBackward. The filter is only applied if the threshold >0 but than the runtime is doubled due to the estimation
-     * of the backward flow.
-     * -# Vector field interpolation is applied to the motion vector set to obtain a dense vector field.
-     *
-     * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details.
-     * Parameters have been described in @cite Senst2012 @cite Senst2013 @cite Senst2014 and @cite Senst2016.
-     *
-     * @note SIMD parallelization is only available when compiling with SSE4.1. If the grid size is set to (1,1) and the
-     * forward backward threshold <= 0 that the dense optical flow field is purely.
-     * computed with the RLOF.
-     *
-     * @see optflow::calcOpticalFlowDenseRLOF(), optflow::RLOFOpticalFlowParameter
-    */
-    class CV_EXPORTS_W DenseRLOFOpticalFlow : public DenseOpticalFlow
-    {
-    public:
-        //! @brief Configuration of the RLOF alogrithm.
-        /**
-            @see optflow::RLOFOpticalFlowParameter, getRLOFOpticalFlowParameter
-        */
-        CV_WRAP virtual void setRLOFOpticalFlowParameter(Ptr<RLOFOpticalFlowParameter>  val) = 0;
-        /** @copybrief setRLOFOpticalFlowParameter
-            @see optflow::RLOFOpticalFlowParameter, setRLOFOpticalFlowParameter
-        */
-        CV_WRAP virtual Ptr<RLOFOpticalFlowParameter>  getRLOFOpticalFlowParameter() const = 0;
-        //! @brief Threshold for the forward backward confidence check
-        /**For each grid point \f$ \mathbf{x} \f$ a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed.
-         *     If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f]
-         *     is larger than threshold given by this function then the motion vector will not be used by the following
-         *    vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test
-         *    will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation.
-         *    @see getForwardBackward, setGridStep
-        */
-        CV_WRAP virtual void setForwardBackward(float val) = 0;
-        /** @copybrief setForwardBackward
-            @see setForwardBackward
-        */
-        CV_WRAP virtual float getForwardBackward() const = 0;
-        //! @brief Size of the grid to spawn the motion vectors.
-        /** For each grid point a motion vector is computed. Some motion vectors will be removed due to the forwatd backward
-         *  threshold (if set >0). The rest will be the base of the vector field interpolation.
-         *    @see getForwardBackward, setGridStep
-        */
-        CV_WRAP virtual Size getGridStep() const = 0;
-        /** @copybrief getGridStep
-         *    @see getGridStep
-         */
-        CV_WRAP virtual void setGridStep(Size val) = 0;
-
-        //! @brief Interpolation used to compute the dense optical flow.
-        /** Two interpolation algorithms are supported
-         * - **INTERP_GEO** applies the fast geodesic interpolation, see @cite Geistert2016.
-         * - **INTERP_EPIC_RESIDUAL** applies the edge-preserving interpolation, see @cite Revaud2015,Geistert2016.
-         * @see ximgproc::EdgeAwareInterpolator, getInterpolation
-        */
-        CV_WRAP virtual void setInterpolation(InterpolationType val) = 0;
-        /** @copybrief setInterpolation
-         *    @see ximgproc::EdgeAwareInterpolator, setInterpolation
-         */
-        CV_WRAP virtual InterpolationType getInterpolation() const = 0;
-        //! @brief see ximgproc::EdgeAwareInterpolator() K value.
-        /** K is a number of nearest-neighbor matches considered, when fitting a locally affine
-         *    model. Usually it should be around 128. However, lower values would make the interpolation noticeably faster.
-         *    @see ximgproc::EdgeAwareInterpolator,  setEPICK
-        */
-        CV_WRAP virtual int getEPICK() const = 0;
-        /** @copybrief getEPICK
-         *    @see ximgproc::EdgeAwareInterpolator, getEPICK
-         */
-        CV_WRAP virtual void setEPICK(int val) = 0;
-        //! @brief see ximgproc::EdgeAwareInterpolator() sigma value.
-        /** Sigma is a parameter defining how fast the weights decrease in the locally-weighted affine
-         *  fitting. Higher values can help preserve fine details, lower values can help to get rid of noise in the
-         *  output flow.
-         *    @see ximgproc::EdgeAwareInterpolator, setEPICSigma
-        */
-        CV_WRAP virtual float getEPICSigma() const = 0;
-        /** @copybrief getEPICSigma
-         *  @see ximgproc::EdgeAwareInterpolator, getEPICSigma
-         */
-        CV_WRAP virtual void setEPICSigma(float val) = 0;
-        //! @brief  see ximgproc::EdgeAwareInterpolator() lambda value.
-        /** Lambda is a parameter defining the weight of the edge-aware term in geodesic distance,
-         *    should be in the range of 0 to 1000.
-         *    @see ximgproc::EdgeAwareInterpolator, setEPICSigma
-        */
-        CV_WRAP virtual float getEPICLambda() const = 0;
-        /** @copybrief getEPICLambda
-         *    @see ximgproc::EdgeAwareInterpolator, getEPICLambda
-        */
-        CV_WRAP virtual void setEPICLambda(float val) = 0;
-        //! @brief see ximgproc::EdgeAwareInterpolator().
-        /** Sets the respective fastGlobalSmootherFilter() parameter.
-         *    @see ximgproc::EdgeAwareInterpolator, setFgsLambda
-        */
-        CV_WRAP virtual float getFgsLambda() const = 0;
-        /** @copybrief getFgsLambda
-         *    @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, getFgsLambda
-        */
-        CV_WRAP virtual void setFgsLambda(float val) = 0;
-        //! @brief see ximgproc::EdgeAwareInterpolator().
-        /** Sets the respective fastGlobalSmootherFilter() parameter.
-         *    @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, setFgsSigma
-        */
-        CV_WRAP virtual float getFgsSigma() const = 0;
-        /** @copybrief getFgsSigma
-         *    @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, getFgsSigma
-         */
-        CV_WRAP virtual void setFgsSigma(float val) = 0;
-        //! @brief enables ximgproc::fastGlobalSmootherFilter
-        /**
-         * @see getUsePostProc
-         */
-        CV_WRAP virtual void setUsePostProc(bool val) = 0;
-        /** @copybrief setUsePostProc
-         *    @see ximgproc::fastGlobalSmootherFilter, setUsePostProc
-         */
-        CV_WRAP virtual bool getUsePostProc() const = 0;
-        //! @brief Creates instance of optflow::DenseRLOFOpticalFlow
-        /**
-         *    @param rlofParam see optflow::RLOFOpticalFlowParameter
-         *    @param forwardBackwardThreshold see setForwardBackward
-         *    @param gridStep see setGridStep
-         *    @param interp_type see setInterpolation
-         *    @param epicK see setEPICK
-         *    @param epicSigma see setEPICSigma
-         *    @param epicLambda see setEPICLambda
-         *    @param use_post_proc see setUsePostProc
-         *    @param fgsLambda see setFgsLambda
-         *    @param fgsSigma see setFgsSigma
-        */
-        CV_WRAP static Ptr<DenseRLOFOpticalFlow> create(
-            Ptr<RLOFOpticalFlowParameter> rlofParam = Ptr<RLOFOpticalFlowParameter>(),
-            float forwardBackwardThreshold = 1.f,
-            Size gridStep = Size(6, 6),
-            InterpolationType interp_type = InterpolationType::INTERP_EPIC,
-            int epicK = 128,
-            float epicSigma = 0.05f,
-            float epicLambda = 999.0f,
-            bool use_post_proc = true,
-            float fgsLambda = 500.0f,
-            float fgsSigma = 1.5f);
-    };
-
-    /** @brief Class used for calculation sparse optical flow and feature tracking with robust local optical flow (RLOF) algorithms.
-    *
-    * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as
-    * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK().
-    *
-    * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details.
-    * Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014 and @cite Senst2016.
-    *
-    * @note SIMD parallelization is only available when compiling with SSE4.1.
-    * @see optflow::calcOpticalFlowSparseRLOF(), optflow::RLOFOpticalFlowParameter
-    */
-    class CV_EXPORTS_W SparseRLOFOpticalFlow : public SparseOpticalFlow
-    {
-    public:
-        /** @copydoc DenseRLOFOpticalFlow::setRLOFOpticalFlowParameter
-        */
-        CV_WRAP virtual void setRLOFOpticalFlowParameter(Ptr<RLOFOpticalFlowParameter> val) = 0;
-        /** @copybrief setRLOFOpticalFlowParameter
-         *    @see setRLOFOpticalFlowParameter
-        */
-        CV_WRAP virtual Ptr<RLOFOpticalFlowParameter>  getRLOFOpticalFlowParameter() const = 0;
-        //! @brief Threshold for the forward backward confidence check
-        /** For each feature point a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed.
-         *     If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f]
-         *     is larger than threshold given by this function then the status  will not be used by the following
-         *    vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test
-         *    will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation.
-         *    @see setForwardBackward
-        */
-        CV_WRAP virtual void setForwardBackward(float val) = 0;
-        /** @copybrief setForwardBackward
-         *    @see setForwardBackward
-        */
-        CV_WRAP virtual float getForwardBackward() const = 0;
-
-        //! @brief Creates instance of SparseRLOFOpticalFlow
-        /**
-         *    @param rlofParam see setRLOFOpticalFlowParameter
-         *    @param forwardBackwardThreshold see setForwardBackward
-        */
-        CV_WRAP static Ptr<SparseRLOFOpticalFlow> create(
-            Ptr<RLOFOpticalFlowParameter> rlofParam = Ptr<RLOFOpticalFlowParameter>(),
-            float forwardBackwardThreshold = 1.f);
-
-    };
-
-    /** @brief Fast dense optical flow computation based on robust local optical flow (RLOF) algorithms and sparse-to-dense interpolation scheme.
-     *
-     * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as
-     * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK().
-     *
-     * The sparse-to-dense interpolation scheme allows for fast computation of dense optical flow using RLOF (see @cite Geistert2016).
-     * For this scheme the following steps are applied:
-     * -# motion vector seeded at a regular sampled grid are computed. The sparsity of this grid can be configured with setGridStep
-     * -# (optinally) errornous motion vectors are filter based on the forward backward confidence. The threshold can be configured
-     * with setForwardBackward. The filter is only applied if the threshold >0 but than the runtime is doubled due to the estimation
-     * of the backward flow.
-     * -# Vector field interpolation is applied to the motion vector set to obtain a dense vector field.
-     *
-     * @param I0 first 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType
-     * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image.
-     * @param I1 second 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType
-     * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image.
-     * @param flow computed flow image that has the same size as I0 and type CV_32FC2.
-     * @param rlofParam see optflow::RLOFOpticalFlowParameter
-     * @param forwardBackwardThreshold Threshold for the forward backward confidence check.
-     * For each grid point \f$ \mathbf{x} \f$ a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed.
-     * If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f]
-     * is larger than threshold given by this function then the motion vector will not be used by the following
-     * vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test
+//! @addtogroup optflow
+//! @{
+
+enum SupportRegionType {
+    SR_FIXED = 0,           /**<  Apply a constant support region */
+    SR_CROSS = 1            /**<  Apply a adaptive support region obtained by cross-based segmentation
+                             *    as described in @cite Senst2014
+                            */
+};
+enum SolverType {
+    ST_STANDART = 0,        /**< Apply standard iterative refinement */
+    ST_BILINEAR = 1         /**< Apply optimized iterative refinement based bilinear equation solutions
+                             *   as described in @cite Senst2013
+                            */
+};
+
+enum InterpolationType
+{
+    INTERP_GEO = 0,    /**<  Fast geodesic interpolation, see @cite Geistert2016 */
+    INTERP_EPIC = 1,    /**<  Edge-preserving interpolation, see @cite Revaud2015,Geistert2016. */
+};
+
+/** @brief This is used store and set up the parameters of the robust local optical flow (RLOF) algoritm.
+ *
+ * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as
+ * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK().
+ * This RLOF implementation can be seen as an improved pyramidal iterative Lucas-Kanade and includes
+ * a set of improving modules. The main improvements in respect to the pyramidal iterative Lucas-Kanade
+ * are:
+ *  - A more robust redecending M-estimator framework (see @cite Senst2012) to improve the accuracy at
+ *     motion boundaries and appearing and disappearing pixels.
+ *  - an adaptive support region strategies to improve the accuracy at motion boundaries to reduce the
+ *     corona effect, i.e oversmoothing of the PLK at motion/object boundaries. The cross-based segementation
+ *     strategy (SR_CROSS) proposed in @cite Senst2014 uses a simple segmenation approach to obtain the optimal
+ *     shape of the support region.
+ *  - To deal with illumination changes (outdoor sequences and shadow) the intensity constancy assumption
+ *     based optical flow equation has been adopt with the Gennert and Negahdaripour illumination model
+ *     (see @cite Senst2016). This model can be switched on/off with the useIlluminationModel variable.
+ *  - By using a global motion prior initialization (see @cite Senst2016) of the iterative refinement
+ *     the accuracy could be significantly improved for large displacements. This initialization can be
+ *     switched on and of with useGlobalMotionPrior variable.
+ *
+ * The RLOF can be computed with the SparseOpticalFlow class or function interface to track a set of features
+ * or with the DenseOpticalFlow class or function interface to compute dense optical flow.
+ *
+ * @see optflow::DenseRLOFOpticalFlow, optflow::calcOpticalFlowDenseRLOF(), optflow::SparseRLOFOpticalFlow, optflow::calcOpticalFlowSparseRLOF()
+ */
+class CV_EXPORTS_W RLOFOpticalFlowParameter{
+public:
+    RLOFOpticalFlowParameter()
+        :solverType(ST_BILINEAR)
+        ,supportRegionType(SR_CROSS)
+        ,normSigma0(3.2f)
+        ,normSigma1(7.f)
+        ,smallWinSize(9)
+        ,largeWinSize(21)
+        ,crossSegmentationThreshold(25)
+        ,maxLevel(3)
+        ,useInitialFlow(false)
+        ,useIlluminationModel(true)
+        ,useGlobalMotionPrior(true)
+        ,maxIteration(30)
+        ,minEigenValue(0.0001f)
+        ,globalMotionRansacThreshold(10)
+    {}
+
+    SolverType solverType;
+    /**< Variable specifies the iterative refinement strategy. Please consider citing  @cite Senst2013 when
+     *  using ST_BILINEAR.
+    */
+
+    SupportRegionType supportRegionType;
+    /**< Variable specifies the support region shape extraction or shrinking strategy.
+    */
+
+    float normSigma0;
+    /**< &sigma paramter of the shrinked Hampel norm introduced in @cite Senst2012. If
+     * &sigma = std::numeric_limist<float>::max() the least-square estimator will be used
+     * instead of the M-estimator. Althoug M-estimator is more robust against outlier in the support
+     * region the least-square can be fast in computation.
+    */
+    float normSigma1;
+    /**< &sigma paramter of the shrinked Hampel norm introduced in @cite Senst2012. If
+     * &sigma = std::numeric_limist<float>::max() the least-square estimator will be used
+     * instead of the M-estimator. Althoug M-estimator is more robust against outlier in the support
+     * region the least-square can be fast in computation.
+    */
+    int smallWinSize;
+    /**< Minimal window size of the support region. This parameter is only used if supportRegionType is SR_CROSS.
+    */
+    int largeWinSize;
+    /**< Maximal window size of the support region. If supportRegionType is SR_FIXED this gives the exact support
+     * region size. The speed of the RLOF is related to the applied win sizes. The smaller the window size the lower is the runtime,
+     * but the more sensitive to noise is the method.
+    */
+    int crossSegmentationThreshold;
+    /**< Color similarity threshold used by cross-based segmentation following @cite Senst2014 .
+     *   (Only used  if supportRegionType is SR_CROSS). With the cross-bassed segmentation
+     *   motion boundaries can be computed more accurately.
+    */
+    int maxLevel;
+    /**< Maximal number of pyramid level used. The large this value is the more likely it is
+     *   to obtain accurate solutions for long-range motions. The runtime is linear related to
+     *   this parameter.
+    */
+    bool useInitialFlow;
+    /**< Use next point list as initial values. A good intialization can imporve the algortihm
+     *   accuracy and reduce the runtime by a faster convergence of the iteration refinement.
+    */
+    bool useIlluminationModel;
+    /**< Use the Gennert and Negahdaripour illumination model instead of the intensity brigthness
+     *   constraint. (proposed in @cite Senst2016 ) This model is defined as follow:
+     *   \f[ I(\mathbf{x},t) + m \cdot I(\mathbf{x},t) + c = I(\mathbf{x},t+1) \f]
+     *   and contains with m and c a multiplicative and additive term which makes the estimate
+     *   more robust against illumination changes. The computational complexity is increased by
+     *   enabling the illumination model.
+    */
+    bool useGlobalMotionPrior;
+    /**< Use global motion prior initialisation has been introduced in @cite Senst2016 . It
+     *   allows to be more accurate for long-range motion. The computational complexity is
+     *   slightly increased by enabling the global motion prior initialisation.
+    */
+    int maxIteration;
+    /**< Number of maximal iterations used for the iterative refinement. Lower values can
+     *   reduce the runtime but also the accuracy.
+    */
+    float minEigenValue;
+    /**< Threshold for the minimal eigenvalue of the gradient matrix defines when to abort the
+     *   iterative refinement.
+    */
+    float globalMotionRansacThreshold;
+    /**< To apply the global motion prior motion vectors will be computed on a regulary sampled which
+     *   are the basis for Homography estimation using RANSAC. The reprojection threshold is based on
+     *   n-th percentil (given by this value [0 ... 100]) of the motion vectors magnitude.
+     *   See @cite Senst2016 for more details.
+    */
+
+    CV_WRAP void setSolverType(SolverType val);
+    CV_WRAP SolverType getSolverType() const;
+
+    CV_WRAP void setSupportRegionType(SupportRegionType val);
+    CV_WRAP SupportRegionType getSupportRegionType() const;
+
+    CV_WRAP void setNormSigma0(float val);
+    CV_WRAP float getNormSigma0() const;
+
+    CV_WRAP void setNormSigma1(float val);
+    CV_WRAP float getNormSigma1() const;
+
+    CV_WRAP void setSmallWinSize(int val);
+    CV_WRAP int getSmallWinSize() const;
+
+    CV_WRAP void setLargeWinSize(int val);
+    CV_WRAP int getLargeWinSize() const;
+
+    CV_WRAP void setCrossSegmentationThreshold(int val);
+    CV_WRAP int getCrossSegmentationThreshold() const;
+
+    CV_WRAP void setMaxLevel(int val);
+    CV_WRAP int getMaxLevel() const;
+
+    CV_WRAP void setUseInitialFlow(bool val);
+    CV_WRAP bool getUseInitialFlow() const;
+
+    CV_WRAP void setUseIlluminationModel(bool val);
+    CV_WRAP bool getUseIlluminationModel() const;
+
+    CV_WRAP void setUseGlobalMotionPrior(bool val);
+    CV_WRAP bool getUseGlobalMotionPrior() const;
+
+    CV_WRAP void setMaxIteration(int val);
+    CV_WRAP int getMaxIteration() const;
+
+    CV_WRAP void setMinEigenValue(float val);
+    CV_WRAP float getMinEigenValue() const;
+
+    CV_WRAP void setGlobalMotionRansacThreshold(float val);
+    CV_WRAP float getGlobalMotionRansacThreshold() const;
+
+    //! @brief Creates instance of optflow::RLOFOpticalFlowParameter
+    CV_WRAP static Ptr<RLOFOpticalFlowParameter> create();
+};
+
+/** @brief Fast dense optical flow computation based on robust local optical flow (RLOF) algorithms and sparse-to-dense interpolation
+ * scheme.
+ *
+ * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as
+ * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK().
+ *
+ * The sparse-to-dense interpolation scheme allows for fast computation of dense optical flow using RLOF (see @cite Geistert2016).
+ * For this scheme the following steps are applied:
+ * -# motion vector seeded at a regular sampled grid are computed. The sparsity of this grid can be configured with setGridStep
+ * -# (optinally) errornous motion vectors are filter based on the forward backward confidence. The threshold can be configured
+ * with setForwardBackward. The filter is only applied if the threshold >0 but than the runtime is doubled due to the estimation
+ * of the backward flow.
+ * -# Vector field interpolation is applied to the motion vector set to obtain a dense vector field.
+ *
+ * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details.
+ * Parameters have been described in @cite Senst2012 @cite Senst2013 @cite Senst2014 and @cite Senst2016.
+ *
+ * @note SIMD parallelization is only available when compiling with SSE4.1. If the grid size is set to (1,1) and the
+ * forward backward threshold <= 0 that the dense optical flow field is purely.
+ * computed with the RLOF.
+ *
+ * @see optflow::calcOpticalFlowDenseRLOF(), optflow::RLOFOpticalFlowParameter
+*/
+class CV_EXPORTS_W DenseRLOFOpticalFlow : public DenseOpticalFlow
+{
+public:
+    //! @brief Configuration of the RLOF alogrithm.
+    /**
+        @see optflow::RLOFOpticalFlowParameter, getRLOFOpticalFlowParameter
+    */
+    CV_WRAP virtual void setRLOFOpticalFlowParameter(Ptr<RLOFOpticalFlowParameter>  val) = 0;
+    /** @copybrief setRLOFOpticalFlowParameter
+        @see optflow::RLOFOpticalFlowParameter, setRLOFOpticalFlowParameter
+    */
+    CV_WRAP virtual Ptr<RLOFOpticalFlowParameter>  getRLOFOpticalFlowParameter() const = 0;
+    //! @brief Threshold for the forward backward confidence check
+    /**For each grid point \f$ \mathbf{x} \f$ a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed.
+     *     If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f]
+     *     is larger than threshold given by this function then the motion vector will not be used by the following
+     *    vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test
      *    will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation.
-     * @param gridStep Size of the grid to spawn the motion vectors. For each grid point a motion vector is computed.
-     * Some motion vectors will be removed due to the forwatd backward threshold (if set >0). The rest will be the
-     * base of the vector field interpolation.
-     * @param interp_type interpolation method used to compute the dense optical flow. Two interpolation algorithms are
-     * supported:
+     *    @see getForwardBackward, setGridStep
+    */
+    CV_WRAP virtual void setForwardBackward(float val) = 0;
+    /** @copybrief setForwardBackward
+        @see setForwardBackward
+    */
+    CV_WRAP virtual float getForwardBackward() const = 0;
+    //! @brief Size of the grid to spawn the motion vectors.
+    /** For each grid point a motion vector is computed. Some motion vectors will be removed due to the forwatd backward
+     *  threshold (if set >0). The rest will be the base of the vector field interpolation.
+     *    @see getForwardBackward, setGridStep
+    */
+    CV_WRAP virtual Size getGridStep() const = 0;
+    /** @copybrief getGridStep
+     *    @see getGridStep
+     */
+    CV_WRAP virtual void setGridStep(Size val) = 0;
+
+    //! @brief Interpolation used to compute the dense optical flow.
+    /** Two interpolation algorithms are supported
      * - **INTERP_GEO** applies the fast geodesic interpolation, see @cite Geistert2016.
      * - **INTERP_EPIC_RESIDUAL** applies the edge-preserving interpolation, see @cite Revaud2015,Geistert2016.
-     * @param epicK see ximgproc::EdgeAwareInterpolator() sets the respective parameter.
-     * @param epicSigma see ximgproc::EdgeAwareInterpolator() sets the respective parameter.
-     * @param epicLambda see ximgproc::EdgeAwareInterpolator() sets the respective parameter.
-     * @param use_post_proc enables ximgproc::fastGlobalSmootherFilter() parameter.
-     * @param fgsLambda sets the respective ximgproc::fastGlobalSmootherFilter() parameter.
-     * @param fgsSigma sets the respective ximgproc::fastGlobalSmootherFilter() parameter.
-     *
-     * Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014, @cite Senst2016.
-     * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details.
-     * @note If the grid size is set to (1,1) and the forward backward threshold <= 0 that the dense optical flow field is purely
-     * computed with the RLOF.
-     *
-     * @note SIMD parallelization is only available when compiling with SSE4.1.
-     *
-     * @sa optflow::DenseRLOFOpticalFlow, optflow::RLOFOpticalFlowParameter
-    */
-    CV_EXPORTS_W void calcOpticalFlowDenseRLOF(InputArray I0, InputArray I1, InputOutputArray flow,
+     * @see ximgproc::EdgeAwareInterpolator, getInterpolation
+    */
+    CV_WRAP virtual void setInterpolation(InterpolationType val) = 0;
+    /** @copybrief setInterpolation
+     *    @see ximgproc::EdgeAwareInterpolator, setInterpolation
+     */
+    CV_WRAP virtual InterpolationType getInterpolation() const = 0;
+    //! @brief see ximgproc::EdgeAwareInterpolator() K value.
+    /** K is a number of nearest-neighbor matches considered, when fitting a locally affine
+     *    model. Usually it should be around 128. However, lower values would make the interpolation noticeably faster.
+     *    @see ximgproc::EdgeAwareInterpolator,  setEPICK
+    */
+    CV_WRAP virtual int getEPICK() const = 0;
+    /** @copybrief getEPICK
+     *    @see ximgproc::EdgeAwareInterpolator, getEPICK
+     */
+    CV_WRAP virtual void setEPICK(int val) = 0;
+    //! @brief see ximgproc::EdgeAwareInterpolator() sigma value.
+    /** Sigma is a parameter defining how fast the weights decrease in the locally-weighted affine
+     *  fitting. Higher values can help preserve fine details, lower values can help to get rid of noise in the
+     *  output flow.
+     *    @see ximgproc::EdgeAwareInterpolator, setEPICSigma
+    */
+    CV_WRAP virtual float getEPICSigma() const = 0;
+    /** @copybrief getEPICSigma
+     *  @see ximgproc::EdgeAwareInterpolator, getEPICSigma
+     */
+    CV_WRAP virtual void setEPICSigma(float val) = 0;
+    //! @brief  see ximgproc::EdgeAwareInterpolator() lambda value.
+    /** Lambda is a parameter defining the weight of the edge-aware term in geodesic distance,
+     *    should be in the range of 0 to 1000.
+     *    @see ximgproc::EdgeAwareInterpolator, setEPICSigma
+    */
+    CV_WRAP virtual float getEPICLambda() const = 0;
+    /** @copybrief getEPICLambda
+     *    @see ximgproc::EdgeAwareInterpolator, getEPICLambda
+    */
+    CV_WRAP virtual void setEPICLambda(float val) = 0;
+    //! @brief see ximgproc::EdgeAwareInterpolator().
+    /** Sets the respective fastGlobalSmootherFilter() parameter.
+     *    @see ximgproc::EdgeAwareInterpolator, setFgsLambda
+    */
+    CV_WRAP virtual float getFgsLambda() const = 0;
+    /** @copybrief getFgsLambda
+     *    @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, getFgsLambda
+    */
+    CV_WRAP virtual void setFgsLambda(float val) = 0;
+    //! @brief see ximgproc::EdgeAwareInterpolator().
+    /** Sets the respective fastGlobalSmootherFilter() parameter.
+     *    @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, setFgsSigma
+    */
+    CV_WRAP virtual float getFgsSigma() const = 0;
+    /** @copybrief getFgsSigma
+     *    @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, getFgsSigma
+     */
+    CV_WRAP virtual void setFgsSigma(float val) = 0;
+    //! @brief enables ximgproc::fastGlobalSmootherFilter
+    /**
+     * @see getUsePostProc
+     */
+    CV_WRAP virtual void setUsePostProc(bool val) = 0;
+    /** @copybrief setUsePostProc
+     *    @see ximgproc::fastGlobalSmootherFilter, setUsePostProc
+     */
+    CV_WRAP virtual bool getUsePostProc() const = 0;
+    //! @brief Creates instance of optflow::DenseRLOFOpticalFlow
+    /**
+     *    @param rlofParam see optflow::RLOFOpticalFlowParameter
+     *    @param forwardBackwardThreshold see setForwardBackward
+     *    @param gridStep see setGridStep
+     *    @param interp_type see setInterpolation
+     *    @param epicK see setEPICK
+     *    @param epicSigma see setEPICSigma
+     *    @param epicLambda see setEPICLambda
+     *    @param use_post_proc see setUsePostProc
+     *    @param fgsLambda see setFgsLambda
+     *    @param fgsSigma see setFgsSigma
+    */
+    CV_WRAP static Ptr<DenseRLOFOpticalFlow> create(
         Ptr<RLOFOpticalFlowParameter> rlofParam = Ptr<RLOFOpticalFlowParameter>(),
-        float forwardBackwardThreshold = 0, Size gridStep = Size(6, 6),
+        float forwardBackwardThreshold = 1.f,
+        Size gridStep = Size(6, 6),
         InterpolationType interp_type = InterpolationType::INTERP_EPIC,
-        int epicK = 128, float epicSigma = 0.05f, float epicLambda = 100.f,
-        bool use_post_proc = true, float fgsLambda = 500.0f, float fgsSigma = 1.5f);
-
-    /** @brief Calculates fast optical flow for a sparse feature set using the robust local optical flow (RLOF) similar
-    * to optflow::calcOpticalFlowPyrLK().
-    *
-    * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as
-    * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK().
-    *
-    * @param prevImg first 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType
-    * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image.
-    * @param nextImg second 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType
-    * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image.
-    * @param prevPts vector of 2D points for which the flow needs to be found; point coordinates must be single-precision
-    * floating-point numbers.
-    * @param nextPts output vector of 2D points (with single-precision floating-point coordinates) containing the calculated
-    * new positions of input features in the second image; when optflow::RLOFOpticalFlowParameter::useInitialFlow variable is true  the vector must
-    * have the same size as in the input and contain the initialization point correspondences.
-    * @param status output status vector (of unsigned chars); each element of the vector is set to 1 if the flow for the
-    * corresponding features has passed the forward backward check.
-    * @param err output vector of errors; each element of the vector is set to the forward backward error for the corresponding feature.
-    * @param rlofParam see optflow::RLOFOpticalFlowParameter
-    * @param forwardBackwardThreshold Threshold for the forward backward confidence check. If forewardBackwardThreshold <=0 the forward
-    *
-    * @note SIMD parallelization is only available when compiling with SSE4.1.
-    *
-    * Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014 and @cite Senst2016.
-    * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details.
-    */
-    CV_EXPORTS_W void calcOpticalFlowSparseRLOF(InputArray prevImg, InputArray nextImg,
-        InputArray prevPts, InputOutputArray nextPts,
-        OutputArray status, OutputArray err,
-        Ptr<RLOFOpticalFlowParameter> rlofParam = Ptr<RLOFOpticalFlowParameter>(),
-        float forwardBackwardThreshold = 0);
-
-    //! Additional interface to the Dense RLOF algorithm - optflow::calcOpticalFlowDenseRLOF()
-    CV_EXPORTS_W Ptr<DenseOpticalFlow> createOptFlow_DenseRLOF();
+        int epicK = 128,
+        float epicSigma = 0.05f,
+        float epicLambda = 999.0f,
+        bool use_post_proc = true,
+        float fgsLambda = 500.0f,
+        float fgsSigma = 1.5f);
+};
+
+/** @brief Class used for calculation sparse optical flow and feature tracking with robust local optical flow (RLOF) algorithms.
+*
+* The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as
+* proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK().
+*
+* For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details.
+* Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014 and @cite Senst2016.
+*
+* @note SIMD parallelization is only available when compiling with SSE4.1.
+* @see optflow::calcOpticalFlowSparseRLOF(), optflow::RLOFOpticalFlowParameter
+*/
+class CV_EXPORTS_W SparseRLOFOpticalFlow : public SparseOpticalFlow
+{
+public:
+    /** @copydoc DenseRLOFOpticalFlow::setRLOFOpticalFlowParameter
+    */
+    CV_WRAP virtual void setRLOFOpticalFlowParameter(Ptr<RLOFOpticalFlowParameter> val) = 0;
+    /** @copybrief setRLOFOpticalFlowParameter
+     *    @see setRLOFOpticalFlowParameter
+    */
+    CV_WRAP virtual Ptr<RLOFOpticalFlowParameter>  getRLOFOpticalFlowParameter() const = 0;
+    //! @brief Threshold for the forward backward confidence check
+    /** For each feature point a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed.
+     *     If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f]
+     *     is larger than threshold given by this function then the status  will not be used by the following
+     *    vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test
+     *    will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation.
+     *    @see setForwardBackward
+    */
+    CV_WRAP virtual void setForwardBackward(float val) = 0;
+    /** @copybrief setForwardBackward
+     *    @see setForwardBackward
+    */
+    CV_WRAP virtual float getForwardBackward() const = 0;
 
-    //! Additional interface to the Sparse RLOF algorithm - optflow::calcOpticalFlowSparseRLOF()
-    CV_EXPORTS_W Ptr<SparseOpticalFlow> createOptFlow_SparseRLOF();
-    //! @}
-}
-}
+    //! @brief Creates instance of SparseRLOFOpticalFlow
+    /**
+     *    @param rlofParam see setRLOFOpticalFlowParameter
+     *    @param forwardBackwardThreshold see setForwardBackward
+    */
+    CV_WRAP static Ptr<SparseRLOFOpticalFlow> create(
+        Ptr<RLOFOpticalFlowParameter> rlofParam = Ptr<RLOFOpticalFlowParameter>(),
+        float forwardBackwardThreshold = 1.f);
+
+};
+
+/** @brief Fast dense optical flow computation based on robust local optical flow (RLOF) algorithms and sparse-to-dense interpolation scheme.
+ *
+ * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as
+ * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK().
+ *
+ * The sparse-to-dense interpolation scheme allows for fast computation of dense optical flow using RLOF (see @cite Geistert2016).
+ * For this scheme the following steps are applied:
+ * -# motion vector seeded at a regular sampled grid are computed. The sparsity of this grid can be configured with setGridStep
+ * -# (optinally) errornous motion vectors are filter based on the forward backward confidence. The threshold can be configured
+ * with setForwardBackward. The filter is only applied if the threshold >0 but than the runtime is doubled due to the estimation
+ * of the backward flow.
+ * -# Vector field interpolation is applied to the motion vector set to obtain a dense vector field.
+ *
+ * @param I0 first 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType
+ * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image.
+ * @param I1 second 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType
+ * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image.
+ * @param flow computed flow image that has the same size as I0 and type CV_32FC2.
+ * @param rlofParam see optflow::RLOFOpticalFlowParameter
+ * @param forwardBackwardThreshold Threshold for the forward backward confidence check.
+ * For each grid point \f$ \mathbf{x} \f$ a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed.
+ * If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f]
+ * is larger than threshold given by this function then the motion vector will not be used by the following
+ * vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test
+ *    will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation.
+ * @param gridStep Size of the grid to spawn the motion vectors. For each grid point a motion vector is computed.
+ * Some motion vectors will be removed due to the forwatd backward threshold (if set >0). The rest will be the
+ * base of the vector field interpolation.
+ * @param interp_type interpolation method used to compute the dense optical flow. Two interpolation algorithms are
+ * supported:
+ * - **INTERP_GEO** applies the fast geodesic interpolation, see @cite Geistert2016.
+ * - **INTERP_EPIC_RESIDUAL** applies the edge-preserving interpolation, see @cite Revaud2015,Geistert2016.
+ * @param epicK see ximgproc::EdgeAwareInterpolator() sets the respective parameter.
+ * @param epicSigma see ximgproc::EdgeAwareInterpolator() sets the respective parameter.
+ * @param epicLambda see ximgproc::EdgeAwareInterpolator() sets the respective parameter.
+ * @param use_post_proc enables ximgproc::fastGlobalSmootherFilter() parameter.
+ * @param fgsLambda sets the respective ximgproc::fastGlobalSmootherFilter() parameter.
+ * @param fgsSigma sets the respective ximgproc::fastGlobalSmootherFilter() parameter.
+ *
+ * Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014, @cite Senst2016.
+ * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details.
+ * @note If the grid size is set to (1,1) and the forward backward threshold <= 0 that the dense optical flow field is purely
+ * computed with the RLOF.
+ *
+ * @note SIMD parallelization is only available when compiling with SSE4.1.
+ *
+ * @sa optflow::DenseRLOFOpticalFlow, optflow::RLOFOpticalFlowParameter
+*/
+CV_EXPORTS_W void calcOpticalFlowDenseRLOF(InputArray I0, InputArray I1, InputOutputArray flow,
+    Ptr<RLOFOpticalFlowParameter> rlofParam = Ptr<RLOFOpticalFlowParameter>(),
+    float forwardBackwardThreshold = 0, Size gridStep = Size(6, 6),
+    InterpolationType interp_type = InterpolationType::INTERP_EPIC,
+    int epicK = 128, float epicSigma = 0.05f, float epicLambda = 100.f,
+    bool use_post_proc = true, float fgsLambda = 500.0f, float fgsSigma = 1.5f);
+
+/** @brief Calculates fast optical flow for a sparse feature set using the robust local optical flow (RLOF) similar
+* to optflow::calcOpticalFlowPyrLK().
+*
+* The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as
+* proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK().
+*
+* @param prevImg first 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType
+* = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image.
+* @param nextImg second 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType
+* = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image.
+* @param prevPts vector of 2D points for which the flow needs to be found; point coordinates must be single-precision
+* floating-point numbers.
+* @param nextPts output vector of 2D points (with single-precision floating-point coordinates) containing the calculated
+* new positions of input features in the second image; when optflow::RLOFOpticalFlowParameter::useInitialFlow variable is true  the vector must
+* have the same size as in the input and contain the initialization point correspondences.
+* @param status output status vector (of unsigned chars); each element of the vector is set to 1 if the flow for the
+* corresponding features has passed the forward backward check.
+* @param err output vector of errors; each element of the vector is set to the forward backward error for the corresponding feature.
+* @param rlofParam see optflow::RLOFOpticalFlowParameter
+* @param forwardBackwardThreshold Threshold for the forward backward confidence check. If forewardBackwardThreshold <=0 the forward
+*
+* @note SIMD parallelization is only available when compiling with SSE4.1.
+*
+* Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014 and @cite Senst2016.
+* For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details.
+*/
+CV_EXPORTS_W void calcOpticalFlowSparseRLOF(InputArray prevImg, InputArray nextImg,
+    InputArray prevPts, InputOutputArray nextPts,
+    OutputArray status, OutputArray err,
+    Ptr<RLOFOpticalFlowParameter> rlofParam = Ptr<RLOFOpticalFlowParameter>(),
+    float forwardBackwardThreshold = 0);
+
+//! Additional interface to the Dense RLOF algorithm - optflow::calcOpticalFlowDenseRLOF()
+CV_EXPORTS_W Ptr<DenseOpticalFlow> createOptFlow_DenseRLOF();
+
+//! Additional interface to the Sparse RLOF algorithm - optflow::calcOpticalFlowSparseRLOF()
+CV_EXPORTS_W Ptr<SparseOpticalFlow> createOptFlow_SparseRLOF();
+//! @}
+
+} // namespace
+} // namespace
 #endif
diff --git a/modules/optflow/perf/perf_rlof.cpp b/modules/optflow/perf/perf_rlof.cpp
index e67feec2777..d8fb64eb40a 100644
--- a/modules/optflow/perf/perf_rlof.cpp
+++ b/modules/optflow/perf/perf_rlof.cpp
@@ -1,73 +1,74 @@
 #include "perf_precomp.hpp"
 namespace opencv_test { namespace {
-        typedef tuple<std::string, std::string, bool> ST_SR_IM_Sparse_t;
-        typedef TestBaseWithParam<ST_SR_IM_Sparse_t> ST_SR_IM_Sparse;
-        PERF_TEST_P(ST_SR_IM_Sparse, OpticalFlow_SparseRLOF,
-            testing::Combine(
-                testing::Values<std::string>("ST_BILINEAR", "ST_STANDART"),
-                testing::Values<std::string>("SR_CROSS", "SR_FIXED"),
-                testing::Values(true, false))
-        )
+
+typedef tuple<std::string, std::string, bool> ST_SR_IM_Sparse_t;
+typedef TestBaseWithParam<ST_SR_IM_Sparse_t> ST_SR_IM_Sparse;
+PERF_TEST_P(ST_SR_IM_Sparse, OpticalFlow_SparseRLOF,
+    testing::Combine(
+        testing::Values<std::string>("ST_BILINEAR", "ST_STANDART"),
+        testing::Values<std::string>("SR_CROSS", "SR_FIXED"),
+        testing::Values(true, false))
+)
+{
+    Mat frame1 = imread(getDataPath("cv/optflow/RubberWhale1.png"));
+    Mat frame2 = imread(getDataPath("cv/optflow/RubberWhale2.png"));
+    ASSERT_FALSE(frame1.empty());
+    ASSERT_FALSE(frame2.empty());
+    vector<Point2f> prevPts, currPts;
+    for (int r = 0; r < frame1.rows; r += 10)
+    {
+        for (int c = 0; c < frame1.cols; c += 10)
         {
-            Mat frame1 = imread(getDataPath("cv/optflow/RubberWhale1.png"));
-            Mat frame2 = imread(getDataPath("cv/optflow/RubberWhale2.png"));
-            ASSERT_FALSE(frame1.empty());
-            ASSERT_FALSE(frame2.empty());
-            vector<Point2f> prevPts, currPts;
-            for (int r = 0; r < frame1.rows; r += 10)
-            {
-                for (int c = 0; c < frame1.cols; c += 10)
-                {
-                    prevPts.push_back(Point2f(static_cast<float>(c), static_cast<float>(r)));
-                }
-            }
-            vector<uchar> status(prevPts.size());
-            vector<float> err(prevPts.size());
+            prevPts.push_back(Point2f(static_cast<float>(c), static_cast<float>(r)));
+        }
+    }
+    vector<uchar> status(prevPts.size());
+    vector<float> err(prevPts.size());
 
-            Ptr<RLOFOpticalFlowParameter> param = Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter);
-            if (get<0>(GetParam()) == "ST_BILINEAR")
-                param->solverType = ST_BILINEAR;
-            if (get<0>(GetParam()) == "ST_STANDART")
-                param->solverType = ST_STANDART;
-            if (get<1>(GetParam()) == "SR_CROSS")
-                param->supportRegionType = SR_CROSS;
-            if (get<1>(GetParam()) == "SR_FIXED")
-                param->supportRegionType = SR_FIXED;
-            param->useIlluminationModel = get<2>(GetParam());
+    Ptr<RLOFOpticalFlowParameter> param = Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter);
+    if (get<0>(GetParam()) == "ST_BILINEAR")
+        param->solverType = ST_BILINEAR;
+    if (get<0>(GetParam()) == "ST_STANDART")
+        param->solverType = ST_STANDART;
+    if (get<1>(GetParam()) == "SR_CROSS")
+        param->supportRegionType = SR_CROSS;
+    if (get<1>(GetParam()) == "SR_FIXED")
+        param->supportRegionType = SR_FIXED;
+    param->useIlluminationModel = get<2>(GetParam());
 
-            TEST_CYCLE_N(1)
-            {
-                calcOpticalFlowSparseRLOF(frame1, frame2, prevPts, currPts, status, err, param, 1.f);
-            }
+    TEST_CYCLE_N(1)
+    {
+        calcOpticalFlowSparseRLOF(frame1, frame2, prevPts, currPts, status, err, param, 1.f);
+    }
 
-            SANITY_CHECK_NOTHING();
-        }
+    SANITY_CHECK_NOTHING();
+}
 
-        typedef tuple<std::string, int> INTERP_GRID_Dense_t;
-        typedef TestBaseWithParam<INTERP_GRID_Dense_t> INTERP_GRID_Dense;
-        PERF_TEST_P(INTERP_GRID_Dense, OpticalFlow_DenseRLOF,
-            testing::Combine(
-                testing::Values<std::string>("INTERP_EPIC", "INTERP_GEO"),
-                testing::Values<int>(4,10,50))
-        )
-        {
-            Mat flow;
-            Mat frame1 = imread(getDataPath("cv/optflow/RubberWhale1.png"));
-            Mat frame2 = imread(getDataPath("cv/optflow/RubberWhale1.png"));
-            ASSERT_FALSE(frame1.empty());
-            ASSERT_FALSE(frame2.empty());
-            Ptr<RLOFOpticalFlowParameter> param = Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter);;
-            Ptr< DenseRLOFOpticalFlow> algo = DenseRLOFOpticalFlow::create();
-            InterpolationType interp_type = INTERP_EPIC;
-            if (get<0>(GetParam()) == "INTERP_EPIC")
-                interp_type = INTERP_EPIC;
-            if (get<0>(GetParam()) == "INTERP_GEO")
-                interp_type = INTERP_GEO;
-            TEST_CYCLE_N(5)
-            {
-                calcOpticalFlowDenseRLOF(frame1, frame2,flow, param, 1.0f, Size(get<1>(GetParam()), get<1>(GetParam())), interp_type);
-            }
-            SANITY_CHECK_NOTHING();
-        }
+typedef tuple<std::string, int> INTERP_GRID_Dense_t;
+typedef TestBaseWithParam<INTERP_GRID_Dense_t> INTERP_GRID_Dense;
+PERF_TEST_P(INTERP_GRID_Dense, OpticalFlow_DenseRLOF,
+    testing::Combine(
+        testing::Values<std::string>("INTERP_EPIC", "INTERP_GEO"),
+        testing::Values<int>(4,10,50))
+)
+{
+    Mat flow;
+    Mat frame1 = imread(getDataPath("cv/optflow/RubberWhale1.png"));
+    Mat frame2 = imread(getDataPath("cv/optflow/RubberWhale1.png"));
+    ASSERT_FALSE(frame1.empty());
+    ASSERT_FALSE(frame2.empty());
+    Ptr<RLOFOpticalFlowParameter> param = Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter);;
+    Ptr< DenseRLOFOpticalFlow> algo = DenseRLOFOpticalFlow::create();
+    InterpolationType interp_type = INTERP_EPIC;
+    if (get<0>(GetParam()) == "INTERP_EPIC")
+        interp_type = INTERP_EPIC;
+    if (get<0>(GetParam()) == "INTERP_GEO")
+        interp_type = INTERP_GEO;
+    TEST_CYCLE_N(5)
+    {
+        calcOpticalFlowDenseRLOF(frame1, frame2,flow, param, 1.0f, Size(get<1>(GetParam()), get<1>(GetParam())), interp_type);
     }
-} // namespace
+    SANITY_CHECK_NOTHING();
+}
+
+}} // namespace
diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp
index 5f917dc12d4..e5cac1fa87c 100644
--- a/modules/optflow/src/rlof/berlof_invoker.hpp
+++ b/modules/optflow/src/rlof/berlof_invoker.hpp
@@ -43,2097 +43,2092 @@ inline void est_Result( const cv::Mat & src, const cv::Vec4f & val, cv::Point2f
         src.at<float>(3,0) * val[0] + src.at<float>(3,1) * val[1] + src.at<float>(3,2) * val[2] + src.at<float>(3,3) * val[3]);
 }
 
-namespace berlof
-{
+namespace berlof {
 
-    namespace ica
-    {
+namespace ica {
 
-    class TrackerInvoker : public cv::ParallelLoopBody
-        {
-            public:
-            TrackerInvoker(
-                                const Mat&    _prevImg,
-                                const Mat&    _prevDeriv,
-                                const Mat&    _nextImg,
-                                const Mat&    _rgbPrevImg,
-                                const Mat&    _rgbNextImg,
-                                const Point2f*    _prevPts,
-                                Point2f*        _nextPts,
-                                uchar*            _status,
-                                float*            _err,
-                                int                _level,
-                                int                _maxLevel,
-                                int                _winSize[2],
-                                int                _maxIteration,
-                                bool            _useInitialFlow,
-                                int                _supportRegionType,
-                                int                _crossSegmentationThreshold,
-                                std::vector<float> _normSigmas,
-                                float           _minEigenValue)
-            {
-                prevImg =        &_prevImg;
-                prevDeriv =        &_prevDeriv;
-                nextImg =        &_nextImg;
-                rgbPrevImg =    &_rgbPrevImg;
-                rgbNextImg =    &_rgbNextImg;
-                prevPts =        _prevPts;
-                nextPts =        _nextPts;
-                status =        _status;
-                err =            _err;
-                minWinSize            =    _winSize[0];
-                maxWinSize            =    _winSize[1];
-                criteria.maxCount    =    _maxIteration;
-                criteria.epsilon    =    0.01;
-                level                =    _level;
-                maxLevel            =    _maxLevel;
-                windowType            = _supportRegionType;
-                minEigThreshold        = _minEigenValue;
-                paramReset            = _normSigmas;
-                useInitialFlow        = _useInitialFlow;
-                crossSegmentationThreshold    = _crossSegmentationThreshold;
-            }
+class TrackerInvoker : public cv::ParallelLoopBody
+{
+public:
+    TrackerInvoker(
+        const Mat&      _prevImg,
+        const Mat&      _prevDeriv,
+        const Mat&      _nextImg,
+        const Mat&      _rgbPrevImg,
+        const Mat&      _rgbNextImg,
+        const Point2f*  _prevPts,
+        Point2f*        _nextPts,
+        uchar*          _status,
+        float*          _err,
+        int             _level,
+        int             _maxLevel,
+        int             _winSize[2],
+        int             _maxIteration,
+        bool            _useInitialFlow,
+        int             _supportRegionType,
+        int             _crossSegmentationThreshold,
+        std::vector<float> _normSigmas,
+        float           _minEigenValue)
+    {
+        prevImg =       &_prevImg;
+        prevDeriv =     &_prevDeriv;
+        nextImg =       &_nextImg;
+        rgbPrevImg =    &_rgbPrevImg;
+        rgbNextImg =    &_rgbNextImg;
+        prevPts =       _prevPts;
+        nextPts =       _nextPts;
+        status =        _status;
+        err =           _err;
+        minWinSize =    _winSize[0];
+        maxWinSize =    _winSize[1];
+        criteria.maxCount = _maxIteration;
+        criteria.epsilon = 0.01;
+        level =         _level;
+        maxLevel =      _maxLevel;
+        windowType =    _supportRegionType;
+        minEigThreshold = _minEigenValue;
+        paramReset =    _normSigmas;
+        useInitialFlow = _useInitialFlow;
+        crossSegmentationThreshold = _crossSegmentationThreshold;
+    }
 
-            TrackerInvoker & operator=(const TrackerInvoker &) {return *this;};
-            void operator()(const cv::Range& range) const CV_OVERRIDE
-            {
+    TrackerInvoker & operator=(const TrackerInvoker &) {return *this;};
+    void operator()(const cv::Range& range) const CV_OVERRIDE
+    {
 #ifdef DEBUG_INVOKER
-                printf("berlof::ica");fflush(stdout);
+        printf("berlof::ica");fflush(stdout);
 #endif
-                Point2f halfWin;
-                cv::Size winSize;
-                const Mat& I = *prevImg;
-                const Mat& J = *nextImg;
-                const Mat& derivI = *prevDeriv;
-                const Mat& BI = *rgbPrevImg;
+        Point2f halfWin;
+        cv::Size winSize;
+        const Mat& I = *prevImg;
+        const Mat& J = *nextImg;
+        const Mat& derivI = *prevDeriv;
+        const Mat& BI = *rgbPrevImg;
 
 
-                const float FLT_SCALE = (1.f/(1 << 16));
+        const float FLT_SCALE = (1.f/(1 << 16));
 
-                winSize = cv::Size(maxWinSize,maxWinSize);
-                int winMaskwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
-                cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
-                winMaskMatBuf.setTo(1);
+        winSize = cv::Size(maxWinSize,maxWinSize);
+        int winMaskwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
+        cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
+        winMaskMatBuf.setTo(1);
 
-                std::vector<float> param = paramReset;
-                int j, cn = I.channels(), cn2 = cn*2;
-                int winbufwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
-                cv::Size winBufSize(winbufwidth,winbufwidth);
+        std::vector<float> param = paramReset;
+        int j, cn = I.channels(), cn2 = cn*2;
+        int winbufwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
+        cv::Size winBufSize(winbufwidth,winbufwidth);
 
-                cv::AutoBuffer<deriv_type> _buf(winBufSize.area()*(cn + cn2));
-                int derivDepth = DataType<deriv_type>::depth;
+        cv::AutoBuffer<deriv_type> _buf(winBufSize.area()*(cn + cn2));
+        int derivDepth = DataType<deriv_type>::depth;
 
-                Mat IWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf.data());
-                Mat derivIWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf.data() + winBufSize.area()*cn);
+        Mat IWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf.data());
+        Mat derivIWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf.data() + winBufSize.area()*cn);
 
 
-                for( int ptidx = range.start; ptidx < range.end; ptidx++ )
+        for( int ptidx = range.start; ptidx < range.end; ptidx++ )
+        {
+            Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
+            Point2f nextPt;
+            if( level == maxLevel )
+            {
+                if( useInitialFlow )
+                    nextPt = nextPts[ptidx]*(float)(1./(1 << level));
+                else
+                    nextPt = prevPt;
+            }
+            else
+                nextPt = nextPts[ptidx]*2.f;
+            nextPts[ptidx] = nextPt;
+
+            Point2i iprevPt, inextPt;
+            iprevPt.x = cvFloor(prevPt.x);
+            iprevPt.y = cvFloor(prevPt.y);
+            int winArea = maxWinSize * maxWinSize;
+            cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize));
+            winMaskMatBuf.setTo(0);
+            if( calcWinMaskMat(BI, windowType, iprevPt,
+                                winMaskMat,winSize,halfWin,winArea,
+                                minWinSize,maxWinSize, crossSegmentationThreshold) == false)
+                continue;
+
+            prevPt -= halfWin;
+            iprevPt.x = cvFloor(prevPt.x);
+            iprevPt.y = cvFloor(prevPt.y);
+
+            if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols ||
+                iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows )
+            {
+                if( level == 0 )
                 {
-                    Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
-                    Point2f nextPt;
-                    if( level == maxLevel )
-                    {
-                        if( useInitialFlow )
-                            nextPt = nextPts[ptidx]*(float)(1./(1 << level));
-                        else
-                            nextPt = prevPt;
-                    }
-                    else
-                        nextPt = nextPts[ptidx]*2.f;
-                    nextPts[ptidx] = nextPt;
-
-                    Point2i iprevPt, inextPt;
-                    iprevPt.x = cvFloor(prevPt.x);
-                    iprevPt.y = cvFloor(prevPt.y);
-                    int winArea = maxWinSize * maxWinSize;
-                    cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize));
-                    winMaskMatBuf.setTo(0);
-                    if( calcWinMaskMat(BI, windowType, iprevPt,
-                                        winMaskMat,winSize,halfWin,winArea,
-                                        minWinSize,maxWinSize, crossSegmentationThreshold) == false)
-                        continue;
-
-                    prevPt -= halfWin;
-                    iprevPt.x = cvFloor(prevPt.x);
-                    iprevPt.y = cvFloor(prevPt.y);
+                    if( status )
+                        status[ptidx] = 3;
+                    if( err )
+                        err[ptidx] = 0;
+                }
+                continue;
+            }
 
-                    if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols ||
-                        iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows )
-                    {
-                        if( level == 0 )
-                        {
-                            if( status )
-                                status[ptidx] = 3;
-                            if( err )
-                                err[ptidx] = 0;
-                        }
-                        continue;
-                    }
+            float a = prevPt.x - iprevPt.x;
+            float b = prevPt.y - iprevPt.y;
+            const int W_BITS = 14, W_BITS1 = 14;
 
-                    float a = prevPt.x - iprevPt.x;
-                    float b = prevPt.y - iprevPt.y;
-                    const int W_BITS = 14, W_BITS1 = 14;
+            int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+            int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+            int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+            int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
 
-                    int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
-                    int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
-                    int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
-                    int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+            int dstep = (int)(derivI.step/derivI.elemSize1());
+            int step = (int)(I.step/I.elemSize1());
+            int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1());
+            CV_Assert( step == (int)(J.step/J.elemSize1()) );
 
-                    int dstep = (int)(derivI.step/derivI.elemSize1());
-                    int step = (int)(I.step/I.elemSize1());
-                    int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1());
-                    CV_Assert( step == (int)(J.step/J.elemSize1()) );
+            float A11 = 0, A12 = 0, A22 = 0;
+            float D = 0;
 
-                    float A11 = 0, A12 = 0, A22 = 0;
-                    float D = 0;
+            // extract the patch from the first image, compute covariation matrix of derivatives
+            int x, y;
+            for( y = 0; y < winSize.height; y++ )
+            {
+                const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
+                const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
 
-                    // extract the patch from the first image, compute covariation matrix of derivatives
-                    int x, y;
-                    for( y = 0; y < winSize.height; y++ )
+                short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step);
+                short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step);
+                x = 0;
+                for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 )
+                {
+                    if( winMaskMat.at<uchar>(y,x) == 0)
                     {
-                        const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
-                        const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
-
-                        short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step);
-                        short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step);
-                        x = 0;
-                        for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 )
-                        {
-                            if( winMaskMat.at<uchar>(y,x) == 0)
-                            {
-                                dIptr[0] = 0;
-                                dIptr[1] = 0;
-                                continue;
-                            }
-                            int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
-                                                  src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5);
-                            int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
-                                                   dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
-                            int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
-                                                   dsrc[dstep+cn2+1]*iw11, W_BITS1);
-                            Iptr[x] = (short)ival;
-                            dIptr[0] = (short)ixval;
-                            dIptr[1] = (short)iyval;
-                        }
+                        dIptr[0] = 0;
+                        dIptr[1] = 0;
+                        continue;
                     }
+                    int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
+                                          src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5);
+                    int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
+                                           dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
+                    int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
+                                           dsrc[dstep+cn2+1]*iw11, W_BITS1);
+                    Iptr[x] = (short)ival;
+                    dIptr[0] = (short)ixval;
+                    dIptr[1] = (short)iyval;
+                }
+            }
 
-                    cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1);
-                    cv::Point2f backUpNextPt = nextPt;
-                    nextPt -= halfWin;
-                    Point2f prevDelta(0,0);    //denotes h(t-1)
-                    cv::Size _winSize = winSize;
+            cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1);
+            cv::Point2f backUpNextPt = nextPt;
+            nextPt -= halfWin;
+            Point2f prevDelta(0,0);    //denotes h(t-1)
+            cv::Size _winSize = winSize;
 #ifdef RLOF_SSE
-                    __m128i mmMask0, mmMask1, mmMask;
-                    getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask);
+            __m128i mmMask0, mmMask1, mmMask;
+            getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask);
 #endif
-                    float MEstimatorScale = 1;
-                    int buffIdx = 0;
-                    float c[8];
-                    cv::Mat GMc0, GMc1, GMc2, GMc3;
-                    cv::Vec2f Mc0, Mc1, Mc2, Mc3;
-                    int noIteration = 0;
-                    int noReusedIteration = 0;
-                    int noSolvedIteration = 0;
-                    for( j = 0; j < criteria.maxCount; j++ )
-                    {
-                        cv::Point2f delta(0,0);
-                        cv::Point2f deltaGain(0,0);
-                        bool hasSolved = false;
-                        a = nextPt.x - inextPt.x;
-                        b = nextPt.y - inextPt.y;
-                        float ab = a * b;
-                        if( j == 0
-                            || ( inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j % 2 != 0 ))
-                        {
-                        inextPt.x = cvFloor(nextPt.x);
-                        inextPt.y = cvFloor(nextPt.y);
+            float MEstimatorScale = 1;
+            int buffIdx = 0;
+            float c[8];
+            cv::Mat GMc0, GMc1, GMc2, GMc3;
+            cv::Vec2f Mc0, Mc1, Mc2, Mc3;
+            int noIteration = 0;
+            int noReusedIteration = 0;
+            int noSolvedIteration = 0;
+            for( j = 0; j < criteria.maxCount; j++ )
+            {
+                cv::Point2f delta(0,0);
+                cv::Point2f deltaGain(0,0);
+                bool hasSolved = false;
+                a = nextPt.x - inextPt.x;
+                b = nextPt.y - inextPt.y;
+                float ab = a * b;
+                if( j == 0
+                    || ( inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j % 2 != 0 ))
+                {
+                    inextPt.x = cvFloor(nextPt.x);
+                    inextPt.y = cvFloor(nextPt.y);
 
-                        if( inextPt.x < -winSize.width  || inextPt.x >= J.cols ||
-                            inextPt.y < -winSize.height || inextPt.y >= J.rows )
-                        {
-                            if( level == 0 && status )
-                                status[ptidx] = 3;
-                            break;
-                        }
+                    if( inextPt.x < -winSize.width  || inextPt.x >= J.cols ||
+                        inextPt.y < -winSize.height || inextPt.y >= J.rows )
+                    {
+                        if( level == 0 && status )
+                            status[ptidx] = 3;
+                        break;
+                    }
 
 
-                        a = nextPt.x - inextPt.x;
-                        b = nextPt.y - inextPt.y;
-                        ab = a * b;
-                        iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
-                        iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
-                        iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
-                        iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-                        // mismatch
-                        if( j == 0 )
-                        {
-                            A11 = 0;
-                            A12 = 0;
-                            A22 = 0;
-                        }
+                    a = nextPt.x - inextPt.x;
+                    b = nextPt.y - inextPt.y;
+                    ab = a * b;
+                    iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+                    iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+                    iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+                    iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+                    // mismatch
+                    if( j == 0 )
+                    {
+                        A11 = 0;
+                        A12 = 0;
+                        A22 = 0;
+                    }
 
-                        if ( j == 0 )
+                    if ( j == 0 )
+                    {
+                        buffIdx = 0;
+                        for( y = 0; y < winSize.height; y++ )
                         {
-                            buffIdx = 0;
-                            for( y = 0; y < winSize.height; y++ )
+                            const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y )*step + (inextPt.x )*cn;
+                            const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step);
+                            const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
+                            const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+                            x = 0;
+                            for( ; x < winSize.width*cn; x++, dIptr += 2)
                             {
-                                const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y )*step + (inextPt.x )*cn;
-                                const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step);
-                                const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
-                                const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
-                                x = 0;
-                                for( ; x < winSize.width*cn; x++, dIptr += 2)
-                                {
-                                    if( maskPtr[x] == 0)
-                                        continue;
-                                    int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, W_BITS1-5)
-                                        - Iptr[x];
-                                    residualMat.at<short>(buffIdx++) = static_cast<short>(diff);
-                                }
+                                if( maskPtr[x] == 0)
+                                    continue;
+                                int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, W_BITS1-5)
+                                    - Iptr[x];
+                                residualMat.at<short>(buffIdx++) = static_cast<short>(diff);
                             }
-                            /*! Estimation for the residual */
-                            cv::Mat residualMatRoi(residualMat, cv::Rect(0,0,1, buffIdx));
-                            MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi);
                         }
+                        /*! Estimation for the residual */
+                        cv::Mat residualMatRoi(residualMat, cv::Rect(0,0,1, buffIdx));
+                        MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi);
+                    }
 
-                        float eta = 1.f / winArea;
-                        float fParam0 = param[0] * 32.f;
-                        float fParam1 = param[1] * 32.f;
-                        fParam0 = param[0] * MEstimatorScale;
-                        fParam1 = param[1] * MEstimatorScale;
+                    float eta = 1.f / winArea;
+                    float fParam0 = param[0] * 32.f;
+                    float fParam1 = param[1] * 32.f;
+                    fParam0 = param[0] * MEstimatorScale;
+                    fParam1 = param[1] * MEstimatorScale;
 
 
-                        buffIdx = 0;
-                        float _b0[4] = {0,0,0,0};
-                        float _b1[4] = {0,0,0,0};
+                    buffIdx = 0;
+                    float _b0[4] = {0,0,0,0};
+                    float _b1[4] = {0,0,0,0};
 
-                        /*
-                        */
-                        for( y = 0; y < _winSize.height; y++ )
-                        {
-                            const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
-                            const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn;
-                            const short* Iptr = (const short*)(IWinBuf.data +  y*IWinBuf.step);
-                            const short* dIptr = (const short*)(derivIWinBuf.data + y*derivIWinBuf.step);
+                    /*
+                    */
+                    for( y = 0; y < _winSize.height; y++ )
+                    {
+                        const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+                        const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn;
+                        const short* Iptr = (const short*)(IWinBuf.data +  y*IWinBuf.step);
+                        const short* dIptr = (const short*)(derivIWinBuf.data + y*derivIWinBuf.step);
 
-                            x = 0;
-                            for( ; x < _winSize.width*cn; x++, dIptr += 2 )
-                            {
-                                if( maskPtr[x] == 0)
-                                    continue;
-                                int illValue =   - Iptr[x];
+                        x = 0;
+                        for( ; x < _winSize.width*cn; x++, dIptr += 2 )
+                        {
+                            if( maskPtr[x] == 0)
+                                continue;
+                            int illValue =   - Iptr[x];
 
-                                float It[4] = {static_cast<float>((Jptr[x+step+cn]<< 5)    + illValue),
-                                             static_cast<float>((Jptr[x+cn]<< 5)        + illValue),
-                                             static_cast<float>((Jptr[x+step]<< 5)        + illValue),
-                                             static_cast<float>((Jptr[x] << 5)            + illValue)};
+                            float It[4] = {static_cast<float>((Jptr[x+step+cn]<< 5)    + illValue),
+                                         static_cast<float>((Jptr[x+cn]<< 5)        + illValue),
+                                         static_cast<float>((Jptr[x+step]<< 5)        + illValue),
+                                         static_cast<float>((Jptr[x] << 5)            + illValue)};
 
 
 
-                                int J_val  =  CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
-                                                      Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11,
-                                                      W_BITS1-5);
+                            int J_val  =  CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
+                                                  Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11,
+                                                  W_BITS1-5);
 
 
-                                int diff = J_val + illValue;
+                            int diff = J_val + illValue;
 
 
-                                MEstimatorScale += (diff < MEstimatorScale) ? -eta : eta;
+                            MEstimatorScale += (diff < MEstimatorScale) ? -eta : eta;
 
-                                int abss = (diff < 0) ? -diff : diff;
+                            int abss = (diff < 0) ? -diff : diff;
 
-                                // compute the missmatch vector
-                                if( j >= 0)
+                            // compute the missmatch vector
+                            if( j >= 0)
+                            {
+                                if( abss > fParam1)
+                                {
+                                    It[0] = 0;
+                                    It[1] = 0;
+                                    It[2] = 0;
+                                    It[3] = 0;
+                                }
+                                else if( abss > fParam0 && diff >= 0 )
+                                {
+                                    It[0] = param[2] * (It[0] - fParam1);
+                                    It[1] = param[2] * (It[1] - fParam1);
+                                    It[2] = param[2] * (It[2] - fParam1);
+                                    It[3] = param[2] * (It[3] - fParam1);
+                                }
+                                else if( abss > fParam0 && diff < 0 )
                                 {
-                                    if( abss > fParam1)
-                                    {
-                                        It[0] = 0;
-                                        It[1] = 0;
-                                        It[2] = 0;
-                                        It[3] = 0;
-                                    }
-                                    else if( abss > fParam0 && diff >= 0 )
-                                    {
-                                        It[0] = param[2] * (It[0] - fParam1);
-                                        It[1] = param[2] * (It[1] - fParam1);
-                                        It[2] = param[2] * (It[2] - fParam1);
-                                        It[3] = param[2] * (It[3] - fParam1);
-                                    }
-                                    else if( abss > fParam0 && diff < 0 )
-                                    {
-                                        It[0] = param[2] * (It[0] + fParam1);
-                                        It[1] = param[2] * (It[1] + fParam1);
-                                        It[2] = param[2] * (It[2] + fParam1);
-                                        It[3] = param[2] * (It[3] + fParam1);
-                                    }
+                                    It[0] = param[2] * (It[0] + fParam1);
+                                    It[1] = param[2] * (It[1] + fParam1);
+                                    It[2] = param[2] * (It[2] + fParam1);
+                                    It[3] = param[2] * (It[3] + fParam1);
                                 }
+                            }
 
-                                float It0 = It[0];
-                                float It1 = It[1];
-                                float It2 = It[2];
-                                float It3 = It[3];
+                            float It0 = It[0];
+                            float It1 = It[1];
+                            float It2 = It[2];
+                            float It3 = It[3];
 
-                                float ixval = static_cast<float>(dIptr[0]);
-                                float iyval = static_cast<float>(dIptr[1]);
-                                _b0[0] += It0 * ixval;
-                                _b0[1] += It1 * ixval;
-                                _b0[2] += It2 * ixval;
-                                _b0[3] += It3 * ixval;
+                            float ixval = static_cast<float>(dIptr[0]);
+                            float iyval = static_cast<float>(dIptr[1]);
+                            _b0[0] += It0 * ixval;
+                            _b0[1] += It1 * ixval;
+                            _b0[2] += It2 * ixval;
+                            _b0[3] += It3 * ixval;
 
 
-                                _b1[0] += It0*iyval;
-                                _b1[1] += It1*iyval;
-                                _b1[2] += It2*iyval;
-                                _b1[3] += It3*iyval;
+                            _b1[0] += It0*iyval;
+                            _b1[1] += It1*iyval;
+                            _b1[2] += It2*iyval;
+                            _b1[3] += It3*iyval;
 
 
-                                // compute the Gradient Matrice
-                                if( j == 0)
+                            // compute the Gradient Matrice
+                            if( j == 0)
+                            {
+                                float tale = param[2] * FLT_RESCALE;
+                                if( abss < fParam0 || j < 0)
                                 {
-                                    float tale = param[2] * FLT_RESCALE;
-                                    if( abss < fParam0 || j < 0)
-                                    {
-                                        tale = FLT_RESCALE;
-                                    }
-                                    else if( abss > fParam1)
-                                    {
-                                        tale *= 0.01f;
-                                    }
-                                    else
-                                    {
-                                        tale *= param[2];
-                                    }
-
-                                    A11 += (float)(ixval*ixval)*tale;
-                                    A12 += (float)(ixval*iyval)*tale;
-                                    A22 += (float)(iyval*iyval)*tale;
-
+                                    tale = FLT_RESCALE;
+                                }
+                                else if( abss > fParam1)
+                                {
+                                    tale *= 0.01f;
+                                }
+                                else
+                                {
+                                    tale *= param[2];
                                 }
+
+                                A11 += (float)(ixval*ixval)*tale;
+                                A12 += (float)(ixval*iyval)*tale;
+                                A22 += (float)(iyval*iyval)*tale;
+
                             }
                         }
+                    }
 
-                        if( j == 0 )
-                        {
+                    if( j == 0 )
+                    {
 
-                            A11 *= FLT_SCALE; // 54866744.
-                            A12 *= FLT_SCALE; // -628764.00
-                            A22 *= FLT_SCALE; // 19730.000
+                        A11 *= FLT_SCALE; // 54866744.
+                        A12 *= FLT_SCALE; // -628764.00
+                        A22 *= FLT_SCALE; // 19730.000
 
-                            D = A11 * A22 - A12 * A12;
-                            float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
-                                    4.f*A12*A12))/(2*winArea);
+                        D = A11 * A22 - A12 * A12;
+                        float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
+                                4.f*A12*A12))/(2*winArea);
 
-                            if(  minEig < minEigThreshold || std::abs(D) < FLT_EPSILON)
+                        if(  minEig < minEigThreshold || std::abs(D) < FLT_EPSILON)
+                        {
+                            if( level == 0 && status )
+                                status[ptidx] = 0;
+                            if( level > 0)
                             {
-                                if( level == 0 && status )
-                                    status[ptidx] = 0;
-                                if( level > 0)
-                                {
-                                    nextPts[ptidx] = backUpNextPt;
-                                }
-                                noIteration++;
-                                break;
+                                nextPts[ptidx] = backUpNextPt;
                             }
-
-                            D = (1.f / D);
-
+                            noIteration++;
+                            break;
                         }
 
-                        _b0[0] *= FLT_SCALE;_b0[1] *= FLT_SCALE;_b0[2] *= FLT_SCALE;_b0[3] *= FLT_SCALE;
-                        _b1[0] *= FLT_SCALE;_b1[1] *= FLT_SCALE;_b1[2] *= FLT_SCALE;_b1[3] *= FLT_SCALE;
+                        D = (1.f / D);
 
+                    }
 
-                        Mc0[0] =   _b0[0] - _b0[1] - _b0[2] + _b0[3];
-                        Mc0[1] =   _b1[0] - _b1[1] - _b1[2] + _b1[3];
+                    _b0[0] *= FLT_SCALE;_b0[1] *= FLT_SCALE;_b0[2] *= FLT_SCALE;_b0[3] *= FLT_SCALE;
+                    _b1[0] *= FLT_SCALE;_b1[1] *= FLT_SCALE;_b1[2] *= FLT_SCALE;_b1[3] *= FLT_SCALE;
 
-                        Mc1[0] =   _b0[1] - _b0[3];
-                        Mc1[1] =   _b1[1] - _b1[3];
 
+                    Mc0[0] =   _b0[0] - _b0[1] - _b0[2] + _b0[3];
+                    Mc0[1] =   _b1[0] - _b1[1] - _b1[2] + _b1[3];
 
-                        Mc2[0] =   _b0[2] - _b0[3];
-                        Mc2[1] =   _b1[2] - _b1[3];
+                    Mc1[0] =   _b0[1] - _b0[3];
+                    Mc1[1] =   _b1[1] - _b1[3];
 
 
-                        Mc3[0] =  _b0[3];
-                        Mc3[1] =  _b1[3];
+                    Mc2[0] =   _b0[2] - _b0[3];
+                    Mc2[1] =   _b1[2] - _b1[3];
 
-                        c[0] = -Mc0[0];
-                        c[1] = -Mc1[0];
-                        c[2] = -Mc2[0];
-                        c[3] = -Mc3[0];
-                        c[4] = -Mc0[1];
-                        c[5] = -Mc1[1];
-                        c[6] = -Mc2[1];
-                        c[7] = -Mc3[1];
 
-                        float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]);
-                        float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]);
-                        float e2 = e0 * (c[1] * c[7] -c[3] * c[5]);
-                        e0 = e1 * e1 - e2;
-                        hasSolved = false;
-                        if ( e0 > 0)
-                        {
-                            e0 = sqrt(e0);
-                            float _y[2] = {-e1 - e0, e0 - e1};
-                            float c0yc1[2] = {c[0] * _y[0] + c[1],
-                                                c[0] * _y[1] + c[1]};
-                            float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0],
-                                            -(c[2] * _y[1] + c[3]) / c0yc1[1]};
-                            bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1);
-                            bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1);
-
-                            bool isSolution1 = checkSolution(_x[0], _y[0], c );
-                            bool isSolution2 = checkSolution(_x[1], _y[1], c );
-                            bool isSink1 = isIn1 && isSolution1;
-                            bool isSink2 = isIn2 && isSolution2;
-
-                            if ( isSink1 != isSink2)
-                            {
-                                a = isSink1 ? _x[0] : _x[1];
-                                b = isSink1 ? _y[0] : _y[1];
-                                ab = a * b;
-                                hasSolved = true;
-                                delta.x = inextPt.x + a - nextPt.x;
-                                delta.y = inextPt.y + b - nextPt.y;
-                            } // isIn1 != isIn2
-                        }
-                            if( hasSolved == false)
-                                noIteration++;
-                        }
-                        else
-                        {
-                            hasSolved = false;
-                            noReusedIteration++;
-                        }
-                        if( hasSolved == false )
-                        {
+                    Mc3[0] =  _b0[3];
+                    Mc3[1] =  _b1[3];
 
-                            cv::Vec2f mismatchVec = ab * Mc0  + Mc1 *a + Mc2 * b + Mc3;
-                            delta.x = (A12 * mismatchVec.val[1] - A22 * mismatchVec.val[0]) * D;
-                            delta.y = (A12 * mismatchVec.val[0] - A11 * mismatchVec.val[1]) * D;
-                            delta.x = MAX(-1.f, MIN(1.f, delta.x));
-                            delta.y = MAX(-1.f, MIN(1.f, delta.y));
-                            nextPt  += delta;
-                            nextPts[ptidx] = nextPt + halfWin;
-                        }
-                        else
-                        {
-                            nextPt += delta;
-                            nextPts[ptidx] = nextPt + halfWin;
-                            noSolvedIteration++;
-                            break;
-                        }
+                    c[0] = -Mc0[0];
+                    c[1] = -Mc1[0];
+                    c[2] = -Mc2[0];
+                    c[3] = -Mc3[0];
+                    c[4] = -Mc0[1];
+                    c[5] = -Mc1[1];
+                    c[6] = -Mc2[1];
+                    c[7] = -Mc3[1];
 
-                        delta.x = ( delta.x != delta.x) ? 0 : delta.x;
-                        delta.y = ( delta.y != delta.y) ? 0 : delta.y;
+                    float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]);
+                    float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]);
+                    float e2 = e0 * (c[1] * c[7] -c[3] * c[5]);
+                    e0 = e1 * e1 - e2;
+                    hasSolved = false;
+                    if ( e0 > 0)
+                    {
+                        e0 = sqrt(e0);
+                        float _y[2] = {-e1 - e0, e0 - e1};
+                        float c0yc1[2] = {c[0] * _y[0] + c[1],
+                                            c[0] * _y[1] + c[1]};
+                        float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0],
+                                        -(c[2] * _y[1] + c[3]) / c0yc1[1]};
+                        bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1);
+                        bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1);
+
+                        bool isSolution1 = checkSolution(_x[0], _y[0], c );
+                        bool isSolution2 = checkSolution(_x[1], _y[1], c );
+                        bool isSink1 = isIn1 && isSolution1;
+                        bool isSink2 = isIn2 && isSolution2;
 
-                        if(j > 0 && (
-                            (std::abs(delta.x - prevDelta.x) < 0.01  &&    std::abs(delta.y - prevDelta.y) < 0.01)))
+                        if ( isSink1 != isSink2)
                         {
-                            nextPts[ptidx]  -= delta*0.5f;
-                            break;
-                        }
+                            a = isSink1 ? _x[0] : _x[1];
+                            b = isSink1 ? _y[0] : _y[1];
+                            ab = a * b;
+                            hasSolved = true;
+                            delta.x = inextPt.x + a - nextPt.x;
+                            delta.y = inextPt.y + b - nextPt.y;
+                        } // isIn1 != isIn2
+                    }
+                    if( hasSolved == false)
+                        noIteration++;
+                }
+                else
+                {
+                    hasSolved = false;
+                    noReusedIteration++;
+                }
+                if( hasSolved == false )
+                {
 
+                    cv::Vec2f mismatchVec = ab * Mc0  + Mc1 *a + Mc2 * b + Mc3;
+                    delta.x = (A12 * mismatchVec.val[1] - A22 * mismatchVec.val[0]) * D;
+                    delta.y = (A12 * mismatchVec.val[0] - A11 * mismatchVec.val[1]) * D;
+                    delta.x = MAX(-1.f, MIN(1.f, delta.x));
+                    delta.y = MAX(-1.f, MIN(1.f, delta.y));
+                    nextPt  += delta;
+                    nextPts[ptidx] = nextPt + halfWin;
+                }
+                else
+                {
+                    nextPt += delta;
+                    nextPts[ptidx] = nextPt + halfWin;
+                    noSolvedIteration++;
+                    break;
+                }
 
-                        prevDelta = delta;
-                    }
+                delta.x = ( delta.x != delta.x) ? 0 : delta.x;
+                delta.y = ( delta.y != delta.y) ? 0 : delta.y;
 
+                if(j > 0 && (
+                    (std::abs(delta.x - prevDelta.x) < 0.01  &&    std::abs(delta.y - prevDelta.y) < 0.01)))
+                {
+                    nextPts[ptidx]  -= delta*0.5f;
+                    break;
                 }
 
+                prevDelta = delta;
             }
 
-            const Mat*        prevImg;
-            const Mat*        nextImg;
-            const Mat*        prevDeriv;
-            const Mat*        rgbPrevImg;
-            const Mat*        rgbNextImg;
-            const Point2f*    prevPts;
-            Point2f*        nextPts;
-            uchar*            status;
-            float*            err;
-            int                maxWinSize;
-            int                minWinSize;
-            TermCriteria    criteria;
-            int                level;
-            int                maxLevel;
-            int                windowType;
-            float            minEigThreshold;
-            bool            useInitialFlow;
-            std::vector<float> paramReset;
-            int        crossSegmentationThreshold;
-
-        };
+        }
+
+    }
+
+    const Mat*          prevImg;
+    const Mat*          nextImg;
+    const Mat*          prevDeriv;
+    const Mat*          rgbPrevImg;
+    const Mat*          rgbNextImg;
+    const Point2f*      prevPts;
+    Point2f*            nextPts;
+    uchar*              status;
+    float*              err;
+    int                 maxWinSize;
+    int                 minWinSize;
+    TermCriteria        criteria;
+    int                 level;
+    int                 maxLevel;
+    int                 windowType;
+    float               minEigThreshold;
+    bool                useInitialFlow;
+    std::vector<float>  paramReset;
+    int                 crossSegmentationThreshold;
+
+};
+
+}  // namespace
+namespace radial {
+class TrackerInvoker : public cv::ParallelLoopBody
+{
+public:
+    TrackerInvoker(
+        const Mat&      _prevImg,
+        const Mat&      _prevDeriv,
+        const Mat&      _nextImg,
+        const Mat&      _rgbPrevImg,
+        const Mat&      _rgbNextImg,
+        const Point2f*  _prevPts,
+        Point2f*        _nextPts,
+        uchar*          _status,
+        float*          _err,
+        Point2f*        _gainVecs,
+        int             _level,
+        int             _maxLevel,
+        int             _winSize[2],
+        int             _maxIteration,
+        bool            _useInitialFlow,
+        int             _supportRegionType,
+        int             _crossSegmentationThreshold,
+        std::vector<float> _normSigmas,
+        float           _minEigenValue)
+    {
+        prevImg = &_prevImg;
+        prevDeriv = &_prevDeriv;
+        nextImg = &_nextImg;
+        rgbPrevImg = &_rgbPrevImg;
+        rgbNextImg = &_rgbNextImg;
+        prevPts = _prevPts;
+        nextPts = _nextPts;
+        status = _status;
+        err = _err;
+        gainVecs = _gainVecs;
+        minWinSize = _winSize[0];
+        maxWinSize = _winSize[1];
+        criteria.maxCount = _maxIteration;
+        criteria.epsilon = 0.01;
+        level = _level;
+        maxLevel = _maxLevel;
+        windowType = _supportRegionType;
+        minEigThreshold = _minEigenValue;
+        paramReset = _normSigmas;
+        useInitialFlow = _useInitialFlow;
+        crossSegmentationThreshold = _crossSegmentationThreshold;
     }
-    namespace radial
+
+    TrackerInvoker & operator=(const TrackerInvoker &) {return *this;};
+    void operator()(const cv::Range& range) const CV_OVERRIDE
     {
-        class TrackerInvoker : public cv::ParallelLoopBody
+#ifdef DEBUG_INVOKER
+        printf("berlof::radial");fflush(stdout);
+#endif
+        Point2f halfWin;
+        cv::Size winSize;
+        const Mat& I = *prevImg;
+        const Mat& J = *nextImg;
+        const Mat& derivI = *prevDeriv;
+        const Mat& BI = *rgbPrevImg;
+        const float FLT_SCALE = (1.f/(1 << 16));//(1.f/(1 << 20)); // 20
+        winSize = cv::Size(maxWinSize,maxWinSize);
+        int winMaskwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
+        cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
+        winMaskMatBuf.setTo(1);
+
+        std::vector<float> param = paramReset;
+        int cn = I.channels(), cn2 = cn*2;
+        int winbufwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
+        cv::Size winBufSize(winbufwidth,winbufwidth);
+
+
+        cv::Mat invTensorMat(4,4, CV_32FC1);
+        cv::Mat mismatchMat(4, 1, CV_32FC1);
+        cv::Mat resultMat(4, 1, CV_32FC1);
+
+        cv::AutoBuffer<deriv_type> _buf(winBufSize.area()*(cn + cn2));
+        int derivDepth = DataType<deriv_type>::depth;
+
+        Mat IWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf.data());
+        Mat derivIWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf.data() + winBufSize.area()*cn);
+
+        for( int ptidx = range.start; ptidx < range.end; ptidx++ )
         {
-        public:
-            TrackerInvoker(
-                const Mat&    _prevImg,
-                const Mat&    _prevDeriv,
-                const Mat&    _nextImg,
-                const Mat&    _rgbPrevImg,
-                const Mat&    _rgbNextImg,
-                const Point2f*    _prevPts,
-                Point2f*        _nextPts,
-                uchar*            _status,
-                float*            _err,
-                Point2f*        _gainVecs,
-                int                _level,
-                int                _maxLevel,
-                int                _winSize[2],
-                int                _maxIteration,
-                bool            _useInitialFlow,
-                int                _supportRegionType,
-                int                _crossSegmentationThreshold,
-                std::vector<float> _normSigmas,
-                float           _minEigenValue)
+            Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
+            Point2f nextPt;
+            if( level == maxLevel )
             {
-                prevImg = &_prevImg;
-                prevDeriv = &_prevDeriv;
-                nextImg = &_nextImg;
-                rgbPrevImg = &_rgbPrevImg;
-                rgbNextImg = &_rgbNextImg;
-                prevPts = _prevPts;
-                nextPts = _nextPts;
-                status = _status;
-                err = _err;
-                gainVecs = _gainVecs;
-                minWinSize = _winSize[0];
-                maxWinSize = _winSize[1];
-                criteria.maxCount = _maxIteration;
-                criteria.epsilon = 0.01;
-                level = _level;
-                maxLevel = _maxLevel;
-                windowType = _supportRegionType;
-                minEigThreshold = _minEigenValue;
-                paramReset = _normSigmas;
-                useInitialFlow = _useInitialFlow;
-                crossSegmentationThreshold = _crossSegmentationThreshold;
+                if( useInitialFlow )
+                    nextPt = nextPts[ptidx]*(float)(1./(1 << level));
+                else
+                    nextPt = prevPt;
             }
-
-            TrackerInvoker & operator=(const TrackerInvoker &) {return *this;};
-            void operator()(const cv::Range& range) const CV_OVERRIDE
+            else
+                nextPt = nextPts[ptidx]*2.f;
+            nextPts[ptidx] = nextPt;
+
+            Point2i iprevPt, inextPt;
+            iprevPt.x = cvFloor(prevPt.x);
+            iprevPt.y = cvFloor(prevPt.y);
+            int winArea = maxWinSize * maxWinSize;
+            cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize));
+            winMaskMatBuf.setTo(0);
+            if( calcWinMaskMat(BI,  windowType, iprevPt,
+                                winMaskMat,winSize,halfWin,winArea,
+                                minWinSize,maxWinSize, crossSegmentationThreshold) == false)
+                continue;
+
+            prevPt -= halfWin;
+            iprevPt.x = cvFloor(prevPt.x);
+            iprevPt.y = cvFloor(prevPt.y);
+
+            if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols ||
+                iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows )
             {
-#ifdef DEBUG_INVOKER
-                printf("berlof::radial");fflush(stdout);
+                if( level == 0 )
+                {
+                    if( status )
+                        status[ptidx] = 3;
+                    if( err )
+                        err[ptidx] = 0;
+                }
+                continue;
+            }
+
+            float a = prevPt.x - iprevPt.x;
+            float b = prevPt.y - iprevPt.y;
+            const int W_BITS = 14, W_BITS1 = 14;
+
+            int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+            int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+            int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+            int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+
+            int dstep = (int)(derivI.step/derivI.elemSize1());
+            int step = (int)(I.step/I.elemSize1());
+            int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1());
+            CV_Assert( step == (int)(J.step/J.elemSize1()) );
+
+            float A11 = 0, A12 = 0, A22 = 0;
+
+            // tensor
+            float sumIx = 0;
+            float sumIy = 0;
+            float sumI  = 0;
+            float sumW = 0;
+            float w1 = 0, w2 = 0; // -IyI
+            float dI = 0; // I^2
+            float D = 0;
+
+#ifdef RLOF_SSE
+
+            __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+            __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+            __m128i z = _mm_setzero_si128();
+            __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1));
+            __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1));
+            __m128i mmMask4_epi32;
+            __m128i mmMaskSet_epi16   = _mm_set1_epi16(std::numeric_limits<unsigned short>::max());
+            get4BitMask(winSize.width, mmMask4_epi32);
 #endif
-                Point2f halfWin;
-                cv::Size winSize;
-                const Mat& I = *prevImg;
-                const Mat& J = *nextImg;
-                const Mat& derivI = *prevDeriv;
-                const Mat& BI = *rgbPrevImg;
-                const float FLT_SCALE = (1.f/(1 << 16));//(1.f/(1 << 20)); // 20
-                winSize = cv::Size(maxWinSize,maxWinSize);
-                int winMaskwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
-                cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
-                winMaskMatBuf.setTo(1);
-
-                std::vector<float> param = paramReset;
-                int cn = I.channels(), cn2 = cn*2;
-                int winbufwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
-                cv::Size winBufSize(winbufwidth,winbufwidth);
-
-
-                cv::Mat invTensorMat(4,4, CV_32FC1);
-                cv::Mat mismatchMat(4, 1, CV_32FC1);
-                cv::Mat resultMat(4, 1, CV_32FC1);
-
-                cv::AutoBuffer<deriv_type> _buf(winBufSize.area()*(cn + cn2));
-                int derivDepth = DataType<deriv_type>::depth;
-
-                Mat IWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf.data());
-                Mat derivIWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf.data() + winBufSize.area()*cn);
-
-                for( int ptidx = range.start; ptidx < range.end; ptidx++ )
+
+            // extract the patch from the first image, compute covariation matrix of derivatives
+            int x, y;
+            for( y = 0; y < winSize.height; y++ )
+            {
+                x = 0;
+
+                const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
+                const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
+
+                short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step);
+                short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step);
+#ifdef RLOF_SSE
+                const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+                for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 )
                 {
-                    Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
-                    Point2f nextPt;
-                    if( level == maxLevel )
+                    __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16);
+                    __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16);
+
+
+                    __m128i v00, v01, v10, v11, t0, t1;
+                    v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
+                    v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
+                    v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z);
+                    v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z);
+
+                    t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                                       _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                    t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
+                    if( x + 4 > winSize.width)
                     {
-                        if( useInitialFlow )
-                            nextPt = nextPts[ptidx]*(float)(1./(1 << level));
-                        else
-                            nextPt = prevPt;
+                        t0 = _mm_and_si128(t0, mmMask4_epi32);
                     }
-                    else
-                        nextPt = nextPts[ptidx]*2.f;
-                    nextPts[ptidx] = nextPt;
-
-                    Point2i iprevPt, inextPt;
-                    iprevPt.x = cvFloor(prevPt.x);
-                    iprevPt.y = cvFloor(prevPt.y);
-                    int winArea = maxWinSize * maxWinSize;
-                    cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize));
-                    winMaskMatBuf.setTo(0);
-                    if( calcWinMaskMat(BI,  windowType, iprevPt,
-                                        winMaskMat,winSize,halfWin,winArea,
-                                        minWinSize,maxWinSize, crossSegmentationThreshold) == false)
-                        continue;
-
-                    prevPt -= halfWin;
-                    iprevPt.x = cvFloor(prevPt.x);
-                    iprevPt.y = cvFloor(prevPt.y);
+                    t0 = _mm_and_si128(t0, mask_0_3_epi16);
+                    _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0));
+
+
+                    v00 = _mm_loadu_si128((const __m128i*)(dsrc));
+                    v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
+                    v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep));
+                    v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2));
+
+                    t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                                       _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                    t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+                                       _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+                    t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
+                    t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
+                    v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
+                    if( x + 4 > winSize.width)
+                    {
+                        v00 = _mm_and_si128(v00, mmMask4_epi32);
+                    }
+                    v00 = _mm_and_si128(v00, mask_0_3_epi16);
+                    _mm_storeu_si128((__m128i*)dIptr, v00);
+                }
+#else
 
-                    if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols ||
-                        iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows )
+                for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 )
+                {
+                    if( winMaskMat.at<uchar>(y,x) == 0)
                     {
-                        if( level == 0 )
-                        {
-                            if( status )
-                                status[ptidx] = 3;
-                            if( err )
-                                err[ptidx] = 0;
-                        }
+                        dIptr[0] = 0;
+                        dIptr[1] = 0;
                         continue;
                     }
+                    int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
+                                          src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5);
+                    int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
+                                           dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
+                    int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
+                                           dsrc[dstep+cn2+1]*iw11, W_BITS1);
+
+                    Iptr[x] = (short)ival;
+                    dIptr[0] = (short)ixval;
+                    dIptr[1] = (short)iyval;
+
+                }
+#endif
+            }
 
-                    float a = prevPt.x - iprevPt.x;
-                    float b = prevPt.y - iprevPt.y;
-                    const int W_BITS = 14, W_BITS1 = 14;
+            cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1);
 
-                    int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
-                    int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
-                    int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
-                    int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+            cv::Point2f backUpNextPt = nextPt;
+            nextPt -= halfWin;
+            Point2f prevDelta(0,0);    //relates to h(t-1)
+            Point2f prevGain(1,0);
+            cv::Point2f gainVec = gainVecs[ptidx];
+            cv::Point2f backUpGain = gainVec;
+            cv::Size _winSize = winSize;
+            int j;
+#ifdef RLOF_SSE
+            __m128i mmMask0, mmMask1, mmMask;
+            getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask);
+            __m128  mmOnes   = _mm_set1_ps(1.f );
+#endif
+            float MEstimatorScale = 1;
+            int buffIdx = 0;
+            float c[8];
+            cv::Mat GMc0, GMc1, GMc2, GMc3;
+            cv::Vec4f Mc0, Mc1, Mc2, Mc3;
+            int noIteration = 0;
+            int noReusedIteration = 0;
+            int noSolvedIteration = 0;
+            for( j = 0; j < criteria.maxCount; j++ )
+            {
+                cv::Point2f delta(0,0);
+                cv::Point2f deltaGain(0,0);
+                bool hasSolved = false;
+                a = nextPt.x - inextPt.x;
+                b = nextPt.y - inextPt.y;
+                float ab = a * b;
+                if( j == 0
+                || (inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j % 2 != 0) )
+                {
+                inextPt.x = cvFloor(nextPt.x);
+                inextPt.y = cvFloor(nextPt.y);
+
+                if( inextPt.x < -winSize.width  || inextPt.x >= J.cols ||
+                    inextPt.y < -winSize.height || inextPt.y >= J.rows )
+                {
+                    if( level == 0 && status )
+                        status[ptidx] = 3;
+                    if (level > 0)
+                    {
+                        nextPts[ptidx] = backUpNextPt;
+                        gainVecs[ptidx] = backUpGain;
+                    }
+                    noIteration++;
+                    break;
+                }
 
-                    int dstep = (int)(derivI.step/derivI.elemSize1());
-                    int step = (int)(I.step/I.elemSize1());
-                    int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1());
-                    CV_Assert( step == (int)(J.step/J.elemSize1()) );
 
-                    float A11 = 0, A12 = 0, A22 = 0;
+                a = nextPt.x - inextPt.x;
+                b = nextPt.y - inextPt.y;
+                ab = a * b;
+                iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+                iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+                iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+                iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+                // mismatch
 
+                if( j == 0)
+                {
                     // tensor
-                    float sumIx = 0;
-                    float sumIy = 0;
-                    float sumI  = 0;
-                    float sumW = 0;
-                    float w1 = 0, w2 = 0; // -IyI
-                    float dI = 0; // I^2
-                    float D = 0;
-
-        #ifdef RLOF_SSE
-
-                    __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
-                    __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
-                    __m128i z = _mm_setzero_si128();
-                    __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1));
-                    __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1));
-                    __m128i mmMask4_epi32;
-                    __m128i mmMaskSet_epi16   = _mm_set1_epi16(std::numeric_limits<unsigned short>::max());
-                    get4BitMask(winSize.width, mmMask4_epi32);
-        #endif
-
-                    // extract the patch from the first image, compute covariation matrix of derivatives
-                    int x, y;
+                    w1 = 0; // -IxI
+                    w2 = 0; // -IyI
+                    dI = 0; // I^2
+                    sumIx = 0;
+                    sumIy = 0;
+                    sumI  = 0;
+                    sumW = 0;
+                    A11 = 0;
+                    A12 = 0;
+                    A22 = 0;
+                }
+
+                if ( j == 0 )
+                {
+                    buffIdx = 0;
                     for( y = 0; y < winSize.height; y++ )
                     {
+                        const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y )*step + (inextPt.x )*cn;
+                        const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step);
+                        const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
+                        const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
                         x = 0;
+                        for( ; x < winSize.width*cn; x++, dIptr += 2)
+                        {
+                            if( maskPtr[x] == 0)
+                                continue;
+                            int diff = static_cast<int>(CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, W_BITS1-5)
+                                - Iptr[x] + Iptr[x] * gainVec.x +gainVec.y);
+                            residualMat.at<short>(buffIdx++) = static_cast<short>(diff);
+                        }
+                    }
+                    /*! Estimation for the residual */
+                    cv::Mat residualMatRoi(residualMat, cv::Rect(0,0,1, buffIdx));
+                    MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi);
+                }
 
-                        const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
-                        const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
+                float eta = 1.f / winArea;
+                float fParam0 = param[0] * 32.f;
+                float fParam1 = param[1] * 32.f;
+                fParam0 = param[0] * MEstimatorScale;
+                fParam1 = param[1] * MEstimatorScale;
 
-                        short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step);
-                        short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step);
-        #ifdef RLOF_SSE
-                        const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
-                        for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 )
-                        {
-                            __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16);
-                            __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16);
+#ifdef RLOF_SSE
 
+                qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+                qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+                __m128 qb0[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
+                __m128 qb1[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
+                __m128 qb2[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
+                __m128 qb3[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
+                __m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps();
+                __m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps();
+                __m128 mmSumIy = _mm_setzero_ps(),  mmSumIx = _mm_setzero_ps();
+                __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();
+                __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() -1, static_cast<short>(fParam0)));
+                __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max()- 1, static_cast<short>(fParam1)));
+
+
+                float s2Val = param[2] > 0 ? param[2] : -param[2];
+                int s2bitShift = param[2] == 0 ? 1 : static_cast<int>(ceil(log(200.f / s2Val) / log(2.f)));
+                __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast<short>(param[2] * (float)(1 << s2bitShift)));
+                __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift);
+                __m128  mmParam2s = _mm_set1_ps(0.01f * param[2]);
+                __m128  mmParam2s2 = _mm_set1_ps(param[2] * param[2]);
+                float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x;
+                int bitShift = gainVec.x == 0 ? 1 : static_cast<int>(ceil(log(200.f / gainVal) / log(2.f)));
+                __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.x * (float)(1 << bitShift)));
+                __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.y));
+                __m128i mmEta     = _mm_setzero_si128();
+                __m128i mmScale      = _mm_set1_epi16(static_cast<short>(MEstimatorScale));
 
-                            __m128i v00, v01, v10, v11, t0, t1;
-                            v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
-                            v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
-                            v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z);
-                            v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z);
+#endif
 
-                            t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
-                                               _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-                            t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
-                            if( x + 4 > winSize.width)
-                            {
-                                t0 = _mm_and_si128(t0, mmMask4_epi32);
-                            }
-                            t0 = _mm_and_si128(t0, mask_0_3_epi16);
-                            _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0));
-
-
-                            v00 = _mm_loadu_si128((const __m128i*)(dsrc));
-                            v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
-                            v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep));
-                            v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2));
-
-                            t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
-                                               _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-                            t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
-                                               _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
-                            t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
-                            t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
-                            v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
-                            if( x + 4 > winSize.width)
-                            {
-                                v00 = _mm_and_si128(v00, mmMask4_epi32);
-                            }
-                            v00 = _mm_and_si128(v00, mask_0_3_epi16);
-                            _mm_storeu_si128((__m128i*)dIptr, v00);
-                        }
-        #else
+                buffIdx = 0;
+                float _b0[4] = {0,0,0,0};
+                float _b1[4] = {0,0,0,0};
+                float _b2[4] = {0,0,0,0};
+                float _b3[4] = {0,0,0,0};
+                /*
+                */
+                for( y = 0; y < _winSize.height; y++ )
+                {
+                    const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+                    const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn;
+                    const short* Iptr = (const short*)(IWinBuf.data +  y*IWinBuf.step);
+                    const short* dIptr = (const short*)(derivIWinBuf.data + y*derivIWinBuf.step);
 
-                        for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 )
-                        {
-                            if( winMaskMat.at<uchar>(y,x) == 0)
-                            {
-                                dIptr[0] = 0;
-                                dIptr[1] = 0;
-                                continue;
-                            }
-                            int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
-                                                  src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5);
-                            int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
-                                                   dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
-                            int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
-                                                   dsrc[dstep+cn2+1]*iw11, W_BITS1);
+                    x = 0;
+#ifdef RLOF_SSE
+                    for( ; x <= _winSize.width*cn; x += 8, dIptr += 8*2 )
+                    {
+                        __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16);
+                        __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x));
+
+                        __m128i v00 = _mm_unpacklo_epi8(
+                            _mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
+                        __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
+                        __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z);
+                        __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z);
+
+                        __m128i t0 = _mm_add_epi32
+                            (_mm_madd_epi16(
+                                _mm_unpacklo_epi16(v00, v01),
+                                qw0),
+                            _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                        __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+                                                   _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+
+                        t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
+                        t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5);
 
-                            Iptr[x] = (short)ival;
-                            dIptr[0] = (short)ixval;
-                            dIptr[1] = (short)iyval;
+                        __m128i lo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16);
+                        __m128i hi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16);
 
-                        }
-        #endif
-                    }
+                        __m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), bitShift);
+                        __m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), bitShift);
+                        __m128i Igain_epi16 =  _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32);
 
-                    cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1);
-
-                    cv::Point2f backUpNextPt = nextPt;
-                    nextPt -= halfWin;
-                    Point2f prevDelta(0,0);    //relates to h(t-1)
-                    Point2f prevGain(1,0);
-                    cv::Point2f gainVec = gainVecs[ptidx];
-                    cv::Point2f backUpGain = gainVec;
-                    cv::Size _winSize = winSize;
-                    int j;
-        #ifdef RLOF_SSE
-                    __m128i mmMask0, mmMask1, mmMask;
-                    getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask);
-                    __m128  mmOnes   = _mm_set1_ps(1.f );
-        #endif
-                    float MEstimatorScale = 1;
-                    int buffIdx = 0;
-                    float c[8];
-                    cv::Mat GMc0, GMc1, GMc2, GMc3;
-                    cv::Vec4f Mc0, Mc1, Mc2, Mc3;
-                    int noIteration = 0;
-                    int noReusedIteration = 0;
-                    int noSolvedIteration = 0;
-                    for( j = 0; j < criteria.maxCount; j++ )
-                    {
-                        cv::Point2f delta(0,0);
-                        cv::Point2f deltaGain(0,0);
-                        bool hasSolved = false;
-                        a = nextPt.x - inextPt.x;
-                        b = nextPt.y - inextPt.y;
-                        float ab = a * b;
-                        if( j == 0
-                        || (inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j % 2 != 0) )
-                        {
-                        inextPt.x = cvFloor(nextPt.x);
-                        inextPt.y = cvFloor(nextPt.y);
 
-                        if( inextPt.x < -winSize.width  || inextPt.x >= J.cols ||
-                            inextPt.y < -winSize.height || inextPt.y >= J.rows )
+                        __m128i diffValue = _mm_subs_epi16(_mm_add_epi16(Igain_epi16, mmConstValue_epi16), I_0_7_epi16);
+                        __m128i mmDiffc_epi16[4] =
                         {
-                            if( level == 0 && status )
-                                status[ptidx] = 3;
-                            if (level > 0)
-                            {
-                                nextPts[ptidx] = backUpNextPt;
-                                gainVecs[ptidx] = backUpGain;
-                            }
-                            noIteration++;
-                            break;
-                        }
+                           _mm_add_epi16(_mm_slli_epi16(v11, 5), diffValue),
+                           _mm_add_epi16(_mm_slli_epi16(v01, 5), diffValue),
+                           _mm_add_epi16(_mm_slli_epi16(v10, 5), diffValue),
+                           _mm_add_epi16(_mm_slli_epi16(v00, 5), diffValue)
+                        };
 
+                        __m128i mmDiff_epi16 = _mm_add_epi16( _mm_packs_epi32(t0, t1), diffValue);
 
-                        a = nextPt.x - inextPt.x;
-                        b = nextPt.y - inextPt.y;
-                        ab = a * b;
-                        iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
-                        iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
-                        iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
-                        iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-                        // mismatch
 
-                        if( j == 0)
-                        {
-                            // tensor
-                            w1 = 0; // -IxI
-                            w2 = 0; // -IyI
-                            dI = 0; // I^2
-                            sumIx = 0;
-                            sumIy = 0;
-                            sumI  = 0;
-                            sumW = 0;
-                            A11 = 0;
-                            A12 = 0;
-                            A22 = 0;
-                        }
+                        mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16);
 
-                        if ( j == 0 )
-                        {
-                            buffIdx = 0;
-                            for( y = 0; y < winSize.height; y++ )
-                            {
-                                const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y )*step + (inextPt.x )*cn;
-                                const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step);
-                                const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
-                                const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
-                                x = 0;
-                                for( ; x < winSize.width*cn; x++, dIptr += 2)
-                                {
-                                    if( maskPtr[x] == 0)
-                                        continue;
-                                    int diff = static_cast<int>(CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, W_BITS1-5)
-                                        - Iptr[x] + Iptr[x] * gainVec.x +gainVec.y);
-                                    residualMat.at<short>(buffIdx++) = static_cast<short>(diff);
-                                }
-                            }
-                            /*! Estimation for the residual */
-                            cv::Mat residualMatRoi(residualMat, cv::Rect(0,0,1, buffIdx));
-                            MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi);
-                        }
+                        __m128i scalediffIsPos_epi16    = _mm_cmpgt_epi16(mmDiff_epi16, mmScale);
+                        mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1)));
 
-                        float eta = 1.f / winArea;
-                        float fParam0 = param[0] * 32.f;
-                        float fParam1 = param[1] * 32.f;
-                        fParam0 = param[0] * MEstimatorScale;
-                        fParam1 = param[1] * MEstimatorScale;
 
-#ifdef RLOF_SSE
+                        __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr));
+                        __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
 
-                        qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
-                        qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
-                        __m128 qb0[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
-                        __m128 qb1[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
-                        __m128 qb2[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
-                        __m128 qb3[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
-                        __m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps();
-                        __m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps();
-                        __m128 mmSumIy = _mm_setzero_ps(),  mmSumIx = _mm_setzero_ps();
-                        __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();
-                        __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() -1, static_cast<short>(fParam0)));
-                        __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max()- 1, static_cast<short>(fParam1)));
-
-
-                        float s2Val = param[2] > 0 ? param[2] : -param[2];
-                        int s2bitShift = param[2] == 0 ? 1 : static_cast<int>(ceil(log(200.f / s2Val) / log(2.f)));
-                        __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast<short>(param[2] * (float)(1 << s2bitShift)));
-                        __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift);
-                        __m128  mmParam2s = _mm_set1_ps(0.01f * param[2]);
-                        __m128  mmParam2s2 = _mm_set1_ps(param[2] * param[2]);
-                        float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x;
-                        int bitShift = gainVec.x == 0 ? 1 : static_cast<int>(ceil(log(200.f / gainVal) / log(2.f)));
-                        __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.x * (float)(1 << bitShift)));
-                        __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.y));
-                        __m128i mmEta     = _mm_setzero_si128();
-                        __m128i mmScale      = _mm_set1_epi16(static_cast<short>(MEstimatorScale));
 
-#endif
-
-                        buffIdx = 0;
-                        float _b0[4] = {0,0,0,0};
-                        float _b1[4] = {0,0,0,0};
-                        float _b2[4] = {0,0,0,0};
-                        float _b3[4] = {0,0,0,0};
-                        /*
-                        */
-                        for( y = 0; y < _winSize.height; y++ )
-                        {
-                            const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
-                            const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn;
-                            const short* Iptr = (const short*)(IWinBuf.data +  y*IWinBuf.step);
-                            const short* dIptr = (const short*)(derivIWinBuf.data + y*derivIWinBuf.step);
+                        __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16);
+                        __m128i bSet2_epi16, bSet1_epi16;
+                        // |It| < sigma1 ?
+                        bSet2_epi16        = _mm_cmplt_epi16(abs_epi16, mmParam1);
+                        // It > 0 ?
+                        __m128i diffIsPos_epi16    = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128());
+                        // sigma0 < |It| < sigma1 ?
+                        bSet1_epi16        = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0));
+                                                    // val = |It| -/+ sigma1
+                        __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)),
+                                                             _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1)));
+                        // It == 0     ? |It| > sigma13
+                        mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16);
 
-                            x = 0;
-#ifdef RLOF_SSE
-                        for( ; x <= _winSize.width*cn; x += 8, dIptr += 8*2 )
+                        for( unsigned int mmi = 0; mmi < 4; mmi++)
                         {
-                            __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16);
-                            __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x));
-
-                            __m128i v00 = _mm_unpacklo_epi8(
-                                _mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
-                            __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
-                            __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z);
-                            __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z);
-
-                            __m128i t0 = _mm_add_epi32
-                                (_mm_madd_epi16(
-                                    _mm_unpacklo_epi16(v00, v01),
-                                    qw0),
-                                _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-                            __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
-                                                       _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+                            __m128i mmDiffc_epi16_t = _mm_and_si128(mmDiffc_epi16[mmi], mask_0_7_epi16);
+                            mmDiffc_epi16_t = _mm_and_si128(bSet2_epi16, mmDiffc_epi16_t);
+
+                            // It == val ? sigma0 < |It| < sigma1
+                            mmDiffc_epi16_t = _mm_blendv_epi8(mmDiffc_epi16_t, tmpParam1_epi16, bSet1_epi16);
+                            __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3
+                            // diff = diff * sigma2
+                            lo = _mm_mullo_epi16(tale_epi16_, mmDiffc_epi16_t);
+                            hi = _mm_mulhi_epi16(tale_epi16_, mmDiffc_epi16_t);
+                            __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift);
+                            __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift);
+
+                            mmDiffc_epi16_t = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32);
+                            __m128i diff1 = _mm_unpackhi_epi16(mmDiffc_epi16_t, mmDiffc_epi16_t); // It4 It4 It5 It5 It6 It6 It7 It7   | It12 It12 It13 It13...
+                            __m128i diff0 = _mm_unpacklo_epi16(mmDiffc_epi16_t, mmDiffc_epi16_t); // It0 It0 It1 It1 It2 It2 It3 It3   | It8 It8 It9 It9...
+
+                            // Ix * diff / Iy * diff
+                            v10 = _mm_mullo_epi16(Ixy_0, diff0);
+                            v11 = _mm_mulhi_epi16(Ixy_0, diff0);
+                            v00 = _mm_unpacklo_epi16(v10, v11);
+                            v10 = _mm_unpackhi_epi16(v10, v11);
 
-                            t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
-                            t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5);
+                            qb0[mmi] = _mm_add_ps(qb0[mmi], _mm_cvtepi32_ps(v00));
+                            qb1[mmi] = _mm_add_ps(qb1[mmi], _mm_cvtepi32_ps(v10));
+                            // It * Ix It * Iy [4 ... 7]
+                            // for set 1 hi sigma 1
+                            v10 = _mm_mullo_epi16(Ixy_1, diff1);
+                            v11 = _mm_mulhi_epi16(Ixy_1, diff1);
+                            v00 = _mm_unpacklo_epi16(v10, v11);
+                            v10 = _mm_unpackhi_epi16(v10, v11);
+                            qb0[mmi] = _mm_add_ps(qb0[mmi], _mm_cvtepi32_ps(v00));
+                            qb1[mmi] = _mm_add_ps(qb1[mmi], _mm_cvtepi32_ps(v10));
+                            // diff * J [0 ... 7]
+                            // for set 1  sigma 1
+                            // b3 += diff * Iptr[x]
+                            v10 = _mm_mullo_epi16(mmDiffc_epi16_t, I_0_7_epi16);
+                            v11 = _mm_mulhi_epi16(mmDiffc_epi16_t, I_0_7_epi16);
+                            v00 = _mm_unpacklo_epi16(v10, v11);
 
-                            __m128i lo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16);
-                            __m128i hi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16);
+                            v10 = _mm_unpackhi_epi16(v10, v11);
+                            qb2[mmi] = _mm_add_ps(qb2[mmi], _mm_cvtepi32_ps(v00));
+                            qb2[mmi] = _mm_add_ps(qb2[mmi], _mm_cvtepi32_ps(v10));
+                            qb3[mmi] = _mm_add_ps(qb3[mmi], _mm_cvtepi32_ps(diff_0_3_epi32));
+                            qb3[mmi] = _mm_add_ps(qb3[mmi], _mm_cvtepi32_ps(diff_4_7_epi32));
+                        }
 
-                            __m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), bitShift);
-                            __m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), bitShift);
-                            __m128i Igain_epi16 =  _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32);
+                        if( j == 0 )
+                        {
+                            __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16));
+                            __m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16,bSet1_epi16), 16));
+                            __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16));
+                            __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16),16)));
 
+                            __m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16));
+                            __m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16),16));
 
-                            __m128i diffValue = _mm_subs_epi16(_mm_add_epi16(Igain_epi16, mmConstValue_epi16), I_0_7_epi16);
-                            __m128i mmDiffc_epi16[4] =
-                            {
-                               _mm_add_epi16(_mm_slli_epi16(v11, 5), diffValue),
-                               _mm_add_epi16(_mm_slli_epi16(v01, 5), diffValue),
-                               _mm_add_epi16(_mm_slli_epi16(v10, 5), diffValue),
-                               _mm_add_epi16(_mm_slli_epi16(v00, 5), diffValue)
-                            };
+                            __m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps);
+                            __m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps);
+                            tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps);
+                            tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps);
 
-                            __m128i mmDiff_epi16 = _mm_add_epi16( _mm_packs_epi32(t0, t1), diffValue);
+                            tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps);
+                            tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps);
 
+                            t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3
+                            t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3
 
-                            mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16);
+                            __m128 fy = _mm_cvtepi32_ps(t0);
+                            __m128 fx = _mm_cvtepi32_ps(t1);
 
-                            __m128i scalediffIsPos_epi16    = _mm_cmpgt_epi16(mmDiff_epi16, mmScale);
-                            mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1)));
+                            // 0 ... 3
+                            __m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16));
 
+                            // A11 - A22
+                            __m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps);
+                            __m128 fytale = _mm_mul_ps(fy, tale_0_3_ps);
 
-                            __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr));
-                            __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
+                            mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
+                            mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
+                            mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
 
+                            // sumIx und sumIy
+                            mmSumIx = _mm_add_ps(mmSumIx, fxtale);
+                            mmSumIy = _mm_add_ps(mmSumIy, fytale);
 
-                            __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16);
-                            __m128i bSet2_epi16, bSet1_epi16;
-                            // |It| < sigma1 ?
-                            bSet2_epi16        = _mm_cmplt_epi16(abs_epi16, mmParam1);
-                            // It > 0 ?
-                            __m128i diffIsPos_epi16    = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128());
-                            // sigma0 < |It| < sigma1 ?
-                            bSet1_epi16        = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0));
-                                                        // val = |It| -/+ sigma1
-                            __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)),
-                                                                 _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1)));
-                            // It == 0     ? |It| > sigma13
-                            mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16);
+                            mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale));
+                            mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale));
 
-                            for( unsigned int mmi = 0; mmi < 4; mmi++)
-                            {
-                                __m128i mmDiffc_epi16_t = _mm_and_si128(mmDiffc_epi16[mmi], mask_0_7_epi16);
-                                mmDiffc_epi16_t = _mm_and_si128(bSet2_epi16, mmDiffc_epi16_t);
-
-                                // It == val ? sigma0 < |It| < sigma1
-                                mmDiffc_epi16_t = _mm_blendv_epi8(mmDiffc_epi16_t, tmpParam1_epi16, bSet1_epi16);
-                                __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3
-                                // diff = diff * sigma2
-                                lo = _mm_mullo_epi16(tale_epi16_, mmDiffc_epi16_t);
-                                hi = _mm_mulhi_epi16(tale_epi16_, mmDiffc_epi16_t);
-                                __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift);
-                                __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift);
-
-                                mmDiffc_epi16_t = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32);
-                                __m128i diff1 = _mm_unpackhi_epi16(mmDiffc_epi16_t, mmDiffc_epi16_t); // It4 It4 It5 It5 It6 It6 It7 It7   | It12 It12 It13 It13...
-                                __m128i diff0 = _mm_unpacklo_epi16(mmDiffc_epi16_t, mmDiffc_epi16_t); // It0 It0 It1 It1 It2 It2 It3 It3   | It8 It8 It9 It9...
-
-                                // Ix * diff / Iy * diff
-                                v10 = _mm_mullo_epi16(Ixy_0, diff0);
-                                v11 = _mm_mulhi_epi16(Ixy_0, diff0);
-                                v00 = _mm_unpacklo_epi16(v10, v11);
-                                v10 = _mm_unpackhi_epi16(v10, v11);
-
-                                qb0[mmi] = _mm_add_ps(qb0[mmi], _mm_cvtepi32_ps(v00));
-                                qb1[mmi] = _mm_add_ps(qb1[mmi], _mm_cvtepi32_ps(v10));
-                                // It * Ix It * Iy [4 ... 7]
-                                // for set 1 hi sigma 1
-                                v10 = _mm_mullo_epi16(Ixy_1, diff1);
-                                v11 = _mm_mulhi_epi16(Ixy_1, diff1);
-                                v00 = _mm_unpacklo_epi16(v10, v11);
-                                v10 = _mm_unpackhi_epi16(v10, v11);
-                                qb0[mmi] = _mm_add_ps(qb0[mmi], _mm_cvtepi32_ps(v00));
-                                qb1[mmi] = _mm_add_ps(qb1[mmi], _mm_cvtepi32_ps(v10));
-                                // diff * J [0 ... 7]
-                                // for set 1  sigma 1
-                                // b3 += diff * Iptr[x]
-                                v10 = _mm_mullo_epi16(mmDiffc_epi16_t, I_0_7_epi16);
-                                v11 = _mm_mulhi_epi16(mmDiffc_epi16_t, I_0_7_epi16);
-                                v00 = _mm_unpacklo_epi16(v10, v11);
-
-                                v10 = _mm_unpackhi_epi16(v10, v11);
-                                qb2[mmi] = _mm_add_ps(qb2[mmi], _mm_cvtepi32_ps(v00));
-                                qb2[mmi] = _mm_add_ps(qb2[mmi], _mm_cvtepi32_ps(v10));
-                                qb3[mmi] = _mm_add_ps(qb3[mmi], _mm_cvtepi32_ps(diff_0_3_epi32));
-                                qb3[mmi] = _mm_add_ps(qb3[mmi], _mm_cvtepi32_ps(diff_4_7_epi32));
-                            }
+                            // sumI
+                            __m128 I_tale_ps = _mm_mul_ps(I_ps, tale_0_3_ps);
+                            mmSumI = _mm_add_ps(mmSumI,I_tale_ps);
 
-                            if( j == 0 )
-                            {
-                                __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16));
-                                __m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16,bSet1_epi16), 16));
-                                __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16));
-                                __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16),16)));
+                            // sumW
+                            mmSumW = _mm_add_ps(mmSumW, tale_0_3_ps);
 
-                                __m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16));
-                                __m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16),16));
+                            // sumDI
+                            mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_tale_ps));
 
-                                __m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps);
-                                __m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps);
-                                tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps);
-                                tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps);
 
-                                tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps);
-                                tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps);
+                            t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11
+                            t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3
 
-                                t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3
-                                t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3
+                            fy =  _mm_cvtepi32_ps(t0);
+                            fx =  _mm_cvtepi32_ps(t1);
 
-                                __m128 fy = _mm_cvtepi32_ps(t0);
-                                __m128 fx = _mm_cvtepi32_ps(t1);
+                            // 4 ... 7
+                            I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16));
 
-                                // 0 ... 3
-                                __m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16));
+                            // A11 - A22
+                            fxtale = _mm_mul_ps(fx, tale_4_7_ps);
+                            fytale = _mm_mul_ps(fy, tale_4_7_ps);
 
-                                // A11 - A22
-                                __m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps);
-                                __m128 fytale = _mm_mul_ps(fy, tale_0_3_ps);
+                            mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
+                            mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
+                            mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
 
-                                mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
-                                mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
-                                mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
+                            // sumIx und sumIy
+                            mmSumIx = _mm_add_ps(mmSumIx, fxtale);
+                            mmSumIy = _mm_add_ps(mmSumIy, fytale);
 
-                                // sumIx und sumIy
-                                mmSumIx = _mm_add_ps(mmSumIx, fxtale);
-                                mmSumIy = _mm_add_ps(mmSumIy, fytale);
+                            mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale));
+                            mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale));
 
-                                mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale));
-                                mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale));
+                            // sumI
+                            I_tale_ps = _mm_mul_ps(I_ps, tale_4_7_ps);
+                            mmSumI = _mm_add_ps(mmSumI, I_tale_ps);
 
-                                // sumI
-                                __m128 I_tale_ps = _mm_mul_ps(I_ps, tale_0_3_ps);
-                                mmSumI = _mm_add_ps(mmSumI,I_tale_ps);
+                            // sumW
+                            mmSumW = _mm_add_ps(mmSumW, tale_4_7_ps);
 
-                                // sumW
-                                mmSumW = _mm_add_ps(mmSumW, tale_0_3_ps);
+                            // sumDI
+                            mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_tale_ps));
+                        }
 
-                                // sumDI
-                                mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_tale_ps));
+                    }
+#else
+                    for( ; x < _winSize.width*cn; x++, dIptr += 2 )
+                    {
+                        if( maskPtr[x] == 0)
+                            continue;
 
+                        short ixval = dIptr[0];
+                        short iyval = dIptr[1];
+                        int illValue =  static_cast<int>(Iptr[x] * gainVec.x + gainVec.y  - Iptr[x]);
 
-                                t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11
-                                t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3
+                        int It[4] = {(Jptr[x+step+cn]<< 5)    + illValue,
+                                     (Jptr[x+cn]<< 5)        + illValue,
+                                     (Jptr[x+step]<< 5)        + illValue,
+                                     (Jptr[x] << 5)            + illValue};
 
-                                fy =  _mm_cvtepi32_ps(t0);
-                                fx =  _mm_cvtepi32_ps(t1);
 
-                                // 4 ... 7
-                                I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16));
+                        int J_val  =  CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
+                                              Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11,
+                                              W_BITS1-5);
 
-                                // A11 - A22
-                                fxtale = _mm_mul_ps(fx, tale_4_7_ps);
-                                fytale = _mm_mul_ps(fy, tale_4_7_ps);
+                        int diff =  J_val + illValue;
 
-                                mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
-                                mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
-                                mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
 
-                                // sumIx und sumIy
-                                mmSumIx = _mm_add_ps(mmSumIx, fxtale);
-                                mmSumIy = _mm_add_ps(mmSumIy, fytale);
 
-                                mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale));
-                                mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale));
+                        MEstimatorScale += (diff < MEstimatorScale) ? -eta : eta;
 
-                                // sumI
-                                I_tale_ps = _mm_mul_ps(I_ps, tale_4_7_ps);
-                                mmSumI = _mm_add_ps(mmSumI, I_tale_ps);
+                        int abss = (diff < 0) ? -diff : diff;
 
-                                // sumW
-                                mmSumW = _mm_add_ps(mmSumW, tale_4_7_ps);
 
-                                // sumDI
-                                mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_tale_ps));
+                        // compute the missmatch vector
+                        if( j >= 0)
+                        {
+                            if( abss > fParam1)
+                            {
+                                It[0] = 0;
+                                It[1] = 0;
+                                It[2] = 0;
+                                It[3] = 0;
                             }
-
-                        }
-#else
-                            for( ; x < _winSize.width*cn; x++, dIptr += 2 )
+                            else if( abss > static_cast<int>(fParam0) && diff >= 0 )
                             {
-                                if( maskPtr[x] == 0)
-                                    continue;
-
-                                short ixval = dIptr[0];
-                                short iyval = dIptr[1];
-                                int illValue =  static_cast<int>(Iptr[x] * gainVec.x + gainVec.y  - Iptr[x]);
-
-                                int It[4] = {(Jptr[x+step+cn]<< 5)    + illValue,
-                                             (Jptr[x+cn]<< 5)        + illValue,
-                                             (Jptr[x+step]<< 5)        + illValue,
-                                             (Jptr[x] << 5)            + illValue};
-
-
-                                int J_val  =  CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
-                                                      Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11,
-                                                      W_BITS1-5);
-
-                                int diff =  J_val + illValue;
-
-
-
-                                MEstimatorScale += (diff < MEstimatorScale) ? -eta : eta;
+                                It[0] = static_cast<int>(param[2] * (It[0] - fParam1));
+                                It[1] = static_cast<int>(param[2] * (It[1] - fParam1));
+                                It[2] = static_cast<int>(param[2] * (It[2] - fParam1));
+                                It[3] = static_cast<int>(param[2] * (It[3] - fParam1));
+                            }
+                            else if( abss > static_cast<int>(fParam0) && diff < 0 )
+                            {
+                                It[0] = static_cast<int>(param[2] * (It[0] + fParam1));
+                                It[1] = static_cast<int>(param[2] * (It[1] + fParam1));
+                                It[2] = static_cast<int>(param[2] * (It[2] + fParam1));
+                                It[3] = static_cast<int>(param[2] * (It[3] + fParam1));
+                            }
+                        }
+                        _b0[0] += (float)(It[0]*dIptr[0]) ;
+                        _b0[1] += (float)(It[1]*dIptr[0]) ;
+                        _b0[2] += (float)(It[2]*dIptr[0]) ;
+                        _b0[3] += (float)(It[3]*dIptr[0]) ;
 
-                                int abss = (diff < 0) ? -diff : diff;
 
+                        _b1[0] += (float)(It[0]*dIptr[1]) ;
+                        _b1[1] += (float)(It[1]*dIptr[1]) ;
+                        _b1[2] += (float)(It[2]*dIptr[1]) ;
+                        _b1[3] += (float)(It[3]*dIptr[1]) ;
 
-                                // compute the missmatch vector
-                                if( j >= 0)
-                                {
-                                    if( abss > fParam1)
-                                    {
-                                        It[0] = 0;
-                                        It[1] = 0;
-                                        It[2] = 0;
-                                        It[3] = 0;
-                                    }
-                                    else if( abss > static_cast<int>(fParam0) && diff >= 0 )
-                                    {
-                                        It[0] = static_cast<int>(param[2] * (It[0] - fParam1));
-                                        It[1] = static_cast<int>(param[2] * (It[1] - fParam1));
-                                        It[2] = static_cast<int>(param[2] * (It[2] - fParam1));
-                                        It[3] = static_cast<int>(param[2] * (It[3] - fParam1));
-                                    }
-                                    else if( abss > static_cast<int>(fParam0) && diff < 0 )
-                                    {
-                                        It[0] = static_cast<int>(param[2] * (It[0] + fParam1));
-                                        It[1] = static_cast<int>(param[2] * (It[1] + fParam1));
-                                        It[2] = static_cast<int>(param[2] * (It[2] + fParam1));
-                                        It[3] = static_cast<int>(param[2] * (It[3] + fParam1));
-                                    }
-                                }
-                                _b0[0] += (float)(It[0]*dIptr[0]) ;
-                                _b0[1] += (float)(It[1]*dIptr[0]) ;
-                                _b0[2] += (float)(It[2]*dIptr[0]) ;
-                                _b0[3] += (float)(It[3]*dIptr[0]) ;
+                        _b2[0] += (float)(It[0])*Iptr[x] ;
+                        _b2[1] += (float)(It[1])*Iptr[x] ;
+                        _b2[2] += (float)(It[2])*Iptr[x] ;
+                        _b2[3] += (float)(It[3])*Iptr[x] ;
 
+                        _b3[0] += (float)(It[0]);
+                        _b3[1] += (float)(It[1]);
+                        _b3[2] += (float)(It[2]);
+                        _b3[3] += (float)(It[3]);
 
-                                _b1[0] += (float)(It[0]*dIptr[1]) ;
-                                _b1[1] += (float)(It[1]*dIptr[1]) ;
-                                _b1[2] += (float)(It[2]*dIptr[1]) ;
-                                _b1[3] += (float)(It[3]*dIptr[1]) ;
-
-                                _b2[0] += (float)(It[0])*Iptr[x] ;
-                                _b2[1] += (float)(It[1])*Iptr[x] ;
-                                _b2[2] += (float)(It[2])*Iptr[x] ;
-                                _b2[3] += (float)(It[3])*Iptr[x] ;
-
-                                _b3[0] += (float)(It[0]);
-                                _b3[1] += (float)(It[1]);
-                                _b3[2] += (float)(It[2]);
-                                _b3[3] += (float)(It[3]);
+                        // compute the Gradient Matrice
+                        if( j == 0)
+                        {
+                            float tale = param[2] * FLT_RESCALE;
+                            if( abss < fParam0 || j < 0)
+                            {
+                                tale = FLT_RESCALE;
+                            }
+                            else if( abss > fParam1)
+                            {
+                                tale *= 0.01f;
+                            }
+                            else
+                            {
+                                tale *= param[2];
+                            }
+                            if( j == 0 )
+                            {
+                                A11 += (float)(ixval*ixval)*tale;
+                                A12 += (float)(ixval*iyval)*tale;
+                                A22 += (float)(iyval*iyval)*tale;
+                            }
 
-                                // compute the Gradient Matrice
-                                if( j == 0)
-                                {
-                                    float tale = param[2] * FLT_RESCALE;
-                                    if( abss < fParam0 || j < 0)
-                                    {
-                                        tale = FLT_RESCALE;
-                                    }
-                                    else if( abss > fParam1)
-                                    {
-                                        tale *= 0.01f;
-                                    }
-                                    else
-                                    {
-                                        tale *= param[2];
-                                    }
-                                    if( j == 0 )
-                                    {
-                                        A11 += (float)(ixval*ixval)*tale;
-                                        A12 += (float)(ixval*iyval)*tale;
-                                        A22 += (float)(iyval*iyval)*tale;
-                                    }
-
-                                    dI += Iptr[x] * Iptr[x] * tale;
-                                    float dx = static_cast<float>(dIptr[0]) * tale;
-                                    float dy = static_cast<float>(dIptr[1]) * tale;
-                                    sumIx += dx;
-                                    sumIy += dy;
-                                    w1 += dx * Iptr[x];
-                                    w2 += dy * Iptr[x];
-                                    sumI += Iptr[x]  * tale;
-                                    sumW += tale;
-                                }
+                            dI += Iptr[x] * Iptr[x] * tale;
+                            float dx = static_cast<float>(dIptr[0]) * tale;
+                            float dy = static_cast<float>(dIptr[1]) * tale;
+                            sumIx += dx;
+                            sumIy += dy;
+                            w1 += dx * Iptr[x];
+                            w2 += dy * Iptr[x];
+                            sumI += Iptr[x]  * tale;
+                            sumW += tale;
+                        }
 
-                            }
+                    }
 #endif
-                        }
+                }
 
 #ifdef RLOF_SSE
-                        short etaValues[8];
-                        _mm_storeu_si128((__m128i*)(etaValues), mmEta);
-                        MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3]
-                                                   + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]);
-                        float CV_DECL_ALIGNED(32) wbuf[4];
+                short etaValues[8];
+                _mm_storeu_si128((__m128i*)(etaValues), mmEta);
+                MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3]
+                                           + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]);
+                float CV_DECL_ALIGNED(32) wbuf[4];
 #endif
-                        if( j == 0 )
-                        {
+                if( j == 0 )
+                {
 #ifdef RLOF_SSE
-                                _mm_store_ps(wbuf, mmSumW1);
-                                w1  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-                                _mm_store_ps(wbuf, mmSumW2);
-                                w2  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-                                _mm_store_ps(wbuf, mmSumDI);
-                                dI  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-                                _mm_store_ps(wbuf, mmSumI);
-                                sumI  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-                                _mm_store_ps(wbuf, mmSumIx);
-                                sumIx  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-                                _mm_store_ps(wbuf, mmSumIy);
-                                sumIy  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-                                _mm_store_ps(wbuf, mmSumW);
-                                sumW  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                        _mm_store_ps(wbuf, mmSumW1);
+                        w1  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                        _mm_store_ps(wbuf, mmSumW2);
+                        w2  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                        _mm_store_ps(wbuf, mmSumDI);
+                        dI  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                        _mm_store_ps(wbuf, mmSumI);
+                        sumI  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                        _mm_store_ps(wbuf, mmSumIx);
+                        sumIx  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                        _mm_store_ps(wbuf, mmSumIy);
+                        sumIy  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                        _mm_store_ps(wbuf, mmSumW);
+                        sumW  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
 #endif
-                                sumIx *= -FLT_SCALE;
-                                sumIy *= -FLT_SCALE;
-                                sumI *=FLT_SCALE;
-                                sumW *= FLT_SCALE;
-                                w1 *= -FLT_SCALE;
-                                w2 *= -FLT_SCALE;
-                                dI *= FLT_SCALE;
+                        sumIx *= -FLT_SCALE;
+                        sumIy *= -FLT_SCALE;
+                        sumI *=FLT_SCALE;
+                        sumW *= FLT_SCALE;
+                        w1 *= -FLT_SCALE;
+                        w2 *= -FLT_SCALE;
+                        dI *= FLT_SCALE;
 
 
 #ifdef RLOF_SSE
-                            float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];//
+                    float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];//
 
-                            _mm_store_ps(A11buf, mmAxx);
-                            _mm_store_ps(A12buf, mmAxy);
-                            _mm_store_ps(A22buf, mmAyy);
+                    _mm_store_ps(A11buf, mmAxx);
+                    _mm_store_ps(A12buf, mmAxy);
+                    _mm_store_ps(A22buf, mmAyy);
 
 
-                            A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
-                            A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
-                            A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
+                    A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
+                    A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
+                    A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
 #endif
-                            A11 *= FLT_SCALE; // 54866744.
-                            A12 *= FLT_SCALE; // -628764.00
-                            A22 *= FLT_SCALE; // 19730.000
-
-                            D = - A12*A12*sumI*sumI + dI*sumW*A12*A12 + 2*A12*sumI*sumIx*w2 + 2*A12*sumI*sumIy*w1
-                                - 2*dI*A12*sumIx*sumIy - 2*sumW*A12*w1*w2 + A11*A22*sumI*sumI - 2*A22*sumI*sumIx*w1
-                                - 2*A11*sumI*sumIy*w2 - sumIx*sumIx*w2*w2 + A22*dI*sumIx*sumIx + 2*sumIx*sumIy*w1*w2
-                                - sumIy*sumIy*w1*w1 + A11*dI*sumIy*sumIy + A22*sumW*w1*w1 + A11*sumW*w2*w2 - A11*A22*dI*sumW;
-
-                            float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
-                                    4.f*A12*A12))/(2*winArea);
-                            if(  minEig < minEigThreshold || std::abs(D) < FLT_EPSILON )
-                            {
-                                if( level == 0 && status )
-                                    status[ptidx] = 0;
-                                if( level > 0)
-                                {
-                                    nextPts[ptidx] = backUpNextPt;
-                                    gainVecs[ptidx] = backUpGain;
-                                }
-                                noIteration++;
-                                break;
-                            }
-
-
-                            D = (1.f / D);
-
-                            invTensorMat.at<float>(0,0) = (A22*sumI*sumI - 2*sumI*sumIy*w2 + dI*sumIy*sumIy + sumW*w2*w2 - A22*dI*sumW)* D;
-                            invTensorMat.at<float>(0,1) = (A12*dI*sumW - A12*sumI * sumI - dI*sumIx*sumIy + sumI*sumIx*w2 + sumI*sumIy*w1 - sumW*w1*w2)* D;
-                            invTensorMat.at<float>(0,2) = (A12*sumI*sumIy - sumIy*sumIy*w1 - A22*sumI*sumIx - A12*sumW*w2 + A22*sumW*w1 + sumIx*sumIy*w2)* D;
-                            invTensorMat.at<float>(0,3) = (A22*dI*sumIx - A12*dI*sumIy - sumIx*w2*w2 + A12*sumI*w2 - A22*sumI*w1 + sumIy*w1*w2) * D;
-                            invTensorMat.at<float>(1,0) = invTensorMat.at<float>(0,1);
-                            invTensorMat.at<float>(1,1) = (A11*sumI * sumI - 2*sumI*sumIx*w1 + dI*sumIx * sumIx + sumW*w1*w1 - A11*dI*sumW) * D;
-                            invTensorMat.at<float>(1,2) = (A12*sumI*sumIx - A11*sumI*sumIy - sumIx * sumIx*w2 + A11*sumW*w2 - A12*sumW*w1 + sumIx*sumIy*w1) * D;
-                            invTensorMat.at<float>(1,3) = (A11*dI*sumIy - sumIy*w1*w1 - A12*dI*sumIx - A11*sumI*w2 + A12*sumI*w1 + sumIx*w1*w2)* D;
-                            invTensorMat.at<float>(2,0) = invTensorMat.at<float>(0,2);
-                            invTensorMat.at<float>(2,1) = invTensorMat.at<float>(1,2);
-                            invTensorMat.at<float>(2,2) = (sumW*A12*A12 - 2*A12*sumIx*sumIy + A22*sumIx*sumIx + A11*sumIy*sumIy - A11*A22*sumW)* D;
-                            invTensorMat.at<float>(2,3) = (A11*A22*sumI - A12*A12*sumI - A11*sumIy*w2 + A12*sumIx*w2 + A12*sumIy*w1 - A22*sumIx*w1)* D;
-                            invTensorMat.at<float>(3,0) = invTensorMat.at<float>(0,3);
-                            invTensorMat.at<float>(3,1) = invTensorMat.at<float>(1,3);
-                            invTensorMat.at<float>(3,2) = invTensorMat.at<float>(2,3);
-                            invTensorMat.at<float>(3,3) = (dI*A12*A12 - 2*A12*w1*w2 + A22*w1*w1 + A11*w2*w2 - A11*A22*dI)* D;
+                    A11 *= FLT_SCALE; // 54866744.
+                    A12 *= FLT_SCALE; // -628764.00
+                    A22 *= FLT_SCALE; // 19730.000
+
+                    D = - A12*A12*sumI*sumI + dI*sumW*A12*A12 + 2*A12*sumI*sumIx*w2 + 2*A12*sumI*sumIy*w1
+                        - 2*dI*A12*sumIx*sumIy - 2*sumW*A12*w1*w2 + A11*A22*sumI*sumI - 2*A22*sumI*sumIx*w1
+                        - 2*A11*sumI*sumIy*w2 - sumIx*sumIx*w2*w2 + A22*dI*sumIx*sumIx + 2*sumIx*sumIy*w1*w2
+                        - sumIy*sumIy*w1*w1 + A11*dI*sumIy*sumIy + A22*sumW*w1*w1 + A11*sumW*w2*w2 - A11*A22*dI*sumW;
+
+                    float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
+                            4.f*A12*A12))/(2*winArea);
+                    if(  minEig < minEigThreshold || std::abs(D) < FLT_EPSILON )
+                    {
+                        if( level == 0 && status )
+                            status[ptidx] = 0;
+                        if( level > 0)
+                        {
+                            nextPts[ptidx] = backUpNextPt;
+                            gainVecs[ptidx] = backUpGain;
                         }
+                        noIteration++;
+                        break;
+                    }
 
 
-        #ifdef RLOF_SSE
-                        float CV_DECL_ALIGNED(16) bbuf[4];
-                        for(int mmi = 0; mmi < 4; mmi++)
-                        {
+                    D = (1.f / D);
+
+                    invTensorMat.at<float>(0,0) = (A22*sumI*sumI - 2*sumI*sumIy*w2 + dI*sumIy*sumIy + sumW*w2*w2 - A22*dI*sumW)* D;
+                    invTensorMat.at<float>(0,1) = (A12*dI*sumW - A12*sumI * sumI - dI*sumIx*sumIy + sumI*sumIx*w2 + sumI*sumIy*w1 - sumW*w1*w2)* D;
+                    invTensorMat.at<float>(0,2) = (A12*sumI*sumIy - sumIy*sumIy*w1 - A22*sumI*sumIx - A12*sumW*w2 + A22*sumW*w1 + sumIx*sumIy*w2)* D;
+                    invTensorMat.at<float>(0,3) = (A22*dI*sumIx - A12*dI*sumIy - sumIx*w2*w2 + A12*sumI*w2 - A22*sumI*w1 + sumIy*w1*w2) * D;
+                    invTensorMat.at<float>(1,0) = invTensorMat.at<float>(0,1);
+                    invTensorMat.at<float>(1,1) = (A11*sumI * sumI - 2*sumI*sumIx*w1 + dI*sumIx * sumIx + sumW*w1*w1 - A11*dI*sumW) * D;
+                    invTensorMat.at<float>(1,2) = (A12*sumI*sumIx - A11*sumI*sumIy - sumIx * sumIx*w2 + A11*sumW*w2 - A12*sumW*w1 + sumIx*sumIy*w1) * D;
+                    invTensorMat.at<float>(1,3) = (A11*dI*sumIy - sumIy*w1*w1 - A12*dI*sumIx - A11*sumI*w2 + A12*sumI*w1 + sumIx*w1*w2)* D;
+                    invTensorMat.at<float>(2,0) = invTensorMat.at<float>(0,2);
+                    invTensorMat.at<float>(2,1) = invTensorMat.at<float>(1,2);
+                    invTensorMat.at<float>(2,2) = (sumW*A12*A12 - 2*A12*sumIx*sumIy + A22*sumIx*sumIx + A11*sumIy*sumIy - A11*A22*sumW)* D;
+                    invTensorMat.at<float>(2,3) = (A11*A22*sumI - A12*A12*sumI - A11*sumIy*w2 + A12*sumIx*w2 + A12*sumIy*w1 - A22*sumIx*w1)* D;
+                    invTensorMat.at<float>(3,0) = invTensorMat.at<float>(0,3);
+                    invTensorMat.at<float>(3,1) = invTensorMat.at<float>(1,3);
+                    invTensorMat.at<float>(3,2) = invTensorMat.at<float>(2,3);
+                    invTensorMat.at<float>(3,3) = (dI*A12*A12 - 2*A12*w1*w2 + A22*w1*w1 + A11*w2*w2 - A11*A22*dI)* D;
+                }
 
-                            _mm_store_ps(bbuf, _mm_add_ps(qb0[mmi], qb1[mmi]));
-                            _b0[mmi] = bbuf[0] + bbuf[2];
-                            _b1[mmi] = bbuf[1] + bbuf[3];
-                            _mm_store_ps(bbuf, qb2[mmi]);
-                            _b2[mmi] = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
-                            _mm_store_ps(bbuf, qb3[mmi]);
-                            _b3[mmi] = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
 
-                        }
-        #endif
-
-                        _b0[0] *= FLT_SCALE;_b0[1] *= FLT_SCALE;_b0[2] *= FLT_SCALE;_b0[3] *= FLT_SCALE;
-                        _b1[0] *= FLT_SCALE;_b1[1] *= FLT_SCALE;_b1[2] *= FLT_SCALE;_b1[3] *= FLT_SCALE;
-                        _b2[0] *= FLT_SCALE;_b2[1] *= FLT_SCALE;_b2[2] *= FLT_SCALE;_b2[3] *= FLT_SCALE;
-                        _b3[0] *= FLT_SCALE;_b3[1] *= FLT_SCALE;_b3[2] *= FLT_SCALE;_b3[3] *= FLT_SCALE;
-
-
-                        Mc0[0] =   _b0[0] - _b0[1] - _b0[2] + _b0[3];
-                        Mc0[1] =   _b1[0] - _b1[1] - _b1[2] + _b1[3];
-                        Mc0[2] = -(_b2[0] - _b2[1] - _b2[2] + _b2[3]);
-                        Mc0[3] = -(_b3[0] - _b3[1] - _b3[2] + _b3[3]);
-
-                        Mc1[0] =   _b0[1] - _b0[3];
-                        Mc1[1] =   _b1[1] - _b1[3];
-                        Mc1[2] = -(_b2[1] - _b2[3]);
-                        Mc1[3] = -(_b3[1] - _b3[3]);
-
-
-                        Mc2[0] =   _b0[2] - _b0[3];
-                        Mc2[1] =   _b1[2] - _b1[3];
-                        Mc2[2] = -(_b2[2] - _b2[3]);
-                        Mc2[3] = -(_b3[2] - _b3[3]);
-
-
-                        Mc3[0] =  _b0[3];
-                        Mc3[1] =  _b1[3];
-                        Mc3[2] = -_b2[3];
-                        Mc3[3] = -_b3[3];
-
-                        //
-                        c[0] = -Mc0[0];
-                        c[1] = -Mc1[0];
-                        c[2] = -Mc2[0];
-                        c[3] = -Mc3[0];
-                        c[4] = -Mc0[1];
-                        c[5] = -Mc1[1];
-                        c[6] = -Mc2[1];
-                        c[7] = -Mc3[1];
-
-                        float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]);
-                        float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]);
-                        float e2 = e0 * (c[1] * c[7] -c[3] * c[5]);
-                        e0 = e1 * e1 - e2;
-                        hasSolved = false;
-                        if ( e0 > 0)
-                        {
-                            e0 = sqrt(e0);
-                            float _y[2] = {-e1 - e0, e0 - e1};
-                            float c0yc1[2] = {c[0] * _y[0] + c[1],
-                                                c[0] * _y[1] + c[1]};
-                            float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0],
-                                            -(c[2] * _y[1] + c[3]) / c0yc1[1]};
-                            bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1);
-                            bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1);
-
-                            bool isSolution1 = checkSolution(_x[0], _y[0], c );
-                            bool isSolution2 = checkSolution(_x[1], _y[1], c );
-                            bool isSink1 = isIn1 && isSolution1;
-                            bool isSink2 = isIn2 && isSolution2;
-
-                            if ( isSink1 != isSink2)
-                            {
-                                a = isSink1 ? _x[0] : _x[1];
-                                b = isSink1 ? _y[0] : _y[1];
-                                ab = a * b;
-                                hasSolved = true;
-                                delta.x = inextPt.x + a - nextPt.x;
-                                delta.y = inextPt.y + b - nextPt.y;
-
-                                cv::Vec4f mismatchVec = ab * Mc0  + Mc1 *a + Mc2 * b + Mc3;
-                                deltaGain = est_DeltaGain(invTensorMat, mismatchVec);
+#ifdef RLOF_SSE
+                float CV_DECL_ALIGNED(16) bbuf[4];
+                for(int mmi = 0; mmi < 4; mmi++)
+                {
 
-                            } // isIn1 != isIn2
-                        }
-                            if( hasSolved == false)
-                                noIteration++;
-                        }
-                        else
-                        {
-                            hasSolved = false;
-                            noReusedIteration++;
-                        }
-                        if( hasSolved == false )
-                        {
+                    _mm_store_ps(bbuf, _mm_add_ps(qb0[mmi], qb1[mmi]));
+                    _b0[mmi] = bbuf[0] + bbuf[2];
+                    _b1[mmi] = bbuf[1] + bbuf[3];
+                    _mm_store_ps(bbuf, qb2[mmi]);
+                    _b2[mmi] = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
+                    _mm_store_ps(bbuf, qb3[mmi]);
+                    _b3[mmi] = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
 
-                            cv::Vec4f mismatchVec = ab * Mc0  + Mc1 *a + Mc2 * b + Mc3;
-                            est_Result(invTensorMat, mismatchVec, delta, deltaGain);
+                }
+#endif
 
-                            delta.x  = MAX(-1.f, MIN( 1.f , delta.x));
-                            delta.y  = MAX(-1.f, MIN( 1.f , delta.y));
+                _b0[0] *= FLT_SCALE;_b0[1] *= FLT_SCALE;_b0[2] *= FLT_SCALE;_b0[3] *= FLT_SCALE;
+                _b1[0] *= FLT_SCALE;_b1[1] *= FLT_SCALE;_b1[2] *= FLT_SCALE;_b1[3] *= FLT_SCALE;
+                _b2[0] *= FLT_SCALE;_b2[1] *= FLT_SCALE;_b2[2] *= FLT_SCALE;_b2[3] *= FLT_SCALE;
+                _b3[0] *= FLT_SCALE;_b3[1] *= FLT_SCALE;_b3[2] *= FLT_SCALE;_b3[3] *= FLT_SCALE;
+
+
+                Mc0[0] =   _b0[0] - _b0[1] - _b0[2] + _b0[3];
+                Mc0[1] =   _b1[0] - _b1[1] - _b1[2] + _b1[3];
+                Mc0[2] = -(_b2[0] - _b2[1] - _b2[2] + _b2[3]);
+                Mc0[3] = -(_b3[0] - _b3[1] - _b3[2] + _b3[3]);
+
+                Mc1[0] =   _b0[1] - _b0[3];
+                Mc1[1] =   _b1[1] - _b1[3];
+                Mc1[2] = -(_b2[1] - _b2[3]);
+                Mc1[3] = -(_b3[1] - _b3[3]);
+
+
+                Mc2[0] =   _b0[2] - _b0[3];
+                Mc2[1] =   _b1[2] - _b1[3];
+                Mc2[2] = -(_b2[2] - _b2[3]);
+                Mc2[3] = -(_b3[2] - _b3[3]);
+
+
+                Mc3[0] =  _b0[3];
+                Mc3[1] =  _b1[3];
+                Mc3[2] = -_b2[3];
+                Mc3[3] = -_b3[3];
+
+                //
+                c[0] = -Mc0[0];
+                c[1] = -Mc1[0];
+                c[2] = -Mc2[0];
+                c[3] = -Mc3[0];
+                c[4] = -Mc0[1];
+                c[5] = -Mc1[1];
+                c[6] = -Mc2[1];
+                c[7] = -Mc3[1];
+
+                float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]);
+                float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]);
+                float e2 = e0 * (c[1] * c[7] -c[3] * c[5]);
+                e0 = e1 * e1 - e2;
+                hasSolved = false;
+                if ( e0 > 0)
+                {
+                    e0 = sqrt(e0);
+                    float _y[2] = {-e1 - e0, e0 - e1};
+                    float c0yc1[2] = {c[0] * _y[0] + c[1],
+                                        c[0] * _y[1] + c[1]};
+                    float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0],
+                                    -(c[2] * _y[1] + c[3]) / c0yc1[1]};
+                    bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1);
+                    bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1);
+
+                    bool isSolution1 = checkSolution(_x[0], _y[0], c );
+                    bool isSolution2 = checkSolution(_x[1], _y[1], c );
+                    bool isSink1 = isIn1 && isSolution1;
+                    bool isSink2 = isIn2 && isSolution2;
+
+                    if ( isSink1 != isSink2)
+                    {
+                        a = isSink1 ? _x[0] : _x[1];
+                        b = isSink1 ? _y[0] : _y[1];
+                        ab = a * b;
+                        hasSolved = true;
+                        delta.x = inextPt.x + a - nextPt.x;
+                        delta.y = inextPt.y + b - nextPt.y;
 
+                        cv::Vec4f mismatchVec = ab * Mc0  + Mc1 *a + Mc2 * b + Mc3;
+                        deltaGain = est_DeltaGain(invTensorMat, mismatchVec);
 
-                            if( j == 0)
-                                prevGain = deltaGain;
-                            gainVec += deltaGain;
-                            nextPt  += delta    ;
-                            nextPts[ptidx] = nextPt + halfWin;
-                            gainVecs[ptidx]= gainVec;
+                    } // isIn1 != isIn2
+                }
+                    if( hasSolved == false)
+                        noIteration++;
+                }
+                else
+                {
+                    hasSolved = false;
+                    noReusedIteration++;
+                }
+                if( hasSolved == false )
+                {
 
-                        }
-                        else
-                        {
-                            nextPt += delta;
-                            nextPts[ptidx] = nextPt + halfWin;
-                            gainVecs[ptidx]= gainVec + deltaGain;
-                            noSolvedIteration++;
-                            break;
-                        }
+                    cv::Vec4f mismatchVec = ab * Mc0  + Mc1 *a + Mc2 * b + Mc3;
+                    est_Result(invTensorMat, mismatchVec, delta, deltaGain);
 
-                        if(j > 0 && (
-                            (std::abs(delta.x - prevDelta.x) < 0.01  &&    std::abs(delta.y - prevDelta.y) < 0.01)
-                         || ((delta.ddot(delta) <= 0.001) && std::abs(prevGain.x - deltaGain.x) < 0.01)))
-                        {
-                            nextPts[ptidx]  -= delta*0.5f;
-                            gainVecs[ptidx] -= deltaGain* 0.5f;
-                            break;
-                        }
+                    delta.x  = MAX(-1.f, MIN( 1.f , delta.x));
+                    delta.y  = MAX(-1.f, MIN( 1.f , delta.y));
 
 
-                        prevDelta = delta;
+                    if( j == 0)
                         prevGain = deltaGain;
-                    }
+                    gainVec += deltaGain;
+                    nextPt  += delta    ;
+                    nextPts[ptidx] = nextPt + halfWin;
+                    gainVecs[ptidx]= gainVec;
 
                 }
+                else
+                {
+                    nextPt += delta;
+                    nextPts[ptidx] = nextPt + halfWin;
+                    gainVecs[ptidx]= gainVec + deltaGain;
+                    noSolvedIteration++;
+                    break;
+                }
 
+                if(j > 0 && (
+                    (std::abs(delta.x - prevDelta.x) < 0.01  &&    std::abs(delta.y - prevDelta.y) < 0.01)
+                 || ((delta.ddot(delta) <= 0.001) && std::abs(prevGain.x - deltaGain.x) < 0.01)))
+                {
+                    nextPts[ptidx]  -= delta*0.5f;
+                    gainVecs[ptidx] -= deltaGain* 0.5f;
+                    break;
+                }
+
+
+                prevDelta = delta;
+                prevGain = deltaGain;
             }
 
-            const Mat*        prevImg;
-            const Mat*        nextImg;
-            const Mat*        prevDeriv;
-            const Mat*        rgbPrevImg;
-            const Mat*        rgbNextImg;
-            const Point2f*    prevPts;
-            Point2f*        nextPts;
-            uchar*            status;
-            cv::Point2f*    gainVecs;        // gain vector x -> multiplier y -> offset
-            float*            err;
-            int                maxWinSize;
-            int                minWinSize;
-            TermCriteria    criteria;
-            int                level;
-            int                maxLevel;
-            int                windowType;
-            float            minEigThreshold;
-            bool            useInitialFlow;
-            std::vector<float> paramReset;
-            int        crossSegmentationThreshold;
-
-        };
+        }
+
     }
-}
-namespace beplk
+
+    const Mat*          prevImg;
+    const Mat*          nextImg;
+    const Mat*          prevDeriv;
+    const Mat*          rgbPrevImg;
+    const Mat*          rgbNextImg;
+    const Point2f*      prevPts;
+    Point2f*            nextPts;
+    uchar*              status;
+    cv::Point2f*        gainVecs;        // gain vector x -> multiplier y -> offset
+    float*              err;
+    int                 maxWinSize;
+    int                 minWinSize;
+    TermCriteria        criteria;
+    int                 level;
+    int                 maxLevel;
+    int                 windowType;
+    float               minEigThreshold;
+    bool                useInitialFlow;
+    std::vector<float>  paramReset;
+    int                 crossSegmentationThreshold;
+};
+
+}} // namespace
+namespace beplk {
+namespace ica {
+
+class TrackerInvoker  : public cv::ParallelLoopBody
 {
-    namespace ica{
-    class TrackerInvoker  : public cv::ParallelLoopBody
+public:
+    TrackerInvoker(
+        const Mat&      _prevImg,
+        const Mat&      _prevDeriv,
+        const Mat&      _nextImg,
+        const Mat&      _rgbPrevImg,
+        const Mat&      _rgbNextImg,
+        const Point2f*  _prevPts,
+        Point2f*        _nextPts,
+        uchar*          _status,
+        float*          _err,
+        int             _level,
+        int             _maxLevel,
+        int             _winSize[2],
+        int             _maxIteration,
+        bool            _useInitialFlow,
+        int             _supportRegionType,
+        int             _crossSegmentationThreshold,
+        float           _minEigenValue)
     {
-    public:
-        TrackerInvoker(const Mat&    _prevImg,
-            const Mat&    _prevDeriv,
-            const Mat&    _nextImg,
-            const Mat&    _rgbPrevImg,
-            const Mat&    _rgbNextImg,
-            const Point2f*    _prevPts,
-            Point2f*        _nextPts,
-            uchar*            _status,
-            float*            _err,
-            int                _level,
-            int                _maxLevel,
-            int                _winSize[2],
-            int                _maxIteration,
-            bool            _useInitialFlow,
-            int                _supportRegionType,
-            int                _crossSegmentationThreshold,
-            float           _minEigenValue)
-        {
-            prevImg = &_prevImg;
-            prevDeriv = &_prevDeriv;
-            nextImg = &_nextImg;
-            rgbPrevImg = &_rgbPrevImg;
-            rgbNextImg = &_rgbNextImg;
-            prevPts = _prevPts;
-            nextPts = _nextPts;
-            status = _status;
-            err = _err;
-            minWinSize = _winSize[0];
-            maxWinSize = _winSize[1];
-            criteria.maxCount = _maxIteration;
-            criteria.epsilon = 0.01;
-            level = _level;
-            maxLevel = _maxLevel;
-            windowType = _supportRegionType;
-            minEigThreshold = _minEigenValue;
-            useInitialFlow = _useInitialFlow;
-            crossSegmentationThreshold = _crossSegmentationThreshold;
-        }
+        prevImg = &_prevImg;
+        prevDeriv = &_prevDeriv;
+        nextImg = &_nextImg;
+        rgbPrevImg = &_rgbPrevImg;
+        rgbNextImg = &_rgbNextImg;
+        prevPts = _prevPts;
+        nextPts = _nextPts;
+        status = _status;
+        err = _err;
+        minWinSize = _winSize[0];
+        maxWinSize = _winSize[1];
+        criteria.maxCount = _maxIteration;
+        criteria.epsilon = 0.01;
+        level = _level;
+        maxLevel = _maxLevel;
+        windowType = _supportRegionType;
+        minEigThreshold = _minEigenValue;
+        useInitialFlow = _useInitialFlow;
+        crossSegmentationThreshold = _crossSegmentationThreshold;
+    }
 
-        TrackerInvoker & operator=(const TrackerInvoker &) {return *this;};
+    TrackerInvoker & operator=(const TrackerInvoker &) {return *this;};
 
-        void operator()(const cv::Range& range) const CV_OVERRIDE
-        {
-            cv::Size    winSize;
-            cv::Point2f halfWin;
+    void operator()(const cv::Range& range) const CV_OVERRIDE
+    {
+        cv::Size    winSize;
+        cv::Point2f halfWin;
 
-            const Mat& I    = *prevImg;
-            const Mat& J    = *nextImg;
-            const Mat& derivI = *prevDeriv;
-            const Mat& BI = *rgbPrevImg;
+        const Mat& I    = *prevImg;
+        const Mat& J    = *nextImg;
+        const Mat& derivI = *prevDeriv;
+        const Mat& BI = *rgbPrevImg;
 
-            winSize = cv::Size(maxWinSize,maxWinSize);
-            int winMaskwidth = static_cast<int>(ceil(winSize.width / 8.f)) * 16;
-            cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
-            winMaskMatBuf.setTo(1);
+        winSize = cv::Size(maxWinSize,maxWinSize);
+        int winMaskwidth = static_cast<int>(ceil(winSize.width / 8.f)) * 16;
+        cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
+        winMaskMatBuf.setTo(1);
 
-            const float FLT_SCALE = (1.f/(1 << 20)); // 20
+        const float FLT_SCALE = (1.f/(1 << 20)); // 20
 
-            int j, cn = I.channels(), cn2 = cn*2;
-            int winbufwidth = static_cast<int>(ceil(winSize.width / 8.f)) * 8;
-            cv::Size winBufSize(winbufwidth,winbufwidth);
+        int j, cn = I.channels(), cn2 = cn*2;
+        int winbufwidth = static_cast<int>(ceil(winSize.width / 8.f)) * 8;
+        cv::Size winBufSize(winbufwidth,winbufwidth);
 
-            std::vector<short> _buf(winBufSize.area()*(cn + cn2));
-            Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]);
-            Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]);
+        std::vector<short> _buf(winBufSize.area()*(cn + cn2));
+        Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]);
+        Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]);
 
-            for( int ptidx = range.start; ptidx < range.end; ptidx++ )
+        for( int ptidx = range.start; ptidx < range.end; ptidx++ )
+        {
+            Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
+            Point2f nextPt;
+            if( level == maxLevel )
             {
-                Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
-                Point2f nextPt;
-                if( level == maxLevel )
-                {
-                    if( useInitialFlow )
-                        nextPt = nextPts[ptidx]*(float)(1./(1 << level));
-                    else
-                        nextPt = prevPt;
-                }
+                if( useInitialFlow )
+                    nextPt = nextPts[ptidx]*(float)(1./(1 << level));
                 else
-                    nextPt = nextPts[ptidx]*2.f;
-                nextPts[ptidx] = nextPt;
-
-                Point2i iprevPt, inextPt;
-                iprevPt.x = cvFloor(prevPt.x);
-                iprevPt.y = cvFloor(prevPt.y);
-                int winArea = maxWinSize * maxWinSize;
-                cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize));
-
-                if( calcWinMaskMat(BI, windowType, iprevPt,
-                            winMaskMat,winSize,halfWin,winArea,
-                            minWinSize,maxWinSize, crossSegmentationThreshold) == false)
-                    continue;
-
-                prevPt -= halfWin;
-                iprevPt.x = cvFloor(prevPt.x);
-                iprevPt.y = cvFloor(prevPt.y);
-
-                if( iprevPt.x + halfWin.x < 0 || iprevPt.x + halfWin.x>= derivI.cols ||
-                    iprevPt.y + halfWin.y < 0 || iprevPt.y + halfWin.y >= derivI.rows )
+                    nextPt = prevPt;
+            }
+            else
+                nextPt = nextPts[ptidx]*2.f;
+            nextPts[ptidx] = nextPt;
+
+            Point2i iprevPt, inextPt;
+            iprevPt.x = cvFloor(prevPt.x);
+            iprevPt.y = cvFloor(prevPt.y);
+            int winArea = maxWinSize * maxWinSize;
+            cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize));
+
+            if( calcWinMaskMat(BI, windowType, iprevPt,
+                        winMaskMat,winSize,halfWin,winArea,
+                        minWinSize,maxWinSize, crossSegmentationThreshold) == false)
+                continue;
+
+            prevPt -= halfWin;
+            iprevPt.x = cvFloor(prevPt.x);
+            iprevPt.y = cvFloor(prevPt.y);
+
+            if( iprevPt.x + halfWin.x < 0 || iprevPt.x + halfWin.x>= derivI.cols ||
+                iprevPt.y + halfWin.y < 0 || iprevPt.y + halfWin.y >= derivI.rows )
+            {
+                if( level == 0 )
                 {
-                    if( level == 0 )
-                    {
-                        if( status )
-                            status[ptidx] = 3;
-                        if( err )
-                            err[ptidx] = 0;
-                    }
-                    continue;
+                    if( status )
+                        status[ptidx] = 3;
+                    if( err )
+                        err[ptidx] = 0;
                 }
+                continue;
+            }
 
-                float a = prevPt.x - iprevPt.x;
-                float b = prevPt.y - iprevPt.y;
-                const int W_BITS = 14, W_BITS1 = 14;
-
-                int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
-                int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
-                int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
-                int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-
-                int dstep = (int)(derivI.step/derivI.elemSize1());
-                int step = (int)(I.step/I.elemSize1());
-                int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1());
-                CV_Assert( step == (int)(J.step/J.elemSize1()) );
-                float A11 = 0, A12 = 0, A22 = 0;
-
-    #ifdef RLOF_SSE
-                __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
-                __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
-                __m128i z = _mm_setzero_si128();
-                __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1));
-                __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1));
-                __m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps();
-                __m128i mmMask4_epi32;
-                get4BitMask(winSize.width, mmMask4_epi32);
-    #endif
-
-
-                int x, y;
-                for( y = 0; y < winSize.height; y++ )
-                {
-                    const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
-                    const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
-                    const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
+            float a = prevPt.x - iprevPt.x;
+            float b = prevPt.y - iprevPt.y;
+            const int W_BITS = 14, W_BITS1 = 14;
 
-                    short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step);
-                    short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step);
+            int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+            int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+            int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+            int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
 
-                    x = 0;
+            int dstep = (int)(derivI.step/derivI.elemSize1());
+            int step = (int)(I.step/I.elemSize1());
+            int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1());
+            CV_Assert( step == (int)(J.step/J.elemSize1()) );
+            float A11 = 0, A12 = 0, A22 = 0;
 
-    #ifdef RLOF_SSE
-                    for( ; x < winSize.width*cn; x += 4, dsrc += 4*2, dIptr += 4*2 )
-                    {
-                        __m128i wMask = _mm_set_epi32(MaskSet * maskPtr[x+3],
-                                                      MaskSet * maskPtr[x+2],
-                                                      MaskSet * maskPtr[x+1],
-                                                      MaskSet * maskPtr[x]);
-                        __m128i v00, v01, v10, v11, t0, t1;
-                        v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
-                        v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
-                        v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z);
-                        v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z);
-
-                        t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
-                                           _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+#ifdef RLOF_SSE
+            __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+            __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+            __m128i z = _mm_setzero_si128();
+            __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1));
+            __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1));
+            __m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps();
+            __m128i mmMask4_epi32;
+            get4BitMask(winSize.width, mmMask4_epi32);
+#endif
 
-                        t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
-                        _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0));
-
-                        v00 = _mm_loadu_si128((const __m128i*)(dsrc));
-                        v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
-                        v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep));
-                        v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2));
-
-                        t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
-                                           _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-                        t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
-                                           _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
-                        t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
-                        t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
-                        v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
-
-                        if( x + 4 > winSize.width)
-                        {
-                            v00 = _mm_and_si128(v00, mmMask4_epi32);
-                        }
-                        v00 = _mm_and_si128(v00, wMask);
 
-                        _mm_storeu_si128((__m128i*)dIptr, v00);
-                        t0 = _mm_srai_epi32(v00, 16); // Iy0 Iy1 Iy2 Iy3
-                        t1 = _mm_srai_epi32(_mm_slli_epi32(v00, 16), 16); // Ix0 Ix1 Ix2 Ix3
+            int x, y;
+            for( y = 0; y < winSize.height; y++ )
+            {
+                const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+                const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
+                const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
 
-                        __m128 fy = _mm_cvtepi32_ps(t0);
-                        __m128 fx = _mm_cvtepi32_ps(t1);
+                short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step);
+                short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step);
 
-                        qA22 = _mm_add_ps(qA22, _mm_mul_ps(fy, fy));
-                        qA12 = _mm_add_ps(qA12, _mm_mul_ps(fx, fy));
-                        qA11 = _mm_add_ps(qA11, _mm_mul_ps(fx, fx));
-                    }
-    #else
+                x = 0;
 
-                    for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2)
+#ifdef RLOF_SSE
+                for( ; x < winSize.width*cn; x += 4, dsrc += 4*2, dIptr += 4*2 )
+                {
+                    __m128i wMask = _mm_set_epi32(MaskSet * maskPtr[x+3],
+                                                  MaskSet * maskPtr[x+2],
+                                                  MaskSet * maskPtr[x+1],
+                                                  MaskSet * maskPtr[x]);
+                    __m128i v00, v01, v10, v11, t0, t1;
+                    v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
+                    v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
+                    v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z);
+                    v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z);
+
+                    t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                                       _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+
+                    t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
+                    _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0));
+
+                    v00 = _mm_loadu_si128((const __m128i*)(dsrc));
+                    v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
+                    v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep));
+                    v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2));
+
+                    t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                                       _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                    t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+                                       _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+                    t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
+                    t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
+                    v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
+
+                    if( x + 4 > winSize.width)
                     {
-                        if( maskPtr[x] == 0)
-                        {
-                            dIptr[0] = (short)0;
-                            dIptr[1] = (short)0;
-
-                            continue;
-                        }
-                        int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
-                                              src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5);
-                        int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
-                                               dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
-                        int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
-                                               dsrc[dstep+cn2+1]*iw11, W_BITS1);
-
-                        Iptr[x] = (short)ival;
-                        dIptr[0] = (short)ixval;
-                        dIptr[1] = (short)iyval;
-                        A11 += (float)(ixval*ixval);
-                        A12 += (float)(ixval*iyval);
-                        A22 += (float)(iyval*iyval);
-
+                        v00 = _mm_and_si128(v00, mmMask4_epi32);
                     }
+                    v00 = _mm_and_si128(v00, wMask);
 
+                    _mm_storeu_si128((__m128i*)dIptr, v00);
+                    t0 = _mm_srai_epi32(v00, 16); // Iy0 Iy1 Iy2 Iy3
+                    t1 = _mm_srai_epi32(_mm_slli_epi32(v00, 16), 16); // Ix0 Ix1 Ix2 Ix3
 
-    #endif
+                    __m128 fy = _mm_cvtepi32_ps(t0);
+                    __m128 fx = _mm_cvtepi32_ps(t1);
 
+                    qA22 = _mm_add_ps(qA22, _mm_mul_ps(fy, fy));
+                    qA12 = _mm_add_ps(qA12, _mm_mul_ps(fx, fy));
+                    qA11 = _mm_add_ps(qA11, _mm_mul_ps(fx, fx));
                 }
+#else
 
-    #ifdef RLOF_SSE
-                float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];
-                _mm_store_ps(A11buf, qA11);
-                _mm_store_ps(A12buf, qA12);
-                _mm_store_ps(A22buf, qA22);
-                A11 += A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
-                A12 += A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
-                A22 += A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
-    #endif
-
-                A11 *= FLT_SCALE;
-                A12 *= FLT_SCALE;
-                A22 *= FLT_SCALE;
-
-
-                float D = A11*A22 - A12*A12;
-                float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
-                                 4.f*A12*A12))/(2 * winArea);
+                for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2)
+                {
+                    if( maskPtr[x] == 0)
+                    {
+                        dIptr[0] = (short)0;
+                        dIptr[1] = (short)0;
 
-                if( err )
-                    err[ptidx] = (float)minEig;
+                        continue;
+                    }
+                    int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
+                                          src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5);
+                    int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
+                                           dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
+                    int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
+                                           dsrc[dstep+cn2+1]*iw11, W_BITS1);
+
+                    Iptr[x] = (short)ival;
+                    dIptr[0] = (short)ixval;
+                    dIptr[1] = (short)iyval;
+                    A11 += (float)(ixval*ixval);
+                    A12 += (float)(ixval*iyval);
+                    A22 += (float)(iyval*iyval);
 
-                if( D < FLT_EPSILON )
-                {
-                    if( level == 0 && status )
-                        status[ptidx] = 0;
-                    continue;
                 }
 
-                D = 1.f/D;
-
-                cv::Point2f backUpNextPt = nextPt;
-                nextPt -= halfWin;
-                Point2f prevDelta(0,0);
 
-                float c[8];
-#ifdef RLOF_SSE
-                __m128i mmMask0, mmMask1, mmMask;
-                getWBitMask(winSize.width, mmMask0, mmMask1, mmMask);
 #endif
-                for( j = 0; j < criteria.maxCount; j++ )
-                {
-                    cv::Point2f delta;
-                    bool hasSolved = false;
-                    a = nextPt.x - cvFloor(nextPt.x);
-                    b = nextPt.y - cvFloor(nextPt.y);
-                    float ab = a * b;
-
-
-                    if( (inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j == 0))
-                    {
-                    inextPt.x = cvFloor(nextPt.x);
-                    inextPt.y = cvFloor(nextPt.y);
-
-
-                    if( inextPt.x < -winSize.width || inextPt.x >= J.cols ||
-                        inextPt.y < -winSize.height || inextPt.y >= J.rows )
-                    {
-                        if( level == 0 && status )
-                            status[ptidx] = 3;
-                        if (level > 0)
-                            nextPts[ptidx] = backUpNextPt;
-                        break;
-                    }
 
+            }
 
-                    iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
-                    iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
-                    iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
-                    iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+#ifdef RLOF_SSE
+            float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];
+            _mm_store_ps(A11buf, qA11);
+            _mm_store_ps(A12buf, qA12);
+            _mm_store_ps(A22buf, qA22);
+            A11 += A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
+            A12 += A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
+            A22 += A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
+#endif
 
-                    float _b1[4] = {0,0,0,0};
-                    float _b2[4] = {0,0,0,0};
-    #ifdef RLOF_SSE
-                    __m128 qbc0[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
-                    __m128 qbc1[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
-                    qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
-                    qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
-    #endif
-                    for( y = 0; y < winSize.height; y++ )
-                    {
-                        const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn;
-                        const short* Iptr = (const short*)(IWinBuf.data + ( y)*IWinBuf.step);
-                        const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
-                        x = 0;
-    #ifdef RLOF_SSE
-                        const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
-                        for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 )
-                        {
-                            if( maskPtr[x  ] == 0 && maskPtr[x+1] == 0 && maskPtr[x+2] == 0 && maskPtr[x+3] == 0
-                            &&    maskPtr[x+4] == 0 && maskPtr[x+5] == 0 && maskPtr[x+6] == 0 && maskPtr[x+7] == 0)
-                                continue;
-                            __m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x));
-                            __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
-                            __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
-                            __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z);
-                            __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z);
-
-                            __m128i mmDiffc_epi16[4] =
-                            { _mm_subs_epi16(_mm_slli_epi16(v00, 5), diff0),
-                              _mm_subs_epi16(_mm_slli_epi16(v01, 5), diff0),
-                              _mm_subs_epi16(_mm_slli_epi16(v10, 5), diff0),
-                              _mm_subs_epi16(_mm_slli_epi16(v11, 5), diff0)
-                            };
-
-                            __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
-                            __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
-
-                            if(  x > winSize.width*cn - 8)
-                            {
-                                Ixy_0 = _mm_and_si128(Ixy_0, mmMask0);
-                                Ixy_1 = _mm_and_si128(Ixy_1, mmMask1);
-                            }
+            A11 *= FLT_SCALE;
+            A12 *= FLT_SCALE;
+            A22 *= FLT_SCALE;
 
-                            __m128i diffc1[4] =
-                            {_mm_unpackhi_epi16(mmDiffc_epi16[0],mmDiffc_epi16[0]),
-                             _mm_unpackhi_epi16(mmDiffc_epi16[1],mmDiffc_epi16[1]),
-                             _mm_unpackhi_epi16(mmDiffc_epi16[2],mmDiffc_epi16[2]),
-                             _mm_unpackhi_epi16(mmDiffc_epi16[3],mmDiffc_epi16[3])
-                            };
 
-                            __m128i diffc0[4] =
-                            {_mm_unpacklo_epi16(mmDiffc_epi16[0],mmDiffc_epi16[0]),
-                             _mm_unpacklo_epi16(mmDiffc_epi16[1],mmDiffc_epi16[1]),
-                             _mm_unpacklo_epi16(mmDiffc_epi16[2],mmDiffc_epi16[2]),
-                             _mm_unpacklo_epi16(mmDiffc_epi16[3],mmDiffc_epi16[3])
-                            };
+            float D = A11*A22 - A12*A12;
+            float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
+                             4.f*A12*A12))/(2 * winArea);
 
+            if( err )
+                err[ptidx] = (float)minEig;
 
-                            // It * Ix It * Iy [0 ... 3]
-                            //mask split for multiplication
-                            // for set 1 lo sigma 1
+            if( D < FLT_EPSILON )
+            {
+                if( level == 0 && status )
+                    status[ptidx] = 0;
+                continue;
+            }
 
+            D = 1.f/D;
 
-                            v10 = _mm_mullo_epi16(Ixy_0, diffc0[0]);
-                            v11 = _mm_mulhi_epi16(Ixy_0, diffc0[0]);
-                            v00 = _mm_unpacklo_epi16(v10, v11);
-                            v10 = _mm_unpackhi_epi16(v10, v11);
-                            qbc0[0] = _mm_add_ps(qbc0[0], _mm_cvtepi32_ps(v00));
-                            qbc1[0] = _mm_add_ps(qbc1[0], _mm_cvtepi32_ps(v10));
+            cv::Point2f backUpNextPt = nextPt;
+            nextPt -= halfWin;
+            Point2f prevDelta(0,0);
 
-                            v10 = _mm_mullo_epi16(Ixy_0, diffc0[1]);
-                            v11 = _mm_mulhi_epi16(Ixy_0, diffc0[1]);
-                            v00 = _mm_unpacklo_epi16(v10, v11);
-                            v10 = _mm_unpackhi_epi16(v10, v11);
-                            qbc0[1] = _mm_add_ps(qbc0[1], _mm_cvtepi32_ps(v00));
-                            qbc1[1] = _mm_add_ps(qbc1[1], _mm_cvtepi32_ps(v10));
+            float c[8];
+#ifdef RLOF_SSE
+            __m128i mmMask0, mmMask1, mmMask;
+            getWBitMask(winSize.width, mmMask0, mmMask1, mmMask);
+#endif
+            for( j = 0; j < criteria.maxCount; j++ )
+            {
+                cv::Point2f delta;
+                bool hasSolved = false;
+                a = nextPt.x - cvFloor(nextPt.x);
+                b = nextPt.y - cvFloor(nextPt.y);
+                float ab = a * b;
 
-                            v10 = _mm_mullo_epi16(Ixy_0, diffc0[2]);
-                            v11 = _mm_mulhi_epi16(Ixy_0, diffc0[2]);
-                            v00 = _mm_unpacklo_epi16(v10, v11);
-                            v10 = _mm_unpackhi_epi16(v10, v11);
-                            qbc0[2] = _mm_add_ps(qbc0[2], _mm_cvtepi32_ps(v00));
-                            qbc1[2] = _mm_add_ps(qbc1[2], _mm_cvtepi32_ps(v10));
 
-                            v10 = _mm_mullo_epi16(Ixy_0, diffc0[3]);
-                            v11 = _mm_mulhi_epi16(Ixy_0, diffc0[3]);
-                            v00 = _mm_unpacklo_epi16(v10, v11);
-                            v10 = _mm_unpackhi_epi16(v10, v11);
-                            qbc0[3] = _mm_add_ps(qbc0[3], _mm_cvtepi32_ps(v00));
-                            qbc1[3] = _mm_add_ps(qbc1[3], _mm_cvtepi32_ps(v10));
-                            // It * Ix It * Iy [4 ... 7]
-                            // for set 1 hi sigma 1
+                if( (inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j == 0))
+                {
+                inextPt.x = cvFloor(nextPt.x);
+                inextPt.y = cvFloor(nextPt.y);
 
-                            v10 = _mm_mullo_epi16(Ixy_1, diffc1[0]);
-                            v11 = _mm_mulhi_epi16(Ixy_1, diffc1[0]);
-                            v00 = _mm_unpacklo_epi16(v10, v11);
-                            v10 = _mm_unpackhi_epi16(v10, v11);
-                            qbc0[0] = _mm_add_ps(qbc0[0], _mm_cvtepi32_ps(v00));
-                            qbc1[0] = _mm_add_ps(qbc1[0], _mm_cvtepi32_ps(v10));
 
-                            v10 = _mm_mullo_epi16(Ixy_1, diffc1[1]);
-                            v11 = _mm_mulhi_epi16(Ixy_1, diffc1[1]);
-                            v00 = _mm_unpacklo_epi16(v10, v11);
-                            v10 = _mm_unpackhi_epi16(v10, v11);
-                            qbc0[1] = _mm_add_ps(qbc0[1], _mm_cvtepi32_ps(v00));
-                            qbc1[1] = _mm_add_ps(qbc1[1], _mm_cvtepi32_ps(v10));
+                if( inextPt.x < -winSize.width || inextPt.x >= J.cols ||
+                    inextPt.y < -winSize.height || inextPt.y >= J.rows )
+                {
+                    if( level == 0 && status )
+                        status[ptidx] = 3;
+                    if (level > 0)
+                        nextPts[ptidx] = backUpNextPt;
+                    break;
+                }
 
-                            v10 = _mm_mullo_epi16(Ixy_1, diffc1[2]);
-                            v11 = _mm_mulhi_epi16(Ixy_1, diffc1[2]);
-                            v00 = _mm_unpacklo_epi16(v10, v11);
-                            v10 = _mm_unpackhi_epi16(v10, v11);
-                            qbc0[2] = _mm_add_ps(qbc0[2], _mm_cvtepi32_ps(v00));
-                            qbc1[2] = _mm_add_ps(qbc1[2], _mm_cvtepi32_ps(v10));
 
-                            v10 = _mm_mullo_epi16(Ixy_1, diffc1[3]);
-                            v11 = _mm_mulhi_epi16(Ixy_1, diffc1[3]);
-                            v00 = _mm_unpacklo_epi16(v10, v11);
-                            v10 = _mm_unpackhi_epi16(v10, v11);
-                            qbc0[3] = _mm_add_ps(qbc0[3], _mm_cvtepi32_ps(v00));
-                            qbc1[3] = _mm_add_ps(qbc1[3], _mm_cvtepi32_ps(v10));
+                iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+                iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+                iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+                iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
 
-                         }
-    #else
-                        for( ; x < winSize.width*cn; x++, dIptr += 2 )
+                float _b1[4] = {0,0,0,0};
+                float _b2[4] = {0,0,0,0};
+#ifdef RLOF_SSE
+                __m128 qbc0[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
+                __m128 qbc1[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
+                qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+                qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+#endif
+                for( y = 0; y < winSize.height; y++ )
+                {
+                    const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn;
+                    const short* Iptr = (const short*)(IWinBuf.data + ( y)*IWinBuf.step);
+                    const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
+                    x = 0;
+#ifdef RLOF_SSE
+                    const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+                    for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 )
+                    {
+                        if( maskPtr[x  ] == 0 && maskPtr[x+1] == 0 && maskPtr[x+2] == 0 && maskPtr[x+3] == 0
+                        &&    maskPtr[x+4] == 0 && maskPtr[x+5] == 0 && maskPtr[x+6] == 0 && maskPtr[x+7] == 0)
+                            continue;
+                        __m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x));
+                        __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
+                        __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
+                        __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z);
+                        __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z);
+
+                        __m128i mmDiffc_epi16[4] =
+                        { _mm_subs_epi16(_mm_slli_epi16(v00, 5), diff0),
+                          _mm_subs_epi16(_mm_slli_epi16(v01, 5), diff0),
+                          _mm_subs_epi16(_mm_slli_epi16(v10, 5), diff0),
+                          _mm_subs_epi16(_mm_slli_epi16(v11, 5), diff0)
+                        };
+
+                        __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
+                        __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
+
+                        if(  x > winSize.width*cn - 8)
                         {
-                            if( dIptr[0] == 0 && dIptr[1] == 0)
-                                continue;
-                            /*short It[4] = {CV_DESCALE((Jptr[x])        *(1 << W_BITS),W_BITS1-5) - Iptr[x],
-                                                CV_DESCALE((Jptr[x+cn])        *(1 << W_BITS),W_BITS1-5) - Iptr[x],
-                                                CV_DESCALE((Jptr[x+step])    *(1 << W_BITS),W_BITS1-5) - Iptr[x],
-                                                CV_DESCALE((Jptr[x+step+cn])*(1 << W_BITS),W_BITS1-5) - Iptr[x]};*/
-                            short It[4] = {(Jptr[x] << 5)            - Iptr[x],
-                                                (Jptr[x+cn]<< 5)        - Iptr[x],
-                                                (Jptr[x+step]<< 5)        - Iptr[x],
-                                                (Jptr[x+step+cn]<< 5)    - Iptr[x]};
-
-                            _b1[0] += (float)(It[0]*dIptr[0]);
-                            _b1[1] += (float)(It[1]*dIptr[0]);
-                            _b1[2] += (float)(It[2]*dIptr[0]);
-                            _b1[3] += (float)(It[3]*dIptr[0]);
-
-                            _b2[0] += (float)(It[0]*dIptr[1]);
-                            _b2[1] += (float)(It[1]*dIptr[1]);
-                            _b2[2] += (float)(It[2]*dIptr[1]);
-                            _b2[3] += (float)(It[3]*dIptr[1]);
+                            Ixy_0 = _mm_and_si128(Ixy_0, mmMask0);
+                            Ixy_1 = _mm_and_si128(Ixy_1, mmMask1);
                         }
-    #endif
 
+                        __m128i diffc1[4] =
+                        {_mm_unpackhi_epi16(mmDiffc_epi16[0],mmDiffc_epi16[0]),
+                         _mm_unpackhi_epi16(mmDiffc_epi16[1],mmDiffc_epi16[1]),
+                         _mm_unpackhi_epi16(mmDiffc_epi16[2],mmDiffc_epi16[2]),
+                         _mm_unpackhi_epi16(mmDiffc_epi16[3],mmDiffc_epi16[3])
+                        };
+
+                        __m128i diffc0[4] =
+                        {_mm_unpacklo_epi16(mmDiffc_epi16[0],mmDiffc_epi16[0]),
+                         _mm_unpacklo_epi16(mmDiffc_epi16[1],mmDiffc_epi16[1]),
+                         _mm_unpacklo_epi16(mmDiffc_epi16[2],mmDiffc_epi16[2]),
+                         _mm_unpacklo_epi16(mmDiffc_epi16[3],mmDiffc_epi16[3])
+                        };
+
+
+                        // It * Ix It * Iy [0 ... 3]
+                        //mask split for multiplication
+                        // for set 1 lo sigma 1
+
+
+                        v10 = _mm_mullo_epi16(Ixy_0, diffc0[0]);
+                        v11 = _mm_mulhi_epi16(Ixy_0, diffc0[0]);
+                        v00 = _mm_unpacklo_epi16(v10, v11);
+                        v10 = _mm_unpackhi_epi16(v10, v11);
+                        qbc0[0] = _mm_add_ps(qbc0[0], _mm_cvtepi32_ps(v00));
+                        qbc1[0] = _mm_add_ps(qbc1[0], _mm_cvtepi32_ps(v10));
+
+                        v10 = _mm_mullo_epi16(Ixy_0, diffc0[1]);
+                        v11 = _mm_mulhi_epi16(Ixy_0, diffc0[1]);
+                        v00 = _mm_unpacklo_epi16(v10, v11);
+                        v10 = _mm_unpackhi_epi16(v10, v11);
+                        qbc0[1] = _mm_add_ps(qbc0[1], _mm_cvtepi32_ps(v00));
+                        qbc1[1] = _mm_add_ps(qbc1[1], _mm_cvtepi32_ps(v10));
+
+                        v10 = _mm_mullo_epi16(Ixy_0, diffc0[2]);
+                        v11 = _mm_mulhi_epi16(Ixy_0, diffc0[2]);
+                        v00 = _mm_unpacklo_epi16(v10, v11);
+                        v10 = _mm_unpackhi_epi16(v10, v11);
+                        qbc0[2] = _mm_add_ps(qbc0[2], _mm_cvtepi32_ps(v00));
+                        qbc1[2] = _mm_add_ps(qbc1[2], _mm_cvtepi32_ps(v10));
+
+                        v10 = _mm_mullo_epi16(Ixy_0, diffc0[3]);
+                        v11 = _mm_mulhi_epi16(Ixy_0, diffc0[3]);
+                        v00 = _mm_unpacklo_epi16(v10, v11);
+                        v10 = _mm_unpackhi_epi16(v10, v11);
+                        qbc0[3] = _mm_add_ps(qbc0[3], _mm_cvtepi32_ps(v00));
+                        qbc1[3] = _mm_add_ps(qbc1[3], _mm_cvtepi32_ps(v10));
+                        // It * Ix It * Iy [4 ... 7]
+                        // for set 1 hi sigma 1
+
+                        v10 = _mm_mullo_epi16(Ixy_1, diffc1[0]);
+                        v11 = _mm_mulhi_epi16(Ixy_1, diffc1[0]);
+                        v00 = _mm_unpacklo_epi16(v10, v11);
+                        v10 = _mm_unpackhi_epi16(v10, v11);
+                        qbc0[0] = _mm_add_ps(qbc0[0], _mm_cvtepi32_ps(v00));
+                        qbc1[0] = _mm_add_ps(qbc1[0], _mm_cvtepi32_ps(v10));
+
+                        v10 = _mm_mullo_epi16(Ixy_1, diffc1[1]);
+                        v11 = _mm_mulhi_epi16(Ixy_1, diffc1[1]);
+                        v00 = _mm_unpacklo_epi16(v10, v11);
+                        v10 = _mm_unpackhi_epi16(v10, v11);
+                        qbc0[1] = _mm_add_ps(qbc0[1], _mm_cvtepi32_ps(v00));
+                        qbc1[1] = _mm_add_ps(qbc1[1], _mm_cvtepi32_ps(v10));
+
+                        v10 = _mm_mullo_epi16(Ixy_1, diffc1[2]);
+                        v11 = _mm_mulhi_epi16(Ixy_1, diffc1[2]);
+                        v00 = _mm_unpacklo_epi16(v10, v11);
+                        v10 = _mm_unpackhi_epi16(v10, v11);
+                        qbc0[2] = _mm_add_ps(qbc0[2], _mm_cvtepi32_ps(v00));
+                        qbc1[2] = _mm_add_ps(qbc1[2], _mm_cvtepi32_ps(v10));
+
+                        v10 = _mm_mullo_epi16(Ixy_1, diffc1[3]);
+                        v11 = _mm_mulhi_epi16(Ixy_1, diffc1[3]);
+                        v00 = _mm_unpacklo_epi16(v10, v11);
+                        v10 = _mm_unpackhi_epi16(v10, v11);
+                        qbc0[3] = _mm_add_ps(qbc0[3], _mm_cvtepi32_ps(v00));
+                        qbc1[3] = _mm_add_ps(qbc1[3], _mm_cvtepi32_ps(v10));
+
+                     }
+#else
+                    for( ; x < winSize.width*cn; x++, dIptr += 2 )
+                    {
+                        if( dIptr[0] == 0 && dIptr[1] == 0)
+                            continue;
+                        /*short It[4] = {CV_DESCALE((Jptr[x])        *(1 << W_BITS),W_BITS1-5) - Iptr[x],
+                                            CV_DESCALE((Jptr[x+cn])        *(1 << W_BITS),W_BITS1-5) - Iptr[x],
+                                            CV_DESCALE((Jptr[x+step])    *(1 << W_BITS),W_BITS1-5) - Iptr[x],
+                                            CV_DESCALE((Jptr[x+step+cn])*(1 << W_BITS),W_BITS1-5) - Iptr[x]};*/
+                        short It[4] = {(Jptr[x] << 5)            - Iptr[x],
+                                            (Jptr[x+cn]<< 5)        - Iptr[x],
+                                            (Jptr[x+step]<< 5)        - Iptr[x],
+                                            (Jptr[x+step+cn]<< 5)    - Iptr[x]};
+
+                        _b1[0] += (float)(It[0]*dIptr[0]);
+                        _b1[1] += (float)(It[1]*dIptr[0]);
+                        _b1[2] += (float)(It[2]*dIptr[0]);
+                        _b1[3] += (float)(It[3]*dIptr[0]);
+
+                        _b2[0] += (float)(It[0]*dIptr[1]);
+                        _b2[1] += (float)(It[1]*dIptr[1]);
+                        _b2[2] += (float)(It[2]*dIptr[1]);
+                        _b2[3] += (float)(It[3]*dIptr[1]);
                     }
+#endif
 
-    #ifdef RLOF_SSE
-                    float CV_DECL_ALIGNED(16) bbuf[4];
-                    _mm_store_ps(bbuf, _mm_add_ps(qbc0[0], qbc1[0]));
-                    _b1[0] += bbuf[0] + bbuf[2];
-                    _b2[0] += bbuf[1] + bbuf[3];
-
-                    _mm_store_ps(bbuf, _mm_add_ps(qbc0[1], qbc1[1]));
-                    _b1[1] += bbuf[0] + bbuf[2];
-                    _b2[1] += bbuf[1] + bbuf[3];
-
-                    _mm_store_ps(bbuf, _mm_add_ps(qbc0[2], qbc1[2]));
-                    _b1[2] += bbuf[0] + bbuf[2];
-                    _b2[2] += bbuf[1] + bbuf[3];
-
-                    _mm_store_ps(bbuf, _mm_add_ps(qbc0[3], qbc1[3]));
-                    _b1[3] += bbuf[0] + bbuf[2];
-                    _b2[3] += bbuf[1] + bbuf[3];
-
-    #endif
-                    _b1[0] *= FLT_SCALE;
-                    _b1[1] *= FLT_SCALE;
-                    _b1[2] *= FLT_SCALE;
-                    _b1[3] *= FLT_SCALE;
-                    _b2[0] *= FLT_SCALE;
-                    _b2[1] *= FLT_SCALE;
-                    _b2[2] *= FLT_SCALE;
-                    _b2[3] *= FLT_SCALE;
-
-                    float c0[4] = {    _b1[3] + _b1[0] - _b1[2] - _b1[1],
-                                    _b1[1] - _b1[0],
-                                    _b1[2] - _b1[0],
-                                    _b1[0]};
-                    float c1[4] = {    _b2[3] + _b2[0] - _b2[2] - _b2[1],
-                                    _b2[1] - _b2[0],
-                                    _b2[2] - _b2[0],
-                                    _b2[0]};
-
-                    float DA12 = A12 * D ;
-                    float DA22 = A22 * D ;
-                    float DA11 = A11 * D ;
-                    c[0]    = DA12 * c1[0] - DA22 * c0[0];
-                    c[1]    = DA12 * c1[1] - DA22 * c0[1];
-                    c[2]    = DA12 * c1[2] - DA22 * c0[2];
-                    c[3]    = DA12 * c1[3] - DA22 * c0[3];
-                    c[4]    = DA12 * c0[0] - DA11 * c1[0];
-                    c[5]    = DA12 * c0[1] - DA11 * c1[1];
-                    c[6]    = DA12 * c0[2] - DA11 * c1[2];
-                    c[7]    = DA12 * c0[3] - DA11 * c1[3];
+                }
 
-                    float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]);
-                    float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]);
-                    float e2 = e0 * (c[1] * c[7] -c[3] * c[5]);
-                    e0 = e1 * e1 - e2;
-                    if ( e0 >= 0)
-                    {
-                        e0 = sqrt(e0);
-                        float _y[2] = {-e1 - e0, e0 - e1};
-                        float c0yc1[2] = {c[0] * _y[0] + c[1],
-                                          c[0] * _y[1] + c[1]};
+#ifdef RLOF_SSE
+                float CV_DECL_ALIGNED(16) bbuf[4];
+                _mm_store_ps(bbuf, _mm_add_ps(qbc0[0], qbc1[0]));
+                _b1[0] += bbuf[0] + bbuf[2];
+                _b2[0] += bbuf[1] + bbuf[3];
 
-                        float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0],
-                                       -(c[2] * _y[1] + c[3]) / c0yc1[1]};
+                _mm_store_ps(bbuf, _mm_add_ps(qbc0[1], qbc1[1]));
+                _b1[1] += bbuf[0] + bbuf[2];
+                _b2[1] += bbuf[1] + bbuf[3];
 
-                        bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1);
-                        bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1);
+                _mm_store_ps(bbuf, _mm_add_ps(qbc0[2], qbc1[2]));
+                _b1[2] += bbuf[0] + bbuf[2];
+                _b2[2] += bbuf[1] + bbuf[3];
 
+                _mm_store_ps(bbuf, _mm_add_ps(qbc0[3], qbc1[3]));
+                _b1[3] += bbuf[0] + bbuf[2];
+                _b2[3] += bbuf[1] + bbuf[3];
 
-                        bool isSink1 = isIn1 && checkSolution(_x[0], _y[0], c );
-                        bool isSink2 = isIn2 && checkSolution(_x[1], _y[1], c );
+#endif
+                _b1[0] *= FLT_SCALE;
+                _b1[1] *= FLT_SCALE;
+                _b1[2] *= FLT_SCALE;
+                _b1[3] *= FLT_SCALE;
+                _b2[0] *= FLT_SCALE;
+                _b2[1] *= FLT_SCALE;
+                _b2[2] *= FLT_SCALE;
+                _b2[3] *= FLT_SCALE;
+
+                float c0[4] = {    _b1[3] + _b1[0] - _b1[2] - _b1[1],
+                                _b1[1] - _b1[0],
+                                _b1[2] - _b1[0],
+                                _b1[0]};
+                float c1[4] = {    _b2[3] + _b2[0] - _b2[2] - _b2[1],
+                                _b2[1] - _b2[0],
+                                _b2[2] - _b2[0],
+                                _b2[0]};
+
+                float DA12 = A12 * D ;
+                float DA22 = A22 * D ;
+                float DA11 = A11 * D ;
+                c[0]    = DA12 * c1[0] - DA22 * c0[0];
+                c[1]    = DA12 * c1[1] - DA22 * c0[1];
+                c[2]    = DA12 * c1[2] - DA22 * c0[2];
+                c[3]    = DA12 * c1[3] - DA22 * c0[3];
+                c[4]    = DA12 * c0[0] - DA11 * c1[0];
+                c[5]    = DA12 * c0[1] - DA11 * c1[1];
+                c[6]    = DA12 * c0[2] - DA11 * c1[2];
+                c[7]    = DA12 * c0[3] - DA11 * c1[3];
+
+                float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]);
+                float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]);
+                float e2 = e0 * (c[1] * c[7] -c[3] * c[5]);
+                e0 = e1 * e1 - e2;
+                if ( e0 >= 0)
+                {
+                    e0 = sqrt(e0);
+                    float _y[2] = {-e1 - e0, e0 - e1};
+                    float c0yc1[2] = {c[0] * _y[0] + c[1],
+                                      c[0] * _y[1] + c[1]};
 
+                    float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0],
+                                   -(c[2] * _y[1] + c[3]) / c0yc1[1]};
 
-                        //if( isSink1 != isSink2 )
-                        {
-                            if( isSink1 )
-                            {
-                                hasSolved = true;
-                                delta.x = inextPt.x + _x[0] - nextPt.x;
-                                delta.y = inextPt.y + _y[0] - nextPt.y;
-                            }
-                            if (isSink2 )
-                            {
-                                hasSolved = true;
-                                delta.x = inextPt.x + _x[1] - nextPt.x;
-                                delta.y = inextPt.y + _y[1] - nextPt.y;
-                            }
-                        } // isIn1 != isIn2
-                    } // e0 >= 0
-                    }
-                    else
-                        hasSolved = false;
-                    if(hasSolved == false)
-                    {
-                        delta = Point2f( c[0] * ab + c[1] * a + c[2] * b + c[3],
-                                         c[4] * ab + c[5] * a + c[6] * b + c[7]);
-                        nextPt += 0.7 * delta;
-                        nextPts[ptidx] = nextPt + halfWin;
-                    }
-                    else
-                    {
-                        nextPt += delta;
-                        nextPts[ptidx] = nextPt + halfWin;
-                        break;
-                    }
+                    bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1);
+                    bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1);
 
 
-                    if( delta.ddot(delta) <= criteria.epsilon)
-                        break;
+                    bool isSink1 = isIn1 && checkSolution(_x[0], _y[0], c );
+                    bool isSink2 = isIn2 && checkSolution(_x[1], _y[1], c );
 
-                    if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01  &&
-                                std::abs(delta.y - prevDelta.y) < 0.01)
+
+                    //if( isSink1 != isSink2 )
                     {
-                        nextPts[ptidx]  -= delta*0.35f;
-                        break;
-                    }
+                        if( isSink1 )
+                        {
+                            hasSolved = true;
+                            delta.x = inextPt.x + _x[0] - nextPt.x;
+                            delta.y = inextPt.y + _y[0] - nextPt.y;
+                        }
+                        if (isSink2 )
+                        {
+                            hasSolved = true;
+                            delta.x = inextPt.x + _x[1] - nextPt.x;
+                            delta.y = inextPt.y + _y[1] - nextPt.y;
+                        }
+                    } // isIn1 != isIn2
+                } // e0 >= 0
+                }
+                else
+                    hasSolved = false;
+                if(hasSolved == false)
+                {
+                    delta = Point2f( c[0] * ab + c[1] * a + c[2] * b + c[3],
+                                     c[4] * ab + c[5] * a + c[6] * b + c[7]);
+                    nextPt += 0.7 * delta;
+                    nextPts[ptidx] = nextPt + halfWin;
+                }
+                else
+                {
+                    nextPt += delta;
+                    nextPts[ptidx] = nextPt + halfWin;
+                    break;
+                }
 
-                    prevDelta = delta;
 
+                if( delta.ddot(delta) <= criteria.epsilon)
+                    break;
+
+                if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01  &&
+                            std::abs(delta.y - prevDelta.y) < 0.01)
+                {
+                    nextPts[ptidx]  -= delta*0.35f;
+                    break;
                 }
 
+                prevDelta = delta;
+
             }
+
         }
+    }
 
 
-        const Mat*        prevImg;
-        const Mat*        nextImg;
-        const Mat*        prevDeriv;
-        const Mat*        rgbPrevImg;
-        const Mat*        rgbNextImg;
-        const Point2f*    prevPts;
-        Point2f*        nextPts;
-        uchar*            status;
-        float*            err;
-        int                maxWinSize;
-        int                minWinSize;
-        TermCriteria    criteria;
-        int                level;
-        int                maxLevel;
-        int                windowType;
-        float            minEigThreshold;
-        bool            useInitialFlow;
-        int        crossSegmentationThreshold;
-    };
-}
-}
-}
-}
+    const Mat*          prevImg;
+    const Mat*          nextImg;
+    const Mat*          prevDeriv;
+    const Mat*          rgbPrevImg;
+    const Mat*          rgbNextImg;
+    const Point2f*      prevPts;
+    Point2f*            nextPts;
+    uchar*              status;
+    float*              err;
+    int                 maxWinSize;
+    int                 minWinSize;
+    TermCriteria        criteria;
+    int                 level;
+    int                 maxLevel;
+    int                 windowType;
+    float               minEigThreshold;
+    bool                useInitialFlow;
+    int                 crossSegmentationThreshold;
+};
+
+}}}}  // namespace
 #endif
diff --git a/modules/optflow/src/rlof/geo_interpolation.cpp b/modules/optflow/src/rlof/geo_interpolation.cpp
index f96e3914cff..048f65812d0 100644
--- a/modules/optflow/src/rlof/geo_interpolation.cpp
+++ b/modules/optflow/src/rlof/geo_interpolation.cpp
@@ -5,482 +5,480 @@
 #include "geo_interpolation.hpp"
 #include <string>
 #include <map>
-namespace cv
-{
-namespace optflow
+namespace cv {
+namespace optflow {
+
+inline double K_h1(float x, double h2)
 {
+    return exp(-(0.5 / (h2)) * x);
+}
 
-    inline double K_h1(float x, double h2)
-    {
-        return exp(-(0.5 / (h2)) * x);
+struct Graph_helper {
+    int * mem;
+    int e_size;
+    Graph_helper(int k, int num_nodes) {
+        e_size = (2 * k + 1);
+        mem = (int*)malloc(sizeof(int) * e_size * num_nodes);
+        memset(mem, 0, sizeof(int) * e_size * num_nodes);
     }
-
-    struct Graph_helper {
-        int * mem;
-        int e_size;
-        Graph_helper(int k, int num_nodes) {
-            e_size = (2 * k + 1);
-            mem = (int*)malloc(sizeof(int) * e_size * num_nodes);
-            memset(mem, 0, sizeof(int) * e_size * num_nodes);
-        }
-        inline int size(int id) {
-            int r_addr = id * (e_size);
-            return ((int*)mem)[r_addr];
-        }
-        inline int * data(int id) {
-            int r_addr = id * (e_size)+1;
-            return mem + r_addr;
-        }
-        inline void add(int id, std::pair<float, int> data) {
-            int r_addr = id * (e_size);
-            int size = ++((int*)mem)[r_addr];
-            r_addr += 2 * size - 1;//== 1 + 2*(size-1);
-            ((float*)mem)[r_addr] = data.first;
-            ((int*)mem)[r_addr + 1] = data.second;
-        }
-        inline bool color_in_target(int id, int color) {
-            int r_addr = id * (e_size);
-            int size = (((int*)mem)[r_addr]);
-            r_addr += 2;
-            for (int i = 0; i < size; i++) {
-                if (((int*)mem)[r_addr] == color) {
-                    return true;
-                }
-                r_addr += 2;
+    inline int size(int id) {
+        int r_addr = id * (e_size);
+        return ((int*)mem)[r_addr];
+    }
+    inline int * data(int id) {
+        int r_addr = id * (e_size)+1;
+        return mem + r_addr;
+    }
+    inline void add(int id, std::pair<float, int> data) {
+        int r_addr = id * (e_size);
+        int size = ++((int*)mem)[r_addr];
+        r_addr += 2 * size - 1;//== 1 + 2*(size-1);
+        ((float*)mem)[r_addr] = data.first;
+        ((int*)mem)[r_addr + 1] = data.second;
+    }
+    inline bool color_in_target(int id, int color) {
+        int r_addr = id * (e_size);
+        int size = (((int*)mem)[r_addr]);
+        r_addr += 2;
+        for (int i = 0; i < size; i++) {
+            if (((int*)mem)[r_addr] == color) {
+                return true;
             }
-            return false;
+            r_addr += 2;
         }
+        return false;
+    }
+
+};
+
+Mat sgeo_dist(const Mat& gra, int y, int x, float max, Mat &prev)
+{
+    std::vector <Point2f> points;
+    points.push_back(Point2f(static_cast<float>(x), static_cast<float>(y)));
+    return sgeo_dist(gra, points, max, prev);
+}
+Mat sgeo_dist(const Mat& gra, const std::vector<Point2f> & points, float max, Mat &prev)
+{
+    int Dx[] = { -1,0,1,-1,1,-1,0,1 };
+    int Dy[] = { -1,-1,-1,0,0,1,1,1 };
+    Mat dm(gra.rows, gra.cols, CV_32F, Scalar(max));
+    prev = Mat(gra.rows, gra.cols, CV_8U, Scalar(255));
 
-    };
+    std::multimap<float, Vec2i > not_visited_with_value;
 
-    Mat sgeo_dist(const Mat& gra, int y, int x, float max, Mat &prev)
+    for (auto i = points.begin(); i != points.end(); i++)
     {
-        std::vector <Point2f> points;
-        points.push_back(Point2f(static_cast<float>(x), static_cast<float>(y)));
-        return sgeo_dist(gra, points, max, prev);
+        int y = static_cast<int>(i->y);
+        int x = static_cast<int>(i->x);
+        not_visited_with_value.insert(std::pair<float, Vec2i >(0.f, Vec2i(y, x)));
+        dm.at<float>(y, x) = 0;
     }
-    Mat sgeo_dist(const Mat& gra, const std::vector<Point2f> & points, float max, Mat &prev)
+
+    bool done = false;
+    while (!done)
     {
-        int Dx[] = { -1,0,1,-1,1,-1,0,1 };
-        int Dy[] = { -1,-1,-1,0,0,1,1,1 };
-        Mat dm(gra.rows, gra.cols, CV_32F, Scalar(max));
-        prev = Mat(gra.rows, gra.cols, CV_8U, Scalar(255));
+        if (not_visited_with_value.begin() == not_visited_with_value.end()) {
+            done = true;
+            break;
+        }
+        std::multimap<float, Vec2i >::iterator current_it = not_visited_with_value.begin();
+        std::pair<float, Vec2i > current_p = *current_it;
+        not_visited_with_value.erase(current_it);
 
-        std::multimap<float, Vec2i > not_visited_with_value;
+        int y = current_p.second[0];
+        int x = current_p.second[1];
+        float cur_d = current_p.first;
 
-        for (auto i = points.begin(); i != points.end(); i++)
-        {
-            int y = static_cast<int>(i->y);
-            int x = static_cast<int>(i->x);
-            not_visited_with_value.insert(std::pair<float, Vec2i >(0.f, Vec2i(y, x)));
-            dm.at<float>(y, x) = 0;
+        if (dm.at<float>(y, x) != cur_d) {
+            continue;
         }
 
-        bool done = false;
-        while (!done)
-        {
-            if (not_visited_with_value.begin() == not_visited_with_value.end()) {
-                done = true;
-                break;
-            }
-            std::multimap<float, Vec2i >::iterator current_it = not_visited_with_value.begin();
-            std::pair<float, Vec2i > current_p = *current_it;
-            not_visited_with_value.erase(current_it);
-
-            int y = current_p.second[0];
-            int x = current_p.second[1];
-            float cur_d = current_p.first;
+        Vec8f gra_e = gra.at<Vec8f>(y, x);
 
-            if (dm.at<float>(y, x) != cur_d) {
+        for (int i = 0; i < 8; i++) {
+            if (gra_e[i] < 0) {
                 continue;
             }
+            int dx = Dx[i];
+            int dy = Dy[i];
 
-            Vec8f gra_e = gra.at<Vec8f>(y, x);
-
-            for (int i = 0; i < 8; i++) {
-                if (gra_e[i] < 0) {
-                    continue;
-                }
-                int dx = Dx[i];
-                int dy = Dy[i];
-
-                if (dm.at<float>(y + dy, x + dx) > cur_d + gra_e[i]) {
-                    dm.at<float>(y + dy, x + dx) = cur_d + gra_e[i];
-                    prev.at<uchar>(y + dy, x + dx) = static_cast<uchar>(7 - i);
-                    not_visited_with_value.insert(std::pair<float, Vec2i >(cur_d + gra_e[i], Vec2i(y + dy, x + dx)));
-                }
-
+            if (dm.at<float>(y + dy, x + dx) > cur_d + gra_e[i]) {
+                dm.at<float>(y + dy, x + dx) = cur_d + gra_e[i];
+                prev.at<uchar>(y + dy, x + dx) = static_cast<uchar>(7 - i);
+                not_visited_with_value.insert(std::pair<float, Vec2i >(cur_d + gra_e[i], Vec2i(y + dy, x + dx)));
             }
+
         }
+    }
 
 
 
 
-        return dm;
-    }
+    return dm;
+}
 
-    Mat interpolate_irregular_nn_raster(const std::vector<Point2f> & prevPoints,
-        const std::vector<Point2f> & nextPoints,
-        const std::vector<uchar> & status,
-        const Mat & i1)
+Mat interpolate_irregular_nn_raster(const std::vector<Point2f> & prevPoints,
+    const std::vector<Point2f> & nextPoints,
+    const std::vector<uchar> & status,
+    const Mat & i1)
+{
+    Mat gra = getGraph(i1, 0.1f);
+    int Dx[] = { -1,0,1,-1,1,-1,0,1 };
+    int Dy[] = { -1,-1,-1,0,0,1,1,1 };
+    int max_rounds = 10;
+    Mat dirt = Mat(gra.rows, gra.cols, CV_8U, Scalar(0));
+    Mat quellknoten = Mat(gra.rows, gra.cols, CV_32S, Scalar(-1));
+    Mat dist = Mat(gra.rows, gra.cols, CV_32F, Scalar(std::numeric_limits<float>::max()));
+    /*
+        * assign quellknoten ids.
+        */
+    for (int i = 0; i < static_cast<int>(prevPoints.size()); i++)
     {
-        Mat gra = getGraph(i1, 0.1f);
-        int Dx[] = { -1,0,1,-1,1,-1,0,1 };
-        int Dy[] = { -1,-1,-1,0,0,1,1,1 };
-        int max_rounds = 10;
-        Mat dirt = Mat(gra.rows, gra.cols, CV_8U, Scalar(0));
-        Mat quellknoten = Mat(gra.rows, gra.cols, CV_32S, Scalar(-1));
-        Mat dist = Mat(gra.rows, gra.cols, CV_32F, Scalar(std::numeric_limits<float>::max()));
+        int x = (int)prevPoints[i].x;
+        int y = (int)prevPoints[i].y;
+        if (status[i] == 0)
+            continue;
+        dirt.at<uchar>(y, x) = 1;
+        dist.at<float>(y, x) = 0;
+        quellknoten.at<int>(y, x) = i;
+    }
+
+    bool clean = true;
+    bool done = false;
+    int x = 0;
+    int y = 0;
+    int rounds = 0;
+    while (!done) {
         /*
-            * assign quellknoten ids.
+            * Update x and y
+            * on even rounds go rasterscanorder , on odd round inverse rasterscanorder
             */
-        for (int i = 0; i < static_cast<int>(prevPoints.size()); i++)
-        {
-            int x = (int)prevPoints[i].x;
-            int y = (int)prevPoints[i].y;
-            if (status[i] == 0)
-                continue;
-            dirt.at<uchar>(y, x) = 1;
-            dist.at<float>(y, x) = 0;
-            quellknoten.at<int>(y, x) = i;
-        }
-
-        bool clean = true;
-        bool done = false;
-        int x = 0;
-        int y = 0;
-        int rounds = 0;
-        while (!done) {
-            /*
-                * Update x and y
-                * on even rounds go rasterscanorder , on odd round inverse rasterscanorder
-                */
-            if (rounds % 2 == 0) {
-                x++;
-                if (x >= gra.cols) {
-                    x = 0;
-                    y++;
-                    if (y >= gra.rows) {
-                        y = 0;
-                        rounds++;
-                        y = gra.rows - 1;
-                        x = gra.cols - 1;
-                        if (rounds >= max_rounds || clean) {
-                            done = true;
-                            break;
-                        }
+        if (rounds % 2 == 0) {
+            x++;
+            if (x >= gra.cols) {
+                x = 0;
+                y++;
+                if (y >= gra.rows) {
+                    y = 0;
+                    rounds++;
+                    y = gra.rows - 1;
+                    x = gra.cols - 1;
+                    if (rounds >= max_rounds || clean) {
+                        done = true;
+                        break;
                     }
                 }
             }
-            else {
-                x--;
-                if (x < 0) {
-                    x = gra.cols - 1;
-                    y--;
-                    if (y < 0) {
-                        y = gra.rows - 1;
-                        rounds++;
-                        y = 0;
-                        x = 0;
-                        if (rounds >= max_rounds || clean) {
-                            done = true;
-                            break;
-                        }
+        }
+        else {
+            x--;
+            if (x < 0) {
+                x = gra.cols - 1;
+                y--;
+                if (y < 0) {
+                    y = gra.rows - 1;
+                    rounds++;
+                    y = 0;
+                    x = 0;
+                    if (rounds >= max_rounds || clean) {
+                        done = true;
+                        break;
                     }
                 }
             }
-            if (dirt.at<uchar>(y, x) == 0) {
-                continue;
-            }
-            dirt.at<uchar>(y, x) = 0;
+        }
+        if (dirt.at<uchar>(y, x) == 0) {
+            continue;
+        }
+        dirt.at<uchar>(y, x) = 0;
 
-            float c_dist = dist.at<float>(y, x);
-            Vec8f gra_e = gra.at<Vec8f>(y, x);
+        float c_dist = dist.at<float>(y, x);
+        Vec8f gra_e = gra.at<Vec8f>(y, x);
 
-            for (int i = 0; i < 8; i++) {
-                int tx = Dx[i];
-                int ty = Dy[i];
-                if (ty == 0 && tx == 0) {
-                    continue;
-                }
-                if (x + tx < 0 || x + tx >= gra.cols) {
-                    continue;
-                }
-                if (y + ty < 0 || y + ty >= gra.rows) {
-                    continue;
-                }
-                if (c_dist > dist.at<float>(y + ty, x + tx)) {
-                    if (c_dist > dist.at<float>(y + ty, x + tx) + gra_e[i]) {
-                        quellknoten.at<int>(y, x) = quellknoten.at<int>(y + ty, x + tx);
-                        dist.at<float>(y, x) = dist.at<float>(y + ty, x + tx) + gra_e[i];
-                        dirt.at<uchar>(y, x) = 1;
-                        clean = false;
-                    }
+        for (int i = 0; i < 8; i++) {
+            int tx = Dx[i];
+            int ty = Dy[i];
+            if (ty == 0 && tx == 0) {
+                continue;
+            }
+            if (x + tx < 0 || x + tx >= gra.cols) {
+                continue;
+            }
+            if (y + ty < 0 || y + ty >= gra.rows) {
+                continue;
+            }
+            if (c_dist > dist.at<float>(y + ty, x + tx)) {
+                if (c_dist > dist.at<float>(y + ty, x + tx) + gra_e[i]) {
+                    quellknoten.at<int>(y, x) = quellknoten.at<int>(y + ty, x + tx);
+                    dist.at<float>(y, x) = dist.at<float>(y + ty, x + tx) + gra_e[i];
+                    dirt.at<uchar>(y, x) = 1;
+                    clean = false;
                 }
-                else {
-                    if (c_dist + gra_e[i] < dist.at<float>(y + ty, x + tx)) {
-                        quellknoten.at<int>(y + ty, x + tx) = quellknoten.at<int>(y, x);
-                        dist.at<float>(y + ty, x + tx) = dist.at<float>(y, x) + gra_e[i];
-                        dirt.at<uchar>(y + ty, x + tx) = 1;
-                        clean = false;
-                    }
+            }
+            else {
+                if (c_dist + gra_e[i] < dist.at<float>(y + ty, x + tx)) {
+                    quellknoten.at<int>(y + ty, x + tx) = quellknoten.at<int>(y, x);
+                    dist.at<float>(y + ty, x + tx) = dist.at<float>(y, x) + gra_e[i];
+                    dirt.at<uchar>(y + ty, x + tx) = 1;
+                    clean = false;
                 }
             }
+        }
 
 
 
-        }
+    }
 
-        Mat nnFlow(i1.rows, i1.cols, CV_32FC2, Scalar(0));
-        for (y = 0; y < i1.rows; y++) {
-            for (x = 0; x < i1.cols; x++) {
+    Mat nnFlow(i1.rows, i1.cols, CV_32FC2, Scalar(0));
+    for (y = 0; y < i1.rows; y++) {
+        for (x = 0; x < i1.cols; x++) {
 
-                int id = quellknoten.at<int>(y, x);
-                if (id != -1)
-                {
-                    nnFlow.at<Point2f>(y, x) = nextPoints[id] - prevPoints[id];
-                }
+            int id = quellknoten.at<int>(y, x);
+            if (id != -1)
+            {
+                nnFlow.at<Point2f>(y, x) = nextPoints[id] - prevPoints[id];
             }
         }
-        return nnFlow;
     }
+    return nnFlow;
+}
 
-    Mat interpolate_irregular_knn(
-        const std::vector<Point2f> & _prevPoints,
-        const std::vector<Point2f> & _nextPoints,
-        const std::vector<uchar> & status,
-        const Mat &color_img,
-        int k,
-        float pixeldistance)
-    {
-        Mat in(color_img.rows, color_img.cols, CV_32FC2);
-        Mat mask = Mat::zeros(color_img.rows, color_img.cols, CV_8UC1);
+Mat interpolate_irregular_knn(
+    const std::vector<Point2f> & _prevPoints,
+    const std::vector<Point2f> & _nextPoints,
+    const std::vector<uchar> & status,
+    const Mat &color_img,
+    int k,
+    float pixeldistance)
+{
+    Mat in(color_img.rows, color_img.cols, CV_32FC2);
+    Mat mask = Mat::zeros(color_img.rows, color_img.cols, CV_8UC1);
 
-        for (unsigned n = 0; n < _prevPoints.size(); n++)
+    for (unsigned n = 0; n < _prevPoints.size(); n++)
+    {
+        if (_prevPoints[n].x >= 0 && _prevPoints[n].y >= 0 && _prevPoints[n].x < color_img.cols && _prevPoints[n].y < color_img.rows)
         {
-            if (_prevPoints[n].x >= 0 && _prevPoints[n].y >= 0 && _prevPoints[n].x < color_img.cols && _prevPoints[n].y < color_img.rows)
-            {
-                in.at<Point2f>(_prevPoints[n]) = _nextPoints[n] - _prevPoints[n];
-                mask.at<uchar>(_prevPoints[n]) = status[n];
-            }
-
+            in.at<Point2f>(_prevPoints[n]) = _nextPoints[n] - _prevPoints[n];
+            mask.at<uchar>(_prevPoints[n]) = status[n];
         }
-        int Dx[] = { -1,0,1,-1,1,-1,0,1 };
-        int Dy[] = { -1,-1,-1,0,0,1,1,1 };
-        Mat gra = getGraph(color_img, pixeldistance);
-        Mat nnFlow(in.rows, in.cols, CV_32FC2, Scalar(0));
 
-        std::multimap<float, Vec2i > my_agents; // <arrivaltim , < target, color >>
-        Graph_helper graph_helper(k, in.rows*in.cols); //< arrivaltime, color>
+    }
+    int Dx[] = { -1,0,1,-1,1,-1,0,1 };
+    int Dy[] = { -1,-1,-1,0,0,1,1,1 };
+    Mat gra = getGraph(color_img, pixeldistance);
+    Mat nnFlow(in.rows, in.cols, CV_32FC2, Scalar(0));
 
+    std::multimap<float, Vec2i > my_agents; // <arrivaltim , < target, color >>
+    Graph_helper graph_helper(k, in.rows*in.cols); //< arrivaltime, color>
 
-        int color = 0;
-        std::vector<Vec2i> flow_point_list;
-        for (int y = 0; y < in.rows; y++) {
-            for (int x = 0; x < in.cols; x++) {
-                if (mask.at<uchar>(y, x) > 0) {
-                    flow_point_list.push_back(Vec2i(y, x));
-                    nnFlow.at<Vec2f>(y, x) = in.at<Vec2f>(y, x);
 
-                    int v_id = (y * in.cols + x);
-                    graph_helper.add(v_id, std::pair<float, int>(0.f, color));
+    int color = 0;
+    std::vector<Vec2i> flow_point_list;
+    for (int y = 0; y < in.rows; y++) {
+        for (int x = 0; x < in.cols; x++) {
+            if (mask.at<uchar>(y, x) > 0) {
+                flow_point_list.push_back(Vec2i(y, x));
+                nnFlow.at<Vec2f>(y, x) = in.at<Vec2f>(y, x);
 
+                int v_id = (y * in.cols + x);
+                graph_helper.add(v_id, std::pair<float, int>(0.f, color));
 
-                    Vec8f gra_e = gra.at<Vec8f>(y, x);
-                    for (int i = 0; i < 8; i++) {
-                        if (gra_e[i] < 0)
-                            continue;
-                        int dx = Dx[i];
-                        int dy = Dy[i];
-                        int target = (y + dy) * in.cols + (x + dx);
-                        Vec2i agent(target, color);
-                        my_agents.insert(std::pair<float, Vec2i >(gra_e[i], agent));
 
-                    }
-                    color++;
+                Vec8f gra_e = gra.at<Vec8f>(y, x);
+                for (int i = 0; i < 8; i++) {
+                    if (gra_e[i] < 0)
+                        continue;
+                    int dx = Dx[i];
+                    int dy = Dy[i];
+                    int target = (y + dy) * in.cols + (x + dx);
+                    Vec2i agent(target, color);
+                    my_agents.insert(std::pair<float, Vec2i >(gra_e[i], agent));
+
                 }
+                color++;
             }
         }
+    }
 
-        int global_time = 0;
+    int global_time = 0;
 
-        bool done = false;
-        while (!done) {
-            if (my_agents.size() == 0) {
-                done = true;
-                break;
-            }
-            global_time++;
+    bool done = false;
+    while (!done) {
+        if (my_agents.size() == 0) {
+            done = true;
+            break;
+        }
+        global_time++;
 
-            std::multimap<float, Vec2i >::iterator current_it = my_agents.begin();
-            std::pair<float, Vec2i > current_p = *current_it;
-            my_agents.erase(current_it);
+        std::multimap<float, Vec2i >::iterator current_it = my_agents.begin();
+        std::pair<float, Vec2i > current_p = *current_it;
+        my_agents.erase(current_it);
 
-            int target = current_p.second[0];
-            color = current_p.second[1];
-            float arriv_time = current_p.first;
+        int target = current_p.second[0];
+        color = current_p.second[1];
+        float arriv_time = current_p.first;
+
+        Vec8f gra_e = gra.at<Vec8f>(target);// (y*cols+x)
+        if (graph_helper.size(target) >= k) {
+            continue;
+        }
 
-            Vec8f gra_e = gra.at<Vec8f>(target);// (y*cols+x)
-            if (graph_helper.size(target) >= k) {
+        bool color_found_in_target = graph_helper.color_in_target(target, color);
+        if (color_found_in_target) {
+            continue;
+        }
+        graph_helper.add(target, std::pair<float, int>(arriv_time, color));
+
+
+        for (int i = 0; i < 8; i++) {
+            if (gra_e[i] < 0)
                 continue;
-            }
 
-            bool color_found_in_target = graph_helper.color_in_target(target, color);
+            int dx = Dx[i];
+            int dy = Dy[i];
+            int new_target = target + dx + (dy*in.cols);
+            if (graph_helper.size(new_target) >= k) {
+                continue;
+            }
+            color_found_in_target = graph_helper.color_in_target(new_target, color);
             if (color_found_in_target) {
                 continue;
             }
-            graph_helper.add(target, std::pair<float, int>(arriv_time, color));
-
-
-            for (int i = 0; i < 8; i++) {
-                if (gra_e[i] < 0)
-                    continue;
+            Vec2i new_agent(new_target, color);
+            my_agents.insert(std::pair<float, Vec2i >(arriv_time + gra_e[i], new_agent));
 
-                int dx = Dx[i];
-                int dy = Dy[i];
-                int new_target = target + dx + (dy*in.cols);
-                if (graph_helper.size(new_target) >= k) {
-                    continue;
-                }
-                color_found_in_target = graph_helper.color_in_target(new_target, color);
-                if (color_found_in_target) {
-                    continue;
-                }
-                Vec2i new_agent(new_target, color);
-                my_agents.insert(std::pair<float, Vec2i >(arriv_time + gra_e[i], new_agent));
-
-            }
         }
+    }
 
-        Mat ret(in.rows, in.cols*k, CV_32FC2);
-        for (int y = 0; y < in.rows; y++) {
-            for (int x = 0; x < in.cols; x++) {
-                for (int i = 0; i < k; i++) {
-                    float dist = *((float*)(graph_helper.data(y*in.cols + x) + 2 * i));
-                    float id = *((float*)(graph_helper.data(y*in.cols + x) + 2 * i + 1));
-                    ret.at<Vec2f>(y, k*x + i) = Vec2f(dist, id);
-                }
+    Mat ret(in.rows, in.cols*k, CV_32FC2);
+    for (int y = 0; y < in.rows; y++) {
+        for (int x = 0; x < in.cols; x++) {
+            for (int i = 0; i < k; i++) {
+                float dist = *((float*)(graph_helper.data(y*in.cols + x) + 2 * i));
+                float id = *((float*)(graph_helper.data(y*in.cols + x) + 2 * i + 1));
+                ret.at<Vec2f>(y, k*x + i) = Vec2f(dist, id);
             }
         }
-        return ret;
+    }
+    return ret;
 /*
-        for (int y = 0; y < in.rows; y++) {
-            for (int x = 0; x < in.cols; x++) {
-                int v_id = y * in.cols + x;
-                std::vector<std::pair<float, Vec2f> > flow_s(k);
+    for (int y = 0; y < in.rows; y++) {
+        for (int x = 0; x < in.cols; x++) {
+            int v_id = y * in.cols + x;
+            std::vector<std::pair<float, Vec2f> > flow_s(k);
 
-                for (int i = 0; i < k; i++) {
-                    color = *((int*)(graph_helper.data(v_id) + 1 + 2 * i));
-                    int cy = flow_point_list[color][0];
-                    int cx = flow_point_list[color][1];
-                    Vec2f flow = in.at<Vec2f>(cy, cx);
-                    flow_s[i] = std::make_pair(static_cast<float>(norm(flow)), flow);
+            for (int i = 0; i < k; i++) {
+                color = *((int*)(graph_helper.data(v_id) + 1 + 2 * i));
+                int cy = flow_point_list[color][0];
+                int cx = flow_point_list[color][1];
+                Vec2f flow = in.at<Vec2f>(cy, cx);
+                flow_s[i] = std::make_pair(static_cast<float>(norm(flow)), flow);
 
 
 
-                }
+            }
 
-                nnFlow.at<Vec2f>(y, x) = std::min_element(flow_s.begin(), flow_s.end(), [](const std::pair<float, Vec2f> &left, const std::pair<float, Vec2f> &right) {
-                    return left.first < right.first;
-                })->second;
+            nnFlow.at<Vec2f>(y, x) = std::min_element(flow_s.begin(), flow_s.end(), [](const std::pair<float, Vec2f> &left, const std::pair<float, Vec2f> &right) {
+                return left.first < right.first;
+            })->second;
 
 
-            }
         }
-        free(graph_helper.mem);
-        return nnFlow;
+    }
+    free(graph_helper.mem);
+    return nnFlow;
 */
 
-    }
-    Mat getGraph(const Mat &image, float edge_length)
-    {
+}
 
-        int Dx[] = { -1,0,1,-1,1,-1,0,1 };
-        int Dy[] = { -1,-1,-1,0,0,1,1,1 };
-        Mat gra(image.rows, image.cols, CV_32FC(8));
+Mat getGraph(const Mat &image, float edge_length)
+{
 
-        for (int y = 0; y < gra.rows; y++) {
-            for (int x = 0; x < gra.cols; x++) {
+    int Dx[] = { -1,0,1,-1,1,-1,0,1 };
+    int Dy[] = { -1,-1,-1,0,0,1,1,1 };
+    Mat gra(image.rows, image.cols, CV_32FC(8));
 
-                for (int i = 0; i < 8; i++) {
-                    int dx = Dx[i];
-                    int dy = Dy[i];
-                    gra.at<Vec8f>(y, x)[i] = -1;
+    for (int y = 0; y < gra.rows; y++) {
+        for (int x = 0; x < gra.cols; x++) {
 
-                    if (x + dx < 0 || y + dy < 0 || x + dx >= gra.cols || y + dy >= gra.rows) {
-                        continue;
-                    }
+            for (int i = 0; i < 8; i++) {
+                int dx = Dx[i];
+                int dy = Dy[i];
+                gra.at<Vec8f>(y, x)[i] = -1;
 
-                    if (i < 4) {
-                        int si = 7 - i;
-                        gra.at<Vec8f>(y, x)[i] = gra.at<Vec8f>(y + dy, x + dx)[si];
-                    }
-                    else {
-                        float p1 = dx * dx*edge_length*edge_length + dy * dy*edge_length*edge_length;
-                        float p2 = static_cast<float>(image.at<Vec3b>(y, x)[0] - image.at<Vec3b>(y + dy, x + dx)[0]);
-                        float p3 = static_cast<float>(image.at<Vec3b>(y, x)[1] - image.at<Vec3b>(y + dy, x + dx)[1]);
-                        float p4 = static_cast<float>(image.at<Vec3b>(y, x)[2] - image.at<Vec3b>(y + dy, x + dx)[2]);
-                        gra.at<Vec8f>(y, x)[i] = sqrt(p1 + p2 * p2 + p3 * p3 + p4 * p4);
-                    }
+                if (x + dx < 0 || y + dy < 0 || x + dx >= gra.cols || y + dy >= gra.rows) {
+                    continue;
+                }
 
+                if (i < 4) {
+                    int si = 7 - i;
+                    gra.at<Vec8f>(y, x)[i] = gra.at<Vec8f>(y + dy, x + dx)[si];
+                }
+                else {
+                    float p1 = dx * dx*edge_length*edge_length + dy * dy*edge_length*edge_length;
+                    float p2 = static_cast<float>(image.at<Vec3b>(y, x)[0] - image.at<Vec3b>(y + dy, x + dx)[0]);
+                    float p3 = static_cast<float>(image.at<Vec3b>(y, x)[1] - image.at<Vec3b>(y + dy, x + dx)[1]);
+                    float p4 = static_cast<float>(image.at<Vec3b>(y, x)[2] - image.at<Vec3b>(y + dy, x + dx)[2]);
+                    gra.at<Vec8f>(y, x)[i] = sqrt(p1 + p2 * p2 + p3 * p3 + p4 * p4);
                 }
 
             }
-        }
 
-        return gra;
+        }
     }
 
+    return gra;
+}
+
 
-    Mat interpolate_irregular_nn(
-        const std::vector<Point2f> & _prevPoints,
-        const std::vector<Point2f> & _nextPoints,
-        const std::vector<uchar> & status,
-        const Mat &color_img,
-        float pixeldistance)
+Mat interpolate_irregular_nn(
+    const std::vector<Point2f> & _prevPoints,
+    const std::vector<Point2f> & _nextPoints,
+    const std::vector<uchar> & status,
+    const Mat &color_img,
+    float pixeldistance)
+{
+    int Dx[] = { -1,0,1,-1,1,-1,0,1 };
+    int Dy[] = { -1,-1,-1,0,0,1,1,1 };
+    std::vector<Point2f> prevPoints, nextPoints;
+    std::map<std::pair<float, float>, std::pair<float, float>> flowMap;
+    for (unsigned n = 0; n < _prevPoints.size(); n++)
     {
-        int Dx[] = { -1,0,1,-1,1,-1,0,1 };
-        int Dy[] = { -1,-1,-1,0,0,1,1,1 };
-        std::vector<Point2f> prevPoints, nextPoints;
-        std::map<std::pair<float, float>, std::pair<float, float>> flowMap;
-        for (unsigned n = 0; n < _prevPoints.size(); n++)
+        if (status[n] != 0)
         {
-            if (status[n] != 0)
-            {
-                flowMap.insert(std::make_pair(
-                    std::make_pair(_prevPoints[n].x, _prevPoints[n].y),
-                    std::make_pair(_nextPoints[n].x, _nextPoints[n].y)));
-                prevPoints.push_back(_prevPoints[n]);
-                nextPoints.push_back(_nextPoints[n]);
-            }
-
+            flowMap.insert(std::make_pair(
+                std::make_pair(_prevPoints[n].x, _prevPoints[n].y),
+                std::make_pair(_nextPoints[n].x, _nextPoints[n].y)));
+            prevPoints.push_back(_prevPoints[n]);
+            nextPoints.push_back(_nextPoints[n]);
         }
 
-        Mat gra = getGraph(color_img, pixeldistance);
+    }
 
-        Mat prev;
-        Mat geo_dist = sgeo_dist(gra, prevPoints, std::numeric_limits<float>::max(), prev);
+    Mat gra = getGraph(color_img, pixeldistance);
 
+    Mat prev;
+    Mat geo_dist = sgeo_dist(gra, prevPoints, std::numeric_limits<float>::max(), prev);
 
-        Mat nnFlow = Mat::zeros(color_img.size(), CV_32FC2);
-        for (int y = 0; y < nnFlow.rows; y++)
+
+    Mat nnFlow = Mat::zeros(color_img.size(), CV_32FC2);
+    for (int y = 0; y < nnFlow.rows; y++)
+    {
+        for (int x = 0; x < nnFlow.cols; x++)
         {
-            for (int x = 0; x < nnFlow.cols; x++)
+            int cx = x;
+            int cy = y;
+            while (prev.at<uchar>(cy, cx) != 255)
             {
-                int cx = x;
-                int cy = y;
-                while (prev.at<uchar>(cy, cx) != 255)
-                {
-                    int i = prev.at<uchar>(cy, cx);
-                    cx += Dx[i];
-                    cy += Dy[i];
-                }
-                auto val = flowMap[std::make_pair(static_cast<float>(cx), static_cast<float>(cy))];
-                nnFlow.at<Vec2f>(y, x) = Vec2f(val.first - cx, val.second - cy);
+                int i = prev.at<uchar>(cy, cx);
+                cx += Dx[i];
+                cy += Dy[i];
             }
+            auto val = flowMap[std::make_pair(static_cast<float>(cx), static_cast<float>(cy))];
+            nnFlow.at<Vec2f>(y, x) = Vec2f(val.first - cx, val.second - cy);
         }
-        return nnFlow;
     }
-
-}
+    return nnFlow;
 }
+
+}}  // namespace
diff --git a/modules/optflow/src/rlof/geo_interpolation.hpp b/modules/optflow/src/rlof/geo_interpolation.hpp
index 60aef39bedc..0e13b44c493 100644
--- a/modules/optflow/src/rlof/geo_interpolation.hpp
+++ b/modules/optflow/src/rlof/geo_interpolation.hpp
@@ -4,34 +4,33 @@
 #ifndef _GEO_INTERPOLATION_HPP_
 #define _GEO_INTERPOLATION_HPP_
 #include <opencv2/core.hpp>
-namespace cv
-{
-namespace optflow
-{
-    typedef Vec<float, 8> Vec8f;
-    Mat getGraph(const Mat & image, float edge_length);
-    Mat sgeo_dist(const Mat& gra, int y, int x, float max, Mat &prev);
-    Mat sgeo_dist(const Mat& gra, const std::vector<Point2f> & points, float max, Mat &prev);
-    Mat interpolate_irregular_nw(const Mat &in, const Mat &mask, const Mat &color_img, float max_d, float bandwidth, float pixeldistance);
-    Mat interpolate_irregular_nn(
-        const std::vector<Point2f> & prevPoints,
-        const std::vector<Point2f> & nextPoints,
-        const std::vector<uchar> & status,
-        const Mat &color_img,
-        float pixeldistance);
-    Mat interpolate_irregular_knn(
-        const std::vector<Point2f> & _prevPoints,
-        const std::vector<Point2f> & _nextPoints,
-        const std::vector<uchar> & status,
-        const Mat &color_img,
-        int k,
-        float pixeldistance);
 
-    Mat interpolate_irregular_nn_raster(const std::vector<Point2f> & prevPoints,
-        const std::vector<Point2f> & nextPoints,
-        const std::vector<uchar> & status,
-        const Mat & i1);
+namespace cv {
+namespace optflow {
 
-}
-}
+typedef Vec<float, 8> Vec8f;
+Mat getGraph(const Mat & image, float edge_length);
+Mat sgeo_dist(const Mat& gra, int y, int x, float max, Mat &prev);
+Mat sgeo_dist(const Mat& gra, const std::vector<Point2f> & points, float max, Mat &prev);
+Mat interpolate_irregular_nw(const Mat &in, const Mat &mask, const Mat &color_img, float max_d, float bandwidth, float pixeldistance);
+Mat interpolate_irregular_nn(
+    const std::vector<Point2f> & prevPoints,
+    const std::vector<Point2f> & nextPoints,
+    const std::vector<uchar> & status,
+    const Mat &color_img,
+    float pixeldistance);
+Mat interpolate_irregular_knn(
+    const std::vector<Point2f> & _prevPoints,
+    const std::vector<Point2f> & _nextPoints,
+    const std::vector<uchar> & status,
+    const Mat &color_img,
+    int k,
+    float pixeldistance);
+
+Mat interpolate_irregular_nn_raster(const std::vector<Point2f> & prevPoints,
+    const std::vector<Point2f> & nextPoints,
+    const std::vector<uchar> & status,
+    const Mat & i1);
+
+}}  // namespace
 #endif
diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp
index dd8ab5e7d1b..b7311edb86c 100644
--- a/modules/optflow/src/rlof/plk_invoker.hpp
+++ b/modules/optflow/src/rlof/plk_invoker.hpp
@@ -6,37 +6,36 @@
 #include "rlof_invokerbase.hpp"
 
 
-namespace cv{
-namespace optflow{
-namespace plk
-{
+namespace cv {
+namespace optflow {
+namespace plk {
 
 // implementierung ohne SSE
-namespace radial
-{
+namespace radial {
+
 class TrackerInvoker : public cv::ParallelLoopBody
 {
 public:
     TrackerInvoker(
 
-        const Mat&    _prevImg,
-        const Mat&    _prevDeriv,
-        const Mat&    _nextImg,
-        const Mat&    _rgbPrevImg,
-        const Mat&    _rgbNextImg,
-        const Point2f*    _prevPts,
+        const Mat&      _prevImg,
+        const Mat&      _prevDeriv,
+        const Mat&      _nextImg,
+        const Mat&      _rgbPrevImg,
+        const Mat&      _rgbNextImg,
+        const Point2f*  _prevPts,
         Point2f*        _nextPts,
-        uchar*            _status,
-        float*            _err,
+        uchar*          _status,
+        float*          _err,
         Point2f*        _gainVecs,
-        int                _level,
-        int                _maxLevel,
-        int                _winSize[2],
-        int                _maxIteration,
+        int             _level,
+        int             _maxLevel,
+        int             _winSize[2],
+        int             _maxIteration,
         bool            _useInitialFlow,
-        int                _supportRegionType,
+        int             _supportRegionType,
         float           _minEigenValue,
-        int                _crossSegmentationThreshold)
+        int             _crossSegmentationThreshold)
     {
         prevImg = &_prevImg;
         prevDeriv = &_prevDeriv;
@@ -615,11 +614,13 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 gainVecs[ptidx]= gainVec;
                 if( delta.ddot(delta) <= criteria.epsilon)
                     break;
-                if(    (   std::abs(delta.x - prevDelta.x) < 0.01
-                    &&    std::abs(delta.y - prevDelta.y) < 0.01)
-                    || ((delta.ddot(delta) <= 0.001)
-                       && std::abs(prevGain.x - deltaGain.x) < 0.01)
-                        )
+                if ((
+                        std::abs(delta.x - prevDelta.x) < 0.01 &&
+                        std::abs(delta.y - prevDelta.y) < 0.01
+                    ) || (
+                        delta.ddot(delta) <= 0.001 &&
+                        std::abs(prevGain.x - deltaGain.x) < 0.01
+                    ))
                 {
                     nextPts[ptidx]  -= delta*0.5f;
                     gainVecs[ptidx] -= deltaGain* 0.5f;
@@ -643,47 +644,50 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
     }
 
-    const Mat*        prevImg;
-    const Mat*        nextImg;
-    const Mat*        prevDeriv;
-    const Mat*        rgbPrevImg;
-    const Mat*        rgbNextImg;
-    const Point2f*    prevPts;
-    Point2f*        nextPts;
-    uchar*            status;
-    cv::Point2f*    gainVecs;        // gain vector x -> multiplier y -> offset
-    float*            err;
-    int                maxWinSize;
-    int                minWinSize;
-    TermCriteria    criteria;
-    int                level;
-    int                maxLevel;
-    int                windowType;
-    float            minEigThreshold;
-    bool            useInitialFlow;
-    int                crossSegmentationThreshold;
+    const Mat*          prevImg;
+    const Mat*          nextImg;
+    const Mat*          prevDeriv;
+    const Mat*          rgbPrevImg;
+    const Mat*          rgbNextImg;
+    const Point2f*      prevPts;
+    Point2f*            nextPts;
+    uchar*              status;
+    cv::Point2f*        gainVecs;        // gain vector x -> multiplier y -> offset
+    float*              err;
+    int                 maxWinSize;
+    int                 minWinSize;
+    TermCriteria        criteria;
+    int                 level;
+    int                 maxLevel;
+    int                 windowType;
+    float               minEigThreshold;
+    bool                useInitialFlow;
+    int                 crossSegmentationThreshold;
 };
-}
-namespace ica{
+
+}  // namespace
+namespace ica {
+
 class TrackerInvoker : public cv::ParallelLoopBody
 {
 public:
-    TrackerInvoker(const Mat&    _prevImg,
-        const Mat&    _prevDeriv,
-        const Mat&    _nextImg,
-        const Mat&    _rgbPrevImg,
-        const Mat&    _rgbNextImg,
-        const Point2f*    _prevPts,
+    TrackerInvoker(
+        const Mat&      _prevImg,
+        const Mat&      _prevDeriv,
+        const Mat&      _nextImg,
+        const Mat&      _rgbPrevImg,
+        const Mat&      _rgbNextImg,
+        const Point2f*  _prevPts,
         Point2f*        _nextPts,
-        uchar*            _status,
-        float*            _err,
-        int                _level,
-        int                _maxLevel,
-        int                _winSize[2],
-        int                _maxIteration,
+        uchar*          _status,
+        float*          _err,
+        int             _level,
+        int             _maxLevel,
+        int             _winSize[2],
+        int             _maxIteration,
         bool            _useInitialFlow,
-        int                _supportRegionType,
-        int                _crossSegmentationThreshold,
+        int             _supportRegionType,
+        int             _crossSegmentationThreshold,
         float           _minEigenValue)
     {
         prevImg = &_prevImg;
@@ -1062,7 +1066,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 nextPt += delta;
                 nextPts[ptidx] = nextPt + halfWin;
 
-                 if( delta.ddot(delta) <= criteria.epsilon)
+                if( delta.ddot(delta) <= criteria.epsilon)
                     break;
                 if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01  &&
                             std::abs(delta.y - prevDelta.y) < 0.01)
@@ -1075,34 +1079,29 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
             }
 
-
-
-
-
         }
 
     }
 
-    const Mat*        prevImg;
-    const Mat*        nextImg;
-    const Mat*        prevDeriv;
-    const Mat*        rgbPrevImg;
-    const Mat*        rgbNextImg;
-    const Point2f*    prevPts;
-    Point2f*        nextPts;
-    uchar*            status;
-    float*            err;
-    int                maxWinSize;
-    int                minWinSize;
-    TermCriteria    criteria;
-    int                level;
-    int                maxLevel;
-    int                windowType;
-    float            minEigThreshold;
-    bool            useInitialFlow;
-    int        crossSegmentationThreshold;
+    const Mat*          prevImg;
+    const Mat*          nextImg;
+    const Mat*          prevDeriv;
+    const Mat*          rgbPrevImg;
+    const Mat*          rgbNextImg;
+    const Point2f*      prevPts;
+    Point2f*            nextPts;
+    uchar*              status;
+    float*              err;
+    int                 maxWinSize;
+    int                 minWinSize;
+    TermCriteria        criteria;
+    int                 level;
+    int                 maxLevel;
+    int                 windowType;
+    float               minEigThreshold;
+    bool                useInitialFlow;
+    int                 crossSegmentationThreshold;
 };
-}
-}
-}}
+
+}}}}  // namespace
 #endif
diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp
index 444d9114bd3..14eabe4b948 100644
--- a/modules/optflow/src/rlof/rlof_invoker.hpp
+++ b/modules/optflow/src/rlof/rlof_invoker.hpp
@@ -4,1435 +4,1431 @@
 #ifndef _RLOF_INVOKER_HPP_
 #define _RLOF_INVOKER_HPP_
 #include "rlof_invokerbase.hpp"
-namespace cv
-{
-namespace optflow
-{
-namespace rlof
-{
-namespace ica
+namespace cv {
+namespace optflow {
+namespace rlof {
+namespace ica {
+
+class TrackerInvoker : public cv::ParallelLoopBody
 {
-    class TrackerInvoker : public cv::ParallelLoopBody
+public:
+    TrackerInvoker(
+        const Mat&      _prevImg,
+        const Mat&      _prevDeriv,
+        const Mat&      _nextImg,
+        const Mat&      _rgbPrevImg,
+        const Mat&      _rgbNextImg,
+        const Point2f*  _prevPts,
+        Point2f*        _nextPts,
+        uchar*          _status,
+        float*          _err,
+        int             _level,
+        int             _maxLevel,
+        int             _winSize[2],
+        int             _maxIteration,
+        bool            _useInitialFlow,
+        int             _supportRegionType,
+        std::vector<float> _normSigmas,
+        float           _minEigenValue,
+        int             _crossSegmentationThreshold)
+    {
+        prevImg = &_prevImg;
+        prevDeriv = &_prevDeriv;
+        nextImg = &_nextImg;
+        rgbPrevImg = &_rgbPrevImg;
+        rgbNextImg = &_rgbNextImg;
+        prevPts = _prevPts;
+        nextPts = _nextPts;
+        status = _status;
+        err = _err;
+        minWinSize = _winSize[0];
+        maxWinSize = _winSize[1];
+        criteria.maxCount = _maxIteration;
+        criteria.epsilon = 0.01;
+        level = _level;
+        maxLevel = _maxLevel;
+        windowType = _supportRegionType;
+        minEigThreshold = _minEigenValue;
+        paramReset = _normSigmas;
+        useInitialFlow = _useInitialFlow;
+        crossSegmentationThreshold = _crossSegmentationThreshold;
+    }
+    TrackerInvoker & operator=(const TrackerInvoker &) { return *this; };
+    void operator()(const cv::Range& range) const CV_OVERRIDE
     {
-    public:
-        TrackerInvoker(
-            const Mat&    _prevImg,
-            const Mat&    _prevDeriv,
-            const Mat&    _nextImg,
-            const Mat&    _rgbPrevImg,
-            const Mat&    _rgbNextImg,
-            const Point2f*    _prevPts,
-            Point2f*        _nextPts,
-            uchar*            _status,
-            float*            _err,
-            int                _level,
-            int                _maxLevel,
-            int                _winSize[2],
-            int                _maxIteration,
-            bool            _useInitialFlow,
-            int                _supportRegionType,
-            std::vector<float> _normSigmas,
-            float           _minEigenValue,
-            int                _crossSegmentationThreshold)
-        {
-            prevImg = &_prevImg;
-            prevDeriv = &_prevDeriv;
-            nextImg = &_nextImg;
-            rgbPrevImg = &_rgbPrevImg;
-            rgbNextImg = &_rgbNextImg;
-            prevPts = _prevPts;
-            nextPts = _nextPts;
-            status = _status;
-            err = _err;
-            minWinSize = _winSize[0];
-            maxWinSize = _winSize[1];
-            criteria.maxCount = _maxIteration;
-            criteria.epsilon = 0.01;
-            level = _level;
-            maxLevel = _maxLevel;
-            windowType = _supportRegionType;
-            minEigThreshold = _minEigenValue;
-            paramReset = _normSigmas;
-            useInitialFlow = _useInitialFlow;
-            crossSegmentationThreshold = _crossSegmentationThreshold;
-        }
-        TrackerInvoker & operator=(const TrackerInvoker &) { return *this; };
-        void operator()(const cv::Range& range) const CV_OVERRIDE
-        {
 #ifdef DEBUG_INVOKER
-            printf("rlof::ica");fflush(stdout);
+        printf("rlof::ica");fflush(stdout);
 #endif
-            cv::Size    winSize;
-            cv::Point2f halfWin;
+        cv::Size    winSize;
+        cv::Point2f halfWin;
 
-            const Mat& I = *prevImg;
-            const Mat& J = *nextImg;
-            const Mat& derivI = *prevDeriv;
-            const Mat& BI = *rgbPrevImg;
+        const Mat& I = *prevImg;
+        const Mat& J = *nextImg;
+        const Mat& derivI = *prevDeriv;
+        const Mat& BI = *rgbPrevImg;
 
-            winSize = cv::Size(maxWinSize, maxWinSize);
-            int winMaskwidth = static_cast<int>(ceil(winSize.width / 8.f)) * 16;
-            cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
-            winMaskMatBuf.setTo(1);
+        winSize = cv::Size(maxWinSize, maxWinSize);
+        int winMaskwidth = static_cast<int>(ceil(winSize.width / 8.f)) * 16;
+        cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
+        winMaskMatBuf.setTo(1);
 
-            const float FLT_SCALE = (1.f / (1 << 20)); // 20
+        const float FLT_SCALE = (1.f / (1 << 20)); // 20
 
-            std::vector<float> param = paramReset;
-            int cn = I.channels(), cn2 = cn * 2;
-            int winbufwidth = static_cast<int>(ceil(winSize.width / 8.f)) * 8;
-            cv::Size winBufSize(winbufwidth, winbufwidth);
+        std::vector<float> param = paramReset;
+        int cn = I.channels(), cn2 = cn * 2;
+        int winbufwidth = static_cast<int>(ceil(winSize.width / 8.f)) * 8;
+        cv::Size winBufSize(winbufwidth, winbufwidth);
 
-            std::vector<short> _buf(winBufSize.area()*(cn + cn2));
-            Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]);
-            Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]);
+        std::vector<short> _buf(winBufSize.area()*(cn + cn2));
+        Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]);
+        Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]);
 
-            for (int ptidx = range.start; ptidx < range.end; ptidx++)
+        for (int ptidx = range.start; ptidx < range.end; ptidx++)
+        {
+            Point2f prevPt = prevPts[ptidx] * (float)(1. / (1 << level));
+            Point2f nextPt;
+            if (level == maxLevel)
             {
-                Point2f prevPt = prevPts[ptidx] * (float)(1. / (1 << level));
-                Point2f nextPt;
-                if (level == maxLevel)
-                {
-                    if ( useInitialFlow)
-                        nextPt = nextPts[ptidx] * (float)(1. / (1 << level));
-                    else
-                        nextPt = prevPt;
-                }
+                if ( useInitialFlow)
+                    nextPt = nextPts[ptidx] * (float)(1. / (1 << level));
                 else
-                    nextPt = nextPts[ptidx] * 2.f;
-                nextPts[ptidx] = nextPt;
+                    nextPt = prevPt;
+            }
+            else
+                nextPt = nextPts[ptidx] * 2.f;
+            nextPts[ptidx] = nextPt;
 
-                Point2i iprevPt, inextPt;
-                iprevPt.x = cvFloor(prevPt.x);
-                iprevPt.y = cvFloor(prevPt.y);
-                int winArea = maxWinSize * maxWinSize;
-                cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0, 0, maxWinSize, maxWinSize));
+            Point2i iprevPt, inextPt;
+            iprevPt.x = cvFloor(prevPt.x);
+            iprevPt.y = cvFloor(prevPt.y);
+            int winArea = maxWinSize * maxWinSize;
+            cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0, 0, maxWinSize, maxWinSize));
 
-                if (calcWinMaskMat(BI, windowType, iprevPt,
-                    winMaskMat, winSize, halfWin, winArea,
-                    minWinSize, maxWinSize, crossSegmentationThreshold) == false)
-                    continue;
+            if (calcWinMaskMat(BI, windowType, iprevPt,
+                winMaskMat, winSize, halfWin, winArea,
+                minWinSize, maxWinSize, crossSegmentationThreshold) == false)
+                continue;
 
 
-                prevPt -= halfWin;
-                iprevPt.x = cvFloor(prevPt.x);
-                iprevPt.y = cvFloor(prevPt.y);
+            prevPt -= halfWin;
+            iprevPt.x = cvFloor(prevPt.x);
+            iprevPt.y = cvFloor(prevPt.y);
 
-                if (iprevPt.x + halfWin.x < 0 || iprevPt.x + halfWin.x >= derivI.cols ||
-                    iprevPt.y + halfWin.y < 0 || iprevPt.y + halfWin.y >= derivI.rows)
+            if (iprevPt.x + halfWin.x < 0 || iprevPt.x + halfWin.x >= derivI.cols ||
+                iprevPt.y + halfWin.y < 0 || iprevPt.y + halfWin.y >= derivI.rows)
+            {
+                if (level == 0)
                 {
-                    if (level == 0)
-                    {
-                        if (status)
-                            status[ptidx] = 3;
-                        if (err)
-                            err[ptidx] = 0;
-                    }
-                    continue;
+                    if (status)
+                        status[ptidx] = 3;
+                    if (err)
+                        err[ptidx] = 0;
                 }
+                continue;
+            }
 
-                float a = prevPt.x - iprevPt.x;
-                float b = prevPt.y - iprevPt.y;
-                const int W_BITS = 14, W_BITS1 = 14;
+            float a = prevPt.x - iprevPt.x;
+            float b = prevPt.y - iprevPt.y;
+            const int W_BITS = 14, W_BITS1 = 14;
 
-                int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
-                int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
-                int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
-                int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+            int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+            int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+            int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+            int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
 
-                int dstep = (int)(derivI.step / derivI.elemSize1());
-                int step = (int)(I.step / I.elemSize1());
-                int mStep = (int)(winMaskMat.step / winMaskMat.elemSize1());
-                CV_Assert(step == (int)(J.step / J.elemSize1()));
-                float A11 = 0, A12 = 0, A22 = 0;
-                float b1 = 0, b2 = 0;
-                float D = 0;
-                float minEig;
+            int dstep = (int)(derivI.step / derivI.elemSize1());
+            int step = (int)(I.step / I.elemSize1());
+            int mStep = (int)(winMaskMat.step / winMaskMat.elemSize1());
+            CV_Assert(step == (int)(J.step / J.elemSize1()));
+            float A11 = 0, A12 = 0, A22 = 0;
+            float b1 = 0, b2 = 0;
+            float D = 0;
+            float minEig;
 
 #ifdef RLOF_SSE
-                __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
-                __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
-                __m128i z = _mm_setzero_si128();
-                __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1 - 1));
-                __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1 - 5 - 1));
-                __m128i mmMask4_epi32;
-                __m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits<unsigned short>::max());
-                get4BitMask(winSize.width, mmMask4_epi32);
+            __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+            __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+            __m128i z = _mm_setzero_si128();
+            __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1 - 1));
+            __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1 - 5 - 1));
+            __m128i mmMask4_epi32;
+            __m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits<unsigned short>::max());
+            get4BitMask(winSize.width, mmMask4_epi32);
 #endif
 
-                // extract the patch from the first image, compute covariation matrix of derivatives
-                int x, y;
-                for (y = 0; y < winSize.height; y++)
-                {
-                    const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
-                    const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
-                    const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
+            // extract the patch from the first image, compute covariation matrix of derivatives
+            int x, y;
+            for (y = 0; y < winSize.height; y++)
+            {
+                const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+                const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
+                const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
 
-                    short* Iptr = (short*)(IWinBuf.data + y * IWinBuf.step);
-                    short* dIptr = (short*)(derivIWinBuf.data + y * derivIWinBuf.step);
+                short* Iptr = (short*)(IWinBuf.data + y * IWinBuf.step);
+                short* dIptr = (short*)(derivIWinBuf.data + y * derivIWinBuf.step);
 
-                    x = 0;
+                x = 0;
 
 #ifdef RLOF_SSE
 
-                    for (; x <= winSize.width*cn; x += 4, dsrc += 4 * 2, dIptr += 4 * 2)
+                for (; x <= winSize.width*cn; x += 4, dsrc += 4 * 2, dIptr += 4 * 2)
+                {
+                    __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
+                    __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16);
+
+                    __m128i v00, v01, v10, v11, t0, t1;
+                    v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
+                    v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
+                    v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z);
+                    v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z);
+
+                    t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                        _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                    t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5);
+                    if (x + 4 > winSize.width)
                     {
-                        __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
-                        __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16);
-
-                        __m128i v00, v01, v10, v11, t0, t1;
-                        v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
-                        v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
-                        v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z);
-                        v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z);
-
-                        t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
-                            _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-                        t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5);
-                        if (x + 4 > winSize.width)
-                        {
-                            t0 = _mm_and_si128(t0, mmMask4_epi32);
-                        }
-                        t0 = _mm_and_si128(t0, mask_0_3_epi16);
-                        _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0, t0));
-
-                        v00 = _mm_loadu_si128((const __m128i*)(dsrc));
-                        v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
-                        v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep));
-                        v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2));
-
-                        t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
-                            _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-                        t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
-                            _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
-                        t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
-                        t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
-                        v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
-                        if (x + 4 > winSize.width)
-                        {
-                            v00 = _mm_and_si128(v00, mmMask4_epi32);
-                        }
-                        v00 = _mm_and_si128(v00, mask_0_3_epi16);
-                        _mm_storeu_si128((__m128i*)dIptr, v00);
+                        t0 = _mm_and_si128(t0, mmMask4_epi32);
                     }
+                    t0 = _mm_and_si128(t0, mask_0_3_epi16);
+                    _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0, t0));
+
+                    v00 = _mm_loadu_si128((const __m128i*)(dsrc));
+                    v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
+                    v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep));
+                    v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2));
+
+                    t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                        _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                    t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+                        _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+                    t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
+                    t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
+                    v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
+                    if (x + 4 > winSize.width)
+                    {
+                        v00 = _mm_and_si128(v00, mmMask4_epi32);
+                    }
+                    v00 = _mm_and_si128(v00, mask_0_3_epi16);
+                    _mm_storeu_si128((__m128i*)dIptr, v00);
+                }
 #else
 
-                    for (; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2)
+                for (; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2)
+                {
+                    if (maskPtr[x] == 0)
                     {
-                        if (maskPtr[x] == 0)
-                        {
-                            dIptr[0] = 0;
-                            dIptr[1] = 0;
-                            continue;
-                        }
-                        int ival = CV_DESCALE(src[x] * iw00 + src[x + cn] * iw01 +
-                            src[x + step] * iw10 + src[x + step + cn] * iw11, W_BITS1 - 5);
-                        int ixval = CV_DESCALE(dsrc[0] * iw00 + dsrc[cn2] * iw01 +
-                            dsrc[dstep] * iw10 + dsrc[dstep + cn2] * iw11, W_BITS1);
-                        int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc[dstep + 1] * iw10 +
-                            dsrc[dstep + cn2 + 1] * iw11, W_BITS1);
-
-                        Iptr[x] = (short)ival;
-                        dIptr[0] = (short)ixval;
-                        dIptr[1] = (short)iyval;
+                        dIptr[0] = 0;
+                        dIptr[1] = 0;
+                        continue;
                     }
-#endif
+                    int ival = CV_DESCALE(src[x] * iw00 + src[x + cn] * iw01 +
+                        src[x + step] * iw10 + src[x + step + cn] * iw11, W_BITS1 - 5);
+                    int ixval = CV_DESCALE(dsrc[0] * iw00 + dsrc[cn2] * iw01 +
+                        dsrc[dstep] * iw10 + dsrc[dstep + cn2] * iw11, W_BITS1);
+                    int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc[dstep + 1] * iw10 +
+                        dsrc[dstep + cn2 + 1] * iw11, W_BITS1);
+
+                    Iptr[x] = (short)ival;
+                    dIptr[0] = (short)ixval;
+                    dIptr[1] = (short)iyval;
                 }
+#endif
+            }
 
-                cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1);
+            cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1);
 
-                cv::Point2f backUpNextPt = nextPt;
-                nextPt -= halfWin;
-                int j;
+            cv::Point2f backUpNextPt = nextPt;
+            nextPt -= halfWin;
+            int j;
 #ifdef RLOF_SSE
-                __m128i mmMask0, mmMask1, mmMask;
-                getWBitMask(winSize.width, mmMask0, mmMask1, mmMask);
+            __m128i mmMask0, mmMask1, mmMask;
+            getWBitMask(winSize.width, mmMask0, mmMask1, mmMask);
 #endif
-                float MEstimatorScale = 1;
-                int buffIdx = 0;
-                cv::Point2f prevDelta(0, 0);
+            float MEstimatorScale = 1;
+            int buffIdx = 0;
+            cv::Point2f prevDelta(0, 0);
 
-                for (j = 0; j < criteria.maxCount; j++)
-                {
-                    inextPt.x = cvFloor(nextPt.x);
-                    inextPt.y = cvFloor(nextPt.y);
+            for (j = 0; j < criteria.maxCount; j++)
+            {
+                inextPt.x = cvFloor(nextPt.x);
+                inextPt.y = cvFloor(nextPt.y);
 
-                    if (inextPt.x < -winSize.width || inextPt.x >= J.cols ||
-                        inextPt.y < -winSize.height || inextPt.y >= J.rows)
-                    {
-                        if (level == 0 && status)
-                            status[ptidx] = 3;
-                        if (level > 0)
-                            nextPts[ptidx] = backUpNextPt;
-                        break;
-                    }
+                if (inextPt.x < -winSize.width || inextPt.x >= J.cols ||
+                    inextPt.y < -winSize.height || inextPt.y >= J.rows)
+                {
+                    if (level == 0 && status)
+                        status[ptidx] = 3;
+                    if (level > 0)
+                        nextPts[ptidx] = backUpNextPt;
+                    break;
+                }
 
 
-                    a = nextPt.x - inextPt.x;
-                    b = nextPt.y - inextPt.y;
-                    iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
-                    iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
-                    iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
-                    iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-                    b1 = 0;
-                    b2 = 0;
+                a = nextPt.x - inextPt.x;
+                b = nextPt.y - inextPt.y;
+                iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+                iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+                iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+                iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+                b1 = 0;
+                b2 = 0;
 
-                    if (j == 0 )
-                    {
-                        A11 = 0;
-                        A12 = 0;
-                        A22 = 0;
-                    }
+                if (j == 0 )
+                {
+                    A11 = 0;
+                    A12 = 0;
+                    A22 = 0;
+                }
 
-                    if (j == 0 )
+                if (j == 0 )
+                {
+                    buffIdx = 0;
+                    for (y = 0; y < winSize.height; y++)
                     {
-                        buffIdx = 0;
-                        for (y = 0; y < winSize.height; y++)
+                        const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn;
+                        const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step);
+                        const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
+                        const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+                        x = 0;
+                        for (; x < winSize.width*cn; x++, dIptr += 2)
                         {
-                            const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn;
-                            const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step);
-                            const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
-                            const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
-                            x = 0;
-                            for (; x < winSize.width*cn; x++, dIptr += 2)
-                            {
-                                if (maskPtr[x] == 0)
-                                    continue;
-                                int diff = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11, W_BITS1 - 5) - Iptr[x];
-                                residualMat.at<short>(buffIdx++) = static_cast<short>(diff);
-                            }
+                            if (maskPtr[x] == 0)
+                                continue;
+                            int diff = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11, W_BITS1 - 5) - Iptr[x];
+                            residualMat.at<short>(buffIdx++) = static_cast<short>(diff);
                         }
-                        /*! Estimation for the residual */
-                        cv::Mat residualMatRoi(residualMat, cv::Rect(0, 0, 1, buffIdx));
-                        MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi);
                     }
+                    /*! Estimation for the residual */
+                    cv::Mat residualMatRoi(residualMat, cv::Rect(0, 0, 1, buffIdx));
+                    MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi);
+                }
 
-                    float eta = 1.f / winArea;
-                    float fParam0 = param[0] * 32.f;
-                    float fParam1 = param[1] * 32.f;
-                    fParam0 = param[0] * MEstimatorScale;
-                    fParam1 = param[1] * MEstimatorScale;
+                float eta = 1.f / winArea;
+                float fParam0 = param[0] * 32.f;
+                float fParam1 = param[1] * 32.f;
+                fParam0 = param[0] * MEstimatorScale;
+                fParam1 = param[1] * MEstimatorScale;
 #ifdef RLOF_SSE
-                    qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
-                    qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
-                    __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps();
-                    __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();
-                    __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam0)));
-                    __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam1)));
-                    float s2Val = param[2] > 0 ? param[2] : -param[2];
-                    int s2bitShift = param[2] == 0 ? 1 : static_cast<int>(ceil(log(200.f / s2Val) / log(2.f)));
-                    __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast<short>(param[2] * (float)(1 << s2bitShift)));
-                    __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift);
-                    __m128  mmParam2s = _mm_set1_ps(0.01f * param[2]);
-                    __m128  mmParam2s2 = _mm_set1_ps(param[2] * param[2]);
-                    __m128  mmOnes = _mm_set1_ps(1.f);
-                    __m128i mmEta = _mm_setzero_si128();
-                    __m128i mmScale = _mm_set1_epi16(static_cast<short>(MEstimatorScale));
+                qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+                qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+                __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps();
+                __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();
+                __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam0)));
+                __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam1)));
+                float s2Val = param[2] > 0 ? param[2] : -param[2];
+                int s2bitShift = param[2] == 0 ? 1 : static_cast<int>(ceil(log(200.f / s2Val) / log(2.f)));
+                __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast<short>(param[2] * (float)(1 << s2bitShift)));
+                __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift);
+                __m128  mmParam2s = _mm_set1_ps(0.01f * param[2]);
+                __m128  mmParam2s2 = _mm_set1_ps(param[2] * param[2]);
+                __m128  mmOnes = _mm_set1_ps(1.f);
+                __m128i mmEta = _mm_setzero_si128();
+                __m128i mmScale = _mm_set1_epi16(static_cast<short>(MEstimatorScale));
 
 #endif
 
-                    buffIdx = 0;
-                    for (y = 0; y < winSize.height; y++)
-                    {
-                        const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
-                        const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn;
-                        const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step);
-                        const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
+                buffIdx = 0;
+                for (y = 0; y < winSize.height; y++)
+                {
+                    const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+                    const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn;
+                    const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step);
+                    const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
 
-                        x = 0;
+                    x = 0;
 #ifdef RLOF_SSE
-                        for (; x <= winSize.width*cn; x += 8, dIptr += 8 * 2)
-                        {
-                            __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
-                            __m128i diff0, diff1;
-                            __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); // von element 0 bis 7
-
-                            __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
-                            __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
-                            __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z);
-                            __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z);
-
-                            __m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
-                                _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-                            __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
-                                _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
-                            t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5);
-                            t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1 - 5);
-
-                            __m128i mmDiff_epi16 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16);
-
-                            mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16);
-
-
-                            __m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale);
-                            mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1)));
-
-
-                            __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
-                            __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
-
-                            __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16);
-                            __m128i bSet2_epi16, bSet1_epi16;
-                            // |It| < sigma1 ?
-                            bSet2_epi16 = _mm_cmplt_epi16(abs_epi16, mmParam1);
-                            // It > 0 ?
-                            __m128i diffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128());
-                            // sigma0 < |It| < sigma1 ?
-                            bSet1_epi16 = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0));
-                            // val = |It| -/+ sigma1
-                            __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)),
-                                _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1)));
-                            // It == 0     ? |It| > sigma13
-                            mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16);
-                            // It == val ? sigma0 < |It| < sigma1
-                            mmDiff_epi16 = _mm_blendv_epi8(mmDiff_epi16, tmpParam1_epi16, bSet1_epi16);
-
-
-                            __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3
-                            // diff = diff * sigma2
-                            __m128i lo = _mm_mullo_epi16(tale_epi16_, mmDiff_epi16);
-                            __m128i hi = _mm_mulhi_epi16(tale_epi16_, mmDiff_epi16);
-                            __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); //diff 0_3_epi32
-                            __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); // diff 4_7_epi32
-
-                            mmDiff_epi16 = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32);
-                            diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7   | It12 It12 It13 It13...
-                            diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3   | It8 It8 It9 It9...
-
-                            // Ix * diff / Iy * diff
-                            v10 = _mm_mullo_epi16(Ixy_0, diff0);
-                            v11 = _mm_mulhi_epi16(Ixy_0, diff0);
-                            v00 = _mm_unpacklo_epi16(v10, v11);
-                            v10 = _mm_unpackhi_epi16(v10, v11);
-                            qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
-                            qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
-
-                            // It * Ix It * Iy [4 ... 7]
-                            // for set 1 hi sigma 1
-
-                            v10 = _mm_mullo_epi16(Ixy_1, diff1);
-                            v11 = _mm_mulhi_epi16(Ixy_1, diff1);
-                            v00 = _mm_unpacklo_epi16(v10, v11);
-                            v10 = _mm_unpackhi_epi16(v10, v11);
-                            qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
-                            qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
+                    for (; x <= winSize.width*cn; x += 8, dIptr += 8 * 2)
+                    {
+                        __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
+                        __m128i diff0, diff1;
+                        __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); // von element 0 bis 7
 
+                        __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
+                        __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
+                        __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z);
+                        __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z);
 
-                            if (j == 0)
-                            {
-                                __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16));
-                                __m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16, bSet1_epi16), 16));
-                                __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16));
-                                __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16)));
+                        __m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                            _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                        __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+                            _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+                        t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5);
+                        t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1 - 5);
 
-                                __m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16));
-                                __m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16), 16));
+                        __m128i mmDiff_epi16 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16);
 
-                                __m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps);
-                                __m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps);
-                                tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps);
-                                tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps);
+                        mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16);
 
-                                tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps);
-                                tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps);
 
-                                t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3
-                                t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3
+                        __m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale);
+                        mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1)));
 
-                                __m128 fy = _mm_cvtepi32_ps(t0);
-                                __m128 fx = _mm_cvtepi32_ps(t1);
 
-                                // A11 - A22
-                                __m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps);
-                                __m128 fytale = _mm_mul_ps(fy, tale_0_3_ps);
+                        __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
+                        __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
 
-                                mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
-                                mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
-                                mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
+                        __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16);
+                        __m128i bSet2_epi16, bSet1_epi16;
+                        // |It| < sigma1 ?
+                        bSet2_epi16 = _mm_cmplt_epi16(abs_epi16, mmParam1);
+                        // It > 0 ?
+                        __m128i diffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128());
+                        // sigma0 < |It| < sigma1 ?
+                        bSet1_epi16 = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0));
+                        // val = |It| -/+ sigma1
+                        __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)),
+                            _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1)));
+                        // It == 0     ? |It| > sigma13
+                        mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16);
+                        // It == val ? sigma0 < |It| < sigma1
+                        mmDiff_epi16 = _mm_blendv_epi8(mmDiff_epi16, tmpParam1_epi16, bSet1_epi16);
 
-                                t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11
-                                t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3
 
-                                fy = _mm_cvtepi32_ps(t0);
-                                fx = _mm_cvtepi32_ps(t1);
+                        __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3
+                        // diff = diff * sigma2
+                        __m128i lo = _mm_mullo_epi16(tale_epi16_, mmDiff_epi16);
+                        __m128i hi = _mm_mulhi_epi16(tale_epi16_, mmDiff_epi16);
+                        __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); //diff 0_3_epi32
+                        __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); // diff 4_7_epi32
 
-                                // A11 - A22
-                                fxtale = _mm_mul_ps(fx, tale_4_7_ps);
-                                fytale = _mm_mul_ps(fy, tale_4_7_ps);
+                        mmDiff_epi16 = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32);
+                        diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7   | It12 It12 It13 It13...
+                        diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3   | It8 It8 It9 It9...
 
-                                mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
-                                mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
-                                mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
-                            }
-                        }
-#else
-                        for (; x < winSize.width*cn; x++, dIptr += 2)
+                        // Ix * diff / Iy * diff
+                        v10 = _mm_mullo_epi16(Ixy_0, diff0);
+                        v11 = _mm_mulhi_epi16(Ixy_0, diff0);
+                        v00 = _mm_unpacklo_epi16(v10, v11);
+                        v10 = _mm_unpackhi_epi16(v10, v11);
+                        qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
+                        qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
+
+                        // It * Ix It * Iy [4 ... 7]
+                        // for set 1 hi sigma 1
+
+                        v10 = _mm_mullo_epi16(Ixy_1, diff1);
+                        v11 = _mm_mulhi_epi16(Ixy_1, diff1);
+                        v00 = _mm_unpacklo_epi16(v10, v11);
+                        v10 = _mm_unpackhi_epi16(v10, v11);
+                        qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
+                        qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
+
+
+                        if (j == 0)
                         {
-                            if (maskPtr[x] == 0)
-                                continue;
-                            int diff = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 +
-                                Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11,
-                                W_BITS1 - 5) - Iptr[x];
+                            __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16));
+                            __m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16, bSet1_epi16), 16));
+                            __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16));
+                            __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16)));
 
+                            __m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16));
+                            __m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16), 16));
 
+                            __m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps);
+                            __m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps);
+                            tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps);
+                            tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps);
 
+                            tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps);
+                            tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps);
 
+                            t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3
+                            t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3
 
-                            if (diff > MEstimatorScale)
-                                MEstimatorScale += eta;
-                            if (diff < MEstimatorScale)
-                                MEstimatorScale -= eta;
+                            __m128 fy = _mm_cvtepi32_ps(t0);
+                            __m128 fx = _mm_cvtepi32_ps(t1);
 
-                            int abss = (diff < 0) ? -diff : diff;
-                            if (abss > fParam1)
-                            {
-                                diff = 0;
-                            }
-                            else if (abss > fParam0 && diff >= 0)
-                            {
-                                //diff = fParam1 * (diff - fParam1);
-                                diff = static_cast<int>(param[2] * (diff - fParam1));
-                            }
-                            else if (abss > fParam0 && diff < 0)
-                            {
-                                //diff = fParam1 * (diff + fParam1);
-                                diff = static_cast<int>(param[2] * (diff + fParam1));
+                            // A11 - A22
+                            __m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps);
+                            __m128 fytale = _mm_mul_ps(fy, tale_0_3_ps);
 
-                            }
+                            mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
+                            mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
+                            mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
 
-                            float ixval = (float)(dIptr[0]);
-                            float iyval = (float)(dIptr[1]);
-                            b1 += (float)(diff*ixval);
-                            b2 += (float)(diff*iyval);
+                            t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11
+                            t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3
 
-                            if ( j == 0)
-                            {
-                                float tale = param[2] * FLT_RESCALE;
-                                if (abss < fParam0)
-                                {
-                                    tale = FLT_RESCALE;
-                                }
-                                else if (abss > fParam1)
-                                {
-                                    tale *= 0.01f;
-                                }
-                                else
-                                {
-                                    tale *= param[2];
-                                }
-                                A11 += (float)(ixval*ixval*tale);
-                                A12 += (float)(ixval*iyval*tale);
-                                A22 += (float)(iyval*iyval*tale);
-                            }
+                            fy = _mm_cvtepi32_ps(t0);
+                            fx = _mm_cvtepi32_ps(t1);
 
+                            // A11 - A22
+                            fxtale = _mm_mul_ps(fx, tale_4_7_ps);
+                            fytale = _mm_mul_ps(fy, tale_4_7_ps);
 
+                            mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
+                            mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
+                            mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
                         }
-#endif
                     }
+#else
+                    for (; x < winSize.width*cn; x++, dIptr += 2)
+                    {
+                        if (maskPtr[x] == 0)
+                            continue;
+                        int diff = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 +
+                            Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11,
+                            W_BITS1 - 5) - Iptr[x];
 
-#ifdef RLOF_SSE
-                    short etaValues[8];
-                    _mm_storeu_si128((__m128i*)(etaValues), mmEta);
-                    MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3]
-                            + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]);
 
 
-#endif
-                    if (j == 0)
-                    {
-#ifdef RLOF_SSE
-                        float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];//
 
-                        _mm_store_ps(A11buf, mmAxx);
-                        _mm_store_ps(A12buf, mmAxy);
-                        _mm_store_ps(A22buf, mmAyy);
 
+                        if (diff > MEstimatorScale)
+                            MEstimatorScale += eta;
+                        if (diff < MEstimatorScale)
+                            MEstimatorScale -= eta;
 
-                        A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
-                        A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
-                        A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
-#endif
-                        A11 *= FLT_SCALE;
-                        A12 *= FLT_SCALE;
-                        A22 *= FLT_SCALE;
+                        int abss = (diff < 0) ? -diff : diff;
+                        if (abss > fParam1)
+                        {
+                            diff = 0;
+                        }
+                        else if (abss > fParam0 && diff >= 0)
+                        {
+                            //diff = fParam1 * (diff - fParam1);
+                            diff = static_cast<int>(param[2] * (diff - fParam1));
+                        }
+                        else if (abss > fParam0 && diff < 0)
+                        {
+                            //diff = fParam1 * (diff + fParam1);
+                            diff = static_cast<int>(param[2] * (diff + fParam1));
 
+                        }
 
-                        D = A11 * A22 - A12 * A12;
-                        minEig = (A22 + A11 - std::sqrt((A11 - A22)*(A11 - A22) +
-                            4.f*A12*A12)) / (2 * winArea);
-                        D = 1.f / D;
+                        float ixval = (float)(dIptr[0]);
+                        float iyval = (float)(dIptr[1]);
+                        b1 += (float)(diff*ixval);
+                        b2 += (float)(diff*iyval);
 
-                        if (minEig < minEigThreshold || std::abs(D) < FLT_EPSILON)
+                        if ( j == 0)
                         {
-                            if (level == 0 && status)
-                                status[ptidx] = 0;
-                            if (level > 0)
+                            float tale = param[2] * FLT_RESCALE;
+                            if (abss < fParam0)
+                            {
+                                tale = FLT_RESCALE;
+                            }
+                            else if (abss > fParam1)
+                            {
+                                tale *= 0.01f;
+                            }
+                            else
                             {
-                                nextPts[ptidx] = backUpNextPt;
+                                tale *= param[2];
                             }
-                            break;
+                            A11 += (float)(ixval*ixval*tale);
+                            A12 += (float)(ixval*iyval*tale);
+                            A22 += (float)(iyval*iyval*tale);
                         }
+
+
                     }
+#endif
+                }
 
 #ifdef RLOF_SSE
-                    float CV_DECL_ALIGNED(16) bbuf[4];
-                    _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
-                    b1 += bbuf[0] + bbuf[2];
-                    b2 += bbuf[1] + bbuf[3];
+                short etaValues[8];
+                _mm_storeu_si128((__m128i*)(etaValues), mmEta);
+                MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3]
+                        + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]);
+
+
 #endif
+                if (j == 0)
+                {
+#ifdef RLOF_SSE
+                    float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];//
 
-                    b1 *= FLT_SCALE;
-                    b2 *= FLT_SCALE;
+                    _mm_store_ps(A11buf, mmAxx);
+                    _mm_store_ps(A12buf, mmAxy);
+                    _mm_store_ps(A22buf, mmAyy);
 
 
-                    Point2f delta((float)((A12*b2 - A22 * b1) * D), (float)((A12*b1 - A11 * b2) * D));
+                    A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
+                    A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
+                    A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
+#endif
+                    A11 *= FLT_SCALE;
+                    A12 *= FLT_SCALE;
+                    A22 *= FLT_SCALE;
 
-                    delta.x = (delta.x != delta.x) ? 0 : delta.x;
-                    delta.y = (delta.y != delta.y) ? 0 : delta.y;
 
-                    nextPt += delta * 0.7;
-                    nextPts[ptidx] = nextPt + halfWin;
+                    D = A11 * A22 - A12 * A12;
+                    minEig = (A22 + A11 - std::sqrt((A11 - A22)*(A11 - A22) +
+                        4.f*A12*A12)) / (2 * winArea);
+                    D = 1.f / D;
 
-                    if (j > 0 && std::abs(delta.x - prevDelta.x) < 0.01  &&
-                        std::abs(delta.y - prevDelta.y) < 0.01)
+                    if (minEig < minEigThreshold || std::abs(D) < FLT_EPSILON)
                     {
-                        nextPts[ptidx] -= delta * 0.5f;
+                        if (level == 0 && status)
+                            status[ptidx] = 0;
+                        if (level > 0)
+                        {
+                            nextPts[ptidx] = backUpNextPt;
+                        }
                         break;
                     }
+                }
+
+#ifdef RLOF_SSE
+                float CV_DECL_ALIGNED(16) bbuf[4];
+                _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
+                b1 += bbuf[0] + bbuf[2];
+                b2 += bbuf[1] + bbuf[3];
+#endif
+
+                b1 *= FLT_SCALE;
+                b2 *= FLT_SCALE;
 
-                    prevDelta = delta;
 
+                Point2f delta((float)((A12*b2 - A22 * b1) * D), (float)((A12*b1 - A11 * b2) * D));
+
+                delta.x = (delta.x != delta.x) ? 0 : delta.x;
+                delta.y = (delta.y != delta.y) ? 0 : delta.y;
+
+                nextPt += delta * 0.7;
+                nextPts[ptidx] = nextPt + halfWin;
+
+                if (j > 0 && std::abs(delta.x - prevDelta.x) < 0.01  &&
+                    std::abs(delta.y - prevDelta.y) < 0.01)
+                {
+                    nextPts[ptidx] -= delta * 0.5f;
+                    break;
                 }
 
+                prevDelta = delta;
+
             }
 
         }
 
-        const Mat*        prevImg;
-        const Mat*        nextImg;
-        const Mat*        prevDeriv;
-        const Mat*        rgbPrevImg;
-        const Mat*        rgbNextImg;
-        const Point2f*    prevPts;
-        Point2f*        nextPts;
-        uchar*            status;
-        float*            err;
-        int                maxWinSize;
-        int                minWinSize;
-        TermCriteria    criteria;
-        int                level;
-        int                maxLevel;
-        int                windowType;
-        float            minEigThreshold;
-        bool            useInitialFlow;
-        std::vector<float> paramReset;
-        int                crossSegmentationThreshold;
-    };
-}
-namespace radial
+    }
+
+    const Mat*          prevImg;
+    const Mat*          nextImg;
+    const Mat*          prevDeriv;
+    const Mat*          rgbPrevImg;
+    const Mat*          rgbNextImg;
+    const Point2f*      prevPts;
+    Point2f*            nextPts;
+    uchar*              status;
+    float*              err;
+    int                 maxWinSize;
+    int                 minWinSize;
+    TermCriteria        criteria;
+    int                 level;
+    int                 maxLevel;
+    int                 windowType;
+    float               minEigThreshold;
+    bool                useInitialFlow;
+    std::vector<float>  paramReset;
+    int                 crossSegmentationThreshold;
+};
+
+} // namespace
+namespace radial {
+
+class TrackerInvoker : public cv::ParallelLoopBody
 {
-    class TrackerInvoker : public cv::ParallelLoopBody
+public:
+    TrackerInvoker(
+        const Mat&      _prevImg,
+        const Mat&      _prevDeriv,
+        const Mat&      _nextImg,
+        const Mat&      _rgbPrevImg,
+        const Mat&      _rgbNextImg,
+        const Point2f*  _prevPts,
+        Point2f*        _nextPts,
+        uchar*          _status,
+        float*          _err,
+        Point2f*        _gainVecs,
+        int             _level,
+        int             _maxLevel,
+        int             _winSize[2],
+        int             _maxIteration,
+        bool            _useInitialFlow,
+        int             _supportRegionType,
+        std::vector<float> _normSigmas,
+        float           _minEigenValue,
+        int             _crossSegmentationThreshold)
+    {
+        prevImg = &_prevImg;
+        prevDeriv = &_prevDeriv;
+        nextImg = &_nextImg;
+        rgbPrevImg = &_rgbPrevImg;
+        rgbNextImg = &_rgbNextImg;
+        prevPts = _prevPts;
+        nextPts = _nextPts;
+        status = _status;
+        err = _err;
+        gainVecs = _gainVecs;
+        minWinSize = _winSize[0];
+        maxWinSize = _winSize[1];
+        criteria.maxCount = _maxIteration;
+        criteria.epsilon = 0.01;
+        level = _level;
+        maxLevel = _maxLevel;
+        windowType = _supportRegionType;
+        minEigThreshold = _minEigenValue;
+        paramReset = _normSigmas;
+        useInitialFlow = _useInitialFlow;
+        crossSegmentationThreshold = _crossSegmentationThreshold;
+    }
+
+    TrackerInvoker & operator=(const TrackerInvoker &) { return *this; };
+    void operator()(const cv::Range& range) const CV_OVERRIDE
     {
-    public:
-        TrackerInvoker(
-            const Mat&    _prevImg,
-            const Mat&    _prevDeriv,
-            const Mat&    _nextImg,
-            const Mat&    _rgbPrevImg,
-            const Mat&    _rgbNextImg,
-            const Point2f*    _prevPts,
-            Point2f*        _nextPts,
-            uchar*            _status,
-            float*            _err,
-            Point2f*        _gainVecs,
-            int                _level,
-            int                _maxLevel,
-            int                _winSize[2],
-            int                _maxIteration,
-            bool            _useInitialFlow,
-            int                _supportRegionType,
-            std::vector<float> _normSigmas,
-            float           _minEigenValue,
-            int                _crossSegmentationThreshold)
-        {
-            prevImg = &_prevImg;
-            prevDeriv = &_prevDeriv;
-            nextImg = &_nextImg;
-            rgbPrevImg = &_rgbPrevImg;
-            rgbNextImg = &_rgbNextImg;
-            prevPts = _prevPts;
-            nextPts = _nextPts;
-            status = _status;
-            err = _err;
-            gainVecs = _gainVecs;
-            minWinSize = _winSize[0];
-            maxWinSize = _winSize[1];
-            criteria.maxCount = _maxIteration;
-            criteria.epsilon = 0.01;
-            level = _level;
-            maxLevel = _maxLevel;
-            windowType = _supportRegionType;
-            minEigThreshold = _minEigenValue;
-            paramReset = _normSigmas;
-            useInitialFlow = _useInitialFlow;
-            crossSegmentationThreshold = _crossSegmentationThreshold;
-        }
-
-        TrackerInvoker & operator=(const TrackerInvoker &) { return *this; };
-        void operator()(const cv::Range& range) const CV_OVERRIDE
-        {
 #ifdef DEBUG_INVOKER
-            printf("rlof::radial");fflush(stdout);
+        printf("rlof::radial");fflush(stdout);
 #endif
-            Point2f halfWin;
-            cv::Size winSize;
-            const Mat& I = *prevImg;
-            const Mat& J = *nextImg;
-            const Mat& derivI = *prevDeriv;
-            const Mat& BI = *rgbPrevImg;
+        Point2f halfWin;
+        cv::Size winSize;
+        const Mat& I = *prevImg;
+        const Mat& J = *nextImg;
+        const Mat& derivI = *prevDeriv;
+        const Mat& BI = *rgbPrevImg;
 
 
-            const float FLT_SCALE = (1.f / (1 << 16));
+        const float FLT_SCALE = (1.f / (1 << 16));
 
-            winSize = cv::Size(maxWinSize, maxWinSize);
-            int winMaskwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
-            cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
-            winMaskMatBuf.setTo(1);
+        winSize = cv::Size(maxWinSize, maxWinSize);
+        int winMaskwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
+        cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
+        winMaskMatBuf.setTo(1);
 
-            std::vector<float> param = paramReset;
-            int cn = I.channels(), cn2 = cn * 2;
-            int winbufwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
-            cv::Size winBufSize(winbufwidth, winbufwidth);
+        std::vector<float> param = paramReset;
+        int cn = I.channels(), cn2 = cn * 2;
+        int winbufwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
+        cv::Size winBufSize(winbufwidth, winbufwidth);
 
-            cv::Mat invTensorMat(4, 4, CV_32FC1);
-            cv::Mat mismatchMat(4, 1, CV_32FC1);
-            cv::Mat resultMat(4, 1, CV_32FC1);
+        cv::Mat invTensorMat(4, 4, CV_32FC1);
+        cv::Mat mismatchMat(4, 1, CV_32FC1);
+        cv::Mat resultMat(4, 1, CV_32FC1);
 
 
-            std::vector<short> _buf(winBufSize.area()*(cn + cn2));
-            Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]);
-            Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]);
+        std::vector<short> _buf(winBufSize.area()*(cn + cn2));
+        Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]);
+        Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]);
 
-            for (int ptidx = range.start; ptidx < range.end; ptidx++)
-            {
+        for (int ptidx = range.start; ptidx < range.end; ptidx++)
+        {
 
-                Point2f prevPt = prevPts[ptidx] * (float)(1. / (1 << level));
-                    Point2f nextPt;
-                if (level == maxLevel)
+            Point2f prevPt = prevPts[ptidx] * (float)(1. / (1 << level));
+            Point2f nextPt;
+            if (level == maxLevel)
+            {
+                if (useInitialFlow)
                 {
-                    if (useInitialFlow)
-                    {
-                        nextPt = nextPts[ptidx] * (float)(1. / (1 << level));
-                        }
-                    else
-                        nextPt = prevPt;
+                    nextPt = nextPts[ptidx] * (float)(1. / (1 << level));
                 }
                 else
-                {
-                    nextPt = nextPts[ptidx] * 2.f;
+                    nextPt = prevPt;
+            }
+            else
+            {
+                nextPt = nextPts[ptidx] * 2.f;
 
-                }
-                nextPts[ptidx] = nextPt;
-
-                Point2i iprevPt, inextPt;
-                iprevPt.x = cvFloor(prevPt.x);
-                iprevPt.y = cvFloor(prevPt.y);
-                int winArea = maxWinSize * maxWinSize;
-                cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0, 0, maxWinSize, maxWinSize));
-                winMaskMatBuf.setTo(0);
-                if (calcWinMaskMat(BI, windowType, iprevPt,
-                    winMaskMat, winSize, halfWin, winArea,
-                    minWinSize, maxWinSize, crossSegmentationThreshold) == false)
-                    continue;
-
-                prevPt -= halfWin;
-                iprevPt.x = cvFloor(prevPt.x);
-                iprevPt.y = cvFloor(prevPt.y);
-
-                if (iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols ||
-                    iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows)
+            }
+            nextPts[ptidx] = nextPt;
+
+            Point2i iprevPt, inextPt;
+            iprevPt.x = cvFloor(prevPt.x);
+            iprevPt.y = cvFloor(prevPt.y);
+            int winArea = maxWinSize * maxWinSize;
+            cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0, 0, maxWinSize, maxWinSize));
+            winMaskMatBuf.setTo(0);
+            if (calcWinMaskMat(BI, windowType, iprevPt,
+                winMaskMat, winSize, halfWin, winArea,
+                minWinSize, maxWinSize, crossSegmentationThreshold) == false)
+                continue;
+
+            prevPt -= halfWin;
+            iprevPt.x = cvFloor(prevPt.x);
+            iprevPt.y = cvFloor(prevPt.y);
+
+            if (iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols ||
+                iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows)
+            {
+                if (level == 0)
                 {
-                    if (level == 0)
-                    {
-                        if (status)
-                            status[ptidx] = 3;
-                        if (err)
-                            err[ptidx] = 0;
-                    }
-                    continue;
+                    if (status)
+                        status[ptidx] = 3;
+                    if (err)
+                        err[ptidx] = 0;
                 }
+                continue;
+            }
 
-                float a = prevPt.x - iprevPt.x;
-                float b = prevPt.y - iprevPt.y;
-                const int W_BITS = 14, W_BITS1 = 14;
-
-                int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
-                int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
-                int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
-                int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-
-                int dstep = (int)(derivI.step / derivI.elemSize1());
-                int step = (int)(I.step / I.elemSize1());
-                int mStep = (int)(winMaskMat.step / winMaskMat.elemSize1());
-                CV_Assert(step == (int)(J.step / J.elemSize1()));
-
-                float A11 = 0, A12 = 0, A22 = 0;
-                float b1 = 0, b2 = 0, b3 = 0, b4 = 0;
-                // tensor
-                float sumIx = 0;
-                float sumIy = 0;
-                float sumI = 0;
-                float sumW = 0;
-                float w1 = 0, w2 = 0; // -IyI
-                float dI = 0; // I^2
-                float D = 0;
+            float a = prevPt.x - iprevPt.x;
+            float b = prevPt.y - iprevPt.y;
+            const int W_BITS = 14, W_BITS1 = 14;
+
+            int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+            int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+            int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+            int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+
+            int dstep = (int)(derivI.step / derivI.elemSize1());
+            int step = (int)(I.step / I.elemSize1());
+            int mStep = (int)(winMaskMat.step / winMaskMat.elemSize1());
+            CV_Assert(step == (int)(J.step / J.elemSize1()));
+
+            float A11 = 0, A12 = 0, A22 = 0;
+            float b1 = 0, b2 = 0, b3 = 0, b4 = 0;
+            // tensor
+            float sumIx = 0;
+            float sumIy = 0;
+            float sumI = 0;
+            float sumW = 0;
+            float w1 = 0, w2 = 0; // -IyI
+            float dI = 0; // I^2
+            float D = 0;
 
 #ifdef RLOF_SSE
-                __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
-                __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
-                __m128i z = _mm_setzero_si128();
-                __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1 - 1));
-                __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1 - 5 - 1));
-                __m128i mmMask4_epi32;
-                __m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits<unsigned short>::max());
-                get4BitMask(winSize.width, mmMask4_epi32);
+            __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+            __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+            __m128i z = _mm_setzero_si128();
+            __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1 - 1));
+            __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1 - 5 - 1));
+            __m128i mmMask4_epi32;
+            __m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits<unsigned short>::max());
+            get4BitMask(winSize.width, mmMask4_epi32);
 #endif
 
-                // extract the patch from the first image, compute covariation matrix of derivatives
-                int x, y;
-                for (y = 0; y < winSize.height; y++)
-                {
-                    const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
-                    const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
+            // extract the patch from the first image, compute covariation matrix of derivatives
+            int x, y;
+            for (y = 0; y < winSize.height; y++)
+            {
+                const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
+                const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
 
-                    short* Iptr = (short*)(IWinBuf.data + y * IWinBuf.step);
-                    short* dIptr = (short*)(derivIWinBuf.data + y * derivIWinBuf.step);
+                short* Iptr = (short*)(IWinBuf.data + y * IWinBuf.step);
+                short* dIptr = (short*)(derivIWinBuf.data + y * derivIWinBuf.step);
 
-                    x = 0;
+                x = 0;
 
 #ifdef RLOF_SSE
-                    const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
-                    for (; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4 * 2, dIptr += 4 * 2)
+                const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+                for (; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4 * 2, dIptr += 4 * 2)
+                {
+                    __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
+                    __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16);
+                    __m128i v00, v01, v10, v11, t0, t1;
+                    v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
+                    v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
+                    v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z);
+                    v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z);
+
+                    t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                        _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                    t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5);
+                    if (x + 4 > winSize.width)
                     {
-                        __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
-                        __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16);
-                        __m128i v00, v01, v10, v11, t0, t1;
-                        v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
-                        v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
-                        v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z);
-                        v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z);
-
-                        t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
-                            _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-                        t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5);
-                        if (x + 4 > winSize.width)
-                        {
-                            t0 = _mm_and_si128(t0, mmMask4_epi32);
-                        }
-                        t0 = _mm_and_si128(t0, mask_0_3_epi16);
-                        _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0, t0));
-
-
-                        v00 = _mm_loadu_si128((const __m128i*)(dsrc));
-                        v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
-                        v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep));
-                        v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2));
-
-                        t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
-                            _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-                        t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
-                            _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
-                        t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
-                        t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
-                        v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
-                        if (x + 4 > winSize.width)
-                        {
-                            v00 = _mm_and_si128(v00, mmMask4_epi32);
-                        }
-                        v00 = _mm_and_si128(v00, mask_0_3_epi16);
-                        _mm_storeu_si128((__m128i*)dIptr, v00);
+                        t0 = _mm_and_si128(t0, mmMask4_epi32);
+                    }
+                    t0 = _mm_and_si128(t0, mask_0_3_epi16);
+                    _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0, t0));
+
+
+                    v00 = _mm_loadu_si128((const __m128i*)(dsrc));
+                    v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
+                    v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep));
+                    v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2));
+
+                    t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                        _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                    t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+                        _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+                    t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
+                    t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
+                    v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
+                    if (x + 4 > winSize.width)
+                    {
+                        v00 = _mm_and_si128(v00, mmMask4_epi32);
                     }
+                    v00 = _mm_and_si128(v00, mask_0_3_epi16);
+                    _mm_storeu_si128((__m128i*)dIptr, v00);
+                }
 #else
 
-                    for (; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2)
+                for (; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2)
+                {
+                    if (winMaskMat.at<uchar>(y, x) == 0)
                     {
-                        if (winMaskMat.at<uchar>(y, x) == 0)
-                        {
-                            dIptr[0] = 0;
-                            dIptr[1] = 0;
-                            continue;
-                        }
-                        int ival = CV_DESCALE(src[x] * iw00 + src[x + cn] * iw01 +
-                            src[x + step] * iw10 + src[x + step + cn] * iw11, W_BITS1 - 5);
-                        int ixval = CV_DESCALE(dsrc[0] * iw00 + dsrc[cn2] * iw01 +
-                            dsrc[dstep] * iw10 + dsrc[dstep + cn2] * iw11, W_BITS1);
-                        int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc[dstep + 1] * iw10 +
-                            dsrc[dstep + cn2 + 1] * iw11, W_BITS1);
+                        dIptr[0] = 0;
+                        dIptr[1] = 0;
+                        continue;
+                    }
+                    int ival = CV_DESCALE(src[x] * iw00 + src[x + cn] * iw01 +
+                        src[x + step] * iw10 + src[x + step + cn] * iw11, W_BITS1 - 5);
+                    int ixval = CV_DESCALE(dsrc[0] * iw00 + dsrc[cn2] * iw01 +
+                        dsrc[dstep] * iw10 + dsrc[dstep + cn2] * iw11, W_BITS1);
+                    int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc[dstep + 1] * iw10 +
+                        dsrc[dstep + cn2 + 1] * iw11, W_BITS1);
 
-                        Iptr[x] = (short)ival;
-                        dIptr[0] = (short)ixval;
-                        dIptr[1] = (short)iyval;
+                    Iptr[x] = (short)ival;
+                    dIptr[0] = (short)ixval;
+                    dIptr[1] = (short)iyval;
 
-                    }
-#endif
                 }
+#endif
+            }
 
-                cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1);
+            cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1);
 
-                cv::Point2f backUpNextPt = nextPt;
-                nextPt -= halfWin;
-                Point2f prevDelta(0, 0);    //related to h(t-1)
-                Point2f prevGain(0, 0);
-                cv::Point2f gainVec = gainVecs[ptidx];
-                cv::Point2f backUpGain = gainVec;
-                cv::Size _winSize = winSize;
-                int j;
+            cv::Point2f backUpNextPt = nextPt;
+            nextPt -= halfWin;
+            Point2f prevDelta(0, 0);    //related to h(t-1)
+            Point2f prevGain(0, 0);
+            cv::Point2f gainVec = gainVecs[ptidx];
+            cv::Point2f backUpGain = gainVec;
+            cv::Size _winSize = winSize;
+            int j;
 #ifdef RLOF_SSE
-                __m128i mmMask0, mmMask1, mmMask;
-                getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask);
-                __m128  mmOnes = _mm_set1_ps(1.f);
+            __m128i mmMask0, mmMask1, mmMask;
+            getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask);
+            __m128  mmOnes = _mm_set1_ps(1.f);
 #endif
-                float MEstimatorScale = 1;
-                int buffIdx = 0;
-                float minEigValue;
+            float MEstimatorScale = 1;
+            int buffIdx = 0;
+            float minEigValue;
 
-                for (j = 0; j < criteria.maxCount; j++)
-                {
+            for (j = 0; j < criteria.maxCount; j++)
+            {
 
-                    inextPt.x = cvFloor(nextPt.x);
-                    inextPt.y = cvFloor(nextPt.y);
+                inextPt.x = cvFloor(nextPt.x);
+                inextPt.y = cvFloor(nextPt.y);
 
-                    if (inextPt.x < -winSize.width || inextPt.x >= J.cols ||
-                        inextPt.y < -winSize.height || inextPt.y >= J.rows)
+                if (inextPt.x < -winSize.width || inextPt.x >= J.cols ||
+                    inextPt.y < -winSize.height || inextPt.y >= J.rows)
+                {
+                    if (level == 0 && status)
+                        status[ptidx] = 3;
+                    if (level > 0)
                     {
-                        if (level == 0 && status)
-                            status[ptidx] = 3;
-                        if (level > 0)
-                        {
-                            nextPts[ptidx] = backUpNextPt;
-                            gainVecs[ptidx] = backUpGain;
-                        }
-                        break;
+                        nextPts[ptidx] = backUpNextPt;
+                        gainVecs[ptidx] = backUpGain;
                     }
+                    break;
+                }
 
-                    a = nextPt.x - inextPt.x;
-                    b = nextPt.y - inextPt.y;
-                    iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
-                    iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
-                    iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
-                    iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-
-                    // mismatch
-                    b1 = 0;
-                    b2 = 0;
-                    b3 = 0;
-                    b4 = 0;
-                    if (j == 0)
-                    {
-                        // tensor
-                        w1 = 0; // -IxI
-                        w2 = 0; // -IyI
-                        dI = 0; // I^2
-                        sumIx = 0;
-                        sumIy = 0;
-                        sumI = 0;
-                        sumW = 0;
-                        A11 = 0;
-                        A12 = 0;
-                        A22 = 0;
-                    }
+                a = nextPt.x - inextPt.x;
+                b = nextPt.y - inextPt.y;
+                iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+                iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+                iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+                iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+
+                // mismatch
+                b1 = 0;
+                b2 = 0;
+                b3 = 0;
+                b4 = 0;
+                if (j == 0)
+                {
+                    // tensor
+                    w1 = 0; // -IxI
+                    w2 = 0; // -IyI
+                    dI = 0; // I^2
+                    sumIx = 0;
+                    sumIy = 0;
+                    sumI = 0;
+                    sumW = 0;
+                    A11 = 0;
+                    A12 = 0;
+                    A22 = 0;
+                }
 
-                    if (j == 0 )
+                if (j == 0 )
+                {
+                    buffIdx = 0;
+                    for (y = 0; y < winSize.height; y++)
                     {
-                        buffIdx = 0;
-                        for (y = 0; y < winSize.height; y++)
+                        const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn;
+                        const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step);
+                        const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
+                        x = 0;
+                        for (; x < winSize.width*cn; x++, dIptr += 2)
                         {
-                            const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn;
-                            const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step);
-                            const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
-                            x = 0;
-                            for (; x < winSize.width*cn; x++, dIptr += 2)
-                            {
-                                if (dIptr[0] == 0 && dIptr[1] == 0)
-                                    continue;
-                                int diff = static_cast<int>(CV_DESCALE(    Jptr[x] * iw00 +
-                                                        Jptr[x + cn] * iw01 +
-                                                        Jptr[x + step] * iw10 +
-                                                        Jptr[x + step + cn] * iw11, W_BITS1 - 5)
-                                    - Iptr[x] + Iptr[x] * gainVec.x + gainVec.y);
-                                residualMat.at<short>(buffIdx++) = static_cast<short>(diff);
-                            }
+                            if (dIptr[0] == 0 && dIptr[1] == 0)
+                                continue;
+                            int diff = static_cast<int>(CV_DESCALE(    Jptr[x] * iw00 +
+                                                    Jptr[x + cn] * iw01 +
+                                                    Jptr[x + step] * iw10 +
+                                                    Jptr[x + step + cn] * iw11, W_BITS1 - 5)
+                                - Iptr[x] + Iptr[x] * gainVec.x + gainVec.y);
+                            residualMat.at<short>(buffIdx++) = static_cast<short>(diff);
                         }
-                        /*! Estimation for the residual */
-                        cv::Mat residualMatRoi(residualMat, cv::Rect(0, 0, 1, buffIdx));
-                        MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi);
                     }
+                    /*! Estimation for the residual */
+                    cv::Mat residualMatRoi(residualMat, cv::Rect(0, 0, 1, buffIdx));
+                    MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi);
+                }
 
-                    float eta = 1.f / winArea;
-                    float fParam0 = param[0] * 32.f;
-                    float fParam1 = param[1] * 32.f;
-                    fParam0 = param[0] * MEstimatorScale;
-                    fParam1 = param[1] * MEstimatorScale;
+                float eta = 1.f / winArea;
+                float fParam0 = param[0] * 32.f;
+                float fParam1 = param[1] * 32.f;
+                fParam0 = param[0] * MEstimatorScale;
+                fParam1 = param[1] * MEstimatorScale;
 
 #ifdef RLOF_SSE
 
-                    qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
-                    qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
-                    __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(), qb2 = _mm_setzero_ps();
-                    __m128 qb3 = _mm_setzero_ps();
-                    __m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps();
-                    __m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps();
-                    __m128 mmSumIy = _mm_setzero_ps(), mmSumIx = _mm_setzero_ps();
-                    __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();
-                    __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam0)));
-                    __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam1)));
-
-
-                    float s2Val = param[2] > 0 ? param[2] : -param[2];
-                    int s2bitShift = param[2] == 0 ? 1 : static_cast<int>(ceil(log(200.f / s2Val) / log(2.f)));
-                    __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast<short>(param[2] * (float)(1 << s2bitShift)));
-                    __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift);
-                    __m128  mmParam2s = _mm_set1_ps(0.01f * param[2]);
-                    __m128  mmParam2s2 = _mm_set1_ps(param[2] * param[2]);
-
-                    float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x;
-                    int bitShift = gainVec.x == 0 ? 1 : static_cast<int>(ceil(log(200.f / gainVal) / log(2.f)));
-
-                    __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.x * (float)(1 << bitShift)));
-                    __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.y));
-                    __m128i    mmEta = _mm_setzero_si128();
-                    __m128i mmScale = _mm_set1_epi16(static_cast<short>(MEstimatorScale));
+                qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+                qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+                __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(), qb2 = _mm_setzero_ps();
+                __m128 qb3 = _mm_setzero_ps();
+                __m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps();
+                __m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps();
+                __m128 mmSumIy = _mm_setzero_ps(), mmSumIx = _mm_setzero_ps();
+                __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();
+                __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam0)));
+                __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam1)));
+
+
+                float s2Val = param[2] > 0 ? param[2] : -param[2];
+                int s2bitShift = param[2] == 0 ? 1 : static_cast<int>(ceil(log(200.f / s2Val) / log(2.f)));
+                __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast<short>(param[2] * (float)(1 << s2bitShift)));
+                __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift);
+                __m128  mmParam2s = _mm_set1_ps(0.01f * param[2]);
+                __m128  mmParam2s2 = _mm_set1_ps(param[2] * param[2]);
+
+                float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x;
+                int bitShift = gainVec.x == 0 ? 1 : static_cast<int>(ceil(log(200.f / gainVal) / log(2.f)));
+
+                __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.x * (float)(1 << bitShift)));
+                __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.y));
+                __m128i    mmEta = _mm_setzero_si128();
+                __m128i mmScale = _mm_set1_epi16(static_cast<short>(MEstimatorScale));
 
 #endif
-                    buffIdx = 0;
+                buffIdx = 0;
 
-                    for (y = 0; y < _winSize.height; y++)
-                    {
-                        const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
-                        const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn;
-                        const short* Iptr = (const short*)(IWinBuf.data + y * IWinBuf.step);
-                        const short* dIptr = (const short*)(derivIWinBuf.data + y * derivIWinBuf.step);
+                for (y = 0; y < _winSize.height; y++)
+                {
+                    const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+                    const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn;
+                    const short* Iptr = (const short*)(IWinBuf.data + y * IWinBuf.step);
+                    const short* dIptr = (const short*)(derivIWinBuf.data + y * derivIWinBuf.step);
 
-                        x = 0;
+                    x = 0;
 #ifdef RLOF_SSE
 
 
-                        for (; x <= _winSize.width*cn; x += 8, dIptr += 8 * 2)
-                        {
-                            __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
-                            __m128i diff0, diff1;
-                            __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); // von element 0 bis 7
-                               __m128i v00 = _mm_unpacklo_epi8(
-                            _mm_loadl_epi64((const __m128i*)(Jptr + x)) , z); //J0 , 0, J1, 0, J2, 0 ... J7,0
-                            __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
-                            __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z);
-                            __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z);
-
-                            __m128i t0 = _mm_add_epi32
-                            (_mm_madd_epi16(
-                                _mm_unpacklo_epi16(v00, v01), //J0, , J1, J1, J2, J2,  ... J3 , J4
-                                qw0),
-                                _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-                            __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
-                                _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
-
-                            t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5);
-                            t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1 - 5);
-
-
-                            __m128i lo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16);
-                            __m128i hi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16);
-
-                            __m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), bitShift);
-                            __m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), bitShift);
-                            __m128i Igain_epi16 = _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32);
-
-                            // diff = J - I + I*gain.x + gain.y
-                            __m128i mmDiff_epi16 = _mm_add_epi16(
-                                _mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16),    // J - I
-                                _mm_add_epi16(Igain_epi16, mmConstValue_epi16));
-
-
-                            mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16);
-
-
-                            __m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale);
-                            mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1)));
-
-
-                            __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1
-                            __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
-
-
-                            __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16);
-                            __m128i bSet2_epi16, bSet1_epi16;
-                            // |It| < sigma1 ?
-                            bSet2_epi16 = _mm_cmplt_epi16(abs_epi16, mmParam1);
-                            // It > 0 ?
-                            __m128i diffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128());
-                            // sigma0 < |It| < sigma1 ?
-                            bSet1_epi16 = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0));
-                            // val = |It| -/+ sigma1
-                            __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)),
-                                _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1)));
-                            // It == 0     ? |It| > sigma13
-                            mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16);
-                            // It == val ? sigma0 < |It| < sigma1
-                            mmDiff_epi16 = _mm_blendv_epi8(mmDiff_epi16, tmpParam1_epi16, bSet1_epi16);
-
-
-                            __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3
-                            // diff = diff * sigma2
-                            lo = _mm_mullo_epi16(tale_epi16_, mmDiff_epi16);
-                            hi = _mm_mulhi_epi16(tale_epi16_, mmDiff_epi16);
-                            __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); //diff 0_3_epi32
-                            __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); // diff 4_7_epi32
-
-                            mmDiff_epi16 = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); // ,da das ergebniss kleiner als 16 bit sein sollte
-
-                            diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7   | It12 It12 It13 It13...
-                            diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3   | It8 It8 It9 It9...
-
-                            // Ix * diff / Iy * diff
-                            v10 = _mm_mullo_epi16(Ixy_0, diff0);
-                            v11 = _mm_mulhi_epi16(Ixy_0, diff0);
-                            v00 = _mm_unpacklo_epi16(v10, v11);
-                            v10 = _mm_unpackhi_epi16(v10, v11);
-                            qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
-                            qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
-
-                            // It * Ix It * Iy [4 ... 7]
-                            // for set 1 hi sigma 1
-
-                            v10 = _mm_mullo_epi16(Ixy_1, diff1);
-                            v11 = _mm_mulhi_epi16(Ixy_1, diff1);
-                            v00 = _mm_unpacklo_epi16(v10, v11);
-                            v10 = _mm_unpackhi_epi16(v10, v11);
-                            qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
-                            qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
-                            // diff * J [0 ... 7]
-                            // for set 1  sigma 1
-                            // b3 += diff * Iptr[x]
-
-                            v10 = _mm_mullo_epi16(mmDiff_epi16, I_0_7_epi16);
-                            v11 = _mm_mulhi_epi16(mmDiff_epi16, I_0_7_epi16);
-                            v00 = _mm_unpacklo_epi16(v10, v11);
-                            v10 = _mm_unpackhi_epi16(v10, v11);
-                            qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v00));
-                            qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v10));
-
-                            qb3 = _mm_add_ps(qb3, _mm_cvtepi32_ps(diff_0_3_epi32));
-                            qb3 = _mm_add_ps(qb3, _mm_cvtepi32_ps(diff_4_7_epi32));
+                    for (; x <= _winSize.width*cn; x += 8, dIptr += 8 * 2)
+                    {
+                        __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
+                        __m128i diff0, diff1;
+                        __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); // von element 0 bis 7
+                           __m128i v00 = _mm_unpacklo_epi8(
+                        _mm_loadl_epi64((const __m128i*)(Jptr + x)) , z); //J0 , 0, J1, 0, J2, 0 ... J7,0
+                        __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
+                        __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z);
+                        __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z);
+
+                        __m128i t0 = _mm_add_epi32
+                        (_mm_madd_epi16(
+                            _mm_unpacklo_epi16(v00, v01), //J0, , J1, J1, J2, J2,  ... J3 , J4
+                            qw0),
+                            _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                        __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+                            _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
 
-                            if (j == 0)
-                            {
-                                __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16));
-                                __m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16, bSet1_epi16), 16));
-                                __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16));
-                                __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16)));
+                        t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5);
+                        t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1 - 5);
 
-                                __m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16));
-                                __m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16), 16));
 
-                                __m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps);
-                                __m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps);
-                                tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps);
-                                tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps);
+                        __m128i lo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16);
+                        __m128i hi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16);
 
-                                tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps);
-                                tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps);
-                                t0 = _mm_srai_epi32(Ixy_0, 16);
-                                t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16);
+                        __m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), bitShift);
+                        __m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), bitShift);
+                        __m128i Igain_epi16 = _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32);
 
-                                __m128 fy = _mm_cvtepi32_ps(t0);
-                                __m128 fx = _mm_cvtepi32_ps(t1);
+                        // diff = J - I + I*gain.x + gain.y
+                        __m128i mmDiff_epi16 = _mm_add_epi16(
+                            _mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16),    // J - I
+                            _mm_add_epi16(Igain_epi16, mmConstValue_epi16));
 
-                                // 0 ... 3
-                                __m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16));
 
-                                // A11 - A22
-                                __m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps);
-                                __m128 fytale = _mm_mul_ps(fy, tale_0_3_ps);
+                        mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16);
 
-                                mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
-                                mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
-                                mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
 
-                                // sumIx und sumIy
-                                mmSumIx = _mm_add_ps(mmSumIx, fxtale);
-                                mmSumIy = _mm_add_ps(mmSumIy, fytale);
+                        __m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale);
+                        mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1)));
 
-                                mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale));
-                                mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale));
 
-                                // sumI
-                                __m128 I_tale_ps = _mm_mul_ps(I_ps, tale_0_3_ps);
-                                mmSumI = _mm_add_ps(mmSumI, I_tale_ps);
+                        __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1
+                        __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
 
-                                // sumW
-                                mmSumW = _mm_add_ps(mmSumW, tale_0_3_ps);
 
-                                // sumDI
-                                mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps(I_ps, I_tale_ps));
+                        __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16);
+                        __m128i bSet2_epi16, bSet1_epi16;
+                        // |It| < sigma1 ?
+                        bSet2_epi16 = _mm_cmplt_epi16(abs_epi16, mmParam1);
+                        // It > 0 ?
+                        __m128i diffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128());
+                        // sigma0 < |It| < sigma1 ?
+                        bSet1_epi16 = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0));
+                        // val = |It| -/+ sigma1
+                        __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)),
+                            _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1)));
+                        // It == 0     ? |It| > sigma13
+                        mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16);
+                        // It == val ? sigma0 < |It| < sigma1
+                        mmDiff_epi16 = _mm_blendv_epi8(mmDiff_epi16, tmpParam1_epi16, bSet1_epi16);
 
-                                t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11
-                                t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3
 
-                                fy = _mm_cvtepi32_ps(t0);
-                                fx = _mm_cvtepi32_ps(t1);
+                        __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3
+                        // diff = diff * sigma2
+                        lo = _mm_mullo_epi16(tale_epi16_, mmDiff_epi16);
+                        hi = _mm_mulhi_epi16(tale_epi16_, mmDiff_epi16);
+                        __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); //diff 0_3_epi32
+                        __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); // diff 4_7_epi32
 
-                                // 4 ... 7
-                                I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16));
+                        mmDiff_epi16 = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); // ,da das ergebniss kleiner als 16 bit sein sollte
 
-                                // A11 - A22
-                                fxtale = _mm_mul_ps(fx, tale_4_7_ps);
-                                fytale = _mm_mul_ps(fy, tale_4_7_ps);
+                        diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7   | It12 It12 It13 It13...
+                        diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3   | It8 It8 It9 It9...
 
-                                mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
-                                mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
-                                mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
+                        // Ix * diff / Iy * diff
+                        v10 = _mm_mullo_epi16(Ixy_0, diff0);
+                        v11 = _mm_mulhi_epi16(Ixy_0, diff0);
+                        v00 = _mm_unpacklo_epi16(v10, v11);
+                        v10 = _mm_unpackhi_epi16(v10, v11);
+                        qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
+                        qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
 
-                                // sumIx und sumIy
-                                mmSumIx = _mm_add_ps(mmSumIx, fxtale);
-                                mmSumIy = _mm_add_ps(mmSumIy, fytale);
+                        // It * Ix It * Iy [4 ... 7]
+                        // for set 1 hi sigma 1
 
-                                mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale));
-                                mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale));
+                        v10 = _mm_mullo_epi16(Ixy_1, diff1);
+                        v11 = _mm_mulhi_epi16(Ixy_1, diff1);
+                        v00 = _mm_unpacklo_epi16(v10, v11);
+                        v10 = _mm_unpackhi_epi16(v10, v11);
+                        qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
+                        qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
+                        // diff * J [0 ... 7]
+                        // for set 1  sigma 1
+                        // b3 += diff * Iptr[x]
 
-                                // sumI
-                                I_tale_ps = _mm_mul_ps(I_ps, tale_4_7_ps);
-                                mmSumI = _mm_add_ps(mmSumI, I_tale_ps);
+                        v10 = _mm_mullo_epi16(mmDiff_epi16, I_0_7_epi16);
+                        v11 = _mm_mulhi_epi16(mmDiff_epi16, I_0_7_epi16);
+                        v00 = _mm_unpacklo_epi16(v10, v11);
+                        v10 = _mm_unpackhi_epi16(v10, v11);
+                        qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v00));
+                        qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v10));
 
-                                mmSumW = _mm_add_ps(mmSumW, tale_4_7_ps);
+                        qb3 = _mm_add_ps(qb3, _mm_cvtepi32_ps(diff_0_3_epi32));
+                        qb3 = _mm_add_ps(qb3, _mm_cvtepi32_ps(diff_4_7_epi32));
 
-                                mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps(I_ps, I_tale_ps));
-                            }
+                        if (j == 0)
+                        {
+                            __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16));
+                            __m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16, bSet1_epi16), 16));
+                            __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16));
+                            __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16)));
+
+                            __m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16));
+                            __m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16), 16));
+
+                            __m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps);
+                            __m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps);
+                            tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps);
+                            tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps);
+
+                            tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps);
+                            tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps);
+                            t0 = _mm_srai_epi32(Ixy_0, 16);
+                            t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16);
 
+                            __m128 fy = _mm_cvtepi32_ps(t0);
+                            __m128 fx = _mm_cvtepi32_ps(t1);
+
+                            // 0 ... 3
+                            __m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16));
+
+                            // A11 - A22
+                            __m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps);
+                            __m128 fytale = _mm_mul_ps(fy, tale_0_3_ps);
+
+                            mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
+                            mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
+                            mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
+
+                            // sumIx und sumIy
+                            mmSumIx = _mm_add_ps(mmSumIx, fxtale);
+                            mmSumIy = _mm_add_ps(mmSumIy, fytale);
+
+                            mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale));
+                            mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale));
+
+                            // sumI
+                            __m128 I_tale_ps = _mm_mul_ps(I_ps, tale_0_3_ps);
+                            mmSumI = _mm_add_ps(mmSumI, I_tale_ps);
+
+                            // sumW
+                            mmSumW = _mm_add_ps(mmSumW, tale_0_3_ps);
+
+                            // sumDI
+                            mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps(I_ps, I_tale_ps));
+
+                            t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11
+                            t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3
+
+                            fy = _mm_cvtepi32_ps(t0);
+                            fx = _mm_cvtepi32_ps(t1);
+
+                            // 4 ... 7
+                            I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16));
+
+                            // A11 - A22
+                            fxtale = _mm_mul_ps(fx, tale_4_7_ps);
+                            fytale = _mm_mul_ps(fy, tale_4_7_ps);
+
+                            mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
+                            mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
+                            mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
+
+                            // sumIx und sumIy
+                            mmSumIx = _mm_add_ps(mmSumIx, fxtale);
+                            mmSumIy = _mm_add_ps(mmSumIy, fytale);
+
+                            mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale));
+                            mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale));
+
+                            // sumI
+                            I_tale_ps = _mm_mul_ps(I_ps, tale_4_7_ps);
+                            mmSumI = _mm_add_ps(mmSumI, I_tale_ps);
+
+                            mmSumW = _mm_add_ps(mmSumW, tale_4_7_ps);
+
+                            mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps(I_ps, I_tale_ps));
                         }
+
+                    }
 #else
-                        for (; x < _winSize.width*cn; x++, dIptr += 2)
+                    for (; x < _winSize.width*cn; x++, dIptr += 2)
+                    {
+                        if (maskPtr[x] == 0)
+                            continue;
+                        int J_val = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11,
+                            W_BITS1 - 5);
+                        short ixval = static_cast<short>(dIptr[0]);
+                        short iyval = static_cast<short>(dIptr[1]);
+                        int diff = static_cast<int>(J_val - Iptr[x] + Iptr[x] * gainVec.x + gainVec.y);
+                        int abss = (diff < 0) ? -diff : diff;
+                        if (diff > MEstimatorScale)
+                            MEstimatorScale += eta;
+                        if (diff < MEstimatorScale)
+                            MEstimatorScale -= eta;
+                        if (abss > static_cast<int>(fParam1))
                         {
-                            if (maskPtr[x] == 0)
-                                continue;
-                            int J_val = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11,
-                                W_BITS1 - 5);
-                            short ixval = static_cast<short>(dIptr[0]);
-                            short iyval = static_cast<short>(dIptr[1]);
-                            int diff = static_cast<int>(J_val - Iptr[x] + Iptr[x] * gainVec.x + gainVec.y);
-                            int abss = (diff < 0) ? -diff : diff;
-                            if (diff > MEstimatorScale)
-                                MEstimatorScale += eta;
-                            if (diff < MEstimatorScale)
-                                MEstimatorScale -= eta;
-                            if (abss > static_cast<int>(fParam1))
+                            diff = 0;
+                        }
+                        else if (abss > static_cast<int>(fParam0) && diff >= 0)
+                        {
+                            diff = static_cast<int>(param[2] * (diff - fParam1));
+                        }
+                        else if (abss > static_cast<int>(fParam0) && diff < 0)
+                        {
+                            diff = static_cast<int>(param[2] * (diff + fParam1));
+                        }
+                        b1 += (float)(diff*ixval);
+                        b2 += (float)(diff*iyval); ;
+                        b3 += (float)(diff)* Iptr[x];
+                        b4 += (float)(diff);
+
+
+                        // compute the Gradient Matrice
+                        if (j == 0)
+                        {
+                            float tale = param[2] * FLT_RESCALE;
+                            if (abss < fParam0 || j < 0)
                             {
-                                diff = 0;
+                                tale = FLT_RESCALE;
                             }
-                            else if (abss > static_cast<int>(fParam0) && diff >= 0)
+                            else if (abss > fParam1)
                             {
-                                diff = static_cast<int>(param[2] * (diff - fParam1));
+                                tale *= 0.01f;
                             }
-                            else if (abss > static_cast<int>(fParam0) && diff < 0)
+                            else
                             {
-                                diff = static_cast<int>(param[2] * (diff + fParam1));
+                                tale *= param[2];
                             }
-                            b1 += (float)(diff*ixval);
-                            b2 += (float)(diff*iyval); ;
-                            b3 += (float)(diff)* Iptr[x];
-                            b4 += (float)(diff);
-
-
-                            // compute the Gradient Matrice
                             if (j == 0)
                             {
-                                float tale = param[2] * FLT_RESCALE;
-                                if (abss < fParam0 || j < 0)
-                                {
-                                    tale = FLT_RESCALE;
-                                }
-                                else if (abss > fParam1)
-                                {
-                                    tale *= 0.01f;
-                                }
-                                else
-                                {
-                                    tale *= param[2];
-                                }
-                                if (j == 0)
-                                {
-                                    A11 += (float)(ixval*ixval)*tale;
-                                    A12 += (float)(ixval*iyval)*tale;
-                                    A22 += (float)(iyval*iyval)*tale;
-                                }
-
-                                dI += Iptr[x] * Iptr[x] * tale;
-                                float dx = static_cast<float>(dIptr[0]) * tale;
-                                float dy = static_cast<float>(dIptr[1]) * tale;
-                                sumIx += dx;
-                                sumIy += dy;
-                                w1 += dx * Iptr[x];
-                                w2 += dy * Iptr[x];
-                                sumI += Iptr[x] * tale;
-                                sumW += tale;
+                                A11 += (float)(ixval*ixval)*tale;
+                                A12 += (float)(ixval*iyval)*tale;
+                                A22 += (float)(iyval*iyval)*tale;
                             }
 
+                            dI += Iptr[x] * Iptr[x] * tale;
+                            float dx = static_cast<float>(dIptr[0]) * tale;
+                            float dy = static_cast<float>(dIptr[1]) * tale;
+                            sumIx += dx;
+                            sumIy += dy;
+                            w1 += dx * Iptr[x];
+                            w2 += dy * Iptr[x];
+                            sumI += Iptr[x] * tale;
+                            sumW += tale;
                         }
-#endif
+
                     }
+#endif
+                }
 #ifdef RLOF_SSE
-                    short etaValues[8];
-                    _mm_storeu_si128((__m128i*)(etaValues), mmEta);
-                    MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3]
-                            + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]);
-                            float CV_DECL_ALIGNED(32) wbuf[4];
+                short etaValues[8];
+                _mm_storeu_si128((__m128i*)(etaValues), mmEta);
+                MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3]
+                        + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]);
+                        float CV_DECL_ALIGNED(32) wbuf[4];
 #endif
-                    if (j == 0)
-                    {
+                if (j == 0)
+                {
 #ifdef RLOF_SSE
-                        _mm_store_ps(wbuf, mmSumW1);
-                        w1 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-                        _mm_store_ps(wbuf, mmSumW2);
-                        w2 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-                        _mm_store_ps(wbuf, mmSumDI);
-                        dI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-                        _mm_store_ps(wbuf, mmSumI);
-                        sumI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-                        _mm_store_ps(wbuf, mmSumIx);
-                        sumIx = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-                        _mm_store_ps(wbuf, mmSumIy);
-                        sumIy = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-                        _mm_store_ps(wbuf, mmSumW);
-                        sumW = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                    _mm_store_ps(wbuf, mmSumW1);
+                    w1 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                    _mm_store_ps(wbuf, mmSumW2);
+                    w2 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                    _mm_store_ps(wbuf, mmSumDI);
+                    dI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                    _mm_store_ps(wbuf, mmSumI);
+                    sumI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                    _mm_store_ps(wbuf, mmSumIx);
+                    sumIx = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                    _mm_store_ps(wbuf, mmSumIy);
+                    sumIy = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                    _mm_store_ps(wbuf, mmSumW);
+                    sumW = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
 #endif
-                        sumIx *= -FLT_SCALE;
-                        sumIy *= -FLT_SCALE;
-                        sumI *= FLT_SCALE;
-                        sumW *= FLT_SCALE;
-                        w1 *= -FLT_SCALE;
-                        w2 *= -FLT_SCALE;
-                        dI *= FLT_SCALE;
+                    sumIx *= -FLT_SCALE;
+                    sumIy *= -FLT_SCALE;
+                    sumI *= FLT_SCALE;
+                    sumW *= FLT_SCALE;
+                    w1 *= -FLT_SCALE;
+                    w2 *= -FLT_SCALE;
+                    dI *= FLT_SCALE;
 #ifdef RLOF_SSE
-                        float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];
-                        _mm_store_ps(A11buf, mmAxx);
-                        _mm_store_ps(A12buf, mmAxy);
-                        _mm_store_ps(A22buf, mmAyy);
+                    float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];
+                    _mm_store_ps(A11buf, mmAxx);
+                    _mm_store_ps(A12buf, mmAxy);
+                    _mm_store_ps(A22buf, mmAyy);
 
 
-                        A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
-                        A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
-                        A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
+                    A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
+                    A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
+                    A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
 #endif
-                        A11 *= FLT_SCALE;
-                        A12 *= FLT_SCALE;
-                        A22 *= FLT_SCALE;
-                    }
+                    A11 *= FLT_SCALE;
+                    A12 *= FLT_SCALE;
+                    A22 *= FLT_SCALE;
+                }
 #ifdef RLOF_SSE
-                    float CV_DECL_ALIGNED(16) bbuf[4];
-                    _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
-                    b1 = bbuf[0] + bbuf[2];
-                    b2 = bbuf[1] + bbuf[3];
-                    _mm_store_ps(bbuf, qb2);
-                    b3 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
-                    _mm_store_ps(bbuf, qb3);
-                    b4 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
+                float CV_DECL_ALIGNED(16) bbuf[4];
+                _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
+                b1 = bbuf[0] + bbuf[2];
+                b2 = bbuf[1] + bbuf[3];
+                _mm_store_ps(bbuf, qb2);
+                b3 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
+                _mm_store_ps(bbuf, qb3);
+                b4 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
 #endif
-                    mismatchMat.at<float>(0, 0) = b1 * FLT_SCALE;
-                    mismatchMat.at<float>(1, 0) = b2 * FLT_SCALE;
-                    mismatchMat.at<float>(2, 0) = -b3 * FLT_SCALE;
-                    mismatchMat.at<float>(3, 0) = -b4 * FLT_SCALE;
-
-                    D = -A12 * A12*sumI*sumI + dI * sumW*A12*A12 + 2 * A12*sumI*sumIx*w2 + 2 * A12*sumI*sumIy*w1
-                        - 2 * dI*A12*sumIx*sumIy - 2 * sumW*A12*w1*w2 + A11 * A22*sumI*sumI - 2 * A22*sumI*sumIx*w1
-                        - 2 * A11*sumI*sumIy*w2 - sumIx * sumIx*w2*w2 + A22 * dI*sumIx*sumIx + 2 * sumIx*sumIy*w1*w2
-                        - sumIy * sumIy*w1*w1 + A11 * dI*sumIy*sumIy + A22 * sumW*w1*w1 + A11 * sumW*w2*w2 - A11 * A22*dI*sumW;
-
-                    float sqrtVal = std::sqrt((A11 - A22)*(A11 - A22) + 4.f*A12*A12);
-                    minEigValue = (A22 + A11 - sqrtVal) / (2.0f*winArea);
-                    if (minEigValue < minEigThreshold || std::abs(D) < FLT_EPSILON)
+                mismatchMat.at<float>(0, 0) = b1 * FLT_SCALE;
+                mismatchMat.at<float>(1, 0) = b2 * FLT_SCALE;
+                mismatchMat.at<float>(2, 0) = -b3 * FLT_SCALE;
+                mismatchMat.at<float>(3, 0) = -b4 * FLT_SCALE;
+
+                D = -A12 * A12*sumI*sumI + dI * sumW*A12*A12 + 2 * A12*sumI*sumIx*w2 + 2 * A12*sumI*sumIy*w1
+                    - 2 * dI*A12*sumIx*sumIy - 2 * sumW*A12*w1*w2 + A11 * A22*sumI*sumI - 2 * A22*sumI*sumIx*w1
+                    - 2 * A11*sumI*sumIy*w2 - sumIx * sumIx*w2*w2 + A22 * dI*sumIx*sumIx + 2 * sumIx*sumIy*w1*w2
+                    - sumIy * sumIy*w1*w1 + A11 * dI*sumIy*sumIy + A22 * sumW*w1*w1 + A11 * sumW*w2*w2 - A11 * A22*dI*sumW;
+
+                float sqrtVal = std::sqrt((A11 - A22)*(A11 - A22) + 4.f*A12*A12);
+                minEigValue = (A22 + A11 - sqrtVal) / (2.0f*winArea);
+                if (minEigValue < minEigThreshold || std::abs(D) < FLT_EPSILON)
+                {
+                    if (level == 0 && status)
+                        status[ptidx] = 0;
+                    if (level > 0)
                     {
-                        if (level == 0 && status)
-                            status[ptidx] = 0;
-                        if (level > 0)
-                        {
-                            nextPts[ptidx] = backUpNextPt;
-                            gainVecs[ptidx] = backUpGain;
-                        }
-                        break;
+                        nextPts[ptidx] = backUpNextPt;
+                        gainVecs[ptidx] = backUpGain;
                     }
+                    break;
+                }
 
-                    D = (1.f / D);
-
-                    invTensorMat.at<float>(0, 0) = (A22*sumI*sumI - 2 * sumI*sumIy*w2 + dI * sumIy*sumIy + sumW * w2*w2 - A22 * dI*sumW)* D;                                                invTensorMat.at<float>(0, 1) = (A12*dI*sumW - A12 * sumI * sumI - dI * sumIx*sumIy + sumI * sumIx*w2 + sumI * sumIy*w1 - sumW * w1*w2)* D;
-                    invTensorMat.at<float>(0, 2) = (A12*sumI*sumIy - sumIy * sumIy*w1 - A22 * sumI*sumIx - A12 * sumW*w2 + A22 * sumW*w1 + sumIx * sumIy*w2)* D;
-                    invTensorMat.at<float>(0, 3) = (A22*dI*sumIx - A12 * dI*sumIy - sumIx * w2*w2 + A12 * sumI*w2 - A22 * sumI*w1 + sumIy * w1*w2) * D;
-                    invTensorMat.at<float>(1, 0) = invTensorMat.at<float>(0, 1);
-                    invTensorMat.at<float>(1, 1) = (A11*sumI * sumI - 2 * sumI*sumIx*w1 + dI * sumIx * sumIx + sumW * w1*w1 - A11 * dI*sumW) * D;
-                    invTensorMat.at<float>(1, 2) = (A12*sumI*sumIx - A11 * sumI*sumIy - sumIx * sumIx*w2 + A11 * sumW*w2 - A12 * sumW*w1 + sumIx * sumIy*w1) * D;
-                    invTensorMat.at<float>(1, 3) = (A11*dI*sumIy - sumIy * w1*w1 - A12 * dI*sumIx - A11 * sumI*w2 + A12 * sumI*w1 + sumIx * w1*w2)* D;
-                    invTensorMat.at<float>(2, 0) = invTensorMat.at<float>(0, 2);
-                    invTensorMat.at<float>(2, 1) = invTensorMat.at<float>(1, 2);
-                    invTensorMat.at<float>(2, 2) = (sumW*A12*A12 - 2 * A12*sumIx*sumIy + A22 * sumIx*sumIx + A11 * sumIy*sumIy - A11 * A22*sumW)* D;
-                    invTensorMat.at<float>(2, 3) = (A11*A22*sumI - A12 * A12*sumI - A11 * sumIy*w2 + A12 * sumIx*w2 + A12 * sumIy*w1 - A22 * sumIx*w1)* D;
-                    invTensorMat.at<float>(3, 0) = invTensorMat.at<float>(0, 3);
-                    invTensorMat.at<float>(3, 1) = invTensorMat.at<float>(1, 3);
-                    invTensorMat.at<float>(3, 2) = invTensorMat.at<float>(2, 3);
-                    invTensorMat.at<float>(3, 3) = (dI*A12*A12 - 2 * A12*w1*w2 + A22 * w1*w1 + A11 * w2*w2 - A11 * A22*dI)* D;
+                D = (1.f / D);
 
-                    resultMat = invTensorMat * mismatchMat;
+                invTensorMat.at<float>(0, 0) = (A22*sumI*sumI - 2 * sumI*sumIy*w2 + dI * sumIy*sumIy + sumW * w2*w2 - A22 * dI*sumW)* D;                                                invTensorMat.at<float>(0, 1) = (A12*dI*sumW - A12 * sumI * sumI - dI * sumIx*sumIy + sumI * sumIx*w2 + sumI * sumIy*w1 - sumW * w1*w2)* D;
+                invTensorMat.at<float>(0, 2) = (A12*sumI*sumIy - sumIy * sumIy*w1 - A22 * sumI*sumIx - A12 * sumW*w2 + A22 * sumW*w1 + sumIx * sumIy*w2)* D;
+                invTensorMat.at<float>(0, 3) = (A22*dI*sumIx - A12 * dI*sumIy - sumIx * w2*w2 + A12 * sumI*w2 - A22 * sumI*w1 + sumIy * w1*w2) * D;
+                invTensorMat.at<float>(1, 0) = invTensorMat.at<float>(0, 1);
+                invTensorMat.at<float>(1, 1) = (A11*sumI * sumI - 2 * sumI*sumIx*w1 + dI * sumIx * sumIx + sumW * w1*w1 - A11 * dI*sumW) * D;
+                invTensorMat.at<float>(1, 2) = (A12*sumI*sumIx - A11 * sumI*sumIy - sumIx * sumIx*w2 + A11 * sumW*w2 - A12 * sumW*w1 + sumIx * sumIy*w1) * D;
+                invTensorMat.at<float>(1, 3) = (A11*dI*sumIy - sumIy * w1*w1 - A12 * dI*sumIx - A11 * sumI*w2 + A12 * sumI*w1 + sumIx * w1*w2)* D;
+                invTensorMat.at<float>(2, 0) = invTensorMat.at<float>(0, 2);
+                invTensorMat.at<float>(2, 1) = invTensorMat.at<float>(1, 2);
+                invTensorMat.at<float>(2, 2) = (sumW*A12*A12 - 2 * A12*sumIx*sumIy + A22 * sumIx*sumIx + A11 * sumIy*sumIy - A11 * A22*sumW)* D;
+                invTensorMat.at<float>(2, 3) = (A11*A22*sumI - A12 * A12*sumI - A11 * sumIy*w2 + A12 * sumIx*w2 + A12 * sumIy*w1 - A22 * sumIx*w1)* D;
+                invTensorMat.at<float>(3, 0) = invTensorMat.at<float>(0, 3);
+                invTensorMat.at<float>(3, 1) = invTensorMat.at<float>(1, 3);
+                invTensorMat.at<float>(3, 2) = invTensorMat.at<float>(2, 3);
+                invTensorMat.at<float>(3, 3) = (dI*A12*A12 - 2 * A12*w1*w2 + A22 * w1*w1 + A11 * w2*w2 - A11 * A22*dI)* D;
 
-                    Point2f delta(-resultMat.at<float>(0), -resultMat.at<float>(1));
-                    Point2f deltaGain(resultMat.at<float>(2), resultMat.at<float>(3));
+                resultMat = invTensorMat * mismatchMat;
 
+                Point2f delta(-resultMat.at<float>(0), -resultMat.at<float>(1));
+                Point2f deltaGain(resultMat.at<float>(2), resultMat.at<float>(3));
 
 
 
-                    if (j == 0)
-                        prevGain = deltaGain;
-                    gainVec += deltaGain * 0.8;
-                    nextPt += delta * 0.8;
-                    nextPts[ptidx] = nextPt + halfWin;
-                    gainVecs[ptidx] = gainVec;
 
-                    if (
-                        (std::abs(delta.x - prevDelta.x) < 0.01  &&    std::abs(delta.y - prevDelta.y) < 0.01)
-                        || ((delta.ddot(delta) <= 0.001) && std::abs(prevGain.x - deltaGain.x) < 0.01)
-                        )
-                    {
-                        nextPts[ptidx] -= delta * 0.5f;
-                        gainVecs[ptidx] -= deltaGain * 0.5f;
-                        break;
-                    }
-
-                    prevDelta = delta;
+                if (j == 0)
                     prevGain = deltaGain;
+                gainVec += deltaGain * 0.8;
+                nextPt += delta * 0.8;
+                nextPts[ptidx] = nextPt + halfWin;
+                gainVecs[ptidx] = gainVec;
+
+                if (
+                    (std::abs(delta.x - prevDelta.x) < 0.01  &&    std::abs(delta.y - prevDelta.y) < 0.01)
+                    || ((delta.ddot(delta) <= 0.001) && std::abs(prevGain.x - deltaGain.x) < 0.01)
+                    )
+                {
+                    nextPts[ptidx] -= delta * 0.5f;
+                    gainVecs[ptidx] -= deltaGain * 0.5f;
+                    break;
                 }
 
+                prevDelta = delta;
+                prevGain = deltaGain;
             }
 
         }
 
-        const Mat*        prevImg;
-        const Mat*        nextImg;
-        const Mat*        prevDeriv;
-        const Mat*        rgbPrevImg;
-        const Mat*        rgbNextImg;
-        const Point2f*    prevPts;
-        Point2f*        nextPts;
-        uchar*            status;
-        cv::Point2f*    gainVecs;
-        float*            err;
-        int                maxWinSize;
-        int                minWinSize;
-        TermCriteria    criteria;
-        int                level;
-        int                maxLevel;
-        int                windowType;
-        float            minEigThreshold;
-        bool            useInitialFlow;
-        std::vector<float> paramReset;
-        int                crossSegmentationThreshold;
-    };
-}
-}
-}
-}
+    }
+
+    const Mat*          prevImg;
+    const Mat*          nextImg;
+    const Mat*          prevDeriv;
+    const Mat*          rgbPrevImg;
+    const Mat*          rgbNextImg;
+    const Point2f*      prevPts;
+    Point2f*            nextPts;
+    uchar*              status;
+    cv::Point2f*        gainVecs;
+    float*              err;
+    int                 maxWinSize;
+    int                 minWinSize;
+    TermCriteria        criteria;
+    int                 level;
+    int                 maxLevel;
+    int                 windowType;
+    float               minEigThreshold;
+    bool                useInitialFlow;
+    std::vector<float>  paramReset;
+    int                 crossSegmentationThreshold;
+};
+
+}}}} // namespace
 #endif
diff --git a/modules/optflow/src/rlof/rlof_invokerbase.hpp b/modules/optflow/src/rlof/rlof_invokerbase.hpp
index 68e2ca28bbc..991531deef0 100644
--- a/modules/optflow/src/rlof/rlof_invokerbase.hpp
+++ b/modules/optflow/src/rlof/rlof_invokerbase.hpp
@@ -2,7 +2,7 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 #ifndef _RLOF_INVOKERBASE_HPP_
-#define  _RLOF_INVOKERBASE_HPP_
+#define _RLOF_INVOKERBASE_HPP_
 
 
 #ifdef CV_CPU_COMPILE_SSE4_1
@@ -26,250 +26,249 @@ using namespace std;
 using namespace cv;
 
 
-namespace cv
-{
-namespace optflow
-{
-    typedef short deriv_type;
+namespace cv {
+namespace optflow {
+
+typedef short deriv_type;
 #ifdef RLOF_SSE
-    inline void get4BitMask(const int & width, __m128i & mask)
+inline void get4BitMask(const int & width, __m128i & mask)
+{
+    int noBits = width - static_cast<int>(floor(width / 4.f) * 4.f);
+    unsigned int val[4];
+    for (int n = 0; n < 4; n++)
     {
-        int noBits = width - static_cast<int>(floor(width / 4.f) * 4.f);
-        unsigned int val[4];
-        for (int n = 0; n < 4; n++)
-        {
-            val[n] = (noBits > n) ? (std::numeric_limits<unsigned int>::max()) : 0;
-        }
-        mask = _mm_set_epi32(val[3], val[2], val[1], val[0]);
-
+        val[n] = (noBits > n) ? (std::numeric_limits<unsigned int>::max()) : 0;
     }
-    inline void getWBitMask(const int & width, __m128i & t0, __m128i & t1, __m128i & t2)
+    mask = _mm_set_epi32(val[3], val[2], val[1], val[0]);
+
+}
+inline void getWBitMask(const int & width, __m128i & t0, __m128i & t1, __m128i & t2)
+{
+    int noBits = width - static_cast<int>(floor(width / 8.f) * 8.f);
+    unsigned short val[8];
+    for (int n = 0; n < 8; n++)
     {
-        int noBits = width - static_cast<int>(floor(width / 8.f) * 8.f);
-        unsigned short val[8];
-        for (int n = 0; n < 8; n++)
-        {
-            val[n] = (noBits > n) ? (0xffff) : 0;
-        }
-        t1 = _mm_set_epi16(val[7], val[7], val[6], val[6], val[5], val[5], val[4], val[4]);
-        t0 = _mm_set_epi16(val[3], val[3], val[2], val[2], val[1], val[1], val[0], val[0]);
-        t2 = _mm_set_epi16(val[7], val[6], val[5], val[4], val[3], val[2], val[1], val[0]);
+        val[n] = (noBits > n) ? (0xffff) : 0;
     }
+    t1 = _mm_set_epi16(val[7], val[7], val[6], val[6], val[5], val[5], val[4], val[4]);
+    t0 = _mm_set_epi16(val[3], val[3], val[2], val[2], val[1], val[1], val[0], val[0]);
+    t2 = _mm_set_epi16(val[7], val[6], val[5], val[4], val[3], val[2], val[1], val[0]);
+}
 #endif
-    typedef uchar tMaskType;
+typedef uchar tMaskType;
 #define tCVMaskType CV_8UC1
 #define MaskSet 0xffffffff
 
-    inline bool notSameColor(const cv::Point3_<uchar> & ref, int r, int c, const cv::Mat & img, int winSize, int threshold)
+inline bool notSameColor(const cv::Point3_<uchar> & ref, int r, int c, const cv::Mat & img, int winSize, int threshold)
+{
+    if (r >= img.rows + winSize || c >= img.cols + winSize || r < -winSize || c < -winSize) return true;
+    int step = static_cast<int>(img.step1());
+    const cv::Point3_<uchar> * tval = img.ptr<cv::Point3_<uchar>>();
+    tval += c;
+    tval += r * step / 3;
+    int R = std::abs((int)ref.x - (int)tval->x);
+    int G = std::abs((int)ref.y - (int)tval->y);
+    int B = std::abs((int)ref.z - (int)tval->z);
+    int diff = MAX(R, MAX(G, B));
+    return diff > threshold;
+}
+
+/*! Estimate the mask with points of the same color as the middle point
+*\param prevGray input CV_8UC1
+*\param prevImg input CV_8UC3
+*\param nextImg input CV_8UC3
+*\param prevPoint global position of the middle point of the mask window
+*\param nextPoint global position of the middle point of the mask window
+*\param winPointMask mask matrice with 1 labeling points contained and 0 point is not contained by the segment
+*\param noPoints number of points contained by the segment
+*\param winRoi rectangle of the region of interesset in global coordinates
+*\param minWinSize,
+*\param threshold,
+*\param useBothImages
+*/
+static void getLocalPatch(
+    const cv::Mat & prevImg,
+    const cv::Point2i & prevPoint, // feature points
+    cv::Mat & winPointMask,
+    int & noPoints,
+    cv::Rect & winRoi,
+    int minWinSize,
+    int threshold)
+{
+    int maxWinSizeH = (winPointMask.cols - 1) / 2;
+    winRoi.x = prevPoint.x;
+    winRoi.y = prevPoint.y;
+    winRoi.width = winPointMask.cols;
+    winRoi.height = winPointMask.rows;
+
+    if (minWinSize == winPointMask.cols)
     {
-        if (r >= img.rows + winSize || c >= img.cols + winSize || r < -winSize || c < -winSize) return true;
-        int step = static_cast<int>(img.step1());
-        const cv::Point3_<uchar> * tval = img.ptr<cv::Point3_<uchar>>();
-        tval += c;
-        tval += r * step / 3;
-        int R = std::abs((int)ref.x - (int)tval->x);
-        int G = std::abs((int)ref.y - (int)tval->y);
-        int B = std::abs((int)ref.z - (int)tval->z);
-        int diff = MAX(R, MAX(G, B));
-        return diff > threshold;
+        winRoi.x = prevPoint.x - maxWinSizeH;
+        winRoi.y = prevPoint.y - maxWinSizeH;
+        winPointMask.setTo(1);
+        noPoints = winPointMask.size().area();
+        return;
     }
-
-    /*! Estimate the mask with points of the same color as the middle point
-    *\param prevGray input CV_8UC1
-    *\param prevImg input CV_8UC3
-    *\param nextImg input CV_8UC3
-    *\param prevPoint global position of the middle point of the mask window
-    *\param nextPoint global position of the middle point of the mask window
-    *\param winPointMask mask matrice with 1 labeling points contained and 0 point is not contained by the segment
-    *\param noPoints number of points contained by the segment
-    *\param winRoi rectangle of the region of interesset in global coordinates
-    *\param minWinSize,
-    *\param threshold,
-    *\param useBothImages
-    */
-    static void getLocalPatch(
-        const cv::Mat & prevImg,
-        const cv::Point2i & prevPoint, // feature points
-        cv::Mat & winPointMask,
-        int & noPoints,
-        cv::Rect & winRoi,
-        int minWinSize,
-        int threshold)
+    noPoints = 0;
+    int c = prevPoint.x;
+    int r = prevPoint.y;
+    int c_left = c - 1;
+    int c_right = c + 1;
+    int r_top = r - 1;
+    int r_bottom = r;
+    int border_left = c - maxWinSizeH;
+    int border_right = c + maxWinSizeH;
+    int border_top = r - maxWinSizeH;
+    int border_bottom = r + maxWinSizeH;
+    int c_local_diff = prevPoint.x - maxWinSizeH;
+    int r_local_diff = prevPoint.y - maxWinSizeH;
+    int _c = c - c_local_diff;
+    int _r = r - r_local_diff;
+    int min_r = _r;
+    int max_r = _r;
+    int min_c = _c;
+    int max_c = _c;
+    // horizontal line
+    if (r < 0 || r >= prevImg.rows || c < 0 || c >= prevImg.cols)
     {
-        int maxWinSizeH = (winPointMask.cols - 1) / 2;
-        winRoi.x = prevPoint.x;
-        winRoi.y = prevPoint.y;
-        winRoi.width = winPointMask.cols;
-        winRoi.height = winPointMask.rows;
-
-        if (minWinSize == winPointMask.cols)
-        {
-            winRoi.x = prevPoint.x - maxWinSizeH;
-            winRoi.y = prevPoint.y - maxWinSizeH;
-            winPointMask.setTo(1);
-            noPoints = winPointMask.size().area();
-            return;
-        }
         noPoints = 0;
-        int c = prevPoint.x;
-        int r = prevPoint.y;
-        int c_left = c - 1;
-        int c_right = c + 1;
-        int r_top = r - 1;
-        int r_bottom = r;
-        int border_left = c - maxWinSizeH;
-        int border_right = c + maxWinSizeH;
-        int border_top = r - maxWinSizeH;
-        int border_bottom = r + maxWinSizeH;
-        int c_local_diff = prevPoint.x - maxWinSizeH;
-        int r_local_diff = prevPoint.y - maxWinSizeH;
-        int _c = c - c_local_diff;
-        int _r = r - r_local_diff;
-        int min_r = _r;
-        int max_r = _r;
-        int min_c = _c;
-        int max_c = _c;
-        // horizontal line
-        if (r < 0 || r >= prevImg.rows || c < 0 || c >= prevImg.cols)
-        {
-            noPoints = 0;
-            return;
-        }
-        cv::Point3_<uchar> val1 = prevImg.at<cv::Point3_<uchar>>(r, c); // middle grayvalue
-        cv::Point3_<uchar> tval;
+        return;
+    }
+    cv::Point3_<uchar> val1 = prevImg.at<cv::Point3_<uchar>>(r, c); // middle grayvalue
+    cv::Point3_<uchar> tval;
 
-        //vertical line
-        for (int dr = r_top; dr >= border_top; dr--)
-        {
-            if (notSameColor(val1, dr, c, prevImg, maxWinSizeH, threshold))
-                break;
+    //vertical line
+    for (int dr = r_top; dr >= border_top; dr--)
+    {
+        if (notSameColor(val1, dr, c, prevImg, maxWinSizeH, threshold))
+            break;
 
-            int _dr = dr - r_local_diff;
-            min_r = MIN(min_r, _dr);
-            max_r = MAX(max_r, _dr);
-            winPointMask.at<uchar>(_dr, _c) = 1;
-            noPoints++;
-        }
-        for (int dr = r_bottom; dr < border_bottom; dr++)
-        {
-            if (notSameColor(val1, dr, c, prevImg, maxWinSizeH, threshold)
-                )
-                break;
+        int _dr = dr - r_local_diff;
+        min_r = MIN(min_r, _dr);
+        max_r = MAX(max_r, _dr);
+        winPointMask.at<uchar>(_dr, _c) = 1;
+        noPoints++;
+    }
+    for (int dr = r_bottom; dr < border_bottom; dr++)
+    {
+        if (notSameColor(val1, dr, c, prevImg, maxWinSizeH, threshold)
+            )
+            break;
 
-            int _dr = dr - r_local_diff;
-            min_r = MIN(min_r, _dr);
-            max_r = MAX(max_r, _dr);
-            winPointMask.at<uchar>(_dr, _c) = 1;
-            noPoints++;
+        int _dr = dr - r_local_diff;
+        min_r = MIN(min_r, _dr);
+        max_r = MAX(max_r, _dr);
+        winPointMask.at<uchar>(_dr, _c) = 1;
+        noPoints++;
+    }
+    // accumulate along the vertical line and the search line that was still labled
+    for (int dr = min_r + r_local_diff; dr <= max_r + r_local_diff; dr++)
+    {
+        int _dr = dr - r_local_diff;
+        if (winPointMask.at<uchar>(_dr, _c) == 0)
+        {
+            winPointMask.row(_dr).setTo(0);
+            continue;
         }
-        // accumulate along the vertical line and the search line that was still labled
-        for (int dr = min_r + r_local_diff; dr <= max_r + r_local_diff; dr++)
+        bool skip = false;
+        int _dc = c_right - c_local_diff;
+        for (int dc = c_right; dc < border_right; dc++, _dc++)
         {
-            int _dr = dr - r_local_diff;
-            if (winPointMask.at<uchar>(_dr, _c) == 0)
+            if (skip == false)
             {
-                winPointMask.row(_dr).setTo(0);
-                continue;
+                if (notSameColor(val1, dr, dc, prevImg, maxWinSizeH, threshold))
+                    skip = true;
             }
-            bool skip = false;
-            int _dc = c_right - c_local_diff;
-            for (int dc = c_right; dc < border_right; dc++, _dc++)
+            if (skip == false)
             {
-                if (skip == false)
-                {
-                    if (notSameColor(val1, dr, dc, prevImg, maxWinSizeH, threshold))
-                        skip = true;
-                }
-                if (skip == false)
-                {
-                    min_c = MIN(min_c, _dc);
-                    max_c = MAX(max_c, _dc);
-                    winPointMask.at<uchar>(_dr, _dc) = 1;
-                    noPoints++;
-                }
-                else
-                    winPointMask.at<uchar>(_dr, _dc) = 0;
+                min_c = MIN(min_c, _dc);
+                max_c = MAX(max_c, _dc);
+                winPointMask.at<uchar>(_dr, _dc) = 1;
+                noPoints++;
             }
+            else
+                winPointMask.at<uchar>(_dr, _dc) = 0;
+        }
 
-            skip = false;
-            _dc = c_left - c_local_diff;
-            for (int dc = c_left; dc >= border_left; dc--, _dc--)
+        skip = false;
+        _dc = c_left - c_local_diff;
+        for (int dc = c_left; dc >= border_left; dc--, _dc--)
+        {
+            if (skip == false)
+            {
+                if (notSameColor(val1, dr, dc, prevImg, maxWinSizeH, threshold))
+                    skip = true;
+            }
+            if (skip == false)
             {
-                if (skip == false)
-                {
-                    if (notSameColor(val1, dr, dc, prevImg, maxWinSizeH, threshold))
-                        skip = true;
-                }
-                if (skip == false)
-                {
-                    min_c = MIN(min_c, _dc);
-                    max_c = MAX(max_c, _dc);
-                    winPointMask.at<uchar>(_dr, _dc) = 1;
-                    noPoints++;
-                }
-                else
-                    winPointMask.at<uchar>(_dr, _dc) = 0;
+                min_c = MIN(min_c, _dc);
+                max_c = MAX(max_c, _dc);
+                winPointMask.at<uchar>(_dr, _dc) = 1;
+                noPoints++;
             }
+            else
+                winPointMask.at<uchar>(_dr, _dc) = 0;
         }
+    }
 
 
-        // get the initial small window
-        if (noPoints < minWinSize * minWinSize)
-        {
-            cv::Rect roi(winPointMask.cols / 2 - (minWinSize - 1) / 2,
-                winPointMask.rows / 2 - (minWinSize - 1) / 2,
-                minWinSize, minWinSize);
-            cv::Mat(winPointMask, roi).setTo(1);
-            min_c = MIN(MIN(min_c, roi.tl().x), roi.br().x - 1);
-            max_c = MAX(MAX(max_c, roi.tl().x), roi.br().x - 1);
-            min_r = MIN(MIN(min_r, roi.tl().y), roi.br().y - 1);
-            max_r = MAX(MAX(max_r, roi.tl().y), roi.br().y - 1);
-            noPoints += minWinSize * minWinSize;
-        }
-        winRoi.x = c_local_diff + min_c;
-        winRoi.y = r_local_diff + min_r;
-        winRoi.width = max_c - min_c + 1;
-        winRoi.height = max_r - min_r + 1;
-        winPointMask = winPointMask(cv::Rect(min_c, min_r, winRoi.width, winRoi.height));
+    // get the initial small window
+    if (noPoints < minWinSize * minWinSize)
+    {
+        cv::Rect roi(winPointMask.cols / 2 - (minWinSize - 1) / 2,
+            winPointMask.rows / 2 - (minWinSize - 1) / 2,
+            minWinSize, minWinSize);
+        cv::Mat(winPointMask, roi).setTo(1);
+        min_c = MIN(MIN(min_c, roi.tl().x), roi.br().x - 1);
+        max_c = MAX(MAX(max_c, roi.tl().x), roi.br().x - 1);
+        min_r = MIN(MIN(min_r, roi.tl().y), roi.br().y - 1);
+        max_r = MAX(MAX(max_r, roi.tl().y), roi.br().y - 1);
+        noPoints += minWinSize * minWinSize;
     }
+    winRoi.x = c_local_diff + min_c;
+    winRoi.y = r_local_diff + min_r;
+    winRoi.width = max_c - min_c + 1;
+    winRoi.height = max_r - min_r + 1;
+    winPointMask = winPointMask(cv::Rect(min_c, min_r, winRoi.width, winRoi.height));
+}
 
-    inline bool calcWinMaskMat(    const cv::Mat & BI,
-                                const int windowType,
-                                const cv::Point2i & iprevPt,
-                                cv::Mat & winMaskMat,
-                                cv::Size & winSize,
-                                cv::Point2f & halfWin,
-                                int & winArea,
-                                const int minWinSize,
-                                const int maxWinSize,
-                                const int crossSegmentationThreshold)
+inline bool calcWinMaskMat(
+        const cv::Mat & BI,
+        const int windowType,
+        const cv::Point2i & iprevPt,
+        cv::Mat & winMaskMat,
+        cv::Size & winSize,
+        cv::Point2f & halfWin,
+        int & winArea,
+        const int minWinSize,
+        const int maxWinSize,
+        const int crossSegmentationThreshold)
+{
+    if (windowType == SR_CROSS && maxWinSize != minWinSize)
     {
-
-        if (windowType == SR_CROSS && maxWinSize != minWinSize)
-        {
-            // patch generation
-            cv::Rect winRoi;
-            getLocalPatch(BI, iprevPt, winMaskMat, winArea, winRoi, minWinSize, crossSegmentationThreshold);
-            if (winArea == 0)
-                return false;
-            winSize = winRoi.size();
-            halfWin = Point2f(static_cast<float>(iprevPt.x - winRoi.tl().x),
-                static_cast<float>(iprevPt.y - winRoi.tl().y));
-        }
-        else
-        {
-            winSize = cv::Size(maxWinSize, maxWinSize);
-            halfWin = Point2f((winSize.width - 1) / 2.f, (winSize.height - 1) / 2.f);
-            winMaskMat.setTo(1);
-        }
-        return true;
+        // patch generation
+        cv::Rect winRoi;
+        getLocalPatch(BI, iprevPt, winMaskMat, winArea, winRoi, minWinSize, crossSegmentationThreshold);
+        if (winArea == 0)
+            return false;
+        winSize = winRoi.size();
+        halfWin = Point2f(static_cast<float>(iprevPt.x - winRoi.tl().x),
+            static_cast<float>(iprevPt.y - winRoi.tl().y));
     }
-
-    inline short estimateScale(cv::Mat & residuals)
+    else
     {
-        cv::Mat absMat = cv::abs(residuals);
-        return quickselect<short>(absMat, absMat.rows / 2);
+        winSize = cv::Size(maxWinSize, maxWinSize);
+        halfWin = Point2f((winSize.width - 1) / 2.f, (winSize.height - 1) / 2.f);
+        winMaskMat.setTo(1);
     }
+    return true;
 }
+
+inline short estimateScale(cv::Mat & residuals)
+{
+    cv::Mat absMat = cv::abs(residuals);
+    return quickselect<short>(absMat, absMat.rows / 2);
 }
+
+}} // namespace
 #endif
diff --git a/modules/optflow/src/rlof/rlof_localflow.cpp b/modules/optflow/src/rlof/rlof_localflow.cpp
index 4d18380ea77..666421f767f 100644
--- a/modules/optflow/src/rlof/rlof_localflow.cpp
+++ b/modules/optflow/src/rlof/rlof_localflow.cpp
@@ -11,542 +11,539 @@
 using namespace std;
 using namespace cv;
 
-namespace cv
-{
-    namespace detail
-    {
+namespace cv {
+namespace detail {
+typedef short deriv_type;
+} // namespace
 
-        typedef short deriv_type;
-    }
-namespace
+namespace {
+static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
 {
-    static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
-    {
-        using namespace cv;
-        using cv::detail::deriv_type;
-        int rows = src.rows, cols = src.cols, cn = src.channels(), colsn = cols * cn, depth = src.depth();
-        CV_Assert(depth == CV_8U);
-        dst.create(rows, cols, CV_MAKETYPE(DataType<deriv_type>::depth, cn * 2));
+    using namespace cv;
+    using cv::detail::deriv_type;
+    int rows = src.rows, cols = src.cols, cn = src.channels(), colsn = cols * cn, depth = src.depth();
+    CV_Assert(depth == CV_8U);
+    dst.create(rows, cols, CV_MAKETYPE(DataType<deriv_type>::depth, cn * 2));
 
-        int x, y, delta = (int)alignSize((cols + 2)*cn, 16);
-        AutoBuffer<deriv_type> _tempBuf(delta * 2 + 64);
-        deriv_type *trow0 = alignPtr(_tempBuf.data() + cn, 16), *trow1 = alignPtr(trow0 + delta, 16);
+    int x, y, delta = (int)alignSize((cols + 2)*cn, 16);
+    AutoBuffer<deriv_type> _tempBuf(delta * 2 + 64);
+    deriv_type *trow0 = alignPtr(_tempBuf.data() + cn, 16), *trow1 = alignPtr(trow0 + delta, 16);
 
 #if CV_SIMD128
-        v_int16x8 c3 = v_setall_s16(3), c10 = v_setall_s16(10);
-        bool haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON);
+    v_int16x8 c3 = v_setall_s16(3), c10 = v_setall_s16(10);
+    bool haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON);
 #endif
 
-        for (y = 0; y < rows; y++)
-        {
-            const uchar* srow0 = src.ptr<uchar>(y > 0 ? y - 1 : rows > 1 ? 1 : 0);
-            const uchar* srow1 = src.ptr<uchar>(y);
-            const uchar* srow2 = src.ptr<uchar>(y < rows - 1 ? y + 1 : rows > 1 ? rows - 2 : 0);
-            deriv_type* drow = dst.ptr<deriv_type>(y);
+    for (y = 0; y < rows; y++)
+    {
+        const uchar* srow0 = src.ptr<uchar>(y > 0 ? y - 1 : rows > 1 ? 1 : 0);
+        const uchar* srow1 = src.ptr<uchar>(y);
+        const uchar* srow2 = src.ptr<uchar>(y < rows - 1 ? y + 1 : rows > 1 ? rows - 2 : 0);
+        deriv_type* drow = dst.ptr<deriv_type>(y);
 
-            // do vertical convolution
-            x = 0;
+        // do vertical convolution
+        x = 0;
 #if CV_SIMD128
-            if (haveSIMD)
+        if (haveSIMD)
+        {
+            for (; x <= colsn - 8; x += 8)
             {
-                for (; x <= colsn - 8; x += 8)
-                {
-                    v_int16x8 s0 = v_reinterpret_as_s16(v_load_expand(srow0 + x));
-                    v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(srow1 + x));
-                    v_int16x8 s2 = v_reinterpret_as_s16(v_load_expand(srow2 + x));
+                v_int16x8 s0 = v_reinterpret_as_s16(v_load_expand(srow0 + x));
+                v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(srow1 + x));
+                v_int16x8 s2 = v_reinterpret_as_s16(v_load_expand(srow2 + x));
 
-                    v_int16x8 t1 = s2 - s0;
-                    v_int16x8 t0 = v_mul_wrap(s0 + s2, c3) + v_mul_wrap(s1, c10);
+                v_int16x8 t1 = s2 - s0;
+                v_int16x8 t0 = v_mul_wrap(s0 + s2, c3) + v_mul_wrap(s1, c10);
 
-                    v_store(trow0 + x, t0);
-                    v_store(trow1 + x, t1);
-                }
+                v_store(trow0 + x, t0);
+                v_store(trow1 + x, t1);
             }
+        }
 #endif
 
-            for (; x < colsn; x++)
-            {
-                int t0 = (srow0[x] + srow2[x]) * 3 + srow1[x] * 10;
-                int t1 = srow2[x] - srow0[x];
-                trow0[x] = (deriv_type)t0;
-                trow1[x] = (deriv_type)t1;
-            }
+        for (; x < colsn; x++)
+        {
+            int t0 = (srow0[x] + srow2[x]) * 3 + srow1[x] * 10;
+            int t1 = srow2[x] - srow0[x];
+            trow0[x] = (deriv_type)t0;
+            trow1[x] = (deriv_type)t1;
+        }
 
-            // make border
-            int x0 = (cols > 1 ? 1 : 0)*cn, x1 = (cols > 1 ? cols - 2 : 0)*cn;
-            for (int k = 0; k < cn; k++)
-            {
-                trow0[-cn + k] = trow0[x0 + k]; trow0[colsn + k] = trow0[x1 + k];
-                trow1[-cn + k] = trow1[x0 + k]; trow1[colsn + k] = trow1[x1 + k];
-            }
+        // make border
+        int x0 = (cols > 1 ? 1 : 0)*cn, x1 = (cols > 1 ? cols - 2 : 0)*cn;
+        for (int k = 0; k < cn; k++)
+        {
+            trow0[-cn + k] = trow0[x0 + k]; trow0[colsn + k] = trow0[x1 + k];
+            trow1[-cn + k] = trow1[x0 + k]; trow1[colsn + k] = trow1[x1 + k];
+        }
 
-            // do horizontal convolution, interleave the results and store them to dst
-            x = 0;
+        // do horizontal convolution, interleave the results and store them to dst
+        x = 0;
 #if CV_SIMD128
-            if (haveSIMD)
+        if (haveSIMD)
+        {
+            for (; x <= colsn - 8; x += 8)
             {
-                for (; x <= colsn - 8; x += 8)
-                {
-                    v_int16x8 s0 = v_load(trow0 + x - cn);
-                    v_int16x8 s1 = v_load(trow0 + x + cn);
-                    v_int16x8 s2 = v_load(trow1 + x - cn);
-                    v_int16x8 s3 = v_load(trow1 + x);
-                    v_int16x8 s4 = v_load(trow1 + x + cn);
+                v_int16x8 s0 = v_load(trow0 + x - cn);
+                v_int16x8 s1 = v_load(trow0 + x + cn);
+                v_int16x8 s2 = v_load(trow1 + x - cn);
+                v_int16x8 s3 = v_load(trow1 + x);
+                v_int16x8 s4 = v_load(trow1 + x + cn);
 
-                    v_int16x8 t0 = s1 - s0;
-                    v_int16x8 t1 = v_mul_wrap(s2 + s4, c3) + v_mul_wrap(s3, c10);
+                v_int16x8 t0 = s1 - s0;
+                v_int16x8 t1 = v_mul_wrap(s2 + s4, c3) + v_mul_wrap(s3, c10);
 
-                    v_store_interleave((drow + x * 2), t0, t1);
-                }
+                v_store_interleave((drow + x * 2), t0, t1);
             }
+        }
 #endif
-            for (; x < colsn; x++)
-            {
-                deriv_type t0 = (deriv_type)(trow0[x + cn] - trow0[x - cn]);
-                deriv_type t1 = (deriv_type)((trow1[x + cn] + trow1[x - cn]) * 3 + trow1[x] * 10);
-                drow[x * 2] = t0; drow[x * 2 + 1] = t1;
-            }
+        for (; x < colsn; x++)
+        {
+            deriv_type t0 = (deriv_type)(trow0[x + cn] - trow0[x - cn]);
+            deriv_type t1 = (deriv_type)((trow1[x + cn] + trow1[x - cn]) * 3 + trow1[x] * 10);
+            drow[x * 2] = t0; drow[x * 2 + 1] = t1;
         }
     }
+}
+
+} // namespace
+namespace optflow {
+
+int buildOpticalFlowPyramidScale(InputArray _img, OutputArrayOfArrays pyramid, Size winSize, int maxLevel, bool withDerivatives,
+    int pyrBorder, int derivBorder, bool tryReuseInputImage, float levelScale[2]);
+void calcLocalOpticalFlowCore(Ptr<CImageBuffer>  prevPyramids[2], Ptr<CImageBuffer> currPyramids[2], InputArray _prevPts,
+    InputOutputArray _nextPts, const RLOFOpticalFlowParameter & param);
 
-}//namespace
-namespace optflow
+void preprocess(Ptr<CImageBuffer> prevPyramids[2], Ptr<CImageBuffer> currPyramids[2], const std::vector<cv::Point2f> & prevPoints,
+    std::vector<cv::Point2f> & currPoints,  const RLOFOpticalFlowParameter & param);
+
+inline bool isrobust(const RLOFOpticalFlowParameter & param)
 {
-    int buildOpticalFlowPyramidScale(InputArray _img, OutputArrayOfArrays pyramid, Size winSize, int maxLevel, bool withDerivatives,
-        int pyrBorder, int derivBorder, bool tryReuseInputImage, float levelScale[2]);
-    void calcLocalOpticalFlowCore(Ptr<CImageBuffer>  prevPyramids[2], Ptr<CImageBuffer> currPyramids[2], InputArray _prevPts,
-        InputOutputArray _nextPts, const RLOFOpticalFlowParameter & param);
+    return (param.normSigma0 != std::numeric_limits<float>::max())
+        && (param.normSigma1 != std::numeric_limits<float>::max());
+}
+inline std::vector<float> get_norm(float sigma0, float sigma1)
+{
+    std::vector<float> result = { sigma0, sigma1, sigma0 / (sigma0 - sigma1), sigma0 * sigma1 };
+    return result;
+}
 
-    void preprocess(Ptr<CImageBuffer> prevPyramids[2], Ptr<CImageBuffer> currPyramids[2], const std::vector<cv::Point2f> & prevPoints,
-        std::vector<cv::Point2f> & currPoints,  const RLOFOpticalFlowParameter & param);
 
-    inline bool isrobust(const RLOFOpticalFlowParameter & param)
-    {
-        return (param.normSigma0 != std::numeric_limits<float>::max())
-            && (param.normSigma1 != std::numeric_limits<float>::max());
-    }
-    inline std::vector<float> get_norm(float sigma0, float sigma1)
+int buildOpticalFlowPyramidScale(InputArray _img, OutputArrayOfArrays pyramid, Size winSize, int maxLevel, bool withDerivatives,
+    int pyrBorder, int derivBorder, bool tryReuseInputImage, float levelScale[2])
+{
+    Mat img = _img.getMat();
+    CV_Assert(img.depth() == CV_8U && winSize.width > 2 && winSize.height > 2);
+    int pyrstep = withDerivatives ? 2 : 1;
+
+    pyramid.create(1, (maxLevel + 1) * pyrstep, 0 /*type*/, -1, true);
+
+    int derivType = CV_MAKETYPE(DataType<short>::depth, img.channels() * 2);
+
+    //level 0
+    bool lvl0IsSet = false;
+    if (tryReuseInputImage && img.isSubmatrix() && (pyrBorder & BORDER_ISOLATED) == 0)
     {
-        std::vector<float> result = { sigma0, sigma1, sigma0 / (sigma0 - sigma1), sigma0 * sigma1 };
-        return result;
+        Size wholeSize;
+        Point ofs;
+        img.locateROI(wholeSize, ofs);
+        if (ofs.x >= winSize.width && ofs.y >= winSize.height
+            && ofs.x + img.cols + winSize.width <= wholeSize.width
+            && ofs.y + img.rows + winSize.height <= wholeSize.height)
+        {
+            pyramid.getMatRef(0) = img;
+            lvl0IsSet = true;
+        }
     }
 
-
-    int buildOpticalFlowPyramidScale(InputArray _img, OutputArrayOfArrays pyramid, Size winSize, int maxLevel, bool withDerivatives,
-        int pyrBorder, int derivBorder, bool tryReuseInputImage, float levelScale[2])
+    if (!lvl0IsSet)
     {
-        Mat img = _img.getMat();
-        CV_Assert(img.depth() == CV_8U && winSize.width > 2 && winSize.height > 2);
-        int pyrstep = withDerivatives ? 2 : 1;
+        Mat& temp = pyramid.getMatRef(0);
 
-        pyramid.create(1, (maxLevel + 1) * pyrstep, 0 /*type*/, -1, true);
+        if (!temp.empty())
+            temp.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width);
+        if (temp.type() != img.type() || temp.cols != winSize.width * 2 + img.cols || temp.rows != winSize.height * 2 + img.rows)
+            temp.create(img.rows + winSize.height * 2, img.cols + winSize.width * 2, img.type());
 
-        int derivType = CV_MAKETYPE(DataType<short>::depth, img.channels() * 2);
+        if (pyrBorder == BORDER_TRANSPARENT)
+            img.copyTo(temp(Rect(winSize.width, winSize.height, img.cols, img.rows)));
+        else
+            copyMakeBorder(img, temp, winSize.height, winSize.height, winSize.width, winSize.width, pyrBorder);
+        temp.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width);
+    }
 
-        //level 0
-        bool lvl0IsSet = false;
-        if (tryReuseInputImage && img.isSubmatrix() && (pyrBorder & BORDER_ISOLATED) == 0)
-        {
-            Size wholeSize;
-            Point ofs;
-            img.locateROI(wholeSize, ofs);
-            if (ofs.x >= winSize.width && ofs.y >= winSize.height
-                && ofs.x + img.cols + winSize.width <= wholeSize.width
-                && ofs.y + img.rows + winSize.height <= wholeSize.height)
-            {
-                pyramid.getMatRef(0) = img;
-                lvl0IsSet = true;
-            }
-        }
+    Size sz = img.size();
+    Mat prevLevel = pyramid.getMatRef(0);
+    Mat thisLevel = prevLevel;
 
-        if (!lvl0IsSet)
+    for (int level = 0; level <= maxLevel; ++level)
+    {
+        if (level != 0)
         {
-            Mat& temp = pyramid.getMatRef(0);
+            Mat& temp = pyramid.getMatRef(level * pyrstep);
 
             if (!temp.empty())
                 temp.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width);
-            if (temp.type() != img.type() || temp.cols != winSize.width * 2 + img.cols || temp.rows != winSize.height * 2 + img.rows)
-                temp.create(img.rows + winSize.height * 2, img.cols + winSize.width * 2, img.type());
+            if (temp.type() != img.type() || temp.cols != winSize.width * 2 + sz.width || temp.rows != winSize.height * 2 + sz.height)
+                temp.create(sz.height + winSize.height * 2, sz.width + winSize.width * 2, img.type());
 
-            if (pyrBorder == BORDER_TRANSPARENT)
-                img.copyTo(temp(Rect(winSize.width, winSize.height, img.cols, img.rows)));
-            else
-                copyMakeBorder(img, temp, winSize.height, winSize.height, winSize.width, winSize.width, pyrBorder);
+            thisLevel = temp(Rect(winSize.width, winSize.height, sz.width, sz.height));
+            pyrDown(prevLevel, thisLevel, sz);
+
+            if (pyrBorder != BORDER_TRANSPARENT)
+                copyMakeBorder(thisLevel, temp, winSize.height, winSize.height, winSize.width, winSize.width, pyrBorder | BORDER_ISOLATED);
             temp.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width);
         }
 
-        Size sz = img.size();
-        Mat prevLevel = pyramid.getMatRef(0);
-        Mat thisLevel = prevLevel;
-
-        for (int level = 0; level <= maxLevel; ++level)
+        if (withDerivatives)
         {
-            if (level != 0)
-            {
-                Mat& temp = pyramid.getMatRef(level * pyrstep);
-
-                if (!temp.empty())
-                    temp.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width);
-                if (temp.type() != img.type() || temp.cols != winSize.width * 2 + sz.width || temp.rows != winSize.height * 2 + sz.height)
-                    temp.create(sz.height + winSize.height * 2, sz.width + winSize.width * 2, img.type());
-
-                thisLevel = temp(Rect(winSize.width, winSize.height, sz.width, sz.height));
-                pyrDown(prevLevel, thisLevel, sz);
-
-                if (pyrBorder != BORDER_TRANSPARENT)
-                    copyMakeBorder(thisLevel, temp, winSize.height, winSize.height, winSize.width, winSize.width, pyrBorder | BORDER_ISOLATED);
-                temp.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width);
-            }
-
-            if (withDerivatives)
-            {
-                Mat& deriv = pyramid.getMatRef(level * pyrstep + 1);
+            Mat& deriv = pyramid.getMatRef(level * pyrstep + 1);
 
-                if (!deriv.empty())
-                    deriv.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width);
-                if (deriv.type() != derivType || deriv.cols != winSize.width * 2 + sz.width || deriv.rows != winSize.height * 2 + sz.height)
-                    deriv.create(sz.height + winSize.height * 2, sz.width + winSize.width * 2, derivType);
+            if (!deriv.empty())
+                deriv.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width);
+            if (deriv.type() != derivType || deriv.cols != winSize.width * 2 + sz.width || deriv.rows != winSize.height * 2 + sz.height)
+                deriv.create(sz.height + winSize.height * 2, sz.width + winSize.width * 2, derivType);
 
-                Mat derivI = deriv(Rect(winSize.width, winSize.height, sz.width, sz.height));
-                calcSharrDeriv(thisLevel, derivI);
+            Mat derivI = deriv(Rect(winSize.width, winSize.height, sz.width, sz.height));
+            calcSharrDeriv(thisLevel, derivI);
 
-                if (derivBorder != BORDER_TRANSPARENT)
-                    copyMakeBorder(derivI, deriv, winSize.height, winSize.height, winSize.width, winSize.width, derivBorder | BORDER_ISOLATED);
-                deriv.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width);
-            }
-
-            sz = Size(static_cast<int>((sz.width + 1) / levelScale[0]),
-                static_cast<int>((sz.height + 1) / levelScale[1]));
-            if (sz.width <= winSize.width || sz.height <= winSize.height)
-            {
-                pyramid.create(1, (level + 1) * pyrstep, 0 /*type*/, -1, true);//check this
-                return level;
-            }
+            if (derivBorder != BORDER_TRANSPARENT)
+                copyMakeBorder(derivI, deriv, winSize.height, winSize.height, winSize.width, winSize.width, derivBorder | BORDER_ISOLATED);
+            deriv.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width);
+        }
 
-            prevLevel = thisLevel;
+        sz = Size(static_cast<int>((sz.width + 1) / levelScale[0]),
+            static_cast<int>((sz.height + 1) / levelScale[1]));
+        if (sz.width <= winSize.width || sz.height <= winSize.height)
+        {
+            pyramid.create(1, (level + 1) * pyrstep, 0 /*type*/, -1, true);//check this
+            return level;
         }
 
-        return maxLevel;
+        prevLevel = thisLevel;
     }
-    int CImageBuffer::buildPyramid(cv::Size winSize, int maxLevel, float levelScale[2])
-    {
-        if (m_Overwrite == false)
-            return m_maxLevel;
-        m_maxLevel = buildOpticalFlowPyramidScale(m_Image, m_ImagePyramid, winSize, maxLevel, false, 4, 0, true, levelScale);
+
+    return maxLevel;
+}
+int CImageBuffer::buildPyramid(cv::Size winSize, int maxLevel, float levelScale[2])
+{
+    if (m_Overwrite == false)
         return m_maxLevel;
-    }
+    m_maxLevel = buildOpticalFlowPyramidScale(m_Image, m_ImagePyramid, winSize, maxLevel, false, 4, 0, true, levelScale);
+    return m_maxLevel;
+}
 
-    void calcLocalOpticalFlowCore(
-        Ptr<CImageBuffer>  prevPyramids[2],
-        Ptr<CImageBuffer> currPyramids[2],
-        InputArray _prevPts,
-        InputOutputArray _nextPts,
-        const RLOFOpticalFlowParameter & param)
+void calcLocalOpticalFlowCore(
+    Ptr<CImageBuffer>  prevPyramids[2],
+    Ptr<CImageBuffer> currPyramids[2],
+    InputArray _prevPts,
+    InputOutputArray _nextPts,
+    const RLOFOpticalFlowParameter & param)
+{
+
+    bool useAdditionalRGB = param.supportRegionType == SR_CROSS;
+    int iWinSize = param.largeWinSize;
+    int winSizes[2] = { iWinSize, iWinSize };
+    if (param.supportRegionType != SR_FIXED)
     {
+        winSizes[0] = param.smallWinSize;
+    }
+    //cv::Size winSize(iWinSize, iWinSize);
+    cv::TermCriteria criteria(cv::TermCriteria::COUNT | cv::TermCriteria::EPS, param.maxIteration, 0.01);
+    std::vector<float> rlofNorm = get_norm(param.normSigma0, param.normSigma1);
+    CV_Assert(winSizes[0] <= winSizes[1]);
 
-        bool useAdditionalRGB = param.supportRegionType == SR_CROSS;
-        int iWinSize = param.largeWinSize;
-        int winSizes[2] = { iWinSize, iWinSize };
-        if (param.supportRegionType != SR_FIXED)
-        {
-            winSizes[0] = param.smallWinSize;
-        }
-        //cv::Size winSize(iWinSize, iWinSize);
-        cv::TermCriteria criteria(cv::TermCriteria::COUNT | cv::TermCriteria::EPS, param.maxIteration, 0.01);
-        std::vector<float> rlofNorm = get_norm(param.normSigma0, param.normSigma1);
-        CV_Assert(winSizes[0] <= winSizes[1]);
+    Mat prevPtsMat = _prevPts.getMat();
+    const int derivDepth = DataType<detail::deriv_type>::depth;
 
-        Mat prevPtsMat = _prevPts.getMat();
-        const int derivDepth = DataType<detail::deriv_type>::depth;
+    CV_Assert(param.maxLevel >= 0 && iWinSize > 2);
 
-        CV_Assert(param.maxLevel >= 0 && iWinSize > 2);
+    int level = 0,  npoints;
+    CV_Assert((npoints = prevPtsMat.checkVector(2, CV_32F, true)) >= 0);
 
-        int level = 0,  npoints;
-        CV_Assert((npoints = prevPtsMat.checkVector(2, CV_32F, true)) >= 0);
+    if (!(param.useInitialFlow))
+        _nextPts.create(prevPtsMat.size(), prevPtsMat.type(), -1, true);
 
-        if (!(param.useInitialFlow))
-            _nextPts.create(prevPtsMat.size(), prevPtsMat.type(), -1, true);
+    Mat nextPtsMat = _nextPts.getMat();
+    CV_Assert(nextPtsMat.checkVector(2, CV_32F, true) == npoints);
 
-        Mat nextPtsMat = _nextPts.getMat();
-        CV_Assert(nextPtsMat.checkVector(2, CV_32F, true) == npoints);
+    const Point2f* prevPts = (const Point2f*)prevPtsMat.data;
+    Point2f* nextPts = (Point2f*)nextPtsMat.data;
+    std::vector<uchar> status(npoints);
+    std::vector<float> err(npoints);
+    std::vector<Point2f> gainPts(npoints);
 
-        const Point2f* prevPts = (const Point2f*)prevPtsMat.data;
-        Point2f* nextPts = (Point2f*)nextPtsMat.data;
-        std::vector<uchar> status(npoints);
-        std::vector<float> err(npoints);
-        std::vector<Point2f> gainPts(npoints);
+    float levelScale[2] = { 2.f,2.f };
 
-        float levelScale[2] = { 2.f,2.f };
+    int maxLevel = prevPyramids[0]->buildPyramid(cv::Size(iWinSize, iWinSize), param.maxLevel, levelScale);
 
-        int maxLevel = prevPyramids[0]->buildPyramid(cv::Size(iWinSize, iWinSize), param.maxLevel, levelScale);
+    maxLevel = currPyramids[0]->buildPyramid(cv::Size(iWinSize, iWinSize), maxLevel, levelScale);
+    if (useAdditionalRGB)
+    {
+        prevPyramids[1]->buildPyramid(cv::Size(iWinSize, iWinSize), maxLevel, levelScale);
+        currPyramids[1]->buildPyramid(cv::Size(iWinSize, iWinSize), maxLevel, levelScale);
+    }
 
-        maxLevel = currPyramids[0]->buildPyramid(cv::Size(iWinSize, iWinSize), maxLevel, levelScale);
+    if ((criteria.type & TermCriteria::COUNT) == 0)
+        criteria.maxCount = 30;
+    else
+        criteria.maxCount = std::min(std::max(criteria.maxCount, 0), 100);
+    if ((criteria.type & TermCriteria::EPS) == 0)
+        criteria.epsilon = 0.001;
+    else
+        criteria.epsilon = std::min(std::max(criteria.epsilon, 0.), 10.);
+    criteria.epsilon *= criteria.epsilon;
+
+    // dI/dx ~ Ix, dI/dy ~ Iy
+    Mat derivIBuf;
+    derivIBuf.create(prevPyramids[0]->m_ImagePyramid[0].rows + iWinSize * 2, prevPyramids[0]->m_ImagePyramid[0].cols + iWinSize * 2, CV_MAKETYPE(derivDepth, prevPyramids[0]->m_ImagePyramid[0].channels() * 2));
+
+    for (level = maxLevel; level >= 0; level--)
+    {
+        Mat derivI;
+        Size imgSize = prevPyramids[0]->getImage(level).size();
+        Mat _derivI(imgSize.height + iWinSize * 2, imgSize.width + iWinSize * 2, derivIBuf.type(), derivIBuf.data);
+        derivI = _derivI(Rect(iWinSize, iWinSize, imgSize.width, imgSize.height));
+        calcSharrDeriv(prevPyramids[0]->getImage(level), derivI);
+        copyMakeBorder(derivI, _derivI, iWinSize, iWinSize, iWinSize, iWinSize, BORDER_CONSTANT | BORDER_ISOLATED);
+
+        cv::Mat tRGBPrevPyr;
+        cv::Mat tRGBNextPyr;
         if (useAdditionalRGB)
         {
-            prevPyramids[1]->buildPyramid(cv::Size(iWinSize, iWinSize), maxLevel, levelScale);
-            currPyramids[1]->buildPyramid(cv::Size(iWinSize, iWinSize), maxLevel, levelScale);
-        }
+            tRGBPrevPyr = prevPyramids[1]->getImage(level);
+            tRGBNextPyr = prevPyramids[1]->getImage(level);
 
-        if ((criteria.type & TermCriteria::COUNT) == 0)
-            criteria.maxCount = 30;
-        else
-            criteria.maxCount = std::min(std::max(criteria.maxCount, 0), 100);
-        if ((criteria.type & TermCriteria::EPS) == 0)
-            criteria.epsilon = 0.001;
-        else
-            criteria.epsilon = std::min(std::max(criteria.epsilon, 0.), 10.);
-        criteria.epsilon *= criteria.epsilon;
+            prevPyramids[1]->m_Overwrite = false;
+            currPyramids[1]->m_Overwrite = false;
+        }
 
-        // dI/dx ~ Ix, dI/dy ~ Iy
-        Mat derivIBuf;
-        derivIBuf.create(prevPyramids[0]->m_ImagePyramid[0].rows + iWinSize * 2, prevPyramids[0]->m_ImagePyramid[0].cols + iWinSize * 2, CV_MAKETYPE(derivDepth, prevPyramids[0]->m_ImagePyramid[0].channels() * 2));
+        cv::Mat prevImage = prevPyramids[0]->getImage(level);
+        cv::Mat currImage = currPyramids[0]->getImage(level);
 
-        for (level = maxLevel; level >= 0; level--)
+        // apply plk like tracker
+        if (isrobust(param) == false)
         {
-            Mat derivI;
-            Size imgSize = prevPyramids[0]->getImage(level).size();
-            Mat _derivI(imgSize.height + iWinSize * 2, imgSize.width + iWinSize * 2, derivIBuf.type(), derivIBuf.data);
-            derivI = _derivI(Rect(iWinSize, iWinSize, imgSize.width, imgSize.height));
-            calcSharrDeriv(prevPyramids[0]->getImage(level), derivI);
-            copyMakeBorder(derivI, _derivI, iWinSize, iWinSize, iWinSize, iWinSize, BORDER_CONSTANT | BORDER_ISOLATED);
-
-            cv::Mat tRGBPrevPyr;
-            cv::Mat tRGBNextPyr;
-            if (useAdditionalRGB)
+            if (param.useIlluminationModel)
             {
-                tRGBPrevPyr = prevPyramids[1]->getImage(level);
-                tRGBNextPyr = prevPyramids[1]->getImage(level);
-
-                prevPyramids[1]->m_Overwrite = false;
-                currPyramids[1]->m_Overwrite = false;
+                cv::parallel_for_(cv::Range(0, npoints),
+                    plk::radial::TrackerInvoker(
+                        prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
+                        prevPts, nextPts, &status[0], &err[0], &gainPts[0],
+                        level, maxLevel, winSizes,
+                        param.maxIteration,
+                        param.useInitialFlow,
+                        param.supportRegionType,
+                        param.minEigenValue,
+                        param.crossSegmentationThreshold));
             }
-
-            cv::Mat prevImage = prevPyramids[0]->getImage(level);
-            cv::Mat currImage = currPyramids[0]->getImage(level);
-
-            // apply plk like tracker
-            if (isrobust(param) == false)
+            else
+            {
+                if (param.solverType == SolverType::ST_STANDART)
+                {
+                    cv::parallel_for_(cv::Range(0, npoints),
+                        plk::ica::TrackerInvoker(
+                            prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
+                            prevPts, nextPts, &status[0], &err[0],
+                            level, maxLevel, winSizes,
+                            param.maxIteration,
+                            param.useInitialFlow,
+                            param.supportRegionType,
+                            param.crossSegmentationThreshold,
+                            param.minEigenValue));
+                }
+                else
+                {
+                    cv::parallel_for_(cv::Range(0, npoints),
+                        beplk::ica::TrackerInvoker(prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
+                            prevPts, nextPts, &status[0], &err[0],
+                            level, maxLevel, winSizes,
+                            param.maxIteration,
+                            param.useInitialFlow,
+                            param.supportRegionType,
+                            param.crossSegmentationThreshold,
+                            param.minEigenValue));
+                }
+            }
+        }
+        // for robust models
+        else
+        {
+            if (param.useIlluminationModel)
             {
-                if (param.useIlluminationModel)
+                if (param.solverType == SolverType::ST_STANDART)
                 {
                     cv::parallel_for_(cv::Range(0, npoints),
-                        plk::radial::TrackerInvoker(
+                        rlof::radial::TrackerInvoker(
                             prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
                             prevPts, nextPts, &status[0], &err[0], &gainPts[0],
                             level, maxLevel, winSizes,
                             param.maxIteration,
                             param.useInitialFlow,
                             param.supportRegionType,
+                            rlofNorm,
                             param.minEigenValue,
                             param.crossSegmentationThreshold));
                 }
                 else
                 {
-                    if (param.solverType == SolverType::ST_STANDART)
-                    {
-                        cv::parallel_for_(cv::Range(0, npoints),
-                            plk::ica::TrackerInvoker(
-                                prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
-                                prevPts, nextPts, &status[0], &err[0],
-                                level, maxLevel, winSizes,
-                                param.maxIteration,
-                                param.useInitialFlow,
-                                param.supportRegionType,
-                                param.crossSegmentationThreshold,
-                                param.minEigenValue));
-                    }
-                    else
-                    {
-                        cv::parallel_for_(cv::Range(0, npoints),
-                            beplk::ica::TrackerInvoker(prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
-                                prevPts, nextPts, &status[0], &err[0],
-                                level, maxLevel, winSizes,
-                                param.maxIteration,
-                                param.useInitialFlow,
-                                param.supportRegionType,
-                                param.crossSegmentationThreshold,
-                                param.minEigenValue));
-                    }
+                    cv::parallel_for_(cv::Range(0, npoints),
+                        berlof::radial::TrackerInvoker(prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
+                            prevPts, nextPts, &status[0], &err[0], &gainPts[0],
+                            level, maxLevel, winSizes,
+                            param.maxIteration,
+                            param.useInitialFlow,
+                            param.supportRegionType,
+                            param.crossSegmentationThreshold,
+                            rlofNorm,
+                            param.minEigenValue));
                 }
             }
-            // for robust models
             else
             {
-                if (param.useIlluminationModel)
+
+                if (param.solverType == SolverType::ST_STANDART)
                 {
-                    if (param.solverType == SolverType::ST_STANDART)
-                    {
-                        cv::parallel_for_(cv::Range(0, npoints),
-                            rlof::radial::TrackerInvoker(
-                                prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
-                                prevPts, nextPts, &status[0], &err[0], &gainPts[0],
-                                level, maxLevel, winSizes,
-                                param.maxIteration,
-                                param.useInitialFlow,
-                                param.supportRegionType,
-                                rlofNorm,
-                                param.minEigenValue,
-                                param.crossSegmentationThreshold));
-                    }
-                    else
-                    {
-                        cv::parallel_for_(cv::Range(0, npoints),
-                            berlof::radial::TrackerInvoker(prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
-                                prevPts, nextPts, &status[0], &err[0], &gainPts[0],
-                                level, maxLevel, winSizes,
-                                param.maxIteration,
-                                param.useInitialFlow,
-                                param.supportRegionType,
-                                param.crossSegmentationThreshold,
-                                rlofNorm,
-                                param.minEigenValue));
-                    }
+                    cv::parallel_for_(cv::Range(0, npoints),
+                        rlof::ica::TrackerInvoker(
+                            prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
+                            prevPts, nextPts, &status[0], &err[0],
+                            level, maxLevel, winSizes,
+                            param.maxIteration,
+                            param.useInitialFlow,
+                            param.supportRegionType,
+                            rlofNorm,
+                            param.minEigenValue,
+                            param.crossSegmentationThreshold));
                 }
                 else
                 {
-
-                    if (param.solverType == SolverType::ST_STANDART)
-                    {
-                        cv::parallel_for_(cv::Range(0, npoints),
-                            rlof::ica::TrackerInvoker(
-                                prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
-                                prevPts, nextPts, &status[0], &err[0],
-                                level, maxLevel, winSizes,
-                                param.maxIteration,
-                                param.useInitialFlow,
-                                param.supportRegionType,
-                                rlofNorm,
-                                param.minEigenValue,
-                                param.crossSegmentationThreshold));
-                    }
-                    else
-                    {
-                        cv::parallel_for_(cv::Range(0, npoints),
-                            berlof::ica::TrackerInvoker(prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
-                                prevPts, nextPts, &status[0], &err[0],
-                                level, maxLevel, winSizes,
-                                param.maxIteration,
-                                param.useInitialFlow,
-                                param.supportRegionType,
-                                param.crossSegmentationThreshold,
-                                rlofNorm,
-                                param.minEigenValue));
-                    }
-
+                    cv::parallel_for_(cv::Range(0, npoints),
+                        berlof::ica::TrackerInvoker(prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
+                            prevPts, nextPts, &status[0], &err[0],
+                            level, maxLevel, winSizes,
+                            param.maxIteration,
+                            param.useInitialFlow,
+                            param.supportRegionType,
+                            param.crossSegmentationThreshold,
+                            rlofNorm,
+                            param.minEigenValue));
                 }
-            }
 
-            prevPyramids[0]->m_Overwrite = true;
-            currPyramids[0]->m_Overwrite = true;
+            }
         }
+
+        prevPyramids[0]->m_Overwrite = true;
+        currPyramids[0]->m_Overwrite = true;
     }
+}
 
-    void preprocess(Ptr<CImageBuffer> prevPyramids[2],
-        Ptr<CImageBuffer> currPyramids[2],
-        const std::vector<cv::Point2f> & prevPoints,
-        std::vector<cv::Point2f> & currPoints,
-        const RLOFOpticalFlowParameter & param)
+void preprocess(Ptr<CImageBuffer> prevPyramids[2],
+    Ptr<CImageBuffer> currPyramids[2],
+    const std::vector<cv::Point2f> & prevPoints,
+    std::vector<cv::Point2f> & currPoints,
+    const RLOFOpticalFlowParameter & param)
+{
+    cv::Mat mask, homography;
+    if (param.useGlobalMotionPrior == false)
+        return;
+
+    currPoints.resize(prevPoints.size());
+
+    RLOFOpticalFlowParameter gmeTrackerParam = param;
+    gmeTrackerParam.useGlobalMotionPrior = false;
+    gmeTrackerParam.largeWinSize = 17;
+    // use none robust tracker for global motion estimation since it is faster
+    gmeTrackerParam.normSigma0 = std::numeric_limits<float>::max();
+    gmeTrackerParam.maxIteration = MAX(15, param.maxIteration);
+    gmeTrackerParam.minEigenValue = 0.000001f;
+
+    std::vector<cv::Point2f> gmPrevPoints, gmCurrPoints;
+
+    // Initialize point grid
+    int stepr = prevPyramids[0]->m_Image.rows / 30;
+    int stepc = prevPyramids[0]->m_Image.rows / 40;
+    for (int r = stepr / 2; r < prevPyramids[0]->m_Image.rows; r += stepr)
     {
-        cv::Mat mask, homography;
-        if (param.useGlobalMotionPrior == false)
-            return;
-
-        currPoints.resize(prevPoints.size());
-
-        RLOFOpticalFlowParameter gmeTrackerParam = param;
-        gmeTrackerParam.useGlobalMotionPrior = false;
-        gmeTrackerParam.largeWinSize = 17;
-        // use none robust tracker for global motion estimation since it is faster
-        gmeTrackerParam.normSigma0 = std::numeric_limits<float>::max();
-        gmeTrackerParam.maxIteration = MAX(15, param.maxIteration);
-        gmeTrackerParam.minEigenValue = 0.000001f;
-
-        std::vector<cv::Point2f> gmPrevPoints, gmCurrPoints;
-
-        // Initialize point grid
-        int stepr = prevPyramids[0]->m_Image.rows / 30;
-        int stepc = prevPyramids[0]->m_Image.rows / 40;
-        for (int r = stepr / 2; r < prevPyramids[0]->m_Image.rows; r += stepr)
+        for (int c = stepc / 2; c < prevPyramids[0]->m_Image.cols; c += stepc)
         {
-            for (int c = stepc / 2; c < prevPyramids[0]->m_Image.cols; c += stepc)
-            {
-                gmPrevPoints.push_back(cv::Point2f(static_cast<float>(c), static_cast<float>(r)));
-            }
+            gmPrevPoints.push_back(cv::Point2f(static_cast<float>(c), static_cast<float>(r)));
         }
+    }
 
-        // perform motion estimation
-        calcLocalOpticalFlowCore(prevPyramids, currPyramids, gmPrevPoints, gmCurrPoints, gmeTrackerParam);
+    // perform motion estimation
+    calcLocalOpticalFlowCore(prevPyramids, currPyramids, gmPrevPoints, gmCurrPoints, gmeTrackerParam);
 
-        cv::Mat prevPointsMat(static_cast<int>(gmPrevPoints.size()), 1, CV_32FC2);
-        cv::Mat currPointsMat(static_cast<int>(gmPrevPoints.size()), 1, CV_32FC2);
-        cv::Mat distMat(static_cast<int>(gmPrevPoints.size()), 1, CV_32FC1);
+    cv::Mat prevPointsMat(static_cast<int>(gmPrevPoints.size()), 1, CV_32FC2);
+    cv::Mat currPointsMat(static_cast<int>(gmPrevPoints.size()), 1, CV_32FC2);
+    cv::Mat distMat(static_cast<int>(gmPrevPoints.size()), 1, CV_32FC1);
 
-        // Forward backward confidence to estimate optimal ransac reprojection error
-        int noPoints = 0;
-        for (unsigned int n = 0; n < gmPrevPoints.size(); n++)
-        {
-            cv::Point2f flow = gmCurrPoints[n] - gmPrevPoints[n];
-            prevPointsMat.at<cv::Point2f>(noPoints) = gmPrevPoints[n];
-            currPointsMat.at<cv::Point2f>(noPoints) = gmCurrPoints[n];
-            distMat.at<float>(noPoints) = flow.x * flow.x + flow.y* flow.y;
-            if (isnan(distMat.at<float>(noPoints)) == false)
-                noPoints++;
-        }
+    // Forward backward confidence to estimate optimal ransac reprojection error
+    int noPoints = 0;
+    for (unsigned int n = 0; n < gmPrevPoints.size(); n++)
+    {
+        cv::Point2f flow = gmCurrPoints[n] - gmPrevPoints[n];
+        prevPointsMat.at<cv::Point2f>(noPoints) = gmPrevPoints[n];
+        currPointsMat.at<cv::Point2f>(noPoints) = gmCurrPoints[n];
+        distMat.at<float>(noPoints) = flow.x * flow.x + flow.y* flow.y;
+        if (isnan(distMat.at<float>(noPoints)) == false)
+            noPoints++;
+    }
 
-        float medianDist = (param.globalMotionRansacThreshold == 0) ? 1.f :
-            quickselect<float>(distMat, static_cast<int>(noPoints  * static_cast<float>(param.globalMotionRansacThreshold) / 100.f));
-        medianDist = sqrt(medianDist);
+    float medianDist = (param.globalMotionRansacThreshold == 0) ? 1.f :
+        quickselect<float>(distMat, static_cast<int>(noPoints  * static_cast<float>(param.globalMotionRansacThreshold) / 100.f));
+    medianDist = sqrt(medianDist);
 
-        if (noPoints < 8)
-            return;
+    if (noPoints < 8)
+        return;
 
-        cv::findHomography(prevPointsMat(cv::Rect(0, 0, 1, noPoints)), currPointsMat(cv::Rect(0, 0, 1, noPoints)), cv::RANSAC, medianDist, mask).convertTo(homography, CV_32FC1);
+    cv::findHomography(prevPointsMat(cv::Rect(0, 0, 1, noPoints)), currPointsMat(cv::Rect(0, 0, 1, noPoints)), cv::RANSAC, medianDist, mask).convertTo(homography, CV_32FC1);
 
-        if (homography.empty())
-            return;
+    if (homography.empty())
+        return;
 
-        cv::perspectiveTransform(prevPoints, currPoints, homography);
-    }
+    cv::perspectiveTransform(prevPoints, currPoints, homography);
+}
 
-    void calcLocalOpticalFlow(
-        const Mat prevImage,
-        const Mat currImage,
-        Ptr<CImageBuffer>  prevPyramids[2],
-        Ptr<CImageBuffer>  currPyramids[2],
-        const std::vector<Point2f> & prevPoints,
-        std::vector<Point2f> & currPoints,
-        const RLOFOpticalFlowParameter & param)
+void calcLocalOpticalFlow(
+    const Mat prevImage,
+    const Mat currImage,
+    Ptr<CImageBuffer>  prevPyramids[2],
+    Ptr<CImageBuffer>  currPyramids[2],
+    const std::vector<Point2f> & prevPoints,
+    std::vector<Point2f> & currPoints,
+    const RLOFOpticalFlowParameter & param)
+{
+    if (prevImage.empty() == false && currImage.empty()== false)
     {
-        if (prevImage.empty() == false && currImage.empty()== false)
+        prevPyramids[0]->m_Overwrite = true;
+        currPyramids[0]->m_Overwrite = true;
+        prevPyramids[1]->m_Overwrite = true;
+        currPyramids[1]->m_Overwrite = true;
+        if (prevImage.type() == CV_8UC3)
         {
-            prevPyramids[0]->m_Overwrite = true;
-            currPyramids[0]->m_Overwrite = true;
-            prevPyramids[1]->m_Overwrite = true;
-            currPyramids[1]->m_Overwrite = true;
-            if (prevImage.type() == CV_8UC3)
-            {
-                prevPyramids[0]->setGrayFromRGB(prevImage);
-                currPyramids[0]->setGrayFromRGB(currImage);
-                prevPyramids[1]->setImage(prevImage);
-                currPyramids[1]->setImage(currImage);
+            prevPyramids[0]->setGrayFromRGB(prevImage);
+            currPyramids[0]->setGrayFromRGB(currImage);
+            prevPyramids[1]->setImage(prevImage);
+            currPyramids[1]->setImage(currImage);
 
-                if (param.supportRegionType == SR_CROSS)
-                {
-                    prevPyramids[1]->setBlurFromRGB(prevImage);
-                    currPyramids[1]->setBlurFromRGB(currImage);
-                }
-            }
-            else
+            if (param.supportRegionType == SR_CROSS)
             {
-                prevPyramids[0]->setImage(prevImage);
-                currPyramids[0]->setImage(currImage);
+                prevPyramids[1]->setBlurFromRGB(prevImage);
+                currPyramids[1]->setBlurFromRGB(currImage);
             }
         }
-        preprocess(prevPyramids, currPyramids, prevPoints, currPoints, param);
-        RLOFOpticalFlowParameter internParam = param;
-        if (param.useGlobalMotionPrior == true)
-            internParam.useInitialFlow = true;
-        calcLocalOpticalFlowCore(prevPyramids, currPyramids, prevPoints, currPoints, internParam);
+        else
+        {
+            prevPyramids[0]->setImage(prevImage);
+            currPyramids[0]->setImage(currImage);
+        }
     }
+    preprocess(prevPyramids, currPyramids, prevPoints, currPoints, param);
+    RLOFOpticalFlowParameter internParam = param;
+    if (param.useGlobalMotionPrior == true)
+        internParam.useInitialFlow = true;
+    calcLocalOpticalFlowCore(prevPyramids, currPyramids, prevPoints, currPoints, internParam);
 }
-}
\ No newline at end of file
+
+}} // namespace
diff --git a/modules/optflow/src/rlof/rlof_localflow.h b/modules/optflow/src/rlof/rlof_localflow.h
index 89ac2f8db21..00702e5323b 100644
--- a/modules/optflow/src/rlof/rlof_localflow.h
+++ b/modules/optflow/src/rlof/rlof_localflow.h
@@ -69,16 +69,15 @@ T quickselect(const Mat & inp, int k)
     }
 }
 
-namespace cv
-{
-namespace optflow
-{
+namespace cv {
+namespace optflow {
+
 class CImageBuffer
 {
 public:
     CImageBuffer()
         : m_Overwrite(true)
-    {};
+    {}
     void setGrayFromRGB(const cv::Mat & inp)
     {
         if(m_Overwrite)
@@ -98,12 +97,12 @@ class CImageBuffer
     int buildPyramid(cv::Size winSize, int maxLevel, float levelScale[2]);
     cv::Mat & getImage(int level) {return m_ImagePyramid[level];}
 
-    std::vector<cv::Mat>        m_ImagePyramid;
-    cv::Mat                        m_BlurredImage;
-    cv::Mat                        m_Image;
-    std::vector<cv::Mat>        m_CrossPyramid;
-    int                            m_maxLevel;
-    bool m_Overwrite;
+    std::vector<cv::Mat>     m_ImagePyramid;
+    cv::Mat                  m_BlurredImage;
+    cv::Mat                  m_Image;
+    std::vector<cv::Mat>     m_CrossPyramid;
+    int                      m_maxLevel;
+    bool                     m_Overwrite;
 };
 
 void calcLocalOpticalFlow(
@@ -114,6 +113,6 @@ void calcLocalOpticalFlow(
     const std::vector<Point2f> & prevPoints,
     std::vector<Point2f> & currPoints,
     const RLOFOpticalFlowParameter & param);
-}
-}
+
+}} // namespace
 #endif
diff --git a/modules/optflow/src/rlofflow.cpp b/modules/optflow/src/rlofflow.cpp
index ff43fc39820..12a72cb035f 100644
--- a/modules/optflow/src/rlofflow.cpp
+++ b/modules/optflow/src/rlofflow.cpp
@@ -5,398 +5,395 @@
 #include "rlof/rlof_localflow.h"
 #include "rlof/geo_interpolation.hpp"
 #include "opencv2/ximgproc.hpp"
-namespace cv
-{
-namespace optflow
+
+namespace cv {
+namespace optflow {
+
+Ptr<RLOFOpticalFlowParameter> RLOFOpticalFlowParameter::create()
 {
+    return Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter);
+}
 
-    Ptr<RLOFOpticalFlowParameter> RLOFOpticalFlowParameter::create()
-    {
-        return Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter);
-    }
+void RLOFOpticalFlowParameter::setSolverType(SolverType val){ solverType = val;}
+SolverType RLOFOpticalFlowParameter::getSolverType() const { return solverType;}
 
-    void RLOFOpticalFlowParameter::setSolverType(SolverType val){ solverType = val;}
-    SolverType RLOFOpticalFlowParameter::getSolverType() const { return solverType;}
+void RLOFOpticalFlowParameter::setSupportRegionType(SupportRegionType val){ supportRegionType = val;}
+SupportRegionType RLOFOpticalFlowParameter::getSupportRegionType() const { return supportRegionType;}
 
-    void RLOFOpticalFlowParameter::setSupportRegionType(SupportRegionType val){ supportRegionType = val;}
-    SupportRegionType RLOFOpticalFlowParameter::getSupportRegionType() const { return supportRegionType;}
+void RLOFOpticalFlowParameter::setNormSigma0(float val){ normSigma0 = val;}
+float RLOFOpticalFlowParameter::getNormSigma0() const { return normSigma0;}
 
-    void RLOFOpticalFlowParameter::setNormSigma0(float val){ normSigma0 = val;}
-    float RLOFOpticalFlowParameter::getNormSigma0() const { return normSigma0;}
+void RLOFOpticalFlowParameter::setNormSigma1(float val){ normSigma1 = val;}
+float RLOFOpticalFlowParameter::getNormSigma1() const { return normSigma1;}
 
-    void RLOFOpticalFlowParameter::setNormSigma1(float val){ normSigma1 = val;}
-    float RLOFOpticalFlowParameter::getNormSigma1() const { return normSigma1;}
+void RLOFOpticalFlowParameter::setSmallWinSize(int val){ smallWinSize = val;}
+int RLOFOpticalFlowParameter::getSmallWinSize() const { return smallWinSize;}
 
-    void RLOFOpticalFlowParameter::setSmallWinSize(int val){ smallWinSize = val;}
-    int RLOFOpticalFlowParameter::getSmallWinSize() const { return smallWinSize;}
+void RLOFOpticalFlowParameter::setLargeWinSize(int val){ largeWinSize = val;}
+int RLOFOpticalFlowParameter::getLargeWinSize() const { return largeWinSize;}
 
-    void RLOFOpticalFlowParameter::setLargeWinSize(int val){ largeWinSize = val;}
-    int RLOFOpticalFlowParameter::getLargeWinSize() const { return largeWinSize;}
+void RLOFOpticalFlowParameter::setCrossSegmentationThreshold(int val){ crossSegmentationThreshold = val;}
+int RLOFOpticalFlowParameter::getCrossSegmentationThreshold() const { return crossSegmentationThreshold;}
 
-    void RLOFOpticalFlowParameter::setCrossSegmentationThreshold(int val){ crossSegmentationThreshold = val;}
-    int RLOFOpticalFlowParameter::getCrossSegmentationThreshold() const { return crossSegmentationThreshold;}
+void RLOFOpticalFlowParameter::setMaxLevel(int val){ maxLevel = val;}
+int RLOFOpticalFlowParameter::getMaxLevel() const { return maxLevel;}
 
-    void RLOFOpticalFlowParameter::setMaxLevel(int val){ maxLevel = val;}
-    int RLOFOpticalFlowParameter::getMaxLevel() const { return maxLevel;}
+void RLOFOpticalFlowParameter::setUseInitialFlow(bool val){ useInitialFlow = val;}
+bool RLOFOpticalFlowParameter::getUseInitialFlow() const { return useInitialFlow;}
 
-    void RLOFOpticalFlowParameter::setUseInitialFlow(bool val){ useInitialFlow = val;}
-    bool RLOFOpticalFlowParameter::getUseInitialFlow() const { return useInitialFlow;}
+void RLOFOpticalFlowParameter::setUseIlluminationModel(bool val){ useIlluminationModel = val;}
+bool RLOFOpticalFlowParameter::getUseIlluminationModel() const { return useIlluminationModel;}
 
-    void RLOFOpticalFlowParameter::setUseIlluminationModel(bool val){ useIlluminationModel = val;}
-    bool RLOFOpticalFlowParameter::getUseIlluminationModel() const { return useIlluminationModel;}
+void RLOFOpticalFlowParameter::setUseGlobalMotionPrior(bool val){ useGlobalMotionPrior = val;}
+bool RLOFOpticalFlowParameter::getUseGlobalMotionPrior() const { return useGlobalMotionPrior;}
 
-    void RLOFOpticalFlowParameter::setUseGlobalMotionPrior(bool val){ useGlobalMotionPrior = val;}
-    bool RLOFOpticalFlowParameter::getUseGlobalMotionPrior() const { return useGlobalMotionPrior;}
+void RLOFOpticalFlowParameter::setMaxIteration(int val){ maxIteration = val;}
+int RLOFOpticalFlowParameter::getMaxIteration() const { return maxIteration;}
 
-    void RLOFOpticalFlowParameter::setMaxIteration(int val){ maxIteration = val;}
-    int RLOFOpticalFlowParameter::getMaxIteration() const { return maxIteration;}
+void RLOFOpticalFlowParameter::setMinEigenValue(float val){ minEigenValue = val;}
+float RLOFOpticalFlowParameter::getMinEigenValue() const { return minEigenValue;}
 
-    void RLOFOpticalFlowParameter::setMinEigenValue(float val){ minEigenValue = val;}
-    float RLOFOpticalFlowParameter::getMinEigenValue() const { return minEigenValue;}
+void RLOFOpticalFlowParameter::setGlobalMotionRansacThreshold(float val){ globalMotionRansacThreshold = val;}
+float RLOFOpticalFlowParameter::getGlobalMotionRansacThreshold() const { return globalMotionRansacThreshold;}
 
-    void RLOFOpticalFlowParameter::setGlobalMotionRansacThreshold(float val){ globalMotionRansacThreshold = val;}
-    float RLOFOpticalFlowParameter::getGlobalMotionRansacThreshold() const { return globalMotionRansacThreshold;}
+class DenseOpticalFlowRLOFImpl : public DenseRLOFOpticalFlow
+{
+public:
+    DenseOpticalFlowRLOFImpl()
+        : param(Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter))
+        , forwardBackwardThreshold(1.f)
+        , gridStep(6, 6)
+        , interp_type(InterpolationType::INTERP_GEO)
+        , k(128)
+        , sigma(0.05f)
+        , lambda(999.f)
+        , fgs_lambda(500.0f)
+        , fgs_sigma(1.5f)
+        , use_post_proc(true)
 
-    class DenseOpticalFlowRLOFImpl : public DenseRLOFOpticalFlow
     {
-    public:
-        DenseOpticalFlowRLOFImpl()
-            : param(Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter))
-            , forwardBackwardThreshold(1.f)
-            , gridStep(6, 6)
-            , interp_type(InterpolationType::INTERP_GEO)
-            , k(128)
-            , sigma(0.05f)
-            , lambda(999.f)
-            , fgs_lambda(500.0f)
-            , fgs_sigma(1.5f)
-            , use_post_proc(true)
-
-        {
-            prevPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer);
-            prevPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer);
-            currPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer);
-            currPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer);
-        }
-        virtual void setRLOFOpticalFlowParameter(Ptr<RLOFOpticalFlowParameter>  val) CV_OVERRIDE { param = val; }
-        virtual Ptr<RLOFOpticalFlowParameter>  getRLOFOpticalFlowParameter() const CV_OVERRIDE { return param; }
+        prevPyramid[0] = cv::Ptr<CImageBuffer>(new CImageBuffer);
+        prevPyramid[1] = cv::Ptr<CImageBuffer>(new CImageBuffer);
+        currPyramid[0] = cv::Ptr<CImageBuffer>(new CImageBuffer);
+        currPyramid[1] = cv::Ptr<CImageBuffer>(new CImageBuffer);
+    }
+    virtual void setRLOFOpticalFlowParameter(Ptr<RLOFOpticalFlowParameter>  val) CV_OVERRIDE { param = val; }
+    virtual Ptr<RLOFOpticalFlowParameter>  getRLOFOpticalFlowParameter() const CV_OVERRIDE { return param; }
 
-        virtual float getForwardBackward() const CV_OVERRIDE { return forwardBackwardThreshold; }
-        virtual void setForwardBackward(float val) CV_OVERRIDE { forwardBackwardThreshold = val; }
+    virtual float getForwardBackward() const CV_OVERRIDE { return forwardBackwardThreshold; }
+    virtual void setForwardBackward(float val) CV_OVERRIDE { forwardBackwardThreshold = val; }
 
-        virtual void setInterpolation(InterpolationType val) CV_OVERRIDE { interp_type = val; }
-        virtual InterpolationType getInterpolation() const CV_OVERRIDE { return interp_type; }
+    virtual void setInterpolation(InterpolationType val) CV_OVERRIDE { interp_type = val; }
+    virtual InterpolationType getInterpolation() const CV_OVERRIDE { return interp_type; }
 
-        virtual Size getGridStep() const CV_OVERRIDE { return gridStep; }
-        virtual void setGridStep(Size val) CV_OVERRIDE { gridStep = val; }
+    virtual Size getGridStep() const CV_OVERRIDE { return gridStep; }
+    virtual void setGridStep(Size val) CV_OVERRIDE { gridStep = val; }
 
-        virtual int getEPICK() const CV_OVERRIDE { return k; }
-        virtual void setEPICK(int val) CV_OVERRIDE { k = val; }
+    virtual int getEPICK() const CV_OVERRIDE { return k; }
+    virtual void setEPICK(int val) CV_OVERRIDE { k = val; }
 
-        virtual float getEPICSigma() const CV_OVERRIDE { return sigma; }
-        virtual void setEPICSigma(float val) CV_OVERRIDE { sigma = val; }
+    virtual float getEPICSigma() const CV_OVERRIDE { return sigma; }
+    virtual void setEPICSigma(float val) CV_OVERRIDE { sigma = val; }
 
-        virtual float getEPICLambda() const CV_OVERRIDE { return lambda; }
-        virtual void setEPICLambda(float val)  CV_OVERRIDE { lambda = val; }
+    virtual float getEPICLambda() const CV_OVERRIDE { return lambda; }
+    virtual void setEPICLambda(float val)  CV_OVERRIDE { lambda = val; }
 
-        virtual float getFgsLambda() const CV_OVERRIDE { return fgs_lambda; }
-        virtual void setFgsLambda(float val) CV_OVERRIDE { fgs_lambda = val; }
+    virtual float getFgsLambda() const CV_OVERRIDE { return fgs_lambda; }
+    virtual void setFgsLambda(float val) CV_OVERRIDE { fgs_lambda = val; }
 
-        virtual float getFgsSigma() const CV_OVERRIDE { return fgs_sigma; }
-        virtual void setFgsSigma(float val) CV_OVERRIDE { fgs_sigma = val; }
+    virtual float getFgsSigma() const CV_OVERRIDE { return fgs_sigma; }
+    virtual void setFgsSigma(float val) CV_OVERRIDE { fgs_sigma = val; }
 
-        virtual bool getUsePostProc() const CV_OVERRIDE { return use_post_proc; }
-        virtual void setUsePostProc(bool val) CV_OVERRIDE { use_post_proc = val; }
+    virtual bool getUsePostProc() const CV_OVERRIDE { return use_post_proc; }
+    virtual void setUsePostProc(bool val) CV_OVERRIDE { use_post_proc = val; }
 
-        virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow) CV_OVERRIDE
+    virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow) CV_OVERRIDE
+    {
+        CV_Assert(!I0.empty() && I0.depth() == CV_8U && (I0.channels() == 3 || I0.channels() == 1));
+        CV_Assert(!I1.empty() && I1.depth() == CV_8U && (I1.channels() == 3 || I1.channels() == 1));
+        CV_Assert(I0.sameSize(I1));
+        if (param.empty())
+            param = Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter());
+        if (param->supportRegionType == SR_CROSS)
+            CV_Assert( I0.channels() == 3 && I1.channels() == 3);
+        CV_Assert(interp_type == InterpolationType::INTERP_EPIC || interp_type == InterpolationType::INTERP_GEO);
+        // if no parameter is used use the default parameter
+
+        Mat prevImage = I0.getMat();
+        Mat currImage = I1.getMat();
+        int noPoints = prevImage.cols * prevImage.rows;
+        std::vector<cv::Point2f> prevPoints(noPoints);
+        std::vector<cv::Point2f> currPoints, refPoints;
+        noPoints = 0;
+        cv::Size grid_h = gridStep / 2;
+        for (int r = grid_h.height; r < prevImage.rows; r += gridStep.height)
         {
-            CV_Assert(!I0.empty() && I0.depth() == CV_8U && (I0.channels() == 3 || I0.channels() == 1));
-            CV_Assert(!I1.empty() && I1.depth() == CV_8U && (I1.channels() == 3 || I1.channels() == 1));
-            CV_Assert(I0.sameSize(I1));
-            if (param.empty())
-                param = Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter());
-            if (param->supportRegionType == SR_CROSS)
-                CV_Assert( I0.channels() == 3 && I1.channels() == 3);
-            CV_Assert(interp_type == InterpolationType::INTERP_EPIC || interp_type == InterpolationType::INTERP_GEO);
-            // if no parameter is used use the default parameter
-
-            Mat prevImage = I0.getMat();
-            Mat currImage = I1.getMat();
-            int noPoints = prevImage.cols * prevImage.rows;
-            std::vector<cv::Point2f> prevPoints(noPoints);
-            std::vector<cv::Point2f> currPoints, refPoints;
-            noPoints = 0;
-            cv::Size grid_h = gridStep / 2;
-            for (int r = grid_h.height; r < prevImage.rows; r += gridStep.height)
+            for (int c = grid_h.width; c < prevImage.cols; c += gridStep.width)
             {
-                for (int c = grid_h.width; c < prevImage.cols; c += gridStep.width)
-                {
-                    prevPoints[noPoints++] = cv::Point2f(static_cast<float>(c), static_cast<float>(r));
-                }
+                prevPoints[noPoints++] = cv::Point2f(static_cast<float>(c), static_cast<float>(r));
             }
-            prevPoints.erase(prevPoints.begin() + noPoints, prevPoints.end());
-            currPoints.resize(prevPoints.size());
-            calcLocalOpticalFlow(prevImage, currImage, prevPyramid, currPyramid, prevPoints, currPoints, *(param.get()));
-            flow.create(prevImage.size(), CV_32FC2);
-            Mat dense_flow = flow.getMat();
-
-            std::vector<Point2f> filtered_prevPoints;
-            std::vector<Point2f> filtered_currPoints;
-            if (gridStep == cv::Size(1, 1) && forwardBackwardThreshold <= 0)
+        }
+        prevPoints.erase(prevPoints.begin() + noPoints, prevPoints.end());
+        currPoints.resize(prevPoints.size());
+        calcLocalOpticalFlow(prevImage, currImage, prevPyramid, currPyramid, prevPoints, currPoints, *(param.get()));
+        flow.create(prevImage.size(), CV_32FC2);
+        Mat dense_flow = flow.getMat();
+
+        std::vector<Point2f> filtered_prevPoints;
+        std::vector<Point2f> filtered_currPoints;
+        if (gridStep == cv::Size(1, 1) && forwardBackwardThreshold <= 0)
+        {
+            for (unsigned int n = 0; n < prevPoints.size(); n++)
             {
-                for (unsigned int n = 0; n < prevPoints.size(); n++)
-                {
-                    dense_flow.at<Point2f>(prevPoints[n]) = currPoints[n] - prevPoints[n];
-                }
-                return;
+                dense_flow.at<Point2f>(prevPoints[n]) = currPoints[n] - prevPoints[n];
             }
-            if (forwardBackwardThreshold > 0)
+            return;
+        }
+        if (forwardBackwardThreshold > 0)
+        {
+            // reuse image pyramids
+            calcLocalOpticalFlow(currImage, prevImage, currPyramid, prevPyramid, currPoints, refPoints, *(param.get()));
+
+            filtered_prevPoints.resize(prevPoints.size());
+            filtered_currPoints.resize(prevPoints.size());
+            float sqrForwardBackwardThreshold = forwardBackwardThreshold * forwardBackwardThreshold;
+            noPoints = 0;
+            for (unsigned int r = 0; r < refPoints.size(); r++)
             {
-                // reuse image pyramids
-                calcLocalOpticalFlow(currImage, prevImage, currPyramid, prevPyramid, currPoints, refPoints, *(param.get()));
-
-                filtered_prevPoints.resize(prevPoints.size());
-                filtered_currPoints.resize(prevPoints.size());
-                float sqrForwardBackwardThreshold = forwardBackwardThreshold * forwardBackwardThreshold;
-                noPoints = 0;
-                for (unsigned int r = 0; r < refPoints.size(); r++)
+                Point2f diff = refPoints[r] - prevPoints[r];
+                if (diff.x * diff.x + diff.y * diff.y < sqrForwardBackwardThreshold)
                 {
-                    Point2f diff = refPoints[r] - prevPoints[r];
-                    if (diff.x * diff.x + diff.y * diff.y < sqrForwardBackwardThreshold)
-                    {
-                        filtered_prevPoints[noPoints] = prevPoints[r];
-                        filtered_currPoints[noPoints++] = currPoints[r];
-                    }
+                    filtered_prevPoints[noPoints] = prevPoints[r];
+                    filtered_currPoints[noPoints++] = currPoints[r];
                 }
-
-                filtered_prevPoints.erase(filtered_prevPoints.begin() + noPoints, filtered_prevPoints.end());
-                filtered_currPoints.erase(filtered_currPoints.begin() + noPoints, filtered_currPoints.end());
-
-            }
-            else
-            {
-                filtered_prevPoints = prevPoints;
-                filtered_currPoints = currPoints;
             }
 
-            if (interp_type == InterpolationType::INTERP_EPIC)
-            {
-                Ptr<ximgproc::EdgeAwareInterpolator> gd = ximgproc::createEdgeAwareInterpolator();
-                gd->setK(k);
-                gd->setSigma(sigma);
-                gd->setLambda(lambda);
-                gd->setUsePostProcessing(false);
-                gd->interpolate(prevImage, filtered_prevPoints, currImage, filtered_currPoints, dense_flow);
-            }
-            else
-            {
-                Mat blurredPrevImage, blurredCurrImage;
-                GaussianBlur(prevImage, blurredPrevImage, cv::Size(5, 5), -1);
-                std::vector<uchar> status(filtered_currPoints.size(), 1);
-                interpolate_irregular_nn_raster(filtered_prevPoints, filtered_currPoints, status, blurredPrevImage).copyTo(dense_flow);
-                std::vector<Mat> vecMats;
-                std::vector<Mat> vecMats2(2);
-                cv::split(dense_flow, vecMats);
-                cv::bilateralFilter(vecMats[0], vecMats2[0], 5, 2, 20);
-                cv::bilateralFilter(vecMats[1], vecMats2[1], 5, 2, 20);
-                cv::merge(vecMats2, dense_flow);
-            }
-            if (use_post_proc)
-            {
-                ximgproc::fastGlobalSmootherFilter(prevImage, flow, flow, fgs_lambda, fgs_sigma);
-            }
-        }
+            filtered_prevPoints.erase(filtered_prevPoints.begin() + noPoints, filtered_prevPoints.end());
+            filtered_currPoints.erase(filtered_currPoints.begin() + noPoints, filtered_currPoints.end());
 
-        virtual void collectGarbage() CV_OVERRIDE
+        }
+        else
         {
-            prevPyramid[0].release();
-            prevPyramid[1].release();
-            currPyramid[0].release();
-            currPyramid[1].release();
+            filtered_prevPoints = prevPoints;
+            filtered_currPoints = currPoints;
         }
 
-    protected:
-        Ptr<RLOFOpticalFlowParameter> param;
-        float                forwardBackwardThreshold;
-        Ptr<CImageBuffer>    prevPyramid[2];
-        Ptr<CImageBuffer>    currPyramid[2];
-        cv::Size            gridStep;
-        InterpolationType   interp_type;
-        int                    k;
-        float                sigma;
-        float                lambda;
-        float                fgs_lambda;
-        float                fgs_sigma;
-        bool                use_post_proc;
-    };
-
-    Ptr<DenseRLOFOpticalFlow> DenseRLOFOpticalFlow::create(
-        Ptr<RLOFOpticalFlowParameter>  rlofParam,
-        float forwardBackwardThreshold,
-        cv::Size gridStep,
-        InterpolationType interp_type,
-        int epicK,
-        float epicSigma,
-        float epicLambda,
-        bool use_post_proc,
-        float fgs_lambda,
-        float fgs_sigma)
-    {
-        Ptr<DenseRLOFOpticalFlow> algo = makePtr<DenseOpticalFlowRLOFImpl>();
-        algo->setRLOFOpticalFlowParameter(rlofParam);
-        algo->setForwardBackward(forwardBackwardThreshold);
-        algo->setGridStep(gridStep);
-        algo->setInterpolation(interp_type);
-        algo->setEPICK(epicK);
-        algo->setEPICSigma(epicSigma);
-        algo->setEPICLambda(epicLambda);
-        algo->setUsePostProc(use_post_proc);
-        algo->setFgsLambda(fgs_lambda);
-        algo->setFgsSigma(fgs_sigma);
-        return algo;
+        if (interp_type == InterpolationType::INTERP_EPIC)
+        {
+            Ptr<ximgproc::EdgeAwareInterpolator> gd = ximgproc::createEdgeAwareInterpolator();
+            gd->setK(k);
+            gd->setSigma(sigma);
+            gd->setLambda(lambda);
+            gd->setUsePostProcessing(false);
+            gd->interpolate(prevImage, filtered_prevPoints, currImage, filtered_currPoints, dense_flow);
+        }
+        else
+        {
+            Mat blurredPrevImage, blurredCurrImage;
+            GaussianBlur(prevImage, blurredPrevImage, cv::Size(5, 5), -1);
+            std::vector<uchar> status(filtered_currPoints.size(), 1);
+            interpolate_irregular_nn_raster(filtered_prevPoints, filtered_currPoints, status, blurredPrevImage).copyTo(dense_flow);
+            std::vector<Mat> vecMats;
+            std::vector<Mat> vecMats2(2);
+            cv::split(dense_flow, vecMats);
+            cv::bilateralFilter(vecMats[0], vecMats2[0], 5, 2, 20);
+            cv::bilateralFilter(vecMats[1], vecMats2[1], 5, 2, 20);
+            cv::merge(vecMats2, dense_flow);
+        }
+        if (use_post_proc)
+        {
+            ximgproc::fastGlobalSmootherFilter(prevImage, flow, flow, fgs_lambda, fgs_sigma);
+        }
     }
 
-    class SparseRLOFOpticalFlowImpl : public SparseRLOFOpticalFlow
+    virtual void collectGarbage() CV_OVERRIDE
     {
-        public:
-        SparseRLOFOpticalFlowImpl()
-            : param(Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter))
-            , forwardBackwardThreshold(1.f)
-        {
-            prevPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer);
-            prevPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer);
-            currPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer);
-            currPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer);
-        }
-        virtual void setRLOFOpticalFlowParameter(Ptr<RLOFOpticalFlowParameter>  val) CV_OVERRIDE { param = val; }
-        virtual Ptr<RLOFOpticalFlowParameter>  getRLOFOpticalFlowParameter() const CV_OVERRIDE { return param; }
+        prevPyramid[0].release();
+        prevPyramid[1].release();
+        currPyramid[0].release();
+        currPyramid[1].release();
+    }
 
-        virtual float getForwardBackward()  const CV_OVERRIDE { return forwardBackwardThreshold; }
-        virtual void setForwardBackward(float val) CV_OVERRIDE { forwardBackwardThreshold = val; }
+protected:
+    Ptr<RLOFOpticalFlowParameter> param;
+    float                         forwardBackwardThreshold;
+    Ptr<CImageBuffer>             prevPyramid[2];
+    Ptr<CImageBuffer>             currPyramid[2];
+    cv::Size                      gridStep;
+    InterpolationType             interp_type;
+    int                           k;
+    float                         sigma;
+    float                         lambda;
+    float                         fgs_lambda;
+    float                         fgs_sigma;
+    bool                          use_post_proc;
+};
+
+Ptr<DenseRLOFOpticalFlow> DenseRLOFOpticalFlow::create(
+    Ptr<RLOFOpticalFlowParameter>  rlofParam,
+    float forwardBackwardThreshold,
+    cv::Size gridStep,
+    InterpolationType interp_type,
+    int epicK,
+    float epicSigma,
+    float epicLambda,
+    bool use_post_proc,
+    float fgs_lambda,
+    float fgs_sigma)
+{
+    Ptr<DenseRLOFOpticalFlow> algo = makePtr<DenseOpticalFlowRLOFImpl>();
+    algo->setRLOFOpticalFlowParameter(rlofParam);
+    algo->setForwardBackward(forwardBackwardThreshold);
+    algo->setGridStep(gridStep);
+    algo->setInterpolation(interp_type);
+    algo->setEPICK(epicK);
+    algo->setEPICSigma(epicSigma);
+    algo->setEPICLambda(epicLambda);
+    algo->setUsePostProc(use_post_proc);
+    algo->setFgsLambda(fgs_lambda);
+    algo->setFgsSigma(fgs_sigma);
+    return algo;
+}
 
-        virtual void calc(InputArray prevImg, InputArray nextImg,
-            InputArray prevPts, InputOutputArray nextPts,
-            OutputArray status,
-            OutputArray err) CV_OVERRIDE
-        {
-            CV_Assert(!prevImg.empty() && prevImg.depth() == CV_8U && (prevImg.channels() == 3 || prevImg.channels() == 1));
-            CV_Assert(!nextImg.empty() && nextImg.depth() == CV_8U && (nextImg.channels() == 3 || nextImg.channels() == 1));
-            CV_Assert(prevImg.sameSize(nextImg));
+class SparseRLOFOpticalFlowImpl : public SparseRLOFOpticalFlow
+{
+    public:
+    SparseRLOFOpticalFlowImpl()
+        : param(Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter))
+        , forwardBackwardThreshold(1.f)
+    {
+        prevPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer);
+        prevPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer);
+        currPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer);
+        currPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer);
+    }
+    virtual void setRLOFOpticalFlowParameter(Ptr<RLOFOpticalFlowParameter>  val) CV_OVERRIDE { param = val; }
+    virtual Ptr<RLOFOpticalFlowParameter>  getRLOFOpticalFlowParameter() const CV_OVERRIDE { return param; }
 
-            Mat prevImage = prevImg.getMat();
-            Mat nextImage = nextImg.getMat();
-            Mat prevPtsMat = prevPts.getMat();
+    virtual float getForwardBackward()  const CV_OVERRIDE { return forwardBackwardThreshold; }
+    virtual void setForwardBackward(float val) CV_OVERRIDE { forwardBackwardThreshold = val; }
 
-            if (param.empty())
-            {
-                param = Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter);
-            }
+    virtual void calc(InputArray prevImg, InputArray nextImg,
+        InputArray prevPts, InputOutputArray nextPts,
+        OutputArray status,
+        OutputArray err) CV_OVERRIDE
+    {
+        CV_Assert(!prevImg.empty() && prevImg.depth() == CV_8U && (prevImg.channels() == 3 || prevImg.channels() == 1));
+        CV_Assert(!nextImg.empty() && nextImg.depth() == CV_8U && (nextImg.channels() == 3 || nextImg.channels() == 1));
+        CV_Assert(prevImg.sameSize(nextImg));
 
-            if (param->useInitialFlow == false)
-                nextPts.create(prevPtsMat.size(), prevPtsMat.type(), -1, true);
+        Mat prevImage = prevImg.getMat();
+        Mat nextImage = nextImg.getMat();
+        Mat prevPtsMat = prevPts.getMat();
 
-            int npoints = 0;
-            CV_Assert((npoints = prevPtsMat.checkVector(2, CV_32F, true)) >= 0);
+        if (param.empty())
+        {
+            param = Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter);
+        }
 
-            if (npoints == 0)
-            {
-                nextPts.release();
-                status.release();
-                err.release();
-                return;
-            }
+        if (param->useInitialFlow == false)
+            nextPts.create(prevPtsMat.size(), prevPtsMat.type(), -1, true);
 
-            Mat nextPtsMat = nextPts.getMat();
-            CV_Assert(nextPtsMat.checkVector(2, CV_32F, true) == npoints);
-            std::vector<cv::Point2f> prevPoints(npoints), nextPoints(npoints), refPoints;
-            prevPtsMat.copyTo(cv::Mat(1, npoints, CV_32FC2, &prevPoints[0]));
-            if (param->useInitialFlow )
-                nextPtsMat.copyTo(cv::Mat(1, nextPtsMat.cols, CV_32FC2, &nextPoints[0]));
+        int npoints = 0;
+        CV_Assert((npoints = prevPtsMat.checkVector(2, CV_32F, true)) >= 0);
 
-            cv::Mat statusMat;
-            cv::Mat errorMat;
-            if (status.needed() || forwardBackwardThreshold > 0)
-            {
-                status.create((int)npoints, 1, CV_8U, -1, true);
-                statusMat = status.getMat();
-                statusMat.setTo(1);
-            }
+        if (npoints == 0)
+        {
+            nextPts.release();
+            status.release();
+            err.release();
+            return;
+        }
 
-            if (err.needed() || forwardBackwardThreshold > 0)
-            {
-                err.create((int)npoints, 1, CV_32F, -1, true);
-                errorMat = err.getMat();
-                errorMat.setTo(0);
-            }
+        Mat nextPtsMat = nextPts.getMat();
+        CV_Assert(nextPtsMat.checkVector(2, CV_32F, true) == npoints);
+        std::vector<cv::Point2f> prevPoints(npoints), nextPoints(npoints), refPoints;
+        prevPtsMat.copyTo(cv::Mat(1, npoints, CV_32FC2, &prevPoints[0]));
+        if (param->useInitialFlow )
+            nextPtsMat.copyTo(cv::Mat(1, nextPtsMat.cols, CV_32FC2, &nextPoints[0]));
 
-            calcLocalOpticalFlow(prevImage, nextImage, prevPyramid, currPyramid, prevPoints, nextPoints, *(param.get()));
-            cv::Mat(1,npoints , CV_32FC2, &nextPoints[0]).copyTo(nextPtsMat);
-            if (forwardBackwardThreshold > 0)
-            {
-                // reuse image pyramids
-                calcLocalOpticalFlow(nextImage, prevImage, currPyramid, prevPyramid, nextPoints, refPoints, *(param.get()));
-            }
-            for (unsigned int r = 0; r < refPoints.size(); r++)
-            {
-                Point2f diff = refPoints[r] - prevPoints[r];
-                errorMat.at<float>(r) = sqrt(diff.x * diff.x + diff.y * diff.y);
-                if (errorMat.at<float>(r) <= forwardBackwardThreshold)
-                    statusMat.at<uchar>(r) = 0;
-            }
+        cv::Mat statusMat;
+        cv::Mat errorMat;
+        if (status.needed() || forwardBackwardThreshold > 0)
+        {
+            status.create((int)npoints, 1, CV_8U, -1, true);
+            statusMat = status.getMat();
+            statusMat.setTo(1);
+        }
 
+        if (err.needed() || forwardBackwardThreshold > 0)
+        {
+            err.create((int)npoints, 1, CV_32F, -1, true);
+            errorMat = err.getMat();
+            errorMat.setTo(0);
         }
 
-    protected:
-        Ptr<RLOFOpticalFlowParameter> param;
-        float                forwardBackwardThreshold;
-        Ptr<CImageBuffer>    prevPyramid[2];
-        Ptr<CImageBuffer>    currPyramid[2];
-    };
+        calcLocalOpticalFlow(prevImage, nextImage, prevPyramid, currPyramid, prevPoints, nextPoints, *(param.get()));
+        cv::Mat(1,npoints , CV_32FC2, &nextPoints[0]).copyTo(nextPtsMat);
+        if (forwardBackwardThreshold > 0)
+        {
+            // reuse image pyramids
+            calcLocalOpticalFlow(nextImage, prevImage, currPyramid, prevPyramid, nextPoints, refPoints, *(param.get()));
+        }
+        for (unsigned int r = 0; r < refPoints.size(); r++)
+        {
+            Point2f diff = refPoints[r] - prevPoints[r];
+            errorMat.at<float>(r) = sqrt(diff.x * diff.x + diff.y * diff.y);
+            if (errorMat.at<float>(r) <= forwardBackwardThreshold)
+                statusMat.at<uchar>(r) = 0;
+        }
 
-    Ptr<SparseRLOFOpticalFlow> SparseRLOFOpticalFlow::create(
-        Ptr<RLOFOpticalFlowParameter>  rlofParam,
-        float forwardBackwardThreshold)
-    {
-        Ptr<SparseRLOFOpticalFlow> algo = makePtr<SparseRLOFOpticalFlowImpl>();
-        algo->setRLOFOpticalFlowParameter(rlofParam);
-        algo->setForwardBackward(forwardBackwardThreshold);
-        return algo;
     }
 
-    void calcOpticalFlowDenseRLOF(InputArray I0, InputArray I1, InputOutputArray flow,
-        Ptr<RLOFOpticalFlowParameter>  rlofParam ,
-        float forewardBackwardThreshold, Size gridStep,
-        InterpolationType interp_type,
-        int epicK, float epicSigma, float epicLambda,
-        bool use_post_proc, float fgsLambda, float fgsSigma)
-    {
-        Ptr<DenseRLOFOpticalFlow> algo = DenseRLOFOpticalFlow::create(
-            rlofParam, forewardBackwardThreshold, gridStep, interp_type,
-            epicK, epicSigma, epicLambda, use_post_proc, fgsLambda, fgsSigma);
-        algo->calc(I0, I1, flow);
-        algo->collectGarbage();
-    }
+protected:
+    Ptr<RLOFOpticalFlowParameter> param;
+    float                forwardBackwardThreshold;
+    Ptr<CImageBuffer>    prevPyramid[2];
+    Ptr<CImageBuffer>    currPyramid[2];
+};
 
-    void calcOpticalFlowSparseRLOF(InputArray prevImg, InputArray nextImg,
-        InputArray prevPts, InputOutputArray nextPts,
-        OutputArray status, OutputArray err,
-        Ptr<RLOFOpticalFlowParameter>  rlofParam,
-        float forewardBackwardThreshold)
-    {
-        Ptr<SparseRLOFOpticalFlow> algo = SparseRLOFOpticalFlow::create(
-            rlofParam, forewardBackwardThreshold);
-        algo->calc(prevImg, nextImg, prevPts, nextPts, status, err);
-    }
-    Ptr<DenseOpticalFlow> createOptFlow_DenseRLOF()
-    {
-        return DenseRLOFOpticalFlow::create();
-    }
+Ptr<SparseRLOFOpticalFlow> SparseRLOFOpticalFlow::create(
+    Ptr<RLOFOpticalFlowParameter>  rlofParam,
+    float forwardBackwardThreshold)
+{
+    Ptr<SparseRLOFOpticalFlow> algo = makePtr<SparseRLOFOpticalFlowImpl>();
+    algo->setRLOFOpticalFlowParameter(rlofParam);
+    algo->setForwardBackward(forwardBackwardThreshold);
+    return algo;
+}
 
-    Ptr<SparseOpticalFlow> createOptFlow_SparseRLOF()
-    {
-        return SparseRLOFOpticalFlow::create();
-    }
+void calcOpticalFlowDenseRLOF(InputArray I0, InputArray I1, InputOutputArray flow,
+    Ptr<RLOFOpticalFlowParameter>  rlofParam ,
+    float forewardBackwardThreshold, Size gridStep,
+    InterpolationType interp_type,
+    int epicK, float epicSigma, float epicLambda,
+    bool use_post_proc, float fgsLambda, float fgsSigma)
+{
+    Ptr<DenseRLOFOpticalFlow> algo = DenseRLOFOpticalFlow::create(
+        rlofParam, forewardBackwardThreshold, gridStep, interp_type,
+        epicK, epicSigma, epicLambda, use_post_proc, fgsLambda, fgsSigma);
+    algo->calc(I0, I1, flow);
+    algo->collectGarbage();
 }
 
+void calcOpticalFlowSparseRLOF(InputArray prevImg, InputArray nextImg,
+    InputArray prevPts, InputOutputArray nextPts,
+    OutputArray status, OutputArray err,
+    Ptr<RLOFOpticalFlowParameter>  rlofParam,
+    float forewardBackwardThreshold)
+{
+    Ptr<SparseRLOFOpticalFlow> algo = SparseRLOFOpticalFlow::create(
+        rlofParam, forewardBackwardThreshold);
+    algo->calc(prevImg, nextImg, prevPts, nextPts, status, err);
+}
+Ptr<DenseOpticalFlow> createOptFlow_DenseRLOF()
+{
+    return DenseRLOFOpticalFlow::create();
+}
 
+Ptr<SparseOpticalFlow> createOptFlow_SparseRLOF()
+{
+    return SparseRLOFOpticalFlow::create();
 }
+
+}} // namespace

From 2b4eb0bc9e4680144a413f0df2522676ac92360e Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.alekhin@intel.com>
Date: Fri, 14 Dec 2018 14:05:31 +0300
Subject: [PATCH 13/25] fix perf test

---
 modules/optflow/perf/perf_rlof.cpp | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/modules/optflow/perf/perf_rlof.cpp b/modules/optflow/perf/perf_rlof.cpp
index d8fb64eb40a..a273a9456a2 100644
--- a/modules/optflow/perf/perf_rlof.cpp
+++ b/modules/optflow/perf/perf_rlof.cpp
@@ -36,10 +36,9 @@ PERF_TEST_P(ST_SR_IM_Sparse, OpticalFlow_SparseRLOF,
         param->supportRegionType = SR_FIXED;
     param->useIlluminationModel = get<2>(GetParam());
 
-    TEST_CYCLE_N(1)
-    {
+    PERF_SAMPLE_BEGIN()
         calcOpticalFlowSparseRLOF(frame1, frame2, prevPts, currPts, status, err, param, 1.f);
-    }
+    PERF_SAMPLE_END()
 
     SANITY_CHECK_NOTHING();
 }
@@ -64,10 +63,9 @@ PERF_TEST_P(INTERP_GRID_Dense, OpticalFlow_DenseRLOF,
         interp_type = INTERP_EPIC;
     if (get<0>(GetParam()) == "INTERP_GEO")
         interp_type = INTERP_GEO;
-    TEST_CYCLE_N(5)
-    {
+    PERF_SAMPLE_BEGIN()
         calcOpticalFlowDenseRLOF(frame1, frame2,flow, param, 1.0f, Size(get<1>(GetParam()), get<1>(GetParam())), interp_type);
-    }
+    PERF_SAMPLE_END()
     SANITY_CHECK_NOTHING();
 }
 

From 8ccab0bd5b483a505277e881b145be6406756cf1 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.alekhin@intel.com>
Date: Fri, 14 Dec 2018 14:54:17 +0300
Subject: [PATCH 14/25] other changes: precomp.hpp / static

---
 modules/optflow/CMakeLists.txt                |  2 +-
 modules/optflow/src/rlof/berlof_invoker.hpp   | 10 ++-----
 .../optflow/src/rlof/geo_interpolation.cpp    | 29 +++++++++----------
 .../optflow/src/rlof/geo_interpolation.hpp    |  3 +-
 modules/optflow/src/rlof/plk_invoker.hpp      |  2 --
 modules/optflow/src/rlof/rlof_invoker.hpp     |  3 +-
 modules/optflow/src/rlof/rlof_invokerbase.hpp | 28 +++++++++---------
 modules/optflow/src/rlof/rlof_localflow.cpp   |  7 +++--
 modules/optflow/src/rlof/rlof_localflow.h     |  1 -
 9 files changed, 38 insertions(+), 47 deletions(-)

diff --git a/modules/optflow/CMakeLists.txt b/modules/optflow/CMakeLists.txt
index 39a37f143dd..e4232749ea2 100644
--- a/modules/optflow/CMakeLists.txt
+++ b/modules/optflow/CMakeLists.txt
@@ -1,2 +1,2 @@
 set(the_description "Optical Flow Algorithms")
-ocv_define_module(optflow opencv_core opencv_imgproc opencv_video opencv_ximgproc opencv_imgcodecs opencv_flann WRAP python)
+ocv_define_module(optflow opencv_core opencv_imgproc opencv_calib3d opencv_video opencv_ximgproc opencv_imgcodecs opencv_flann WRAP python)
diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp
index e5cac1fa87c..d4e1228d3e1 100644
--- a/modules/optflow/src/rlof/berlof_invoker.hpp
+++ b/modules/optflow/src/rlof/berlof_invoker.hpp
@@ -10,7 +10,7 @@ namespace cv{
 namespace optflow{
 
 
-inline bool checkSolution(float x, float y, float * c )
+static inline bool checkSolution(float x, float y, float * c )
 {
     float _a = x - 0.002f;
     float _b = y - 0.002f;
@@ -25,13 +25,13 @@ inline bool checkSolution(float x, float y, float * c )
         && (bl.x >= 0 && bl.y <= 0) && (br.x <= 0 && br.y <= 0);
 }
 
-inline cv::Point2f est_DeltaGain( const cv::Mat & src, const cv::Vec4f & val)
+static inline cv::Point2f est_DeltaGain( const cv::Mat & src, const cv::Vec4f & val)
 {
     return cv::Point2f(
         src.at<float>(2,0) * val[0] + src.at<float>(2,1) * val[1] + src.at<float>(2,2) * val[2] + src.at<float>(2,3) * val[3],
         src.at<float>(3,0) * val[0] + src.at<float>(3,1) * val[1] + src.at<float>(3,2) * val[2] + src.at<float>(3,3) * val[3]);
 }
-inline void est_Result( const cv::Mat & src, const cv::Vec4f & val, cv::Point2f & delta, cv::Point2f & gain)
+static inline void est_Result( const cv::Mat & src, const cv::Vec4f & val, cv::Point2f & delta, cv::Point2f & gain)
 {
 
     delta = cv::Point2f(
@@ -92,7 +92,6 @@ class TrackerInvoker : public cv::ParallelLoopBody
         crossSegmentationThreshold = _crossSegmentationThreshold;
     }
 
-    TrackerInvoker & operator=(const TrackerInvoker &) {return *this;};
     void operator()(const cv::Range& range) const CV_OVERRIDE
     {
 #ifdef DEBUG_INVOKER
@@ -610,7 +609,6 @@ class TrackerInvoker : public cv::ParallelLoopBody
         crossSegmentationThreshold = _crossSegmentationThreshold;
     }
 
-    TrackerInvoker & operator=(const TrackerInvoker &) {return *this;};
     void operator()(const cv::Range& range) const CV_OVERRIDE
     {
 #ifdef DEBUG_INVOKER
@@ -1576,8 +1574,6 @@ class TrackerInvoker  : public cv::ParallelLoopBody
         crossSegmentationThreshold = _crossSegmentationThreshold;
     }
 
-    TrackerInvoker & operator=(const TrackerInvoker &) {return *this;};
-
     void operator()(const cv::Range& range) const CV_OVERRIDE
     {
         cv::Size    winSize;
diff --git a/modules/optflow/src/rlof/geo_interpolation.cpp b/modules/optflow/src/rlof/geo_interpolation.cpp
index 048f65812d0..143b4f63a4e 100644
--- a/modules/optflow/src/rlof/geo_interpolation.cpp
+++ b/modules/optflow/src/rlof/geo_interpolation.cpp
@@ -2,46 +2,43 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 // This functions have been contributed by Jonas Geisters <geistert@nue.tu-berlin.de>
+
+#include "../precomp.hpp"
+
 #include "geo_interpolation.hpp"
 #include <string>
 #include <map>
 namespace cv {
 namespace optflow {
 
-inline double K_h1(float x, double h2)
-{
-    return exp(-(0.5 / (h2)) * x);
-}
-
 struct Graph_helper {
-    int * mem;
+    std::vector<int> mem;
     int e_size;
     Graph_helper(int k, int num_nodes) {
         e_size = (2 * k + 1);
-        mem = (int*)malloc(sizeof(int) * e_size * num_nodes);
-        memset(mem, 0, sizeof(int) * e_size * num_nodes);
+        mem.resize(e_size * num_nodes, 0);
     }
     inline int size(int id) {
         int r_addr = id * (e_size);
-        return ((int*)mem)[r_addr];
+        return mem[r_addr];
     }
     inline int * data(int id) {
         int r_addr = id * (e_size)+1;
-        return mem + r_addr;
+        return &mem[r_addr];
     }
     inline void add(int id, std::pair<float, int> data) {
         int r_addr = id * (e_size);
-        int size = ++((int*)mem)[r_addr];
+        int size = ++mem[r_addr];
         r_addr += 2 * size - 1;//== 1 + 2*(size-1);
-        ((float*)mem)[r_addr] = data.first;
-        ((int*)mem)[r_addr + 1] = data.second;
+        *(float*)&mem[r_addr] = data.first;
+        mem[r_addr + 1] = data.second;
     }
     inline bool color_in_target(int id, int color) {
         int r_addr = id * (e_size);
-        int size = (((int*)mem)[r_addr]);
+        int size = mem[r_addr];
         r_addr += 2;
         for (int i = 0; i < size; i++) {
-            if (((int*)mem)[r_addr] == color) {
+            if (mem[r_addr] == color) {
                 return true;
             }
             r_addr += 2;
@@ -481,4 +478,4 @@ Mat interpolate_irregular_nn(
     return nnFlow;
 }
 
-}}  // namespace
+}} // namespace
diff --git a/modules/optflow/src/rlof/geo_interpolation.hpp b/modules/optflow/src/rlof/geo_interpolation.hpp
index 0e13b44c493..526768903b5 100644
--- a/modules/optflow/src/rlof/geo_interpolation.hpp
+++ b/modules/optflow/src/rlof/geo_interpolation.hpp
@@ -3,7 +3,6 @@
 // of this distribution and at http://opencv.org/license.html.
 #ifndef _GEO_INTERPOLATION_HPP_
 #define _GEO_INTERPOLATION_HPP_
-#include <opencv2/core.hpp>
 
 namespace cv {
 namespace optflow {
@@ -32,5 +31,5 @@ Mat interpolate_irregular_nn_raster(const std::vector<Point2f> & prevPoints,
     const std::vector<uchar> & status,
     const Mat & i1);
 
-}}  // namespace
+}} // namespace
 #endif
diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp
index b7311edb86c..3a51b1ba087 100644
--- a/modules/optflow/src/rlof/plk_invoker.hpp
+++ b/modules/optflow/src/rlof/plk_invoker.hpp
@@ -59,7 +59,6 @@ class TrackerInvoker : public cv::ParallelLoopBody
         crossSegmentationThreshold = _crossSegmentationThreshold;
     }
 
-    TrackerInvoker & operator=(const TrackerInvoker &) {return *this;};
     void operator()(const cv::Range& range) const CV_OVERRIDE
     {
 #ifdef DEBUG_INVOKER
@@ -711,7 +710,6 @@ class TrackerInvoker : public cv::ParallelLoopBody
         crossSegmentationThreshold = _crossSegmentationThreshold;
     }
 
-    TrackerInvoker & operator=(const TrackerInvoker &) {return *this;};
     void operator()(const cv::Range& range) const CV_OVERRIDE
     {
 #ifdef DEBUG_INVOKER
diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp
index 14eabe4b948..14de2d04393 100644
--- a/modules/optflow/src/rlof/rlof_invoker.hpp
+++ b/modules/optflow/src/rlof/rlof_invoker.hpp
@@ -53,7 +53,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
         useInitialFlow = _useInitialFlow;
         crossSegmentationThreshold = _crossSegmentationThreshold;
     }
-    TrackerInvoker & operator=(const TrackerInvoker &) { return *this; };
+
     void operator()(const cv::Range& range) const CV_OVERRIDE
     {
 #ifdef DEBUG_INVOKER
@@ -675,7 +675,6 @@ class TrackerInvoker : public cv::ParallelLoopBody
         crossSegmentationThreshold = _crossSegmentationThreshold;
     }
 
-    TrackerInvoker & operator=(const TrackerInvoker &) { return *this; };
     void operator()(const cv::Range& range) const CV_OVERRIDE
     {
 #ifdef DEBUG_INVOKER
diff --git a/modules/optflow/src/rlof/rlof_invokerbase.hpp b/modules/optflow/src/rlof/rlof_invokerbase.hpp
index 991531deef0..f65efeb673c 100644
--- a/modules/optflow/src/rlof/rlof_invokerbase.hpp
+++ b/modules/optflow/src/rlof/rlof_invokerbase.hpp
@@ -5,20 +5,19 @@
 #define _RLOF_INVOKERBASE_HPP_
 
 
-#ifdef CV_CPU_COMPILE_SSE4_1
-#define RLOF_SSE
-#endif
-#ifdef CV_CPU_COMPILE_SSE4_2
+#if CV_SSE4_1
 #define RLOF_SSE
 #endif
 
 //#define DEBUG_INVOKER
 
-#define  CV_DESCALE(x,n)     (((x) + (1 << ((n)-1))) >> (n))
+#ifndef CV_DESCALE
+#define CV_DESCALE(x, n)     (((x) + (1 << ((n)-1))) >> (n))
+#endif
+
 #define FLT_RESCALE 1
 
-#include "opencv2/core.hpp"
-#include "opencv2/video.hpp"
+
 #include "rlof_localflow.h"
 #include <unordered_map>
 #include "opencv2/core/hal/intrin.hpp"
@@ -31,7 +30,7 @@ namespace optflow {
 
 typedef short deriv_type;
 #ifdef RLOF_SSE
-inline void get4BitMask(const int & width, __m128i & mask)
+static inline void get4BitMask(const int & width, __m128i & mask)
 {
     int noBits = width - static_cast<int>(floor(width / 4.f) * 4.f);
     unsigned int val[4];
@@ -42,7 +41,7 @@ inline void get4BitMask(const int & width, __m128i & mask)
     mask = _mm_set_epi32(val[3], val[2], val[1], val[0]);
 
 }
-inline void getWBitMask(const int & width, __m128i & t0, __m128i & t1, __m128i & t2)
+static inline void getWBitMask(const int & width, __m128i & t0, __m128i & t1, __m128i & t2)
 {
     int noBits = width - static_cast<int>(floor(width / 8.f) * 8.f);
     unsigned short val[8];
@@ -59,7 +58,7 @@ typedef uchar tMaskType;
 #define tCVMaskType CV_8UC1
 #define MaskSet 0xffffffff
 
-inline bool notSameColor(const cv::Point3_<uchar> & ref, int r, int c, const cv::Mat & img, int winSize, int threshold)
+static inline bool notSameColor(const cv::Point3_<uchar> & ref, int r, int c, const cv::Mat & img, int winSize, int threshold)
 {
     if (r >= img.rows + winSize || c >= img.cols + winSize || r < -winSize || c < -winSize) return true;
     int step = static_cast<int>(img.step1());
@@ -86,7 +85,8 @@ inline bool notSameColor(const cv::Point3_<uchar> & ref, int r, int c, const cv:
 *\param threshold,
 *\param useBothImages
 */
-static void getLocalPatch(
+static
+void getLocalPatch(
     const cv::Mat & prevImg,
     const cv::Point2i & prevPoint, // feature points
     cv::Mat & winPointMask,
@@ -232,7 +232,8 @@ static void getLocalPatch(
     winPointMask = winPointMask(cv::Rect(min_c, min_r, winRoi.width, winRoi.height));
 }
 
-inline bool calcWinMaskMat(
+static inline
+bool calcWinMaskMat(
         const cv::Mat & BI,
         const int windowType,
         const cv::Point2i & iprevPt,
@@ -264,7 +265,8 @@ inline bool calcWinMaskMat(
     return true;
 }
 
-inline short estimateScale(cv::Mat & residuals)
+static inline
+short estimateScale(cv::Mat & residuals)
 {
     cv::Mat absMat = cv::abs(residuals);
     return quickselect<short>(absMat, absMat.rows / 2);
diff --git a/modules/optflow/src/rlof/rlof_localflow.cpp b/modules/optflow/src/rlof/rlof_localflow.cpp
index 666421f767f..211d999b0e7 100644
--- a/modules/optflow/src/rlof/rlof_localflow.cpp
+++ b/modules/optflow/src/rlof/rlof_localflow.cpp
@@ -1,10 +1,11 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
+#include "../precomp.hpp"
+
+#include "opencv2/calib3d.hpp"  // findHomography
+
 #include "rlof_localflow.h"
-#ifdef HAVE_OPENCV_CALIB3D
-#include "opencv2/calib3d.hpp"
-#endif
 #include "berlof_invoker.hpp"
 #include "rlof_invoker.hpp"
 #include "plk_invoker.hpp"
diff --git a/modules/optflow/src/rlof/rlof_localflow.h b/modules/optflow/src/rlof/rlof_localflow.h
index 00702e5323b..9ffe42cea3e 100644
--- a/modules/optflow/src/rlof/rlof_localflow.h
+++ b/modules/optflow/src/rlof/rlof_localflow.h
@@ -7,7 +7,6 @@
 #include <math.h>
 #include <float.h>
 #include <stdio.h>
-#include "opencv2/core.hpp"
 #include "opencv2/imgproc.hpp"
 #include "opencv2/optflow/rlofflow.hpp"
 //! Fast median estimation method based on @cite Tibshirani2008. This implementation relates to http://www.stat.cmu.edu/~ryantibs/median/

From 70f11160795fd2fab1ff7b7ab161722ec062e63d Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.alekhin@intel.com>
Date: Fri, 14 Dec 2018 15:31:54 +0300
Subject: [PATCH 15/25] use Matx44f and Vec4f instead of Mat

---
 modules/optflow/src/rlof/berlof_invoker.hpp | 52 ++++++++++-----------
 modules/optflow/src/rlof/plk_invoker.hpp    | 49 +++++++++----------
 modules/optflow/src/rlof/rlof_invoker.hpp   | 49 +++++++++----------
 3 files changed, 75 insertions(+), 75 deletions(-)

diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp
index d4e1228d3e1..31d2792b00e 100644
--- a/modules/optflow/src/rlof/berlof_invoker.hpp
+++ b/modules/optflow/src/rlof/berlof_invoker.hpp
@@ -25,22 +25,22 @@ static inline bool checkSolution(float x, float y, float * c )
         && (bl.x >= 0 && bl.y <= 0) && (br.x <= 0 && br.y <= 0);
 }
 
-static inline cv::Point2f est_DeltaGain( const cv::Mat & src, const cv::Vec4f & val)
+static inline cv::Point2f est_DeltaGain(const cv::Matx44f& src, const cv::Vec4f& val)
 {
     return cv::Point2f(
-        src.at<float>(2,0) * val[0] + src.at<float>(2,1) * val[1] + src.at<float>(2,2) * val[2] + src.at<float>(2,3) * val[3],
-        src.at<float>(3,0) * val[0] + src.at<float>(3,1) * val[1] + src.at<float>(3,2) * val[2] + src.at<float>(3,3) * val[3]);
+        src(2,0) * val[0] + src(2,1) * val[1] + src(2,2) * val[2] + src(2,3) * val[3],
+        src(3,0) * val[0] + src(3,1) * val[1] + src(3,2) * val[2] + src(3,3) * val[3]);
 }
-static inline void est_Result( const cv::Mat & src, const cv::Vec4f & val, cv::Point2f & delta, cv::Point2f & gain)
+static inline void est_Result(const cv::Matx44f& src, const cv::Vec4f & val, cv::Point2f & delta, cv::Point2f & gain)
 {
 
     delta = cv::Point2f(
-        -(src.at<float>(0,0) * val[0] + src.at<float>(0,1) * val[1] + src.at<float>(0,2) * val[2] + src.at<float>(0,3) * val[3]),
-        -(src.at<float>(1,0) * val[0] + src.at<float>(1,1) * val[1] + src.at<float>(1,2) * val[2] + src.at<float>(1,3) * val[3]));
+        -(src(0,0) * val[0] + src(0,1) * val[1] + src(0,2) * val[2] + src(0,3) * val[3]),
+        -(src(1,0) * val[0] + src(1,1) * val[1] + src(1,2) * val[2] + src(1,3) * val[3]));
 
     gain = cv::Point2f(
-        src.at<float>(2,0) * val[0] + src.at<float>(2,1) * val[1] + src.at<float>(2,2) * val[2] + src.at<float>(2,3) * val[3],
-        src.at<float>(3,0) * val[0] + src.at<float>(3,1) * val[1] + src.at<float>(3,2) * val[2] + src.at<float>(3,3) * val[3]);
+        src(2,0) * val[0] + src(2,1) * val[1] + src(2,2) * val[2] + src(2,3) * val[3],
+        src(3,0) * val[0] + src(3,1) * val[1] + src(3,2) * val[2] + src(3,3) * val[3]);
 }
 
 namespace berlof {
@@ -632,9 +632,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
         cv::Size winBufSize(winbufwidth,winbufwidth);
 
 
-        cv::Mat invTensorMat(4,4, CV_32FC1);
-        cv::Mat mismatchMat(4, 1, CV_32FC1);
-        cv::Mat resultMat(4, 1, CV_32FC1);
+        cv::Matx44f invTensorMat;
 
         cv::AutoBuffer<deriv_type> _buf(winBufSize.area()*(cn + cn2));
         int derivDepth = DataType<deriv_type>::depth;
@@ -1344,22 +1342,22 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
                     D = (1.f / D);
 
-                    invTensorMat.at<float>(0,0) = (A22*sumI*sumI - 2*sumI*sumIy*w2 + dI*sumIy*sumIy + sumW*w2*w2 - A22*dI*sumW)* D;
-                    invTensorMat.at<float>(0,1) = (A12*dI*sumW - A12*sumI * sumI - dI*sumIx*sumIy + sumI*sumIx*w2 + sumI*sumIy*w1 - sumW*w1*w2)* D;
-                    invTensorMat.at<float>(0,2) = (A12*sumI*sumIy - sumIy*sumIy*w1 - A22*sumI*sumIx - A12*sumW*w2 + A22*sumW*w1 + sumIx*sumIy*w2)* D;
-                    invTensorMat.at<float>(0,3) = (A22*dI*sumIx - A12*dI*sumIy - sumIx*w2*w2 + A12*sumI*w2 - A22*sumI*w1 + sumIy*w1*w2) * D;
-                    invTensorMat.at<float>(1,0) = invTensorMat.at<float>(0,1);
-                    invTensorMat.at<float>(1,1) = (A11*sumI * sumI - 2*sumI*sumIx*w1 + dI*sumIx * sumIx + sumW*w1*w1 - A11*dI*sumW) * D;
-                    invTensorMat.at<float>(1,2) = (A12*sumI*sumIx - A11*sumI*sumIy - sumIx * sumIx*w2 + A11*sumW*w2 - A12*sumW*w1 + sumIx*sumIy*w1) * D;
-                    invTensorMat.at<float>(1,3) = (A11*dI*sumIy - sumIy*w1*w1 - A12*dI*sumIx - A11*sumI*w2 + A12*sumI*w1 + sumIx*w1*w2)* D;
-                    invTensorMat.at<float>(2,0) = invTensorMat.at<float>(0,2);
-                    invTensorMat.at<float>(2,1) = invTensorMat.at<float>(1,2);
-                    invTensorMat.at<float>(2,2) = (sumW*A12*A12 - 2*A12*sumIx*sumIy + A22*sumIx*sumIx + A11*sumIy*sumIy - A11*A22*sumW)* D;
-                    invTensorMat.at<float>(2,3) = (A11*A22*sumI - A12*A12*sumI - A11*sumIy*w2 + A12*sumIx*w2 + A12*sumIy*w1 - A22*sumIx*w1)* D;
-                    invTensorMat.at<float>(3,0) = invTensorMat.at<float>(0,3);
-                    invTensorMat.at<float>(3,1) = invTensorMat.at<float>(1,3);
-                    invTensorMat.at<float>(3,2) = invTensorMat.at<float>(2,3);
-                    invTensorMat.at<float>(3,3) = (dI*A12*A12 - 2*A12*w1*w2 + A22*w1*w1 + A11*w2*w2 - A11*A22*dI)* D;
+                    invTensorMat(0,0) = (A22*sumI*sumI - 2*sumI*sumIy*w2 + dI*sumIy*sumIy + sumW*w2*w2 - A22*dI*sumW)* D;
+                    invTensorMat(0,1) = (A12*dI*sumW - A12*sumI * sumI - dI*sumIx*sumIy + sumI*sumIx*w2 + sumI*sumIy*w1 - sumW*w1*w2)* D;
+                    invTensorMat(0,2) = (A12*sumI*sumIy - sumIy*sumIy*w1 - A22*sumI*sumIx - A12*sumW*w2 + A22*sumW*w1 + sumIx*sumIy*w2)* D;
+                    invTensorMat(0,3) = (A22*dI*sumIx - A12*dI*sumIy - sumIx*w2*w2 + A12*sumI*w2 - A22*sumI*w1 + sumIy*w1*w2) * D;
+                    invTensorMat(1,0) = invTensorMat(0,1);
+                    invTensorMat(1,1) = (A11*sumI * sumI - 2*sumI*sumIx*w1 + dI*sumIx * sumIx + sumW*w1*w1 - A11*dI*sumW) * D;
+                    invTensorMat(1,2) = (A12*sumI*sumIx - A11*sumI*sumIy - sumIx * sumIx*w2 + A11*sumW*w2 - A12*sumW*w1 + sumIx*sumIy*w1) * D;
+                    invTensorMat(1,3) = (A11*dI*sumIy - sumIy*w1*w1 - A12*dI*sumIx - A11*sumI*w2 + A12*sumI*w1 + sumIx*w1*w2)* D;
+                    invTensorMat(2,0) = invTensorMat(0,2);
+                    invTensorMat(2,1) = invTensorMat(1,2);
+                    invTensorMat(2,2) = (sumW*A12*A12 - 2*A12*sumIx*sumIy + A22*sumIx*sumIx + A11*sumIy*sumIy - A11*A22*sumW)* D;
+                    invTensorMat(2,3) = (A11*A22*sumI - A12*A12*sumI - A11*sumIy*w2 + A12*sumIx*w2 + A12*sumIy*w1 - A22*sumIx*w1)* D;
+                    invTensorMat(3,0) = invTensorMat(0,3);
+                    invTensorMat(3,1) = invTensorMat(1,3);
+                    invTensorMat(3,2) = invTensorMat(2,3);
+                    invTensorMat(3,3) = (dI*A12*A12 - 2*A12*w1*w2 + A22*w1*w1 + A11*w2*w2 - A11*A22*dI)* D;
                 }
 
 
diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp
index 3a51b1ba087..06a2972969d 100644
--- a/modules/optflow/src/rlof/plk_invoker.hpp
+++ b/modules/optflow/src/rlof/plk_invoker.hpp
@@ -82,9 +82,9 @@ class TrackerInvoker : public cv::ParallelLoopBody
         int winbufwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
         cv::Size winBufSize(winbufwidth,winbufwidth);
 
-        cv::Mat invTensorMat(4,4, CV_32FC1);
-        cv::Mat mismatchMat(4, 1, CV_32FC1);
-        cv::Mat resultMat(4, 1, CV_32FC1);
+        cv::Matx44f invTensorMat;
+        Vec4f mismatchMat;
+        Vec4f resultMat;
 
         cv::AutoBuffer<deriv_type> _buf(winBufSize.area()*(cn + cn2));
         int derivDepth = DataType<deriv_type>::depth;
@@ -555,10 +555,10 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 b4 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
 #endif
 
-                mismatchMat.at<float>(0,0) =  b1 * FLT_SCALE;
-                mismatchMat.at<float>(1,0) =  b2 * FLT_SCALE;
-                mismatchMat.at<float>(2,0) = -b3 * FLT_SCALE;
-                mismatchMat.at<float>(3,0) = -b4 * FLT_SCALE;
+                mismatchMat(0) =  b1 * FLT_SCALE;
+                mismatchMat(1) =  b2 * FLT_SCALE;
+                mismatchMat(2) = -b3 * FLT_SCALE;
+                mismatchMat(3) = -b4 * FLT_SCALE;
 
                 D = - A12*A12*sumI*sumI + dI*sumW*A12*A12 + 2*A12*sumI*sumIx*w2 + 2*A12*sumI*sumIy*w1
                         - 2*dI*A12*sumIx*sumIy - 2*sumW*A12*w1*w2 + A11*A22*sumI*sumI - 2*A22*sumI*sumIx*w1
@@ -583,27 +583,28 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
 
                 D = (1.f / D);
-                invTensorMat.at<float>(0,0) = (A22*sumI*sumI - 2*sumI*sumIy*w2 + dI*sumIy*sumIy + sumW*w2*w2 - A22*dI*sumW)* D;                                                invTensorMat.at<float>(0,1) = (A12*dI*sumW - A12*sumI * sumI - dI*sumIx*sumIy + sumI*sumIx*w2 + sumI*sumIy*w1 - sumW*w1*w2)* D;
-                invTensorMat.at<float>(0,2) = (A12*sumI*sumIy - sumIy*sumIy*w1 - A22*sumI*sumIx - A12*sumW*w2 + A22*sumW*w1 + sumIx*sumIy*w2)* D;
-                invTensorMat.at<float>(0,3) = (A22*dI*sumIx - A12*dI*sumIy - sumIx*w2*w2 + A12*sumI*w2 - A22*sumI*w1 + sumIy*w1*w2) * D;
-                invTensorMat.at<float>(1,0) = invTensorMat.at<float>(0,1);
-                invTensorMat.at<float>(1,1) = (A11*sumI * sumI - 2*sumI*sumIx*w1 + dI*sumIx * sumIx + sumW*w1*w1 - A11*dI*sumW) * D;
-                invTensorMat.at<float>(1,2) = (A12*sumI*sumIx - A11*sumI*sumIy - sumIx * sumIx*w2 + A11*sumW*w2 - A12*sumW*w1 + sumIx*sumIy*w1) * D;
-                invTensorMat.at<float>(1,3) = (A11*dI*sumIy - sumIy*w1*w1 - A12*dI*sumIx - A11*sumI*w2 + A12*sumI*w1 + sumIx*w1*w2)* D;
-                invTensorMat.at<float>(2,0) = invTensorMat.at<float>(0,2);
-                invTensorMat.at<float>(2,1) = invTensorMat.at<float>(1,2);
-                invTensorMat.at<float>(2,2) = (sumW*A12*A12 - 2*A12*sumIx*sumIy + A22*sumIx*sumIx + A11*sumIy*sumIy - A11*A22*sumW)* D;
-                invTensorMat.at<float>(2,3) = (A11*A22*sumI - A12*A12*sumI - A11*sumIy*w2 + A12*sumIx*w2 + A12*sumIy*w1 - A22*sumIx*w1)* D;
-                invTensorMat.at<float>(3,0) = invTensorMat.at<float>(0,3);
-                invTensorMat.at<float>(3,1) = invTensorMat.at<float>(1,3);
-                invTensorMat.at<float>(3,2) = invTensorMat.at<float>(2,3);
-                invTensorMat.at<float>(3,3) = (dI*A12*A12 - 2*A12*w1*w2 + A22*w1*w1 + A11*w2*w2 - A11*A22*dI)* D;
+                invTensorMat(0,0) = (A22*sumI*sumI - 2*sumI*sumIy*w2 + dI*sumIy*sumIy + sumW*w2*w2 - A22*dI*sumW)* D;
+                invTensorMat(0,1) = (A12*dI*sumW - A12*sumI * sumI - dI*sumIx*sumIy + sumI*sumIx*w2 + sumI*sumIy*w1 - sumW*w1*w2)* D;
+                invTensorMat(0,2) = (A12*sumI*sumIy - sumIy*sumIy*w1 - A22*sumI*sumIx - A12*sumW*w2 + A22*sumW*w1 + sumIx*sumIy*w2)* D;
+                invTensorMat(0,3) = (A22*dI*sumIx - A12*dI*sumIy - sumIx*w2*w2 + A12*sumI*w2 - A22*sumI*w1 + sumIy*w1*w2) * D;
+                invTensorMat(1,0) = invTensorMat(0,1);
+                invTensorMat(1,1) = (A11*sumI * sumI - 2*sumI*sumIx*w1 + dI*sumIx * sumIx + sumW*w1*w1 - A11*dI*sumW) * D;
+                invTensorMat(1,2) = (A12*sumI*sumIx - A11*sumI*sumIy - sumIx * sumIx*w2 + A11*sumW*w2 - A12*sumW*w1 + sumIx*sumIy*w1) * D;
+                invTensorMat(1,3) = (A11*dI*sumIy - sumIy*w1*w1 - A12*dI*sumIx - A11*sumI*w2 + A12*sumI*w1 + sumIx*w1*w2)* D;
+                invTensorMat(2,0) = invTensorMat(0,2);
+                invTensorMat(2,1) = invTensorMat(1,2);
+                invTensorMat(2,2) = (sumW*A12*A12 - 2*A12*sumIx*sumIy + A22*sumIx*sumIx + A11*sumIy*sumIy - A11*A22*sumW)* D;
+                invTensorMat(2,3) = (A11*A22*sumI - A12*A12*sumI - A11*sumIy*w2 + A12*sumIx*w2 + A12*sumIy*w1 - A22*sumIx*w1)* D;
+                invTensorMat(3,0) = invTensorMat(0,3);
+                invTensorMat(3,1) = invTensorMat(1,3);
+                invTensorMat(3,2) = invTensorMat(2,3);
+                invTensorMat(3,3) = (dI*A12*A12 - 2*A12*w1*w2 + A22*w1*w1 + A11*w2*w2 - A11*A22*dI)* D;
 
                 resultMat = invTensorMat * mismatchMat;
 
                 // 0.08 -12.10
-                Point2f delta(-resultMat.at<float>(0), -resultMat.at<float>(1));
-                Point2f deltaGain(-resultMat.at<float>(2), -resultMat.at<float>(3));
+                Point2f delta(-resultMat(0), -resultMat(1));
+                Point2f deltaGain(-resultMat(2), -resultMat(3));
 
 
                 if( j == 0)
diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp
index 14de2d04393..7a4bd343cbe 100644
--- a/modules/optflow/src/rlof/rlof_invoker.hpp
+++ b/modules/optflow/src/rlof/rlof_invoker.hpp
@@ -700,9 +700,9 @@ class TrackerInvoker : public cv::ParallelLoopBody
         int winbufwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
         cv::Size winBufSize(winbufwidth, winbufwidth);
 
-        cv::Mat invTensorMat(4, 4, CV_32FC1);
-        cv::Mat mismatchMat(4, 1, CV_32FC1);
-        cv::Mat resultMat(4, 1, CV_32FC1);
+        cv::Matx44f invTensorMat;
+        cv::Vec4f mismatchMat;
+        cv::Vec4f resultMat;
 
 
         std::vector<short> _buf(winBufSize.area()*(cn + cn2));
@@ -1332,10 +1332,10 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 _mm_store_ps(bbuf, qb3);
                 b4 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
 #endif
-                mismatchMat.at<float>(0, 0) = b1 * FLT_SCALE;
-                mismatchMat.at<float>(1, 0) = b2 * FLT_SCALE;
-                mismatchMat.at<float>(2, 0) = -b3 * FLT_SCALE;
-                mismatchMat.at<float>(3, 0) = -b4 * FLT_SCALE;
+                mismatchMat(0) = b1 * FLT_SCALE;
+                mismatchMat(1) = b2 * FLT_SCALE;
+                mismatchMat(2) = -b3 * FLT_SCALE;
+                mismatchMat(3) = -b4 * FLT_SCALE;
 
                 D = -A12 * A12*sumI*sumI + dI * sumW*A12*A12 + 2 * A12*sumI*sumIx*w2 + 2 * A12*sumI*sumIy*w1
                     - 2 * dI*A12*sumIx*sumIy - 2 * sumW*A12*w1*w2 + A11 * A22*sumI*sumI - 2 * A22*sumI*sumIx*w1
@@ -1358,26 +1358,27 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
                 D = (1.f / D);
 
-                invTensorMat.at<float>(0, 0) = (A22*sumI*sumI - 2 * sumI*sumIy*w2 + dI * sumIy*sumIy + sumW * w2*w2 - A22 * dI*sumW)* D;                                                invTensorMat.at<float>(0, 1) = (A12*dI*sumW - A12 * sumI * sumI - dI * sumIx*sumIy + sumI * sumIx*w2 + sumI * sumIy*w1 - sumW * w1*w2)* D;
-                invTensorMat.at<float>(0, 2) = (A12*sumI*sumIy - sumIy * sumIy*w1 - A22 * sumI*sumIx - A12 * sumW*w2 + A22 * sumW*w1 + sumIx * sumIy*w2)* D;
-                invTensorMat.at<float>(0, 3) = (A22*dI*sumIx - A12 * dI*sumIy - sumIx * w2*w2 + A12 * sumI*w2 - A22 * sumI*w1 + sumIy * w1*w2) * D;
-                invTensorMat.at<float>(1, 0) = invTensorMat.at<float>(0, 1);
-                invTensorMat.at<float>(1, 1) = (A11*sumI * sumI - 2 * sumI*sumIx*w1 + dI * sumIx * sumIx + sumW * w1*w1 - A11 * dI*sumW) * D;
-                invTensorMat.at<float>(1, 2) = (A12*sumI*sumIx - A11 * sumI*sumIy - sumIx * sumIx*w2 + A11 * sumW*w2 - A12 * sumW*w1 + sumIx * sumIy*w1) * D;
-                invTensorMat.at<float>(1, 3) = (A11*dI*sumIy - sumIy * w1*w1 - A12 * dI*sumIx - A11 * sumI*w2 + A12 * sumI*w1 + sumIx * w1*w2)* D;
-                invTensorMat.at<float>(2, 0) = invTensorMat.at<float>(0, 2);
-                invTensorMat.at<float>(2, 1) = invTensorMat.at<float>(1, 2);
-                invTensorMat.at<float>(2, 2) = (sumW*A12*A12 - 2 * A12*sumIx*sumIy + A22 * sumIx*sumIx + A11 * sumIy*sumIy - A11 * A22*sumW)* D;
-                invTensorMat.at<float>(2, 3) = (A11*A22*sumI - A12 * A12*sumI - A11 * sumIy*w2 + A12 * sumIx*w2 + A12 * sumIy*w1 - A22 * sumIx*w1)* D;
-                invTensorMat.at<float>(3, 0) = invTensorMat.at<float>(0, 3);
-                invTensorMat.at<float>(3, 1) = invTensorMat.at<float>(1, 3);
-                invTensorMat.at<float>(3, 2) = invTensorMat.at<float>(2, 3);
-                invTensorMat.at<float>(3, 3) = (dI*A12*A12 - 2 * A12*w1*w2 + A22 * w1*w1 + A11 * w2*w2 - A11 * A22*dI)* D;
+                invTensorMat(0, 0) = (A22*sumI*sumI - 2 * sumI*sumIy*w2 + dI * sumIy*sumIy + sumW * w2*w2 - A22 * dI*sumW)* D;
+                invTensorMat(0, 1) = (A12*dI*sumW - A12 * sumI * sumI - dI * sumIx*sumIy + sumI * sumIx*w2 + sumI * sumIy*w1 - sumW * w1*w2)* D;
+                invTensorMat(0, 2) = (A12*sumI*sumIy - sumIy * sumIy*w1 - A22 * sumI*sumIx - A12 * sumW*w2 + A22 * sumW*w1 + sumIx * sumIy*w2)* D;
+                invTensorMat(0, 3) = (A22*dI*sumIx - A12 * dI*sumIy - sumIx * w2*w2 + A12 * sumI*w2 - A22 * sumI*w1 + sumIy * w1*w2) * D;
+                invTensorMat(1, 0) = invTensorMat(0, 1);
+                invTensorMat(1, 1) = (A11*sumI * sumI - 2 * sumI*sumIx*w1 + dI * sumIx * sumIx + sumW * w1*w1 - A11 * dI*sumW) * D;
+                invTensorMat(1, 2) = (A12*sumI*sumIx - A11 * sumI*sumIy - sumIx * sumIx*w2 + A11 * sumW*w2 - A12 * sumW*w1 + sumIx * sumIy*w1) * D;
+                invTensorMat(1, 3) = (A11*dI*sumIy - sumIy * w1*w1 - A12 * dI*sumIx - A11 * sumI*w2 + A12 * sumI*w1 + sumIx * w1*w2)* D;
+                invTensorMat(2, 0) = invTensorMat(0, 2);
+                invTensorMat(2, 1) = invTensorMat(1, 2);
+                invTensorMat(2, 2) = (sumW*A12*A12 - 2 * A12*sumIx*sumIy + A22 * sumIx*sumIx + A11 * sumIy*sumIy - A11 * A22*sumW)* D;
+                invTensorMat(2, 3) = (A11*A22*sumI - A12 * A12*sumI - A11 * sumIy*w2 + A12 * sumIx*w2 + A12 * sumIy*w1 - A22 * sumIx*w1)* D;
+                invTensorMat(3, 0) = invTensorMat(0, 3);
+                invTensorMat(3, 1) = invTensorMat(1, 3);
+                invTensorMat(3, 2) = invTensorMat(2, 3);
+                invTensorMat(3, 3) = (dI*A12*A12 - 2 * A12*w1*w2 + A22 * w1*w1 + A11 * w2*w2 - A11 * A22*dI)* D;
 
                 resultMat = invTensorMat * mismatchMat;
 
-                Point2f delta(-resultMat.at<float>(0), -resultMat.at<float>(1));
-                Point2f deltaGain(resultMat.at<float>(2), resultMat.at<float>(3));
+                Point2f delta(-resultMat(0), -resultMat(1));
+                Point2f deltaGain(resultMat(2), resultMat(3));
 
 
 

From 3b794f2fbdf5c2d805615c8a1836bf210b81cf7f Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.alekhin@intel.com>
Date: Fri, 14 Dec 2018 16:12:26 +0300
Subject: [PATCH 16/25] normSigmas into constants

---
 modules/optflow/src/rlof/berlof_invoker.hpp | 90 +++++++++++----------
 modules/optflow/src/rlof/rlof_invoker.hpp   | 76 ++++++++---------
 2 files changed, 87 insertions(+), 79 deletions(-)

diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp
index 31d2792b00e..87f973d1ac1 100644
--- a/modules/optflow/src/rlof/berlof_invoker.hpp
+++ b/modules/optflow/src/rlof/berlof_invoker.hpp
@@ -67,8 +67,12 @@ class TrackerInvoker : public cv::ParallelLoopBody
         bool            _useInitialFlow,
         int             _supportRegionType,
         int             _crossSegmentationThreshold,
-        std::vector<float> _normSigmas,
-        float           _minEigenValue)
+        const std::vector<float>& _normSigmas,
+        float           _minEigenValue
+    ) :
+        normSigma0(_normSigmas[0]),
+        normSigma1(_normSigmas[1]),
+        normSigma2(_normSigmas[2])
     {
         prevImg =       &_prevImg;
         prevDeriv =     &_prevDeriv;
@@ -87,7 +91,6 @@ class TrackerInvoker : public cv::ParallelLoopBody
         maxLevel =      _maxLevel;
         windowType =    _supportRegionType;
         minEigThreshold = _minEigenValue;
-        paramReset =    _normSigmas;
         useInitialFlow = _useInitialFlow;
         crossSegmentationThreshold = _crossSegmentationThreshold;
     }
@@ -112,7 +115,6 @@ class TrackerInvoker : public cv::ParallelLoopBody
         cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
         winMaskMatBuf.setTo(1);
 
-        std::vector<float> param = paramReset;
         int j, cn = I.channels(), cn2 = cn*2;
         int winbufwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
         cv::Size winBufSize(winbufwidth,winbufwidth);
@@ -294,10 +296,10 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     }
 
                     float eta = 1.f / winArea;
-                    float fParam0 = param[0] * 32.f;
-                    float fParam1 = param[1] * 32.f;
-                    fParam0 = param[0] * MEstimatorScale;
-                    fParam1 = param[1] * MEstimatorScale;
+                    float fParam0 = normSigma0 * 32.f;
+                    float fParam1 = normSigma1 * 32.f;
+                    fParam0 = normSigma0 * MEstimatorScale;
+                    fParam1 = normSigma1 * MEstimatorScale;
 
 
                     buffIdx = 0;
@@ -351,17 +353,17 @@ class TrackerInvoker : public cv::ParallelLoopBody
                                 }
                                 else if( abss > fParam0 && diff >= 0 )
                                 {
-                                    It[0] = param[2] * (It[0] - fParam1);
-                                    It[1] = param[2] * (It[1] - fParam1);
-                                    It[2] = param[2] * (It[2] - fParam1);
-                                    It[3] = param[2] * (It[3] - fParam1);
+                                    It[0] = normSigma2 * (It[0] - fParam1);
+                                    It[1] = normSigma2 * (It[1] - fParam1);
+                                    It[2] = normSigma2 * (It[2] - fParam1);
+                                    It[3] = normSigma2 * (It[3] - fParam1);
                                 }
                                 else if( abss > fParam0 && diff < 0 )
                                 {
-                                    It[0] = param[2] * (It[0] + fParam1);
-                                    It[1] = param[2] * (It[1] + fParam1);
-                                    It[2] = param[2] * (It[2] + fParam1);
-                                    It[3] = param[2] * (It[3] + fParam1);
+                                    It[0] = normSigma2 * (It[0] + fParam1);
+                                    It[1] = normSigma2 * (It[1] + fParam1);
+                                    It[2] = normSigma2 * (It[2] + fParam1);
+                                    It[3] = normSigma2 * (It[3] + fParam1);
                                 }
                             }
 
@@ -387,7 +389,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             // compute the Gradient Matrice
                             if( j == 0)
                             {
-                                float tale = param[2] * FLT_RESCALE;
+                                float tale = normSigma2 * FLT_RESCALE;
                                 if( abss < fParam0 || j < 0)
                                 {
                                     tale = FLT_RESCALE;
@@ -398,7 +400,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                                 }
                                 else
                                 {
-                                    tale *= param[2];
+                                    tale *= normSigma2;
                                 }
 
                                 A11 += (float)(ixval*ixval)*tale;
@@ -555,7 +557,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
     int                 windowType;
     float               minEigThreshold;
     bool                useInitialFlow;
-    std::vector<float>  paramReset;
+    const float         normSigma0, normSigma1, normSigma2;
     int                 crossSegmentationThreshold;
 
 };
@@ -583,8 +585,12 @@ class TrackerInvoker : public cv::ParallelLoopBody
         bool            _useInitialFlow,
         int             _supportRegionType,
         int             _crossSegmentationThreshold,
-        std::vector<float> _normSigmas,
-        float           _minEigenValue)
+        const std::vector<float>& _normSigmas,
+        float           _minEigenValue
+    ) :
+        normSigma0(_normSigmas[0]),
+        normSigma1(_normSigmas[1]),
+        normSigma2(_normSigmas[2])
     {
         prevImg = &_prevImg;
         prevDeriv = &_prevDeriv;
@@ -604,7 +610,6 @@ class TrackerInvoker : public cv::ParallelLoopBody
         maxLevel = _maxLevel;
         windowType = _supportRegionType;
         minEigThreshold = _minEigenValue;
-        paramReset = _normSigmas;
         useInitialFlow = _useInitialFlow;
         crossSegmentationThreshold = _crossSegmentationThreshold;
     }
@@ -626,7 +631,6 @@ class TrackerInvoker : public cv::ParallelLoopBody
         cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
         winMaskMatBuf.setTo(1);
 
-        std::vector<float> param = paramReset;
         int cn = I.channels(), cn2 = cn*2;
         int winbufwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
         cv::Size winBufSize(winbufwidth,winbufwidth);
@@ -901,10 +905,10 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 }
 
                 float eta = 1.f / winArea;
-                float fParam0 = param[0] * 32.f;
-                float fParam1 = param[1] * 32.f;
-                fParam0 = param[0] * MEstimatorScale;
-                fParam1 = param[1] * MEstimatorScale;
+                float fParam0 = normSigma0 * 32.f;
+                float fParam1 = normSigma1 * 32.f;
+                fParam0 = normSigma0 * MEstimatorScale;
+                fParam1 = normSigma1 * MEstimatorScale;
 
 #ifdef RLOF_SSE
 
@@ -922,12 +926,12 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max()- 1, static_cast<short>(fParam1)));
 
 
-                float s2Val = param[2] > 0 ? param[2] : -param[2];
-                int s2bitShift = param[2] == 0 ? 1 : static_cast<int>(ceil(log(200.f / s2Val) / log(2.f)));
-                __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast<short>(param[2] * (float)(1 << s2bitShift)));
+                float s2Val = std::fabs(normSigma2);
+                int s2bitShift = normSigma2 == 0 ? 1 : static_cast<int>(ceil(log(200.f / s2Val) / log(2.f)));
+                __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast<short>(normSigma2 * (float)(1 << s2bitShift)));
                 __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift);
-                __m128  mmParam2s = _mm_set1_ps(0.01f * param[2]);
-                __m128  mmParam2s2 = _mm_set1_ps(param[2] * param[2]);
+                __m128  mmParam2s = _mm_set1_ps(0.01f * normSigma2);
+                __m128  mmParam2s2 = _mm_set1_ps(normSigma2 * normSigma2);
                 float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x;
                 int bitShift = gainVec.x == 0 ? 1 : static_cast<int>(ceil(log(200.f / gainVal) / log(2.f)));
                 __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.x * (float)(1 << bitShift)));
@@ -1197,17 +1201,17 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             }
                             else if( abss > static_cast<int>(fParam0) && diff >= 0 )
                             {
-                                It[0] = static_cast<int>(param[2] * (It[0] - fParam1));
-                                It[1] = static_cast<int>(param[2] * (It[1] - fParam1));
-                                It[2] = static_cast<int>(param[2] * (It[2] - fParam1));
-                                It[3] = static_cast<int>(param[2] * (It[3] - fParam1));
+                                It[0] = static_cast<int>(normSigma2 * (It[0] - fParam1));
+                                It[1] = static_cast<int>(normSigma2 * (It[1] - fParam1));
+                                It[2] = static_cast<int>(normSigma2 * (It[2] - fParam1));
+                                It[3] = static_cast<int>(normSigma2 * (It[3] - fParam1));
                             }
                             else if( abss > static_cast<int>(fParam0) && diff < 0 )
                             {
-                                It[0] = static_cast<int>(param[2] * (It[0] + fParam1));
-                                It[1] = static_cast<int>(param[2] * (It[1] + fParam1));
-                                It[2] = static_cast<int>(param[2] * (It[2] + fParam1));
-                                It[3] = static_cast<int>(param[2] * (It[3] + fParam1));
+                                It[0] = static_cast<int>(normSigma2 * (It[0] + fParam1));
+                                It[1] = static_cast<int>(normSigma2 * (It[1] + fParam1));
+                                It[2] = static_cast<int>(normSigma2 * (It[2] + fParam1));
+                                It[3] = static_cast<int>(normSigma2 * (It[3] + fParam1));
                             }
                         }
                         _b0[0] += (float)(It[0]*dIptr[0]) ;
@@ -1234,7 +1238,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         // compute the Gradient Matrice
                         if( j == 0)
                         {
-                            float tale = param[2] * FLT_RESCALE;
+                            float tale = normSigma2 * FLT_RESCALE;
                             if( abss < fParam0 || j < 0)
                             {
                                 tale = FLT_RESCALE;
@@ -1245,7 +1249,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             }
                             else
                             {
-                                tale *= param[2];
+                                tale *= normSigma2;
                             }
                             if( j == 0 )
                             {
@@ -1521,7 +1525,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
     int                 windowType;
     float               minEigThreshold;
     bool                useInitialFlow;
-    std::vector<float>  paramReset;
+    const float         normSigma0, normSigma1, normSigma2;
     int                 crossSegmentationThreshold;
 };
 
diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp
index 7a4bd343cbe..cea0f7f6718 100644
--- a/modules/optflow/src/rlof/rlof_invoker.hpp
+++ b/modules/optflow/src/rlof/rlof_invoker.hpp
@@ -28,9 +28,13 @@ class TrackerInvoker : public cv::ParallelLoopBody
         int             _maxIteration,
         bool            _useInitialFlow,
         int             _supportRegionType,
-        std::vector<float> _normSigmas,
+        const std::vector<float>& _normSigmas,
         float           _minEigenValue,
-        int             _crossSegmentationThreshold)
+        int             _crossSegmentationThreshold
+    ) :
+        normSigma0(_normSigmas[0]),
+        normSigma1(_normSigmas[1]),
+        normSigma2(_normSigmas[2])
     {
         prevImg = &_prevImg;
         prevDeriv = &_prevDeriv;
@@ -49,7 +53,6 @@ class TrackerInvoker : public cv::ParallelLoopBody
         maxLevel = _maxLevel;
         windowType = _supportRegionType;
         minEigThreshold = _minEigenValue;
-        paramReset = _normSigmas;
         useInitialFlow = _useInitialFlow;
         crossSegmentationThreshold = _crossSegmentationThreshold;
     }
@@ -74,7 +77,6 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
         const float FLT_SCALE = (1.f / (1 << 20)); // 20
 
-        std::vector<float> param = paramReset;
         int cn = I.channels(), cn2 = cn * 2;
         int winbufwidth = static_cast<int>(ceil(winSize.width / 8.f)) * 8;
         cv::Size winBufSize(winbufwidth, winbufwidth);
@@ -304,10 +306,10 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 }
 
                 float eta = 1.f / winArea;
-                float fParam0 = param[0] * 32.f;
-                float fParam1 = param[1] * 32.f;
-                fParam0 = param[0] * MEstimatorScale;
-                fParam1 = param[1] * MEstimatorScale;
+                float fParam0 = normSigma0 * 32.f;
+                float fParam1 = normSigma1 * 32.f;
+                fParam0 = normSigma0 * MEstimatorScale;
+                fParam1 = normSigma1 * MEstimatorScale;
 #ifdef RLOF_SSE
                 qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
                 qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
@@ -315,12 +317,12 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();
                 __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam0)));
                 __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam1)));
-                float s2Val = param[2] > 0 ? param[2] : -param[2];
-                int s2bitShift = param[2] == 0 ? 1 : static_cast<int>(ceil(log(200.f / s2Val) / log(2.f)));
-                __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast<short>(param[2] * (float)(1 << s2bitShift)));
+                float s2Val = std::fabs(normSigma2);
+                int s2bitShift = normSigma2 == 0 ? 1 : static_cast<int>(ceil(log(200.f / s2Val) / log(2.f)));
+                __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast<short>(normSigma2 * (float)(1 << s2bitShift)));
                 __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift);
-                __m128  mmParam2s = _mm_set1_ps(0.01f * param[2]);
-                __m128  mmParam2s2 = _mm_set1_ps(param[2] * param[2]);
+                __m128  mmParam2s = _mm_set1_ps(0.01f * normSigma2);
+                __m128  mmParam2s2 = _mm_set1_ps(normSigma2 * normSigma2);
                 __m128  mmOnes = _mm_set1_ps(1.f);
                 __m128i mmEta = _mm_setzero_si128();
                 __m128i mmScale = _mm_set1_epi16(static_cast<short>(MEstimatorScale));
@@ -487,12 +489,12 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         else if (abss > fParam0 && diff >= 0)
                         {
                             //diff = fParam1 * (diff - fParam1);
-                            diff = static_cast<int>(param[2] * (diff - fParam1));
+                            diff = static_cast<int>(normSigma2 * (diff - fParam1));
                         }
                         else if (abss > fParam0 && diff < 0)
                         {
                             //diff = fParam1 * (diff + fParam1);
-                            diff = static_cast<int>(param[2] * (diff + fParam1));
+                            diff = static_cast<int>(normSigma2 * (diff + fParam1));
 
                         }
 
@@ -503,7 +505,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
                         if ( j == 0)
                         {
-                            float tale = param[2] * FLT_RESCALE;
+                            float tale = normSigma2 * FLT_RESCALE;
                             if (abss < fParam0)
                             {
                                 tale = FLT_RESCALE;
@@ -514,7 +516,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             }
                             else
                             {
-                                tale *= param[2];
+                                tale *= normSigma2;
                             }
                             A11 += (float)(ixval*ixval*tale);
                             A12 += (float)(ixval*iyval*tale);
@@ -621,7 +623,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
     int                 windowType;
     float               minEigThreshold;
     bool                useInitialFlow;
-    std::vector<float>  paramReset;
+    const float         normSigma0, normSigma1, normSigma2;
     int                 crossSegmentationThreshold;
 };
 
@@ -648,9 +650,13 @@ class TrackerInvoker : public cv::ParallelLoopBody
         int             _maxIteration,
         bool            _useInitialFlow,
         int             _supportRegionType,
-        std::vector<float> _normSigmas,
+        const std::vector<float>& _normSigmas,
         float           _minEigenValue,
-        int             _crossSegmentationThreshold)
+        int             _crossSegmentationThreshold
+    ) :
+        normSigma0(_normSigmas[0]),
+        normSigma1(_normSigmas[1]),
+        normSigma2(_normSigmas[2])
     {
         prevImg = &_prevImg;
         prevDeriv = &_prevDeriv;
@@ -670,7 +676,6 @@ class TrackerInvoker : public cv::ParallelLoopBody
         maxLevel = _maxLevel;
         windowType = _supportRegionType;
         minEigThreshold = _minEigenValue;
-        paramReset = _normSigmas;
         useInitialFlow = _useInitialFlow;
         crossSegmentationThreshold = _crossSegmentationThreshold;
     }
@@ -695,7 +700,6 @@ class TrackerInvoker : public cv::ParallelLoopBody
         cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
         winMaskMatBuf.setTo(1);
 
-        std::vector<float> param = paramReset;
         int cn = I.channels(), cn2 = cn * 2;
         int winbufwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
         cv::Size winBufSize(winbufwidth, winbufwidth);
@@ -965,10 +969,10 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 }
 
                 float eta = 1.f / winArea;
-                float fParam0 = param[0] * 32.f;
-                float fParam1 = param[1] * 32.f;
-                fParam0 = param[0] * MEstimatorScale;
-                fParam1 = param[1] * MEstimatorScale;
+                float fParam0 = normSigma0 * 32.f;
+                float fParam1 = normSigma1 * 32.f;
+                fParam0 = normSigma0 * MEstimatorScale;
+                fParam1 = normSigma1 * MEstimatorScale;
 
 #ifdef RLOF_SSE
 
@@ -984,12 +988,12 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam1)));
 
 
-                float s2Val = param[2] > 0 ? param[2] : -param[2];
-                int s2bitShift = param[2] == 0 ? 1 : static_cast<int>(ceil(log(200.f / s2Val) / log(2.f)));
-                __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast<short>(param[2] * (float)(1 << s2bitShift)));
+                float s2Val = std::fabs(normSigma2);
+                int s2bitShift = normSigma2 == 0 ? 1 : static_cast<int>(ceil(log(200.f / s2Val) / log(2.f)));
+                __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast<short>(normSigma2 * (float)(1 << s2bitShift)));
                 __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift);
-                __m128  mmParam2s = _mm_set1_ps(0.01f * param[2]);
-                __m128  mmParam2s2 = _mm_set1_ps(param[2] * param[2]);
+                __m128  mmParam2s = _mm_set1_ps(0.01f * normSigma2);
+                __m128  mmParam2s2 = _mm_set1_ps(normSigma2 * normSigma2);
 
                 float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x;
                 int bitShift = gainVec.x == 0 ? 1 : static_cast<int>(ceil(log(200.f / gainVal) / log(2.f)));
@@ -1226,11 +1230,11 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         }
                         else if (abss > static_cast<int>(fParam0) && diff >= 0)
                         {
-                            diff = static_cast<int>(param[2] * (diff - fParam1));
+                            diff = static_cast<int>(normSigma2 * (diff - fParam1));
                         }
                         else if (abss > static_cast<int>(fParam0) && diff < 0)
                         {
-                            diff = static_cast<int>(param[2] * (diff + fParam1));
+                            diff = static_cast<int>(normSigma2 * (diff + fParam1));
                         }
                         b1 += (float)(diff*ixval);
                         b2 += (float)(diff*iyval); ;
@@ -1241,7 +1245,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         // compute the Gradient Matrice
                         if (j == 0)
                         {
-                            float tale = param[2] * FLT_RESCALE;
+                            float tale = normSigma2 * FLT_RESCALE;
                             if (abss < fParam0 || j < 0)
                             {
                                 tale = FLT_RESCALE;
@@ -1252,7 +1256,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             }
                             else
                             {
-                                tale *= param[2];
+                                tale *= normSigma2;
                             }
                             if (j == 0)
                             {
@@ -1426,7 +1430,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
     int                 windowType;
     float               minEigThreshold;
     bool                useInitialFlow;
-    std::vector<float>  paramReset;
+    const float         normSigma0, normSigma1, normSigma2;
     int                 crossSegmentationThreshold;
 };
 

From 5acff7bcaa63a7ceed987eca3008b7cea6249419 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.alekhin@intel.com>
Date: Fri, 14 Dec 2018 16:22:02 +0300
Subject: [PATCH 17/25] replace ceil() calls

---
 modules/optflow/src/rlof/berlof_invoker.hpp   | 16 ++++++++--------
 modules/optflow/src/rlof/plk_invoker.hpp      | 10 +++++-----
 modules/optflow/src/rlof/rlof_invoker.hpp     | 14 +++++++-------
 modules/optflow/src/rlof/rlof_invokerbase.hpp |  6 +++---
 4 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp
index 87f973d1ac1..1d04715fe48 100644
--- a/modules/optflow/src/rlof/berlof_invoker.hpp
+++ b/modules/optflow/src/rlof/berlof_invoker.hpp
@@ -111,12 +111,12 @@ class TrackerInvoker : public cv::ParallelLoopBody
         const float FLT_SCALE = (1.f/(1 << 16));
 
         winSize = cv::Size(maxWinSize,maxWinSize);
-        int winMaskwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
+        int winMaskwidth = roundUp(winSize.width, 16);
         cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
         winMaskMatBuf.setTo(1);
 
         int j, cn = I.channels(), cn2 = cn*2;
-        int winbufwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
+        int winbufwidth = roundUp(winSize.width, 16);
         cv::Size winBufSize(winbufwidth,winbufwidth);
 
         cv::AutoBuffer<deriv_type> _buf(winBufSize.area()*(cn + cn2));
@@ -627,12 +627,12 @@ class TrackerInvoker : public cv::ParallelLoopBody
         const Mat& BI = *rgbPrevImg;
         const float FLT_SCALE = (1.f/(1 << 16));//(1.f/(1 << 20)); // 20
         winSize = cv::Size(maxWinSize,maxWinSize);
-        int winMaskwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
+        int winMaskwidth = roundUp(winSize.width, 16);
         cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
         winMaskMatBuf.setTo(1);
 
         int cn = I.channels(), cn2 = cn*2;
-        int winbufwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
+        int winbufwidth = roundUp(winSize.width, 16);
         cv::Size winBufSize(winbufwidth,winbufwidth);
 
 
@@ -927,13 +927,13 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
 
                 float s2Val = std::fabs(normSigma2);
-                int s2bitShift = normSigma2 == 0 ? 1 : static_cast<int>(ceil(log(200.f / s2Val) / log(2.f)));
+                int s2bitShift = normSigma2 == 0 ? 1 : cvCeil(log(200.f / s2Val) / log(2.f));
                 __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast<short>(normSigma2 * (float)(1 << s2bitShift)));
                 __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift);
                 __m128  mmParam2s = _mm_set1_ps(0.01f * normSigma2);
                 __m128  mmParam2s2 = _mm_set1_ps(normSigma2 * normSigma2);
                 float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x;
-                int bitShift = gainVec.x == 0 ? 1 : static_cast<int>(ceil(log(200.f / gainVal) / log(2.f)));
+                int bitShift = gainVec.x == 0 ? 1 : cvCeil(log(200.f / gainVal) / log(2.f));
                 __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.x * (float)(1 << bitShift)));
                 __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.y));
                 __m128i mmEta     = _mm_setzero_si128();
@@ -1587,14 +1587,14 @@ class TrackerInvoker  : public cv::ParallelLoopBody
         const Mat& BI = *rgbPrevImg;
 
         winSize = cv::Size(maxWinSize,maxWinSize);
-        int winMaskwidth = static_cast<int>(ceil(winSize.width / 8.f)) * 16;
+        int winMaskwidth = roundUp(winSize.width, 8) * 2;
         cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
         winMaskMatBuf.setTo(1);
 
         const float FLT_SCALE = (1.f/(1 << 20)); // 20
 
         int j, cn = I.channels(), cn2 = cn*2;
-        int winbufwidth = static_cast<int>(ceil(winSize.width / 8.f)) * 8;
+        int winbufwidth = roundUp(winSize.width, 8);
         cv::Size winBufSize(winbufwidth,winbufwidth);
 
         std::vector<short> _buf(winBufSize.area()*(cn + cn2));
diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp
index 06a2972969d..10c68e581cf 100644
--- a/modules/optflow/src/rlof/plk_invoker.hpp
+++ b/modules/optflow/src/rlof/plk_invoker.hpp
@@ -73,13 +73,13 @@ class TrackerInvoker : public cv::ParallelLoopBody
         const Mat& BI = *rgbPrevImg;
 
         winSize = cv::Size(maxWinSize,maxWinSize);
-        int winMaskwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
+        int winMaskwidth = roundUp(winSize.width, 16);
         cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
         winMaskMatBuf.setTo(1);
         const float FLT_SCALE = (1.f/(1 << 16));//(1.f/(1 << 20)); // 20
 
         int cn = I.channels(), cn2 = cn*2;
-        int winbufwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
+        int winbufwidth = roundUp(winSize.width, 16);
         cv::Size winBufSize(winbufwidth,winbufwidth);
 
         cv::Matx44f invTensorMat;
@@ -304,7 +304,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 __m128 mmSumIy = _mm_setzero_ps(),  mmSumIx = _mm_setzero_ps();
                 __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();
                 float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x;
-                int bitShift = gainVec.x == 0 ? 1 : static_cast<int>(ceil(log(200.f / gainVal) / log(2.f)));
+                int bitShift = gainVec.x == 0 ? 1 : cvCeil(log(200.f / gainVal) / log(2.f));
                 __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.x * (float)(1 << bitShift)));
                 __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.y));
 #endif
@@ -725,7 +725,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
         const Mat& BI = *rgbPrevImg;
 
         winSize = cv::Size(maxWinSize,maxWinSize);
-        int winMaskwidth = static_cast<int>(ceil(winSize.width / 8.f)) * 16;
+        int winMaskwidth = roundUp(winSize.width, 8) * 2;
         cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
         winMaskMatBuf.setTo(1);
 
@@ -734,7 +734,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
         const float FLT_SCALE = (1.f/(1 << 20)); // 20
 
         int j, cn = I.channels(), cn2 = cn*2;
-        int winbufwidth = static_cast<int>(ceil(winSize.width / 8.f)) * 8;
+        int winbufwidth = roundUp(winSize.width, 8);
         cv::Size winBufSize(winbufwidth,winbufwidth);
 
         std::vector<short> _buf(winBufSize.area()*(cn + cn2));
diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp
index cea0f7f6718..bb3d589e663 100644
--- a/modules/optflow/src/rlof/rlof_invoker.hpp
+++ b/modules/optflow/src/rlof/rlof_invoker.hpp
@@ -71,14 +71,14 @@ class TrackerInvoker : public cv::ParallelLoopBody
         const Mat& BI = *rgbPrevImg;
 
         winSize = cv::Size(maxWinSize, maxWinSize);
-        int winMaskwidth = static_cast<int>(ceil(winSize.width / 8.f)) * 16;
+        int winMaskwidth = roundUp(winSize.width, 8) * 2;
         cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
         winMaskMatBuf.setTo(1);
 
         const float FLT_SCALE = (1.f / (1 << 20)); // 20
 
         int cn = I.channels(), cn2 = cn * 2;
-        int winbufwidth = static_cast<int>(ceil(winSize.width / 8.f)) * 8;
+        int winbufwidth = roundUp(winSize.width, 8);
         cv::Size winBufSize(winbufwidth, winbufwidth);
 
         std::vector<short> _buf(winBufSize.area()*(cn + cn2));
@@ -318,7 +318,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam0)));
                 __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam1)));
                 float s2Val = std::fabs(normSigma2);
-                int s2bitShift = normSigma2 == 0 ? 1 : static_cast<int>(ceil(log(200.f / s2Val) / log(2.f)));
+                int s2bitShift = normSigma2 == 0 ? 1 : cvCeil(log(200.f / s2Val) / log(2.f));
                 __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast<short>(normSigma2 * (float)(1 << s2bitShift)));
                 __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift);
                 __m128  mmParam2s = _mm_set1_ps(0.01f * normSigma2);
@@ -696,12 +696,12 @@ class TrackerInvoker : public cv::ParallelLoopBody
         const float FLT_SCALE = (1.f / (1 << 16));
 
         winSize = cv::Size(maxWinSize, maxWinSize);
-        int winMaskwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
+        int winMaskwidth = roundUp(winSize.width, 16);
         cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
         winMaskMatBuf.setTo(1);
 
         int cn = I.channels(), cn2 = cn * 2;
-        int winbufwidth = static_cast<int>(ceil(winSize.width / 16.f)) * 16;
+        int winbufwidth = roundUp(winSize.width, 16);
         cv::Size winBufSize(winbufwidth, winbufwidth);
 
         cv::Matx44f invTensorMat;
@@ -989,14 +989,14 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
 
                 float s2Val = std::fabs(normSigma2);
-                int s2bitShift = normSigma2 == 0 ? 1 : static_cast<int>(ceil(log(200.f / s2Val) / log(2.f)));
+                int s2bitShift = normSigma2 == 0 ? 1 : cvCeil(log(200.f / s2Val) / log(2.f));
                 __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast<short>(normSigma2 * (float)(1 << s2bitShift)));
                 __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift);
                 __m128  mmParam2s = _mm_set1_ps(0.01f * normSigma2);
                 __m128  mmParam2s2 = _mm_set1_ps(normSigma2 * normSigma2);
 
                 float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x;
-                int bitShift = gainVec.x == 0 ? 1 : static_cast<int>(ceil(log(200.f / gainVal) / log(2.f)));
+                int bitShift = gainVec.x == 0 ? 1 : cvCeil(log(200.f / gainVal) / log(2.f));
 
                 __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.x * (float)(1 << bitShift)));
                 __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.y));
diff --git a/modules/optflow/src/rlof/rlof_invokerbase.hpp b/modules/optflow/src/rlof/rlof_invokerbase.hpp
index f65efeb673c..17c83e026d7 100644
--- a/modules/optflow/src/rlof/rlof_invokerbase.hpp
+++ b/modules/optflow/src/rlof/rlof_invokerbase.hpp
@@ -216,8 +216,8 @@ void getLocalPatch(
     if (noPoints < minWinSize * minWinSize)
     {
         cv::Rect roi(winPointMask.cols / 2 - (minWinSize - 1) / 2,
-            winPointMask.rows / 2 - (minWinSize - 1) / 2,
-            minWinSize, minWinSize);
+                     winPointMask.rows / 2 - (minWinSize - 1) / 2,
+                     minWinSize, minWinSize);
         cv::Mat(winPointMask, roi).setTo(1);
         min_c = MIN(MIN(min_c, roi.tl().x), roi.br().x - 1);
         max_c = MAX(MAX(max_c, roi.tl().x), roi.br().x - 1);
@@ -254,7 +254,7 @@ bool calcWinMaskMat(
             return false;
         winSize = winRoi.size();
         halfWin = Point2f(static_cast<float>(iprevPt.x - winRoi.tl().x),
-            static_cast<float>(iprevPt.y - winRoi.tl().y));
+                          static_cast<float>(iprevPt.y - winRoi.tl().y));
     }
     else
     {

From aa3cc4c7498d1c7bb89de2750ee696902e6acb22 Mon Sep 17 00:00:00 2001
From: senst <senst@nue.tu-berlin.de>
Date: Fri, 14 Dec 2018 20:21:02 +0100
Subject: [PATCH 18/25] maximum level is set to 5 so that it is similar value
 used in the papers

---
 modules/optflow/include/opencv2/optflow/rlofflow.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/optflow/include/opencv2/optflow/rlofflow.hpp b/modules/optflow/include/opencv2/optflow/rlofflow.hpp
index 4d602d2b28f..bbed7f0272c 100644
--- a/modules/optflow/include/opencv2/optflow/rlofflow.hpp
+++ b/modules/optflow/include/opencv2/optflow/rlofflow.hpp
@@ -68,7 +68,7 @@ class CV_EXPORTS_W RLOFOpticalFlowParameter{
         ,smallWinSize(9)
         ,largeWinSize(21)
         ,crossSegmentationThreshold(25)
-        ,maxLevel(3)
+        ,maxLevel(5)
         ,useInitialFlow(false)
         ,useIlluminationModel(true)
         ,useGlobalMotionPrior(true)

From 725e296eabcdab84b88b26451156b0c07820dfa2 Mon Sep 17 00:00:00 2001
From: senst <senst@nue.tu-berlin.de>
Date: Fri, 14 Dec 2018 20:22:22 +0100
Subject: [PATCH 19/25] implement paralellized horizontal cross segmentation as
 used in Geistert2016

---
 modules/optflow/src/rlof/berlof_invoker.hpp   |   6 +-
 modules/optflow/src/rlof/plk_invoker.hpp      |   4 +-
 modules/optflow/src/rlof/rlof_invoker.hpp     |   4 +-
 modules/optflow/src/rlof/rlof_invokerbase.hpp | 190 +++++-------------
 modules/optflow/src/rlof/rlof_localflow.cpp   | 143 +++++++++++++
 5 files changed, 203 insertions(+), 144 deletions(-)

diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp
index 1d04715fe48..fb40fd2bd2d 100644
--- a/modules/optflow/src/rlof/berlof_invoker.hpp
+++ b/modules/optflow/src/rlof/berlof_invoker.hpp
@@ -149,7 +149,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
             winMaskMatBuf.setTo(0);
             if( calcWinMaskMat(BI, windowType, iprevPt,
                                 winMaskMat,winSize,halfWin,winArea,
-                                minWinSize,maxWinSize, crossSegmentationThreshold) == false)
+                                minWinSize,maxWinSize) == false)
                 continue;
 
             prevPt -= halfWin;
@@ -667,7 +667,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
             winMaskMatBuf.setTo(0);
             if( calcWinMaskMat(BI,  windowType, iprevPt,
                                 winMaskMat,winSize,halfWin,winArea,
-                                minWinSize,maxWinSize, crossSegmentationThreshold) == false)
+                                minWinSize,maxWinSize) == false)
                 continue;
 
             prevPt -= halfWin;
@@ -1624,7 +1624,7 @@ class TrackerInvoker  : public cv::ParallelLoopBody
 
             if( calcWinMaskMat(BI, windowType, iprevPt,
                         winMaskMat,winSize,halfWin,winArea,
-                        minWinSize,maxWinSize, crossSegmentationThreshold) == false)
+                        minWinSize,maxWinSize) == false)
                 continue;
 
             prevPt -= halfWin;
diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp
index 10c68e581cf..b905cffe8d1 100644
--- a/modules/optflow/src/rlof/plk_invoker.hpp
+++ b/modules/optflow/src/rlof/plk_invoker.hpp
@@ -117,7 +117,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
             winMaskMatBuf.setTo(0);
             if( calcWinMaskMat(BI, windowType, iprevPt,
                                 winMaskMat,winSize,halfWin,winArea,
-                                minWinSize,maxWinSize, crossSegmentationThreshold) == false)
+                                minWinSize,maxWinSize) == false)
                         continue;
 
             prevPt -= halfWin;
@@ -764,7 +764,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
             if( calcWinMaskMat(BI,  windowType, iprevPt,
                                 winMaskMat,winSize,halfWin,winArea,
-                                minWinSize,maxWinSize, crossSegmentationThreshold) == false)
+                                minWinSize,maxWinSize) == false)
                         continue;
 
 
diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp
index bb3d589e663..67692c4da50 100644
--- a/modules/optflow/src/rlof/rlof_invoker.hpp
+++ b/modules/optflow/src/rlof/rlof_invoker.hpp
@@ -108,7 +108,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
             if (calcWinMaskMat(BI, windowType, iprevPt,
                 winMaskMat, winSize, halfWin, winArea,
-                minWinSize, maxWinSize, crossSegmentationThreshold) == false)
+                minWinSize, maxWinSize) == false)
                 continue;
 
 
@@ -742,7 +742,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
             winMaskMatBuf.setTo(0);
             if (calcWinMaskMat(BI, windowType, iprevPt,
                 winMaskMat, winSize, halfWin, winArea,
-                minWinSize, maxWinSize, crossSegmentationThreshold) == false)
+                minWinSize, maxWinSize) == false)
                 continue;
 
             prevPt -= halfWin;
diff --git a/modules/optflow/src/rlof/rlof_invokerbase.hpp b/modules/optflow/src/rlof/rlof_invokerbase.hpp
index 17c83e026d7..470f4d87ad7 100644
--- a/modules/optflow/src/rlof/rlof_invokerbase.hpp
+++ b/modules/optflow/src/rlof/rlof_invokerbase.hpp
@@ -72,36 +72,23 @@ static inline bool notSameColor(const cv::Point3_<uchar> & ref, int r, int c, co
     return diff > threshold;
 }
 
-/*! Estimate the mask with points of the same color as the middle point
-*\param prevGray input CV_8UC1
-*\param prevImg input CV_8UC3
-*\param nextImg input CV_8UC3
-*\param prevPoint global position of the middle point of the mask window
-*\param nextPoint global position of the middle point of the mask window
-*\param winPointMask mask matrice with 1 labeling points contained and 0 point is not contained by the segment
-*\param noPoints number of points contained by the segment
-*\param winRoi rectangle of the region of interesset in global coordinates
-*\param minWinSize,
-*\param threshold,
-*\param useBothImages
-*/
 static
 void getLocalPatch(
-    const cv::Mat & prevImg,
-    const cv::Point2i & prevPoint, // feature points
-    cv::Mat & winPointMask,
-    int & noPoints,
-    cv::Rect & winRoi,
-    int minWinSize,
-    int threshold)
+        const cv::Mat & src,
+        const cv::Point2i & prevPoint, // feature points
+        cv::Mat & winPointMask,
+        int & noPoints,
+        cv::Rect & winRoi,
+        int minWinSize)
 {
     int maxWinSizeH = (winPointMask.cols - 1) / 2;
-    winRoi.x = prevPoint.x;
-    winRoi.y = prevPoint.y;
-    winRoi.width = winPointMask.cols;
-    winRoi.height = winPointMask.rows;
+    winRoi.x = prevPoint.x;// - maxWinSizeH;
+    winRoi.y = prevPoint.y;// - maxWinSizeH;
+    winRoi.width  =  winPointMask.cols;
+    winRoi.height =  winPointMask.rows;
 
-    if (minWinSize == winPointMask.cols)
+    if( minWinSize == winPointMask.cols || prevPoint.x < 0 || prevPoint.y < 0
+        || prevPoint.x + 2*maxWinSizeH >= src.cols || prevPoint.y + 2*maxWinSizeH >= src.rows)
     {
         winRoi.x = prevPoint.x - maxWinSizeH;
         winRoi.y = prevPoint.y - maxWinSizeH;
@@ -109,127 +96,57 @@ void getLocalPatch(
         noPoints = winPointMask.size().area();
         return;
     }
+    winPointMask.setTo(0);
     noPoints = 0;
-    int c = prevPoint.x;
-    int r = prevPoint.y;
-    int c_left = c - 1;
-    int c_right = c + 1;
-    int r_top = r - 1;
-    int r_bottom = r;
-    int border_left = c - maxWinSizeH;
-    int border_right = c + maxWinSizeH;
-    int border_top = r - maxWinSizeH;
-    int border_bottom = r + maxWinSizeH;
-    int c_local_diff = prevPoint.x - maxWinSizeH;
-    int r_local_diff = prevPoint.y - maxWinSizeH;
-    int _c = c - c_local_diff;
-    int _r = r - r_local_diff;
-    int min_r = _r;
-    int max_r = _r;
-    int min_c = _c;
-    int max_c = _c;
-    // horizontal line
-    if (r < 0 || r >= prevImg.rows || c < 0 || c >= prevImg.cols)
+    int c            = prevPoint.x + maxWinSizeH;
+    int r            = prevPoint.y + maxWinSizeH;
+    int min_c = c;
+    int max_c = c;
+    int border_left    = c - maxWinSizeH;
+    int border_top    = r - maxWinSizeH;
+    cv::Vec4i bounds = src.at<cv::Vec4i>(r,c);
+    int min_r = bounds.val[2];
+    int max_r = bounds.val[3];
+
+    for( int _r = min_r; _r <= max_r; _r++)
     {
-        noPoints = 0;
-        return;
-    }
-    cv::Point3_<uchar> val1 = prevImg.at<cv::Point3_<uchar>>(r, c); // middle grayvalue
-    cv::Point3_<uchar> tval;
-
-    //vertical line
-    for (int dr = r_top; dr >= border_top; dr--)
-    {
-        if (notSameColor(val1, dr, c, prevImg, maxWinSizeH, threshold))
-            break;
-
-        int _dr = dr - r_local_diff;
-        min_r = MIN(min_r, _dr);
-        max_r = MAX(max_r, _dr);
-        winPointMask.at<uchar>(_dr, _c) = 1;
-        noPoints++;
-    }
-    for (int dr = r_bottom; dr < border_bottom; dr++)
-    {
-        if (notSameColor(val1, dr, c, prevImg, maxWinSizeH, threshold)
-            )
-            break;
-
-        int _dr = dr - r_local_diff;
-        min_r = MIN(min_r, _dr);
-        max_r = MAX(max_r, _dr);
-        winPointMask.at<uchar>(_dr, _c) = 1;
-        noPoints++;
-    }
-    // accumulate along the vertical line and the search line that was still labled
-    for (int dr = min_r + r_local_diff; dr <= max_r + r_local_diff; dr++)
-    {
-        int _dr = dr - r_local_diff;
-        if (winPointMask.at<uchar>(_dr, _c) == 0)
+        cv::Rect roi(maxWinSizeH, _r - border_top,  winPointMask.cols, 1);
+        if( _r >= 0 && _r < src.cols)
         {
-            winPointMask.row(_dr).setTo(0);
-            continue;
+            bounds = src.at<cv::Vec4i>(_r,c);
+            roi.x      = bounds.val[0] - border_left;
+            roi.width = bounds.val[1] - bounds.val[0];
         }
-        bool skip = false;
-        int _dc = c_right - c_local_diff;
-        for (int dc = c_right; dc < border_right; dc++, _dc++)
+        else
         {
-            if (skip == false)
-            {
-                if (notSameColor(val1, dr, dc, prevImg, maxWinSizeH, threshold))
-                    skip = true;
-            }
-            if (skip == false)
-            {
-                min_c = MIN(min_c, _dc);
-                max_c = MAX(max_c, _dc);
-                winPointMask.at<uchar>(_dr, _dc) = 1;
-                noPoints++;
-            }
-            else
-                winPointMask.at<uchar>(_dr, _dc) = 0;
-        }
-
-        skip = false;
-        _dc = c_left - c_local_diff;
-        for (int dc = c_left; dc >= border_left; dc--, _dc--)
-        {
-            if (skip == false)
-            {
-                if (notSameColor(val1, dr, dc, prevImg, maxWinSizeH, threshold))
-                    skip = true;
-            }
-            if (skip == false)
-            {
-                min_c = MIN(min_c, _dc);
-                max_c = MAX(max_c, _dc);
-                winPointMask.at<uchar>(_dr, _dc) = 1;
-                noPoints++;
-            }
-            else
-                winPointMask.at<uchar>(_dr, _dc) = 0;
+            bounds.val[0] = border_left;
+            bounds.val[1] = border_left + roi.width;
         }
+        cv::Mat(winPointMask, roi).setTo(1);
+        min_c = MIN(min_c, bounds.val[0]);
+        max_c = MAX(max_c, bounds.val[1]);
+        noPoints += roi.width;
     }
 
-
-    // get the initial small window
-    if (noPoints < minWinSize * minWinSize)
+    if( noPoints < minWinSize * minWinSize)
     {
-        cv::Rect roi(winPointMask.cols / 2 - (minWinSize - 1) / 2,
-                     winPointMask.rows / 2 - (minWinSize - 1) / 2,
-                     minWinSize, minWinSize);
+        cv::Rect roi(    winPointMask.cols / 2 - (minWinSize-1)/2,
+                        winPointMask.rows / 2 - (minWinSize-1)/2,
+                        minWinSize, minWinSize);
         cv::Mat(winPointMask, roi).setTo(1);
-        min_c = MIN(MIN(min_c, roi.tl().x), roi.br().x - 1);
-        max_c = MAX(MAX(max_c, roi.tl().x), roi.br().x - 1);
-        min_r = MIN(MIN(min_r, roi.tl().y), roi.br().y - 1);
-        max_r = MAX(MAX(max_r, roi.tl().y), roi.br().y - 1);
+        roi.x += border_left;
+        roi.y += border_top;
+        min_c = MIN(MIN(min_c, roi.tl().x),roi.br().x);
+        max_c = MAX(MAX(max_c, roi.tl().x),roi.br().x);
+        min_r = MIN(MIN(min_r, roi.tl().y),roi.br().y);
+        max_r = MAX(MAX(max_r, roi.tl().y),roi.br().y);
         noPoints += minWinSize * minWinSize;
     }
-    winRoi.x = c_local_diff + min_c;
-    winRoi.y = r_local_diff + min_r;
-    winRoi.width = max_c - min_c + 1;
-    winRoi.height = max_r - min_r + 1;
-    winPointMask = winPointMask(cv::Rect(min_c, min_r, winRoi.width, winRoi.height));
+    winRoi.x = min_c - maxWinSizeH;
+    winRoi.y = min_r - maxWinSizeH;
+    winRoi.width  =  max_c - min_c;
+    winRoi.height =  max_r - min_r;
+    winPointMask = winPointMask(cv::Rect(min_c - border_left, min_r - border_top, winRoi.width, winRoi.height));
 }
 
 static inline
@@ -242,14 +159,13 @@ bool calcWinMaskMat(
         cv::Point2f & halfWin,
         int & winArea,
         const int minWinSize,
-        const int maxWinSize,
-        const int crossSegmentationThreshold)
+        const int maxWinSize)
 {
     if (windowType == SR_CROSS && maxWinSize != minWinSize)
     {
         // patch generation
         cv::Rect winRoi;
-        getLocalPatch(BI, iprevPt, winMaskMat, winArea, winRoi, minWinSize, crossSegmentationThreshold);
+        getLocalPatch(BI, iprevPt, winMaskMat, winArea, winRoi, minWinSize);
         if (winArea == 0)
             return false;
         winSize = winRoi.size();
diff --git a/modules/optflow/src/rlof/rlof_localflow.cpp b/modules/optflow/src/rlof/rlof_localflow.cpp
index 211d999b0e7..7a6227902b7 100644
--- a/modules/optflow/src/rlof/rlof_localflow.cpp
+++ b/modules/optflow/src/rlof/rlof_localflow.cpp
@@ -110,6 +110,141 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
 } // namespace
 namespace optflow {
 
+/*! Helper function for preCalcCrossSegmentation. Everything is performed on the large
+*\param data CV_8UC3 image ( use extended image mit winSize)
+*\param winSize
+*\param dst CV_32SC1 bounding map
+*\param threshold
+*\param stride if true store into first two bounding maps
+*/
+class HorizontalCrossSegmentation  : public cv::ParallelLoopBody
+{
+public:
+    HorizontalCrossSegmentation(
+            const cv::Point2f * ptList,
+            int npoints,
+            float pointScale,
+            const cv::Mat * data,
+            const int winSize,
+            cv::Mat * dst,
+            int threshold,
+            bool stride,
+            const cv::Mat * mask
+    )
+    {
+        m_ptList        = ptList;
+        m_npoints       = npoints;
+        m_pointScale    = pointScale;
+        m_data          = data;
+        m_winSize       = winSize;
+        m_dst           = dst;
+        m_threshold     = threshold;
+        m_stride        = stride;
+        m_mask          = mask;
+    }
+
+    void operator()(const cv::Range& range) const CV_OVERRIDE
+    {
+        uchar channel[2];
+        channel[0] = m_stride ? 2 : 0;
+        channel[1] = m_stride ? 3 : 1;
+        int hWinSize        = (m_winSize - 1) / 2;
+        std::vector<int> differenz(m_winSize);
+        for( int r = range.start; r < range.end; r++ )
+        {
+            for(int c = hWinSize; c < m_data->cols - hWinSize; c++)
+            {
+                if( m_mask->at<uchar>(r,c) == 0)
+                    continue;
+                const cv::Point3_<uchar> & ucval = m_data->at<cv::Point3_<uchar>>(r,c);
+                cv::Point3i val(static_cast<int>(ucval.x), static_cast<int>(ucval.y), static_cast<int>(ucval.z));
+                int x = c - hWinSize;
+                cv::Point dstPos = m_stride ? cv::Point(r,c) : cv::Point(c,r);
+                for(int ix = 0; ix < m_winSize; ix++, x++)
+                {
+                    const cv::Point3_<uchar> & valref = m_data->at<cv::Point3_<uchar>>(r,x);
+                    differenz[ix] = MAX(std::abs(static_cast<int>(valref.x) - val.x),
+                                    MAX(std::abs(static_cast<int>(valref.y) - val.y),
+                                    (std::abs(static_cast<int>(valref.z) - val.z))));
+
+                }
+                cv::Vec4i & bounds = m_dst->at<cv::Vec4i>(dstPos);
+                bounds.val[channel[0]] = c - hWinSize;
+                bounds.val[channel[1]] = c + hWinSize;
+                int * diffPtr = &differenz[hWinSize];
+                bool useUpperBound = false;
+                bool useLowerBound = false;
+                for(int ix = 1; ix <= hWinSize; ix++)
+                {
+                    if( !useUpperBound && diffPtr[-ix] > m_threshold)
+                    {
+                        useUpperBound = true;
+                        bounds.val[channel[0]] = c - ix;
+                    }
+                    if( !useLowerBound && diffPtr[ix-1] > m_threshold)
+                    {
+                        useLowerBound = true;
+                        bounds.val[channel[1]] = c + ix - 1;
+                    }
+                    if( useUpperBound && useLowerBound)
+                        break;
+                }
+            }
+        }
+    }
+
+    const cv::Point2f * m_ptList;
+    int                 m_npoints;
+    float               m_pointScale;
+    const cv::Mat *     m_data;
+    int                 m_winSize;
+    cv::Mat *           m_dst;
+    int                 m_threshold;
+    bool                m_stride;
+    const cv::Mat *     m_mask;
+};
+
+static
+void preCalcCrossSegmentation(
+    const cv::Point2f * ptList,
+    int npoints,
+    float pointScale,
+    const cv::Mat & img,
+    const int winSize,
+    cv::Mat & dst,
+    int threshold
+)
+{
+    int hWinSize = (winSize - 1) / 2;
+    cv::Mat data = img;
+    data.adjustROI(hWinSize, hWinSize, hWinSize, hWinSize);
+    if( dst.size() != dst.size() || dst.type() != CV_32SC4)
+    {
+        dst.release();
+        dst.create(data.size(), CV_32SC4);
+    }
+    cv::Mat mask(data.cols, data.rows, CV_8UC1);
+    mask.setTo(0);
+    for( unsigned int n = 0; n < static_cast<unsigned int>(npoints); n++)
+    {
+        cv::Point ipos( static_cast<int>(floor(ptList[n].y * pointScale)),
+                        static_cast<int>(floor(ptList[n].x * pointScale) + hWinSize));
+        ipos.x = MAX( MIN(ipos.x, mask.cols - 1), 0);
+        int to = MIN( mask.cols - 1, ipos.x + winSize );
+        int ypos = MAX( MIN(ipos.y, mask.rows - 1), 0);
+        for(int x = ipos.x; x <= to ; x++)
+        {
+            mask.at<uchar>(ypos, x) = 255;
+        }
+    }
+    cv::Mat datat = data.t();
+    cv::Mat maskt = mask.t();
+    parallel_for_(cv::Range(0, datat.rows),    HorizontalCrossSegmentation(ptList, npoints, pointScale, &datat, winSize, &dst, threshold, true, &mask));
+    parallel_for_(cv::Range(0, data.rows),    HorizontalCrossSegmentation(ptList, npoints, pointScale, &data, winSize, &dst, threshold, false, &maskt));
+
+}
+
+
 int buildOpticalFlowPyramidScale(InputArray _img, OutputArrayOfArrays pyramid, Size winSize, int maxLevel, bool withDerivatives,
     int pyrBorder, int derivBorder, bool tryReuseInputImage, float levelScale[2]);
 void calcLocalOpticalFlowCore(Ptr<CImageBuffer>  prevPyramids[2], Ptr<CImageBuffer> currPyramids[2], InputArray _prevPts,
@@ -254,6 +389,7 @@ void calcLocalOpticalFlowCore(
     std::vector<float> rlofNorm = get_norm(param.normSigma0, param.normSigma1);
     CV_Assert(winSizes[0] <= winSizes[1]);
 
+    bool usePreComputedCross = winSizes[0] != winSizes[1];
     Mat prevPtsMat = _prevPts.getMat();
     const int derivDepth = DataType<detail::deriv_type>::depth;
 
@@ -322,6 +458,13 @@ void calcLocalOpticalFlowCore(
         cv::Mat prevImage = prevPyramids[0]->getImage(level);
         cv::Mat currImage = currPyramids[0]->getImage(level);
 
+        cv::Mat preCrossMap;
+        if( usePreComputedCross )
+        {
+            preCalcCrossSegmentation(prevPts, npoints, (float)(1./(1 << level)), tRGBPrevPyr, winSizes[1], preCrossMap, param.crossSegmentationThreshold);
+            tRGBNextPyr = cv::Mat();
+            tRGBPrevPyr = preCrossMap;
+        }
         // apply plk like tracker
         if (isrobust(param) == false)
         {

From c1c94a2191c44aa043da7357083f4300a0c9cc67 Mon Sep 17 00:00:00 2001
From: senst <senst@nue.tu-berlin.de>
Date: Fri, 14 Dec 2018 20:57:50 +0100
Subject: [PATCH 20/25] drop dead code

---
 .../optflow/src/rlof/geo_interpolation.cpp    | 28 -------------------
 1 file changed, 28 deletions(-)

diff --git a/modules/optflow/src/rlof/geo_interpolation.cpp b/modules/optflow/src/rlof/geo_interpolation.cpp
index 143b4f63a4e..0ac2e1715da 100644
--- a/modules/optflow/src/rlof/geo_interpolation.cpp
+++ b/modules/optflow/src/rlof/geo_interpolation.cpp
@@ -358,34 +358,6 @@ Mat interpolate_irregular_knn(
         }
     }
     return ret;
-/*
-    for (int y = 0; y < in.rows; y++) {
-        for (int x = 0; x < in.cols; x++) {
-            int v_id = y * in.cols + x;
-            std::vector<std::pair<float, Vec2f> > flow_s(k);
-
-            for (int i = 0; i < k; i++) {
-                color = *((int*)(graph_helper.data(v_id) + 1 + 2 * i));
-                int cy = flow_point_list[color][0];
-                int cx = flow_point_list[color][1];
-                Vec2f flow = in.at<Vec2f>(cy, cx);
-                flow_s[i] = std::make_pair(static_cast<float>(norm(flow)), flow);
-
-
-
-            }
-
-            nnFlow.at<Vec2f>(y, x) = std::min_element(flow_s.begin(), flow_s.end(), [](const std::pair<float, Vec2f> &left, const std::pair<float, Vec2f> &right) {
-                return left.first < right.first;
-            })->second;
-
-
-        }
-    }
-    free(graph_helper.mem);
-    return nnFlow;
-*/
-
 }
 
 Mat getGraph(const Mat &image, float edge_length)

From b95339c940ee20fb7d9da5c62997221073abaf16 Mon Sep 17 00:00:00 2001
From: senst <senst@nue.tu-berlin.de>
Date: Sat, 15 Dec 2018 01:39:43 +0100
Subject: [PATCH 21/25] Avoid using "data" and "step" calculations. Use
 .ptr<mat_type>(row, col) instead.

---
 modules/optflow/src/rlof/berlof_invoker.hpp | 89 +++++++++------------
 modules/optflow/src/rlof/plk_invoker.hpp    | 44 +++++-----
 modules/optflow/src/rlof/rlof_invoker.hpp   | 59 ++++++--------
 modules/optflow/src/rlof/rlof_localflow.cpp |  2 +-
 4 files changed, 84 insertions(+), 110 deletions(-)

diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp
index fb40fd2bd2d..1a7f6e10502 100644
--- a/modules/optflow/src/rlof/berlof_invoker.hpp
+++ b/modules/optflow/src/rlof/berlof_invoker.hpp
@@ -150,8 +150,9 @@ class TrackerInvoker : public cv::ParallelLoopBody
             if( calcWinMaskMat(BI, windowType, iprevPt,
                                 winMaskMat,winSize,halfWin,winArea,
                                 minWinSize,maxWinSize) == false)
+            {
                 continue;
-
+            }
             prevPt -= halfWin;
             iprevPt.x = cvFloor(prevPt.x);
             iprevPt.y = cvFloor(prevPt.y);
@@ -180,7 +181,6 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
             int dstep = (int)(derivI.step/derivI.elemSize1());
             int step = (int)(I.step/I.elemSize1());
-            int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1());
             CV_Assert( step == (int)(J.step/J.elemSize1()) );
 
             float A11 = 0, A12 = 0, A22 = 0;
@@ -190,11 +190,10 @@ class TrackerInvoker : public cv::ParallelLoopBody
             int x, y;
             for( y = 0; y < winSize.height; y++ )
             {
-                const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
-                const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
-
-                short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step);
-                short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step);
+                const uchar* src = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
+                const short* dsrc = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
+                short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                short* dIptr = derivIWinBuf.ptr<short>(y, 0);
                 x = 0;
                 for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 )
                 {
@@ -276,10 +275,10 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         buffIdx = 0;
                         for( y = 0; y < winSize.height; y++ )
                         {
-                            const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y )*step + (inextPt.x )*cn;
-                            const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step);
-                            const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
-                            const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+                            const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                            const short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                            const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
+                            const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
                             x = 0;
                             for( ; x < winSize.width*cn; x++, dIptr += 2)
                             {
@@ -310,11 +309,10 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     */
                     for( y = 0; y < _winSize.height; y++ )
                     {
-                        const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
-                        const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn;
-                        const short* Iptr = (const short*)(IWinBuf.data +  y*IWinBuf.step);
-                        const short* dIptr = (const short*)(derivIWinBuf.data + y*derivIWinBuf.step);
-
+                        const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                        const short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                        const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
+                        const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
                         x = 0;
                         for( ; x < _winSize.width*cn; x++, dIptr += 2 )
                         {
@@ -668,7 +666,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
             if( calcWinMaskMat(BI,  windowType, iprevPt,
                                 winMaskMat,winSize,halfWin,winArea,
                                 minWinSize,maxWinSize) == false)
-                continue;
+            continue;
 
             prevPt -= halfWin;
             iprevPt.x = cvFloor(prevPt.x);
@@ -698,7 +696,6 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
             int dstep = (int)(derivI.step/derivI.elemSize1());
             int step = (int)(I.step/I.elemSize1());
-            int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1());
             CV_Assert( step == (int)(J.step/J.elemSize1()) );
 
             float A11 = 0, A12 = 0, A22 = 0;
@@ -729,14 +726,12 @@ class TrackerInvoker : public cv::ParallelLoopBody
             for( y = 0; y < winSize.height; y++ )
             {
                 x = 0;
-
-                const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
-                const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
-
-                short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step);
-                short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step);
+                const uchar* src = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
+                const short* dsrc = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
+                short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                short* dIptr = derivIWinBuf.ptr<short>(y, 0);
 #ifdef RLOF_SSE
-                const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+                const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
                 for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 )
                 {
                     __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16);
@@ -885,10 +880,10 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     buffIdx = 0;
                     for( y = 0; y < winSize.height; y++ )
                     {
-                        const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y )*step + (inextPt.x )*cn;
-                        const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step);
-                        const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
-                        const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+                        const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                        const short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                        const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
+                        const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
                         x = 0;
                         for( ; x < winSize.width*cn; x++, dIptr += 2)
                         {
@@ -950,11 +945,10 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 */
                 for( y = 0; y < _winSize.height; y++ )
                 {
-                    const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
-                    const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn;
-                    const short* Iptr = (const short*)(IWinBuf.data +  y*IWinBuf.step);
-                    const short* dIptr = (const short*)(derivIWinBuf.data + y*derivIWinBuf.step);
-
+                    const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                    const short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                    const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
+                    const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
                     x = 0;
 #ifdef RLOF_SSE
                     for( ; x <= _winSize.width*cn; x += 8, dIptr += 8*2 )
@@ -1623,8 +1617,8 @@ class TrackerInvoker  : public cv::ParallelLoopBody
             cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize));
 
             if( calcWinMaskMat(BI, windowType, iprevPt,
-                        winMaskMat,winSize,halfWin,winArea,
-                        minWinSize,maxWinSize) == false)
+                    winMaskMat,winSize,halfWin,winArea,
+                    minWinSize,maxWinSize) == false)
                 continue;
 
             prevPt -= halfWin;
@@ -1655,7 +1649,6 @@ class TrackerInvoker  : public cv::ParallelLoopBody
 
             int dstep = (int)(derivI.step/derivI.elemSize1());
             int step = (int)(I.step/I.elemSize1());
-            int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1());
             CV_Assert( step == (int)(J.step/J.elemSize1()) );
             float A11 = 0, A12 = 0, A22 = 0;
 
@@ -1674,15 +1667,12 @@ class TrackerInvoker  : public cv::ParallelLoopBody
             int x, y;
             for( y = 0; y < winSize.height; y++ )
             {
-                const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
-                const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
-                const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
-
-                short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step);
-                short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step);
-
+                const uchar* src = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
+                const short* dsrc = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
+                short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                short* dIptr = derivIWinBuf.ptr<short>(y, 0);
+                const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
                 x = 0;
-
 #ifdef RLOF_SSE
                 for( ; x < winSize.width*cn; x += 4, dsrc += 4*2, dIptr += 4*2 )
                 {
@@ -1845,12 +1835,13 @@ class TrackerInvoker  : public cv::ParallelLoopBody
 #endif
                 for( y = 0; y < winSize.height; y++ )
                 {
-                    const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn;
-                    const short* Iptr = (const short*)(IWinBuf.data + ( y)*IWinBuf.step);
-                    const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
+                    const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                    const short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                    const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
                     x = 0;
 #ifdef RLOF_SSE
-                    const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+
+                    const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
                     for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 )
                     {
                         if( maskPtr[x  ] == 0 && maskPtr[x+1] == 0 && maskPtr[x+2] == 0 && maskPtr[x+3] == 0
diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp
index b905cffe8d1..cabf61df104 100644
--- a/modules/optflow/src/rlof/plk_invoker.hpp
+++ b/modules/optflow/src/rlof/plk_invoker.hpp
@@ -147,7 +147,6 @@ class TrackerInvoker : public cv::ParallelLoopBody
             int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
             int dstep = (int)(derivI.step/derivI.elemSize1());
             int step = (int)(I.step/I.elemSize1());
-            int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1());
             CV_Assert( step == (int)(J.step/J.elemSize1()) );
             float A11 = 0, A12 = 0, A22 = 0;
             // tensor
@@ -173,13 +172,11 @@ class TrackerInvoker : public cv::ParallelLoopBody
             int x, y;
             for( y = 0; y < winSize.height; y++ )
             {
-                const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
-                const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
-                const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
-
-                short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step);
-                short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step);
-
+                const uchar* src = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
+                const short* dsrc = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
+                short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                short* dIptr = derivIWinBuf.ptr<short>(y, 0);
+                const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
                 x = 0;
 #ifdef RLOF_SSE
                for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 )
@@ -310,11 +307,10 @@ class TrackerInvoker : public cv::ParallelLoopBody
 #endif
                 for( y = 0; y < winSize.height; y++ )
                 {
-                    const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
-                    const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn;
-                    const short* Iptr = (const short*)(IWinBuf.data + ( y)*IWinBuf.step);
-                    const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
-
+                    const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                    const short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                    const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
+                    const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
                     x = 0;
 #ifdef RLOF_SSE
 
@@ -797,7 +793,6 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
             int dstep = (int)(derivI.step/derivI.elemSize1());
             int step = (int)(I.step/I.elemSize1());
-            int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1());
             CV_Assert( step == (int)(J.step/J.elemSize1()) );
             float A11 = 0, A12 = 0, A22 = 0;
 
@@ -816,15 +811,12 @@ class TrackerInvoker : public cv::ParallelLoopBody
             int x, y;
             for( y = 0; y < winSize.height; y++ )
             {
-                const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
-                const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
-                const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
-
-                short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step);
-                short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step);
-
+                const uchar* src = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
+                const short* dsrc = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
+                short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                short* dIptr = derivIWinBuf.ptr<short>(y, 0);
+                const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
                 x = 0;
-
 #ifdef RLOF_SSE
                 for( ; x <= winSize.width*cn; x += 4, dsrc += 4*2, dIptr += 4*2 )
                 {
@@ -974,13 +966,13 @@ class TrackerInvoker : public cv::ParallelLoopBody
 #endif
                 for( y = 0; y < winSize.height; y++ )
                 {
-                    const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn;
-                    const short* Iptr = (const short*)(IWinBuf.data + ( y)*IWinBuf.step);
-                    const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
+                    const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                    const short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                    const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
 
                     x = 0;
 #ifdef RLOF_SSE
-                    const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+                    const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
                     for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 )
                     {
                         if( maskPtr[x  ] == 0 && maskPtr[x+1] == 0 && maskPtr[x+2] == 0 && maskPtr[x+3] == 0
diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp
index 67692c4da50..da9fb20a8ab 100644
--- a/modules/optflow/src/rlof/rlof_invoker.hpp
+++ b/modules/optflow/src/rlof/rlof_invoker.hpp
@@ -140,7 +140,6 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
             int dstep = (int)(derivI.step / derivI.elemSize1());
             int step = (int)(I.step / I.elemSize1());
-            int mStep = (int)(winMaskMat.step / winMaskMat.elemSize1());
             CV_Assert(step == (int)(J.step / J.elemSize1()));
             float A11 = 0, A12 = 0, A22 = 0;
             float b1 = 0, b2 = 0;
@@ -162,15 +161,12 @@ class TrackerInvoker : public cv::ParallelLoopBody
             int x, y;
             for (y = 0; y < winSize.height; y++)
             {
-                const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
-                const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
-                const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
-
-                short* Iptr = (short*)(IWinBuf.data + y * IWinBuf.step);
-                short* dIptr = (short*)(derivIWinBuf.data + y * derivIWinBuf.step);
-
+                const uchar* src = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
+                const short* dsrc = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
+                short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                short* dIptr = derivIWinBuf.ptr<short>(y, 0);
+                const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
                 x = 0;
-
 #ifdef RLOF_SSE
 
                 for (; x <= winSize.width*cn; x += 4, dsrc += 4 * 2, dIptr += 4 * 2)
@@ -287,10 +283,10 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     buffIdx = 0;
                     for (y = 0; y < winSize.height; y++)
                     {
-                        const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn;
-                        const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step);
-                        const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
-                        const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+                        const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                        const short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                        const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
+                        const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
                         x = 0;
                         for (; x < winSize.width*cn; x++, dIptr += 2)
                         {
@@ -332,11 +328,10 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 buffIdx = 0;
                 for (y = 0; y < winSize.height; y++)
                 {
-                    const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
-                    const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn;
-                    const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step);
-                    const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
-
+                    const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                    const short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                    const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
+                    const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
                     x = 0;
 #ifdef RLOF_SSE
                     for (; x <= winSize.width*cn; x += 8, dIptr += 8 * 2)
@@ -773,7 +768,6 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
             int dstep = (int)(derivI.step / derivI.elemSize1());
             int step = (int)(I.step / I.elemSize1());
-            int mStep = (int)(winMaskMat.step / winMaskMat.elemSize1());
             CV_Assert(step == (int)(J.step / J.elemSize1()));
 
             float A11 = 0, A12 = 0, A22 = 0;
@@ -802,16 +796,14 @@ class TrackerInvoker : public cv::ParallelLoopBody
             int x, y;
             for (y = 0; y < winSize.height; y++)
             {
-                const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
-                const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
-
-                short* Iptr = (short*)(IWinBuf.data + y * IWinBuf.step);
-                short* dIptr = (short*)(derivIWinBuf.data + y * derivIWinBuf.step);
-
+                const uchar* src = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
+                const short* dsrc = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
+                short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                short* dIptr = derivIWinBuf.ptr<short>(y, 0);
                 x = 0;
 
 #ifdef RLOF_SSE
-                const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
+                const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
                 for (; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4 * 2, dIptr += 4 * 2)
                 {
                     __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
@@ -947,9 +939,9 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     buffIdx = 0;
                     for (y = 0; y < winSize.height; y++)
                     {
-                        const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn;
-                        const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step);
-                        const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step);
+                        const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                        const short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                        const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
                         x = 0;
                         for (; x < winSize.width*cn; x++, dIptr += 2)
                         {
@@ -1008,11 +1000,10 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
                 for (y = 0; y < _winSize.height; y++)
                 {
-                    const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep;
-                    const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn;
-                    const short* Iptr = (const short*)(IWinBuf.data + y * IWinBuf.step);
-                    const short* dIptr = (const short*)(derivIWinBuf.data + y * derivIWinBuf.step);
-
+                    const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                    const short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                    const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
+                    const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
                     x = 0;
 #ifdef RLOF_SSE
 
diff --git a/modules/optflow/src/rlof/rlof_localflow.cpp b/modules/optflow/src/rlof/rlof_localflow.cpp
index 7a6227902b7..1bc000d8fa1 100644
--- a/modules/optflow/src/rlof/rlof_localflow.cpp
+++ b/modules/optflow/src/rlof/rlof_localflow.cpp
@@ -606,7 +606,7 @@ void preprocess(Ptr<CImageBuffer> prevPyramids[2],
 
     // Initialize point grid
     int stepr = prevPyramids[0]->m_Image.rows / 30;
-    int stepc = prevPyramids[0]->m_Image.rows / 40;
+    int stepc = prevPyramids[0]->m_Image.cols / 40;
     for (int r = stepr / 2; r < prevPyramids[0]->m_Image.rows; r += stepr)
     {
         for (int c = stepc / 2; c < prevPyramids[0]->m_Image.cols; c += stepc)

From 873e03a4fc632909337a7888923245bd868221e6 Mon Sep 17 00:00:00 2001
From: senst <senst@nue.tu-berlin.de>
Date: Sat, 15 Dec 2018 21:13:23 +0100
Subject: [PATCH 22/25] Avoid using "data" and "step" calculations. Use
 .ptr<mat_type>(row, col) instead.

---
 modules/optflow/src/rlof/berlof_invoker.hpp | 112 +++++++++-----------
 modules/optflow/src/rlof/plk_invoker.hpp    |  65 ++++++------
 modules/optflow/src/rlof/rlof_invoker.hpp   |  74 ++++++-------
 modules/optflow/src/rlof/rlof_localflow.cpp |   3 +-
 modules/optflow/src/rlofflow.cpp            |   4 +-
 5 files changed, 125 insertions(+), 133 deletions(-)

diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp
index 1a7f6e10502..1f7b3ac8384 100644
--- a/modules/optflow/src/rlof/berlof_invoker.hpp
+++ b/modules/optflow/src/rlof/berlof_invoker.hpp
@@ -179,10 +179,6 @@ class TrackerInvoker : public cv::ParallelLoopBody
             int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
             int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
 
-            int dstep = (int)(derivI.step/derivI.elemSize1());
-            int step = (int)(I.step/I.elemSize1());
-            CV_Assert( step == (int)(J.step/J.elemSize1()) );
-
             float A11 = 0, A12 = 0, A22 = 0;
             float D = 0;
 
@@ -191,11 +187,13 @@ class TrackerInvoker : public cv::ParallelLoopBody
             for( y = 0; y < winSize.height; y++ )
             {
                 const uchar* src = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
+                const uchar* src1 = I.ptr<uchar>(y + iprevPt.y + 1, 0) + iprevPt.x*cn;
                 const short* dsrc = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
+                const short* dsrc1 = derivI.ptr<short>(y + iprevPt.y + 1, 0) + iprevPt.x*cn2;
                 short* Iptr  = IWinBuf.ptr<short>(y, 0);
                 short* dIptr = derivIWinBuf.ptr<short>(y, 0);
                 x = 0;
-                for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 )
+                for( ; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2 )
                 {
                     if( winMaskMat.at<uchar>(y,x) == 0)
                     {
@@ -204,11 +202,11 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         continue;
                     }
                     int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
-                                          src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5);
+                                          src1[x]*iw10 + src1[x+cn]*iw11, W_BITS1-5);
                     int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
-                                           dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
-                    int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
-                                           dsrc[dstep+cn2+1]*iw11, W_BITS1);
+                                           dsrc1[0]*iw10 + dsrc1[cn2]*iw11, W_BITS1);
+                    int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 +
+                                           dsrc1[1]*iw10 + dsrc1[cn2+1]*iw11, W_BITS1);
                     Iptr[x] = (short)ival;
                     dIptr[0] = (short)ixval;
                     dIptr[1] = (short)iyval;
@@ -276,6 +274,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         for( y = 0; y < winSize.height; y++ )
                         {
                             const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                            const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
                             const short* Iptr  = IWinBuf.ptr<short>(y, 0);
                             const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
                             const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
@@ -284,7 +283,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             {
                                 if( maskPtr[x] == 0)
                                     continue;
-                                int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, W_BITS1-5)
+                                int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr1[x]*iw10 + Jptr1[x+cn]*iw11, W_BITS1-5)
                                     - Iptr[x];
                                 residualMat.at<short>(buffIdx++) = static_cast<short>(diff);
                             }
@@ -310,6 +309,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     for( y = 0; y < _winSize.height; y++ )
                     {
                         const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                        const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
                         const short* Iptr  = IWinBuf.ptr<short>(y, 0);
                         const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
                         const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
@@ -320,15 +320,15 @@ class TrackerInvoker : public cv::ParallelLoopBody
                                 continue;
                             int illValue =   - Iptr[x];
 
-                            float It[4] = {static_cast<float>((Jptr[x+step+cn]<< 5)    + illValue),
+                            float It[4] = {static_cast<float>((Jptr1[x+cn]<< 5)    + illValue),
                                          static_cast<float>((Jptr[x+cn]<< 5)        + illValue),
-                                         static_cast<float>((Jptr[x+step]<< 5)        + illValue),
+                                         static_cast<float>((Jptr1[x]<< 5)        + illValue),
                                          static_cast<float>((Jptr[x] << 5)            + illValue)};
 
 
 
                             int J_val  =  CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
-                                                  Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11,
+                                                  Jptr1[x]*iw10 + Jptr1[x+cn]*iw11,
                                                   W_BITS1-5);
 
 
@@ -693,11 +693,6 @@ class TrackerInvoker : public cv::ParallelLoopBody
             int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
             int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
             int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-
-            int dstep = (int)(derivI.step/derivI.elemSize1());
-            int step = (int)(I.step/I.elemSize1());
-            CV_Assert( step == (int)(J.step/J.elemSize1()) );
-
             float A11 = 0, A12 = 0, A22 = 0;
 
             // tensor
@@ -726,13 +721,15 @@ class TrackerInvoker : public cv::ParallelLoopBody
             for( y = 0; y < winSize.height; y++ )
             {
                 x = 0;
-                const uchar* src = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
-                const short* dsrc = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
+                const uchar* src  = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
+                const uchar* src1 = I.ptr<uchar>(y + iprevPt.y + 1, 0) + iprevPt.x*cn;
+                const short* dsrc  = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
+                const short* dsrc1 = derivI.ptr<short>(y + iprevPt.y + 1, 0) + iprevPt.x*cn2;
                 short* Iptr  = IWinBuf.ptr<short>(y, 0);
                 short* dIptr = derivIWinBuf.ptr<short>(y, 0);
 #ifdef RLOF_SSE
                 const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
-                for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 )
+                for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dsrc1 += 8, dIptr += 4*2 )
                 {
                     __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16);
                     __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16);
@@ -741,8 +738,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     __m128i v00, v01, v10, v11, t0, t1;
                     v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
                     v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
-                    v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z);
-                    v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z);
+                    v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x)), z);
+                    v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x + cn)), z);
 
                     t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
                                        _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
@@ -757,8 +754,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
                     v00 = _mm_loadu_si128((const __m128i*)(dsrc));
                     v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
-                    v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep));
-                    v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2));
+                    v10 = _mm_loadu_si128((const __m128i*)(dsrc1));
+                    v11 = _mm_loadu_si128((const __m128i*)(dsrc1 + cn2));
 
                     t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
                                        _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
@@ -776,7 +773,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 }
 #else
 
-                for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 )
+                for( ; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2 )
                 {
                     if( winMaskMat.at<uchar>(y,x) == 0)
                     {
@@ -785,11 +782,11 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         continue;
                     }
                     int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
-                                          src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5);
+                                          src1[x]*iw10 + src1[x+cn]*iw11, W_BITS1-5);
                     int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
-                                           dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
-                    int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
-                                           dsrc[dstep+cn2+1]*iw11, W_BITS1);
+                                           dsrc1[0]*iw10 + dsrc1[cn2]*iw11, W_BITS1);
+                    int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 +
+                                            dsrc1[1]*iw10 + dsrc1[cn2+1]*iw11, W_BITS1);
 
                     Iptr[x] = (short)ival;
                     dIptr[0] = (short)ixval;
@@ -881,6 +878,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     for( y = 0; y < winSize.height; y++ )
                     {
                         const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                        const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
                         const short* Iptr  = IWinBuf.ptr<short>(y, 0);
                         const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
                         const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
@@ -889,7 +887,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         {
                             if( maskPtr[x] == 0)
                                 continue;
-                            int diff = static_cast<int>(CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, W_BITS1-5)
+                            int diff = static_cast<int>(CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr1[x]*iw10 + Jptr1[x+cn]*iw11, W_BITS1-5)
                                 - Iptr[x] + Iptr[x] * gainVec.x +gainVec.y);
                             residualMat.at<short>(buffIdx++) = static_cast<short>(diff);
                         }
@@ -945,7 +943,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 */
                 for( y = 0; y < _winSize.height; y++ )
                 {
-                    const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                    const uchar* Jptr =  J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                    const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
                     const short* Iptr  = IWinBuf.ptr<short>(y, 0);
                     const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
                     const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
@@ -959,8 +958,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         __m128i v00 = _mm_unpacklo_epi8(
                             _mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
                         __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
-                        __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z);
-                        __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z);
+                        __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z);
+                        __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z);
 
                         __m128i t0 = _mm_add_epi32
                             (_mm_madd_epi16(
@@ -1164,14 +1163,14 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         short iyval = dIptr[1];
                         int illValue =  static_cast<int>(Iptr[x] * gainVec.x + gainVec.y  - Iptr[x]);
 
-                        int It[4] = {(Jptr[x+step+cn]<< 5)    + illValue,
+                        int It[4] = {(Jptr1[x+cn]<< 5)    + illValue,
                                      (Jptr[x+cn]<< 5)        + illValue,
-                                     (Jptr[x+step]<< 5)        + illValue,
+                                     (Jptr1[x]<< 5)        + illValue,
                                      (Jptr[x] << 5)            + illValue};
 
 
                         int J_val  =  CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
-                                              Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11,
+                                              Jptr1[x]*iw10 + Jptr1[x+cn]*iw11,
                                               W_BITS1-5);
 
                         int diff =  J_val + illValue;
@@ -1646,10 +1645,6 @@ class TrackerInvoker  : public cv::ParallelLoopBody
             int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
             int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
             int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-
-            int dstep = (int)(derivI.step/derivI.elemSize1());
-            int step = (int)(I.step/I.elemSize1());
-            CV_Assert( step == (int)(J.step/J.elemSize1()) );
             float A11 = 0, A12 = 0, A22 = 0;
 
 #ifdef RLOF_SSE
@@ -1668,13 +1663,15 @@ class TrackerInvoker  : public cv::ParallelLoopBody
             for( y = 0; y < winSize.height; y++ )
             {
                 const uchar* src = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
+                const uchar* src1 = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
                 const short* dsrc = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
+                const short* dsrc1 = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
                 short* Iptr  = IWinBuf.ptr<short>(y, 0);
                 short* dIptr = derivIWinBuf.ptr<short>(y, 0);
                 const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
                 x = 0;
 #ifdef RLOF_SSE
-                for( ; x < winSize.width*cn; x += 4, dsrc += 4*2, dIptr += 4*2 )
+                for( ; x < winSize.width*cn; x += 4, dsrc += 4*2, dsrc1 += 8,dIptr += 4*2 )
                 {
                     __m128i wMask = _mm_set_epi32(MaskSet * maskPtr[x+3],
                                                   MaskSet * maskPtr[x+2],
@@ -1683,8 +1680,8 @@ class TrackerInvoker  : public cv::ParallelLoopBody
                     __m128i v00, v01, v10, v11, t0, t1;
                     v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
                     v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
-                    v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z);
-                    v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z);
+                    v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x)), z);
+                    v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x + cn)), z);
 
                     t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
                                        _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
@@ -1694,8 +1691,8 @@ class TrackerInvoker  : public cv::ParallelLoopBody
 
                     v00 = _mm_loadu_si128((const __m128i*)(dsrc));
                     v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
-                    v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep));
-                    v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2));
+                    v10 = _mm_loadu_si128((const __m128i*)(dsrc1));
+                    v11 = _mm_loadu_si128((const __m128i*)(dsrc1 + cn2));
 
                     t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
                                        _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
@@ -1724,7 +1721,7 @@ class TrackerInvoker  : public cv::ParallelLoopBody
                 }
 #else
 
-                for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2)
+                for( ; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2)
                 {
                     if( maskPtr[x] == 0)
                     {
@@ -1734,11 +1731,11 @@ class TrackerInvoker  : public cv::ParallelLoopBody
                         continue;
                     }
                     int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
-                                          src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5);
+                                          src1[x]*iw10 + src1[x+cn]*iw11, W_BITS1-5);
                     int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
-                                           dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
-                    int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
-                                           dsrc[dstep+cn2+1]*iw11, W_BITS1);
+                                           dsrc1[0]*iw10 + dsrc1[cn2]*iw11, W_BITS1);
+                    int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc1[1]*iw10 +
+                                           dsrc1[cn2+1]*iw11, W_BITS1);
 
                     Iptr[x] = (short)ival;
                     dIptr[0] = (short)ixval;
@@ -1836,6 +1833,7 @@ class TrackerInvoker  : public cv::ParallelLoopBody
                 for( y = 0; y < winSize.height; y++ )
                 {
                     const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                    const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
                     const short* Iptr  = IWinBuf.ptr<short>(y, 0);
                     const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
                     x = 0;
@@ -1850,8 +1848,8 @@ class TrackerInvoker  : public cv::ParallelLoopBody
                         __m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x));
                         __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
                         __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
-                        __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z);
-                        __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z);
+                        __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z);
+                        __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z);
 
                         __m128i mmDiffc_epi16[4] =
                         { _mm_subs_epi16(_mm_slli_epi16(v00, 5), diff0),
@@ -1953,14 +1951,10 @@ class TrackerInvoker  : public cv::ParallelLoopBody
                     {
                         if( dIptr[0] == 0 && dIptr[1] == 0)
                             continue;
-                        /*short It[4] = {CV_DESCALE((Jptr[x])        *(1 << W_BITS),W_BITS1-5) - Iptr[x],
-                                            CV_DESCALE((Jptr[x+cn])        *(1 << W_BITS),W_BITS1-5) - Iptr[x],
-                                            CV_DESCALE((Jptr[x+step])    *(1 << W_BITS),W_BITS1-5) - Iptr[x],
-                                            CV_DESCALE((Jptr[x+step+cn])*(1 << W_BITS),W_BITS1-5) - Iptr[x]};*/
                         short It[4] = {(Jptr[x] << 5)            - Iptr[x],
                                             (Jptr[x+cn]<< 5)        - Iptr[x],
-                                            (Jptr[x+step]<< 5)        - Iptr[x],
-                                            (Jptr[x+step+cn]<< 5)    - Iptr[x]};
+                                            (Jptr1[x]<< 5)        - Iptr[x],
+                                            (Jptr1[x+cn]<< 5)    - Iptr[x]};
 
                         _b1[0] += (float)(It[0]*dIptr[0]);
                         _b1[1] += (float)(It[1]*dIptr[0]);
diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp
index cabf61df104..3f76ce460a4 100644
--- a/modules/optflow/src/rlof/plk_invoker.hpp
+++ b/modules/optflow/src/rlof/plk_invoker.hpp
@@ -145,9 +145,6 @@ class TrackerInvoker : public cv::ParallelLoopBody
             int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
             int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
             int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-            int dstep = (int)(derivI.step/derivI.elemSize1());
-            int step = (int)(I.step/I.elemSize1());
-            CV_Assert( step == (int)(J.step/J.elemSize1()) );
             float A11 = 0, A12 = 0, A22 = 0;
             // tensor
             float sumIx = 0;
@@ -173,21 +170,23 @@ class TrackerInvoker : public cv::ParallelLoopBody
             for( y = 0; y < winSize.height; y++ )
             {
                 const uchar* src = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
+                const uchar* src1 = I.ptr<uchar>(y + iprevPt.y + 1, 0) + iprevPt.x*cn;
                 const short* dsrc = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
+                const short* dsrc1 = derivI.ptr<short>(y + iprevPt.y + 1, 0) + iprevPt.x*cn2;
                 short* Iptr  = IWinBuf.ptr<short>(y, 0);
                 short* dIptr = derivIWinBuf.ptr<short>(y, 0);
                 const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
                 x = 0;
 #ifdef RLOF_SSE
-               for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 )
+               for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dsrc1 += 8, dIptr += 4*2 )
                 {
                     __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16);
                     __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16);
                     __m128i v00, v01, v10, v11, t0, t1;
                     v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
                     v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
-                    v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z);
-                    v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z);
+                    v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x)), z);
+                    v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x + cn)), z);
 
                     t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
                                        _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
@@ -202,8 +201,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
                     v00 = _mm_loadu_si128((const __m128i*)(dsrc));
                     v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
-                    v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep));
-                    v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2));
+                    v10 = _mm_loadu_si128((const __m128i*)(dsrc1));
+                    v11 = _mm_loadu_si128((const __m128i*)(dsrc1 + cn2));
 
                     t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
                                        _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
@@ -220,7 +219,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     _mm_storeu_si128((__m128i*)dIptr, v00);
                 }
 #else
-                for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 )
+                for( ; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2 )
                 {
                     if( maskPtr[x] == 0)
                     {
@@ -229,11 +228,11 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         continue;
                     }
                     int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
-                                          src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5);
+                                          src1[x]*iw10 + src1[x+cn]*iw11, W_BITS1-5);
                     int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
-                                           dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
-                    int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
-                                           dsrc[dstep+cn2+1]*iw11, W_BITS1);
+                                           dsrc1[0]*iw10 + dsrc1[cn2]*iw11, W_BITS1);
+                    int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc1[1]*iw10 +
+                                           dsrc1[cn2+1]*iw11, W_BITS1);
 
                     Iptr[x] = (short)ival;
                     dIptr[0] = (short)ixval;
@@ -308,6 +307,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 for( y = 0; y < winSize.height; y++ )
                 {
                     const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                    const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
                     const short* Iptr  = IWinBuf.ptr<short>(y, 0);
                     const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
                     const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
@@ -328,8 +328,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             _mm_loadl_epi64((const __m128i*)(Jptr + x)) // J0, J1, J2, ..., J7, 64 bit nullen
                                 , z); //J0 , 0, J1, 0, J2, 0 ... J7,0
                         __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
-                        __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z);
-                        __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z);
+                        __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z);
+                        __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z);
 
 
                         __m128i t0 = _mm_add_epi32
@@ -465,7 +465,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         if( maskPtr[x] == 0)
                             continue;
                         int J_val = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
-                                              Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11,
+                                              Jptr1[x]*iw10 + Jptr1[x+cn]*iw11,
                                               W_BITS1-5);
                         int diff = static_cast<int>(J_val - Iptr[x] + Iptr[x] * gainVec.x + gainVec.y);
 
@@ -790,10 +790,6 @@ class TrackerInvoker : public cv::ParallelLoopBody
             int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
             int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
             int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-
-            int dstep = (int)(derivI.step/derivI.elemSize1());
-            int step = (int)(I.step/I.elemSize1());
-            CV_Assert( step == (int)(J.step/J.elemSize1()) );
             float A11 = 0, A12 = 0, A22 = 0;
 
 #ifdef RLOF_SSE
@@ -812,13 +808,15 @@ class TrackerInvoker : public cv::ParallelLoopBody
             for( y = 0; y < winSize.height; y++ )
             {
                 const uchar* src = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
+                const uchar* src1 = I.ptr<uchar>(y + iprevPt.y + 1, 0) + iprevPt.x*cn;
                 const short* dsrc = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
+                const short* dsrc1 = derivI.ptr<short>(y + iprevPt.y + 1, 0) + iprevPt.x*cn2;
                 short* Iptr  = IWinBuf.ptr<short>(y, 0);
                 short* dIptr = derivIWinBuf.ptr<short>(y, 0);
                 const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
                 x = 0;
 #ifdef RLOF_SSE
-                for( ; x <= winSize.width*cn; x += 4, dsrc += 4*2, dIptr += 4*2 )
+                for( ; x <= winSize.width*cn; x += 4, dsrc += 4*2, dsrc1 += 8, dIptr += 4*2 )
                 {
                     __m128i wMask = _mm_set_epi32(MaskSet * maskPtr[x+3],
                                                   MaskSet * maskPtr[x+2],
@@ -827,8 +825,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     __m128i v00, v01, v10, v11, t0, t1;
                     v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
                     v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
-                    v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z);
-                    v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z);
+                    v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x)), z);
+                    v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x + cn)), z);
 
                     t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
                                        _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
@@ -843,8 +841,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
                     v00 = _mm_loadu_si128((const __m128i*)(dsrc));
                     v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
-                    v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep));
-                    v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2));
+                    v10 = _mm_loadu_si128((const __m128i*)(dsrc1));
+                    v11 = _mm_loadu_si128((const __m128i*)(dsrc1 + cn2));
 
                     t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
                                        _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
@@ -874,7 +872,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 }
 #else
 
-                for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 )
+                for( ; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2 )
                 {
                     if( maskPtr[x] == 0)
                     {
@@ -883,11 +881,11 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         continue;
                     }
                     int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
-                                          src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5);
+                                          src1[x]*iw10 + src1[x+cn]*iw11, W_BITS1-5);
                     int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
-                                           dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
-                    int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
-                                           dsrc[dstep+cn2+1]*iw11, W_BITS1);
+                                           dsrc1[0]*iw10 + dsrc1[cn2]*iw11, W_BITS1);
+                    int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc1[1]*iw10 +
+                                           dsrc1[cn2+1]*iw11, W_BITS1);
 
                     Iptr[x] = (short)ival;
                     dIptr[0] = (short)ixval;
@@ -967,6 +965,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 for( y = 0; y < winSize.height; y++ )
                 {
                     const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                    const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
                     const short* Iptr  = IWinBuf.ptr<short>(y, 0);
                     const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
 
@@ -981,8 +980,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         __m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x)), diff1;
                         __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
                         __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
-                        __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z);
-                        __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z);
+                        __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z);
+                        __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z);
 
                         __m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
                                                    _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
@@ -1030,7 +1029,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         if( dIptr[0] == 0 && dIptr[1] == 0)
                             continue;
                         int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
-                                              Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11,
+                                              Jptr1[x]*iw10 + Jptr1[x+cn]*iw11,
                                               W_BITS1-5) - Iptr[x];
 
                         b1 += (float)(diff*dIptr[0]) * FLT_RESCALE;
diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp
index da9fb20a8ab..38648871273 100644
--- a/modules/optflow/src/rlof/rlof_invoker.hpp
+++ b/modules/optflow/src/rlof/rlof_invoker.hpp
@@ -137,10 +137,6 @@ class TrackerInvoker : public cv::ParallelLoopBody
             int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
             int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
             int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-
-            int dstep = (int)(derivI.step / derivI.elemSize1());
-            int step = (int)(I.step / I.elemSize1());
-            CV_Assert(step == (int)(J.step / J.elemSize1()));
             float A11 = 0, A12 = 0, A22 = 0;
             float b1 = 0, b2 = 0;
             float D = 0;
@@ -162,14 +158,16 @@ class TrackerInvoker : public cv::ParallelLoopBody
             for (y = 0; y < winSize.height; y++)
             {
                 const uchar* src = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
+                const uchar* src1 = I.ptr<uchar>(y + iprevPt.y + 1, 0) + iprevPt.x*cn;
                 const short* dsrc = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
+                const short* dsrc1 = derivI.ptr<short>(y + iprevPt.y + 1, 0) + iprevPt.x*cn2;
                 short* Iptr  = IWinBuf.ptr<short>(y, 0);
                 short* dIptr = derivIWinBuf.ptr<short>(y, 0);
                 const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
                 x = 0;
 #ifdef RLOF_SSE
 
-                for (; x <= winSize.width*cn; x += 4, dsrc += 4 * 2, dIptr += 4 * 2)
+                for (; x <= winSize.width*cn; x += 4, dsrc += 4 * 2, dsrc1 += 8, dIptr += 4 * 2)
                 {
                     __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
                     __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16);
@@ -177,8 +175,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     __m128i v00, v01, v10, v11, t0, t1;
                     v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
                     v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
-                    v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z);
-                    v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z);
+                    v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x)), z);
+                    v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x + cn)), z);
 
                     t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
                         _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
@@ -192,8 +190,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
                     v00 = _mm_loadu_si128((const __m128i*)(dsrc));
                     v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
-                    v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep));
-                    v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2));
+                    v10 = _mm_loadu_si128((const __m128i*)(dsrc1));
+                    v11 = _mm_loadu_si128((const __m128i*)(dsrc1 + cn2));
 
                     t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
                         _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
@@ -211,7 +209,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 }
 #else
 
-                for (; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2)
+                for (; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2)
                 {
                     if (maskPtr[x] == 0)
                     {
@@ -220,11 +218,11 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         continue;
                     }
                     int ival = CV_DESCALE(src[x] * iw00 + src[x + cn] * iw01 +
-                        src[x + step] * iw10 + src[x + step + cn] * iw11, W_BITS1 - 5);
+                        src1[x] * iw10 + src1[x + cn] * iw11, W_BITS1 - 5);
                     int ixval = CV_DESCALE(dsrc[0] * iw00 + dsrc[cn2] * iw01 +
-                        dsrc[dstep] * iw10 + dsrc[dstep + cn2] * iw11, W_BITS1);
-                    int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc[dstep + 1] * iw10 +
-                        dsrc[dstep + cn2 + 1] * iw11, W_BITS1);
+                        dsrc1[0] * iw10 + dsrc1[cn2] * iw11, W_BITS1);
+                    int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc1[1] * iw10 +
+                        dsrc1[cn2 + 1] * iw11, W_BITS1);
 
                     Iptr[x] = (short)ival;
                     dIptr[0] = (short)ixval;
@@ -284,6 +282,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     for (y = 0; y < winSize.height; y++)
                     {
                         const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                        const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
                         const short* Iptr  = IWinBuf.ptr<short>(y, 0);
                         const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
                         const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
@@ -292,7 +291,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         {
                             if (maskPtr[x] == 0)
                                 continue;
-                            int diff = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11, W_BITS1 - 5) - Iptr[x];
+                            int diff = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr1[x] * iw10 + Jptr1[x + cn] * iw11, W_BITS1 - 5) - Iptr[x];
                             residualMat.at<short>(buffIdx++) = static_cast<short>(diff);
                         }
                     }
@@ -329,6 +328,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 for (y = 0; y < winSize.height; y++)
                 {
                     const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                    const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
                     const short* Iptr  = IWinBuf.ptr<short>(y, 0);
                     const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
                     const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
@@ -342,8 +342,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
                         __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
                         __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
-                        __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z);
-                        __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z);
+                        __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z);
+                        __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z);
 
                         __m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
                             _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
@@ -464,7 +464,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         if (maskPtr[x] == 0)
                             continue;
                         int diff = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 +
-                            Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11,
+                            Jptr1[x] * iw10 + Jptr1[x + cn] * iw11,
                             W_BITS1 - 5) - Iptr[x];
 
 
@@ -766,10 +766,6 @@ class TrackerInvoker : public cv::ParallelLoopBody
             int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
             int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
 
-            int dstep = (int)(derivI.step / derivI.elemSize1());
-            int step = (int)(I.step / I.elemSize1());
-            CV_Assert(step == (int)(J.step / J.elemSize1()));
-
             float A11 = 0, A12 = 0, A22 = 0;
             float b1 = 0, b2 = 0, b3 = 0, b4 = 0;
             // tensor
@@ -797,22 +793,24 @@ class TrackerInvoker : public cv::ParallelLoopBody
             for (y = 0; y < winSize.height; y++)
             {
                 const uchar* src = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
+                const uchar* src1 = I.ptr<uchar>(y + iprevPt.y + 1, 0) + iprevPt.x*cn;
                 const short* dsrc = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
+                const short* dsrc1 = derivI.ptr<short>(y + iprevPt.y + 1, 0) + iprevPt.x*cn2;
                 short* Iptr  = IWinBuf.ptr<short>(y, 0);
                 short* dIptr = derivIWinBuf.ptr<short>(y, 0);
                 x = 0;
 
 #ifdef RLOF_SSE
                 const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
-                for (; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4 * 2, dIptr += 4 * 2)
+                for (; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4 * 2, dsrc1 += 8, dIptr += 4 * 2)
                 {
                     __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
                     __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16);
                     __m128i v00, v01, v10, v11, t0, t1;
                     v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
                     v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
-                    v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z);
-                    v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z);
+                    v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x)), z);
+                    v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x + cn)), z);
 
                     t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
                         _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
@@ -827,8 +825,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
                     v00 = _mm_loadu_si128((const __m128i*)(dsrc));
                     v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
-                    v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep));
-                    v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2));
+                    v10 = _mm_loadu_si128((const __m128i*)(dsrc1));
+                    v11 = _mm_loadu_si128((const __m128i*)(dsrc1 + cn2));
 
                     t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
                         _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
@@ -846,7 +844,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 }
 #else
 
-                for (; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2)
+                for (; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2)
                 {
                     if (winMaskMat.at<uchar>(y, x) == 0)
                     {
@@ -855,11 +853,11 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         continue;
                     }
                     int ival = CV_DESCALE(src[x] * iw00 + src[x + cn] * iw01 +
-                        src[x + step] * iw10 + src[x + step + cn] * iw11, W_BITS1 - 5);
+                        src1[x] * iw10 + src1[x + cn] * iw11, W_BITS1 - 5);
                     int ixval = CV_DESCALE(dsrc[0] * iw00 + dsrc[cn2] * iw01 +
-                        dsrc[dstep] * iw10 + dsrc[dstep + cn2] * iw11, W_BITS1);
-                    int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc[dstep + 1] * iw10 +
-                        dsrc[dstep + cn2 + 1] * iw11, W_BITS1);
+                        dsrc1[0] * iw10 + dsrc1[ cn2] * iw11, W_BITS1);
+                    int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc1[1] * iw10 +
+                        dsrc1[cn2 + 1] * iw11, W_BITS1);
 
                     Iptr[x] = (short)ival;
                     dIptr[0] = (short)ixval;
@@ -940,6 +938,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     for (y = 0; y < winSize.height; y++)
                     {
                         const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                        const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
                         const short* Iptr  = IWinBuf.ptr<short>(y, 0);
                         const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
                         x = 0;
@@ -949,8 +948,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                                 continue;
                             int diff = static_cast<int>(CV_DESCALE(    Jptr[x] * iw00 +
                                                     Jptr[x + cn] * iw01 +
-                                                    Jptr[x + step] * iw10 +
-                                                    Jptr[x + step + cn] * iw11, W_BITS1 - 5)
+                                                    Jptr1[x] * iw10 +
+                                                    Jptr1[x + cn] * iw11, W_BITS1 - 5)
                                 - Iptr[x] + Iptr[x] * gainVec.x + gainVec.y);
                             residualMat.at<short>(buffIdx++) = static_cast<short>(diff);
                         }
@@ -1001,6 +1000,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 for (y = 0; y < _winSize.height; y++)
                 {
                     const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                    const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
                     const short* Iptr  = IWinBuf.ptr<short>(y, 0);
                     const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
                     const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
@@ -1016,8 +1016,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                            __m128i v00 = _mm_unpacklo_epi8(
                         _mm_loadl_epi64((const __m128i*)(Jptr + x)) , z); //J0 , 0, J1, 0, J2, 0 ... J7,0
                         __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
-                        __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z);
-                        __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z);
+                        __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z);
+                        __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z);
 
                         __m128i t0 = _mm_add_epi32
                         (_mm_madd_epi16(
@@ -1205,7 +1205,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     {
                         if (maskPtr[x] == 0)
                             continue;
-                        int J_val = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11,
+                        int J_val = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr1[x] * iw10 + Jptr1[x + cn] * iw11,
                             W_BITS1 - 5);
                         short ixval = static_cast<short>(dIptr[0]);
                         short iyval = static_cast<short>(dIptr[1]);
diff --git a/modules/optflow/src/rlof/rlof_localflow.cpp b/modules/optflow/src/rlof/rlof_localflow.cpp
index 1bc000d8fa1..feb515d51d1 100644
--- a/modules/optflow/src/rlof/rlof_localflow.cpp
+++ b/modules/optflow/src/rlof/rlof_localflow.cpp
@@ -255,8 +255,7 @@ void preprocess(Ptr<CImageBuffer> prevPyramids[2], Ptr<CImageBuffer> currPyramid
 
 inline bool isrobust(const RLOFOpticalFlowParameter & param)
 {
-    return (param.normSigma0 != std::numeric_limits<float>::max())
-        && (param.normSigma1 != std::numeric_limits<float>::max());
+    return (param.normSigma0 < 255 && param.normSigma1 < 255);
 }
 inline std::vector<float> get_norm(float sigma0, float sigma1)
 {
diff --git a/modules/optflow/src/rlofflow.cpp b/modules/optflow/src/rlofflow.cpp
index 12a72cb035f..5ee07005eb7 100644
--- a/modules/optflow/src/rlofflow.cpp
+++ b/modules/optflow/src/rlofflow.cpp
@@ -126,9 +126,9 @@ class DenseOpticalFlowRLOFImpl : public DenseRLOFOpticalFlow
         std::vector<cv::Point2f> currPoints, refPoints;
         noPoints = 0;
         cv::Size grid_h = gridStep / 2;
-        for (int r = grid_h.height; r < prevImage.rows; r += gridStep.height)
+        for (int r = grid_h.height; r < prevImage.rows - grid_h.height; r += gridStep.height)
         {
-            for (int c = grid_h.width; c < prevImage.cols; c += gridStep.width)
+            for (int c = grid_h.width; c < prevImage.cols - grid_h.width; c += gridStep.width)
             {
                 prevPoints[noPoints++] = cv::Point2f(static_cast<float>(c), static_cast<float>(r));
             }

From 0046f5438472508ecd4eadf079ba335cc26a4d31 Mon Sep 17 00:00:00 2001
From: senst <senst@nue.tu-berlin.de>
Date: Sun, 16 Dec 2018 13:48:37 +0100
Subject: [PATCH 23/25] bugfix on BEPLK with ica and adapt the accuracy tests

---
 modules/optflow/src/rlof/berlof_invoker.hpp |  6 +++---
 modules/optflow/test/test_OF_accuracy.cpp   | 12 ++++++------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp
index 1f7b3ac8384..5e618562159 100644
--- a/modules/optflow/src/rlof/berlof_invoker.hpp
+++ b/modules/optflow/src/rlof/berlof_invoker.hpp
@@ -1663,9 +1663,9 @@ class TrackerInvoker  : public cv::ParallelLoopBody
             for( y = 0; y < winSize.height; y++ )
             {
                 const uchar* src = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
-                const uchar* src1 = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
+                const uchar* src1 = I.ptr<uchar>(y + iprevPt.y + 1, 0) + iprevPt.x*cn;
                 const short* dsrc = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
-                const short* dsrc1 = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
+                const short* dsrc1 = derivI.ptr<short>(y + iprevPt.y + 1, 0) + iprevPt.x*cn2;
                 short* Iptr  = IWinBuf.ptr<short>(y, 0);
                 short* dIptr = derivIWinBuf.ptr<short>(y, 0);
                 const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
@@ -1833,7 +1833,7 @@ class TrackerInvoker  : public cv::ParallelLoopBody
                 for( y = 0; y < winSize.height; y++ )
                 {
                     const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
-                    const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                    const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
                     const short* Iptr  = IWinBuf.ptr<short>(y, 0);
                     const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
                     x = 0;
diff --git a/modules/optflow/test/test_OF_accuracy.cpp b/modules/optflow/test/test_OF_accuracy.cpp
index 26f6b3668c4..2e8940b3a2c 100644
--- a/modules/optflow/test/test_OF_accuracy.cpp
+++ b/modules/optflow/test/test_OF_accuracy.cpp
@@ -198,7 +198,7 @@ TEST(SparseOpticalFlow, ReferenceAccuracy)
     param->solverType = ST_BILINEAR;
     algo->setRLOFOpticalFlowParameter(param);
     algo->calc(frame1, frame2, prevPts, currPts, status, err);
-    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.25f);
+    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.27f);
 
     param->solverType = ST_STANDART;
     algo->setRLOFOpticalFlowParameter(param);
@@ -209,12 +209,12 @@ TEST(SparseOpticalFlow, ReferenceAccuracy)
     param->solverType = ST_BILINEAR;
     algo->setRLOFOpticalFlowParameter(param);
     algo->calc(frame1, frame2, prevPts, currPts, status, err);
-    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.25f);
+    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.27f);
 
     param->solverType = ST_STANDART;
     algo->setRLOFOpticalFlowParameter(param);
     algo->calc(frame1, frame2, prevPts, currPts, status, err);
-    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.25f);
+    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.27f);
 
     param->normSigma0 = numeric_limits<float>::max();
     param->normSigma1 = numeric_limits<float>::max();
@@ -235,7 +235,7 @@ TEST(SparseOpticalFlow, ReferenceAccuracy)
     param->solverType = ST_BILINEAR;
     algo->setRLOFOpticalFlowParameter(param);
     algo->calc(frame1, frame2, prevPts, currPts, status, err);
-    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.25f);
+    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.80f);
 
     param->solverType = ST_STANDART;
     algo->setRLOFOpticalFlowParameter(param);
@@ -260,14 +260,14 @@ TEST(DenseOpticalFlow_RLOF, ReferenceAccuracy)
 
     ASSERT_EQ(GT.rows, flow.rows);
     ASSERT_EQ(GT.cols, flow.cols);
-    EXPECT_LE(calcRMSE(GT, flow), 0.37f);
+    EXPECT_LE(calcRMSE(GT, flow), 0.44f);
 
     algo->setInterpolation(INTERP_GEO);
     algo->calc(frame1, frame2, flow);
 
     ASSERT_EQ(GT.rows, flow.rows);
     ASSERT_EQ(GT.cols, flow.cols);
-    EXPECT_LE(calcRMSE(GT, flow), 0.49f);
+    EXPECT_LE(calcRMSE(GT, flow), 0.53f);
 
 }
 

From 221b861afc68763da26ebc8861553cd3b03ba0aa Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Sun, 16 Dec 2018 23:26:37 +0000
Subject: [PATCH 24/25] more 'static' functions

---
 modules/optflow/src/rlof/rlof_localflow.cpp | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/modules/optflow/src/rlof/rlof_localflow.cpp b/modules/optflow/src/rlof/rlof_localflow.cpp
index feb515d51d1..8689aa3ed9f 100644
--- a/modules/optflow/src/rlof/rlof_localflow.cpp
+++ b/modules/optflow/src/rlof/rlof_localflow.cpp
@@ -245,25 +245,19 @@ void preCalcCrossSegmentation(
 }
 
 
-int buildOpticalFlowPyramidScale(InputArray _img, OutputArrayOfArrays pyramid, Size winSize, int maxLevel, bool withDerivatives,
-    int pyrBorder, int derivBorder, bool tryReuseInputImage, float levelScale[2]);
-void calcLocalOpticalFlowCore(Ptr<CImageBuffer>  prevPyramids[2], Ptr<CImageBuffer> currPyramids[2], InputArray _prevPts,
-    InputOutputArray _nextPts, const RLOFOpticalFlowParameter & param);
-
-void preprocess(Ptr<CImageBuffer> prevPyramids[2], Ptr<CImageBuffer> currPyramids[2], const std::vector<cv::Point2f> & prevPoints,
-    std::vector<cv::Point2f> & currPoints,  const RLOFOpticalFlowParameter & param);
-
-inline bool isrobust(const RLOFOpticalFlowParameter & param)
+static inline
+bool isrobust(const RLOFOpticalFlowParameter & param)
 {
     return (param.normSigma0 < 255 && param.normSigma1 < 255);
 }
-inline std::vector<float> get_norm(float sigma0, float sigma1)
+static inline
+std::vector<float> get_norm(float sigma0, float sigma1)
 {
     std::vector<float> result = { sigma0, sigma1, sigma0 / (sigma0 - sigma1), sigma0 * sigma1 };
     return result;
 }
 
-
+static
 int buildOpticalFlowPyramidScale(InputArray _img, OutputArrayOfArrays pyramid, Size winSize, int maxLevel, bool withDerivatives,
     int pyrBorder, int derivBorder, bool tryReuseInputImage, float levelScale[2])
 {
@@ -368,6 +362,7 @@ int CImageBuffer::buildPyramid(cv::Size winSize, int maxLevel, float levelScale[
     return m_maxLevel;
 }
 
+static
 void calcLocalOpticalFlowCore(
     Ptr<CImageBuffer>  prevPyramids[2],
     Ptr<CImageBuffer> currPyramids[2],
@@ -581,6 +576,7 @@ void calcLocalOpticalFlowCore(
     }
 }
 
+static
 void preprocess(Ptr<CImageBuffer> prevPyramids[2],
     Ptr<CImageBuffer> currPyramids[2],
     const std::vector<cv::Point2f> & prevPoints,

From dcdc9c9ca94a7220df0e65fbdd95a8f2ec90890a Mon Sep 17 00:00:00 2001
From: senst <senst@nue.tu-berlin.de>
Date: Mon, 17 Dec 2018 22:02:40 +0100
Subject: [PATCH 25/25] bugfix after changing ptr + step to .ptr(y,x) calls by
 adjusting ROI of prevImage, currImage and derivI as well as changing the
 offset of the points in the invoker classes.

add some static_cast to avoid warning

remove 50 grid size sample from perf test. This grid size is to sparse
for the epic interpolation

remove notSameColor function since it is not used anymore
---
 modules/optflow/perf/perf_rlof.cpp            |    2 +-
 modules/optflow/src/rlof/berlof_invoker.hpp   | 1560 ++++++++---------
 modules/optflow/src/rlof/plk_invoker.hpp      |   42 +-
 modules/optflow/src/rlof/rlof_invoker.hpp     |   38 +-
 modules/optflow/src/rlof/rlof_invokerbase.hpp |   14 -
 modules/optflow/src/rlof/rlof_localflow.cpp   |   13 +-
 modules/optflow/test/test_OF_accuracy.cpp     |    4 +-
 7 files changed, 818 insertions(+), 855 deletions(-)

diff --git a/modules/optflow/perf/perf_rlof.cpp b/modules/optflow/perf/perf_rlof.cpp
index a273a9456a2..1295c526a6e 100644
--- a/modules/optflow/perf/perf_rlof.cpp
+++ b/modules/optflow/perf/perf_rlof.cpp
@@ -48,7 +48,7 @@ typedef TestBaseWithParam<INTERP_GRID_Dense_t> INTERP_GRID_Dense;
 PERF_TEST_P(INTERP_GRID_Dense, OpticalFlow_DenseRLOF,
     testing::Combine(
         testing::Values<std::string>("INTERP_EPIC", "INTERP_GEO"),
-        testing::Values<int>(4,10,50))
+        testing::Values<int>(4,10))
 )
 {
     Mat flow;
diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp
index 5e618562159..604305250a5 100644
--- a/modules/optflow/src/rlof/berlof_invoker.hpp
+++ b/modules/optflow/src/rlof/berlof_invoker.hpp
@@ -97,9 +97,6 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
     void operator()(const cv::Range& range) const CV_OVERRIDE
     {
-#ifdef DEBUG_INVOKER
-        printf("berlof::ica");fflush(stdout);
-#endif
         Point2f halfWin;
         cv::Size winSize;
         const Mat& I = *prevImg;
@@ -153,12 +150,12 @@ class TrackerInvoker : public cv::ParallelLoopBody
             {
                 continue;
             }
-            prevPt -= halfWin;
+            halfWin = Point2f(static_cast<float>(maxWinSize), static_cast<float>(maxWinSize) ) - halfWin;
+            prevPt += halfWin;
             iprevPt.x = cvFloor(prevPt.x);
             iprevPt.y = cvFloor(prevPt.y);
-
-            if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols ||
-                iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows )
+            if( iprevPt.x < 0 || iprevPt.x >= derivI.cols - winSize.width ||
+                iprevPt.y < 0 || iprevPt.y >= derivI.rows - winSize.height - 1)
             {
                 if( level == 0 )
                 {
@@ -215,7 +212,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
             cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1);
             cv::Point2f backUpNextPt = nextPt;
-            nextPt -= halfWin;
+            nextPt += halfWin;
             Point2f prevDelta(0,0);    //denotes h(t-1)
             cv::Size _winSize = winSize;
 #ifdef RLOF_SSE
@@ -243,9 +240,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 {
                     inextPt.x = cvFloor(nextPt.x);
                     inextPt.y = cvFloor(nextPt.y);
-
-                    if( inextPt.x < -winSize.width  || inextPt.x >= J.cols ||
-                        inextPt.y < -winSize.height || inextPt.y >= J.rows )
+                    if( inextPt.x < 0 || inextPt.x >= J.cols - winSize.width ||
+                       inextPt.y < 0 || inextPt.y >= J.rows - winSize.height - 1)
                     {
                         if( level == 0 && status )
                             status[ptidx] = 3;
@@ -511,12 +507,12 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     delta.x = MAX(-1.f, MIN(1.f, delta.x));
                     delta.y = MAX(-1.f, MIN(1.f, delta.y));
                     nextPt  += delta;
-                    nextPts[ptidx] = nextPt + halfWin;
+                    nextPts[ptidx] = nextPt - halfWin;
                 }
                 else
                 {
                     nextPt += delta;
-                    nextPts[ptidx] = nextPt + halfWin;
+                    nextPts[ptidx] = nextPt - halfWin;
                     noSolvedIteration++;
                     break;
                 }
@@ -614,9 +610,6 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
     void operator()(const cv::Range& range) const CV_OVERRIDE
     {
-#ifdef DEBUG_INVOKER
-        printf("berlof::radial");fflush(stdout);
-#endif
         Point2f halfWin;
         cv::Size winSize;
         const Mat& I = *prevImg;
@@ -667,13 +660,12 @@ class TrackerInvoker : public cv::ParallelLoopBody
                                 winMaskMat,winSize,halfWin,winArea,
                                 minWinSize,maxWinSize) == false)
             continue;
-
-            prevPt -= halfWin;
+            halfWin = Point2f(static_cast<float>(maxWinSize), static_cast<float>(maxWinSize) ) - halfWin;
+            prevPt += halfWin;
             iprevPt.x = cvFloor(prevPt.x);
             iprevPt.y = cvFloor(prevPt.y);
-
-            if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols ||
-                iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows )
+            if( iprevPt.x < 0 || iprevPt.x >= derivI.cols - winSize.width ||
+                iprevPt.y < 0 || iprevPt.y >= derivI.rows - winSize.height - 1)
             {
                 if( level == 0 )
                 {
@@ -797,9 +789,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
             }
 
             cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1);
-
             cv::Point2f backUpNextPt = nextPt;
-            nextPt -= halfWin;
+                    nextPt += halfWin;
             Point2f prevDelta(0,0);    //relates to h(t-1)
             Point2f prevGain(1,0);
             cv::Point2f gainVec = gainVecs[ptidx];
@@ -827,626 +818,626 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 a = nextPt.x - inextPt.x;
                 b = nextPt.y - inextPt.y;
                 float ab = a * b;
-                if( j == 0
-                || (inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j % 2 != 0) )
+                if (j == 0
+                    || (inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j % 2 != 0) )
                 {
-                inextPt.x = cvFloor(nextPt.x);
-                inextPt.y = cvFloor(nextPt.y);
+                    inextPt.x = cvFloor(nextPt.x);
+                    inextPt.y = cvFloor(nextPt.y);
 
-                if( inextPt.x < -winSize.width  || inextPt.x >= J.cols ||
-                    inextPt.y < -winSize.height || inextPt.y >= J.rows )
-                {
-                    if( level == 0 && status )
-                        status[ptidx] = 3;
-                    if (level > 0)
+                    if( inextPt.x < 0 || inextPt.x >= J.cols - winSize.width ||
+                        inextPt.y < 0 || inextPt.y >= J.rows - winSize.height - 1)
                     {
-                        nextPts[ptidx] = backUpNextPt;
-                        gainVecs[ptidx] = backUpGain;
+                        if( level == 0 && status )
+                            status[ptidx] = 3;
+                        if (level > 0)
+                        {
+                            nextPts[ptidx] = backUpNextPt;
+                            gainVecs[ptidx] = backUpGain;
+                        }
+                        noIteration++;
+                        break;
                     }
-                    noIteration++;
-                    break;
-                }
 
 
-                a = nextPt.x - inextPt.x;
-                b = nextPt.y - inextPt.y;
-                ab = a * b;
-                iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
-                iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
-                iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
-                iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-                // mismatch
-
-                if( j == 0)
-                {
-                    // tensor
-                    w1 = 0; // -IxI
-                    w2 = 0; // -IyI
-                    dI = 0; // I^2
-                    sumIx = 0;
-                    sumIy = 0;
-                    sumI  = 0;
-                    sumW = 0;
-                    A11 = 0;
-                    A12 = 0;
-                    A22 = 0;
-                }
+                    a = nextPt.x - inextPt.x;
+                    b = nextPt.y - inextPt.y;
+                    ab = a * b;
+                    iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+                    iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+                    iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+                    iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+                    // mismatch
+
+                    if( j == 0)
+                    {
+                        // tensor
+                        w1 = 0; // -IxI
+                        w2 = 0; // -IyI
+                        dI = 0; // I^2
+                        sumIx = 0;
+                        sumIy = 0;
+                        sumI  = 0;
+                        sumW = 0;
+                        A11 = 0;
+                        A12 = 0;
+                        A22 = 0;
+                    }
+
+                    if ( j == 0 )
+                    {
+                        buffIdx = 0;
+                        for( y = 0; y < winSize.height; y++ )
+                        {
+                            const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                            const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
+                            const short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                            const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
+                            const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
+                            x = 0;
+                            for( ; x < winSize.width*cn; x++, dIptr += 2)
+                            {
+                                if( maskPtr[x] == 0)
+                                    continue;
+                                int diff = static_cast<int>(CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr1[x]*iw10 + Jptr1[x+cn]*iw11, W_BITS1-5)
+                                    - Iptr[x] + Iptr[x] * gainVec.x +gainVec.y);
+                                residualMat.at<short>(buffIdx++) = static_cast<short>(diff);
+                            }
+                        }
+                        /*! Estimation for the residual */
+                        cv::Mat residualMatRoi(residualMat, cv::Rect(0,0,1, buffIdx));
+                        MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi);
+                    }
+
+                    float eta = 1.f / winArea;
+                    float fParam0 = normSigma0 * 32.f;
+                    float fParam1 = normSigma1 * 32.f;
+                    fParam0 = normSigma0 * MEstimatorScale;
+                    fParam1 = normSigma1 * MEstimatorScale;
+
+    #ifdef RLOF_SSE
+
+                    qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+                    qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+                    __m128 qb0[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
+                    __m128 qb1[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
+                    __m128 qb2[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
+                    __m128 qb3[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
+                    __m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps();
+                    __m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps();
+                    __m128 mmSumIy = _mm_setzero_ps(),  mmSumIx = _mm_setzero_ps();
+                    __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();
+                    __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() -1, static_cast<short>(fParam0)));
+                    __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max()- 1, static_cast<short>(fParam1)));
+
+
+                    float s2Val = std::fabs(normSigma2);
+                    int s2bitShift = normSigma2 == 0 ? 1 : cvCeil(log(200.f / s2Val) / log(2.f));
+                    __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast<short>(normSigma2 * (float)(1 << s2bitShift)));
+                    __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift);
+                    __m128  mmParam2s = _mm_set1_ps(0.01f * normSigma2);
+                    __m128  mmParam2s2 = _mm_set1_ps(normSigma2 * normSigma2);
+                    float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x;
+                    int bitShift = gainVec.x == 0 ? 1 : cvCeil(log(200.f / gainVal) / log(2.f));
+                    __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.x * (float)(1 << bitShift)));
+                    __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.y));
+                    __m128i mmEta     = _mm_setzero_si128();
+                    __m128i mmScale      = _mm_set1_epi16(static_cast<short>(MEstimatorScale));
+
+    #endif
 
-                if ( j == 0 )
-                {
                     buffIdx = 0;
-                    for( y = 0; y < winSize.height; y++ )
+                    float _b0[4] = {0,0,0,0};
+                    float _b1[4] = {0,0,0,0};
+                    float _b2[4] = {0,0,0,0};
+                    float _b3[4] = {0,0,0,0};
+                    /*
+                    */
+                    for( y = 0; y < _winSize.height; y++ )
                     {
-                        const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                        const uchar* Jptr =  J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
                         const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
                         const short* Iptr  = IWinBuf.ptr<short>(y, 0);
                         const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
                         const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
                         x = 0;
-                        for( ; x < winSize.width*cn; x++, dIptr += 2)
+    #ifdef RLOF_SSE
+                        for( ; x <= _winSize.width*cn; x += 8, dIptr += 8*2 )
                         {
-                            if( maskPtr[x] == 0)
-                                continue;
-                            int diff = static_cast<int>(CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr1[x]*iw10 + Jptr1[x+cn]*iw11, W_BITS1-5)
-                                - Iptr[x] + Iptr[x] * gainVec.x +gainVec.y);
-                            residualMat.at<short>(buffIdx++) = static_cast<short>(diff);
-                        }
-                    }
-                    /*! Estimation for the residual */
-                    cv::Mat residualMatRoi(residualMat, cv::Rect(0,0,1, buffIdx));
-                    MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi);
-                }
-
-                float eta = 1.f / winArea;
-                float fParam0 = normSigma0 * 32.f;
-                float fParam1 = normSigma1 * 32.f;
-                fParam0 = normSigma0 * MEstimatorScale;
-                fParam1 = normSigma1 * MEstimatorScale;
+                            __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16);
+                            __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x));
 
-#ifdef RLOF_SSE
+                            __m128i v00 = _mm_unpacklo_epi8(
+                                _mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
+                            __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
+                            __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z);
+                            __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z);
 
-                qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
-                qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
-                __m128 qb0[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
-                __m128 qb1[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
-                __m128 qb2[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
-                __m128 qb3[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
-                __m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps();
-                __m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps();
-                __m128 mmSumIy = _mm_setzero_ps(),  mmSumIx = _mm_setzero_ps();
-                __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();
-                __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() -1, static_cast<short>(fParam0)));
-                __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max()- 1, static_cast<short>(fParam1)));
-
-
-                float s2Val = std::fabs(normSigma2);
-                int s2bitShift = normSigma2 == 0 ? 1 : cvCeil(log(200.f / s2Val) / log(2.f));
-                __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast<short>(normSigma2 * (float)(1 << s2bitShift)));
-                __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift);
-                __m128  mmParam2s = _mm_set1_ps(0.01f * normSigma2);
-                __m128  mmParam2s2 = _mm_set1_ps(normSigma2 * normSigma2);
-                float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x;
-                int bitShift = gainVec.x == 0 ? 1 : cvCeil(log(200.f / gainVal) / log(2.f));
-                __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.x * (float)(1 << bitShift)));
-                __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.y));
-                __m128i mmEta     = _mm_setzero_si128();
-                __m128i mmScale      = _mm_set1_epi16(static_cast<short>(MEstimatorScale));
+                            __m128i t0 = _mm_add_epi32
+                                (_mm_madd_epi16(
+                                    _mm_unpacklo_epi16(v00, v01),
+                                    qw0),
+                                _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                            __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+                                                       _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
 
-#endif
-
-                buffIdx = 0;
-                float _b0[4] = {0,0,0,0};
-                float _b1[4] = {0,0,0,0};
-                float _b2[4] = {0,0,0,0};
-                float _b3[4] = {0,0,0,0};
-                /*
-                */
-                for( y = 0; y < _winSize.height; y++ )
-                {
-                    const uchar* Jptr =  J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
-                    const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
-                    const short* Iptr  = IWinBuf.ptr<short>(y, 0);
-                    const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
-                    const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
-                    x = 0;
-#ifdef RLOF_SSE
-                    for( ; x <= _winSize.width*cn; x += 8, dIptr += 8*2 )
-                    {
-                        __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16);
-                        __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x));
+                            t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
+                            t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5);
 
-                        __m128i v00 = _mm_unpacklo_epi8(
-                            _mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
-                        __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
-                        __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z);
-                        __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z);
+                            __m128i lo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16);
+                            __m128i hi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16);
 
-                        __m128i t0 = _mm_add_epi32
-                            (_mm_madd_epi16(
-                                _mm_unpacklo_epi16(v00, v01),
-                                qw0),
-                            _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-                        __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
-                                                   _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+                            __m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), bitShift);
+                            __m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), bitShift);
+                            __m128i Igain_epi16 =  _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32);
 
-                        t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
-                        t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5);
 
-                        __m128i lo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16);
-                        __m128i hi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16);
+                            __m128i diffValue = _mm_subs_epi16(_mm_add_epi16(Igain_epi16, mmConstValue_epi16), I_0_7_epi16);
+                            __m128i mmDiffc_epi16[4] =
+                            {
+                               _mm_add_epi16(_mm_slli_epi16(v11, 5), diffValue),
+                               _mm_add_epi16(_mm_slli_epi16(v01, 5), diffValue),
+                               _mm_add_epi16(_mm_slli_epi16(v10, 5), diffValue),
+                               _mm_add_epi16(_mm_slli_epi16(v00, 5), diffValue)
+                            };
 
-                        __m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), bitShift);
-                        __m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), bitShift);
-                        __m128i Igain_epi16 =  _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32);
+                            __m128i mmDiff_epi16 = _mm_add_epi16( _mm_packs_epi32(t0, t1), diffValue);
 
 
-                        __m128i diffValue = _mm_subs_epi16(_mm_add_epi16(Igain_epi16, mmConstValue_epi16), I_0_7_epi16);
-                        __m128i mmDiffc_epi16[4] =
-                        {
-                           _mm_add_epi16(_mm_slli_epi16(v11, 5), diffValue),
-                           _mm_add_epi16(_mm_slli_epi16(v01, 5), diffValue),
-                           _mm_add_epi16(_mm_slli_epi16(v10, 5), diffValue),
-                           _mm_add_epi16(_mm_slli_epi16(v00, 5), diffValue)
-                        };
+                            mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16);
 
-                        __m128i mmDiff_epi16 = _mm_add_epi16( _mm_packs_epi32(t0, t1), diffValue);
+                            __m128i scalediffIsPos_epi16    = _mm_cmpgt_epi16(mmDiff_epi16, mmScale);
+                            mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1)));
 
 
-                        mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16);
+                            __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr));
+                            __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
 
-                        __m128i scalediffIsPos_epi16    = _mm_cmpgt_epi16(mmDiff_epi16, mmScale);
-                        mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1)));
 
+                            __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16);
+                            __m128i bSet2_epi16, bSet1_epi16;
+                            // |It| < sigma1 ?
+                            bSet2_epi16        = _mm_cmplt_epi16(abs_epi16, mmParam1);
+                            // It > 0 ?
+                            __m128i diffIsPos_epi16    = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128());
+                            // sigma0 < |It| < sigma1 ?
+                            bSet1_epi16        = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0));
+                                                        // val = |It| -/+ sigma1
+                            __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)),
+                                                                 _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1)));
+                            // It == 0     ? |It| > sigma13
+                            mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16);
 
-                        __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr));
-                        __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
+                            for( unsigned int mmi = 0; mmi < 4; mmi++)
+                            {
+                                __m128i mmDiffc_epi16_t = _mm_and_si128(mmDiffc_epi16[mmi], mask_0_7_epi16);
+                                mmDiffc_epi16_t = _mm_and_si128(bSet2_epi16, mmDiffc_epi16_t);
+
+                                // It == val ? sigma0 < |It| < sigma1
+                                mmDiffc_epi16_t = _mm_blendv_epi8(mmDiffc_epi16_t, tmpParam1_epi16, bSet1_epi16);
+                                __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3
+                                // diff = diff * sigma2
+                                lo = _mm_mullo_epi16(tale_epi16_, mmDiffc_epi16_t);
+                                hi = _mm_mulhi_epi16(tale_epi16_, mmDiffc_epi16_t);
+                                __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift);
+                                __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift);
+
+                                mmDiffc_epi16_t = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32);
+                                __m128i diff1 = _mm_unpackhi_epi16(mmDiffc_epi16_t, mmDiffc_epi16_t); // It4 It4 It5 It5 It6 It6 It7 It7   | It12 It12 It13 It13...
+                                __m128i diff0 = _mm_unpacklo_epi16(mmDiffc_epi16_t, mmDiffc_epi16_t); // It0 It0 It1 It1 It2 It2 It3 It3   | It8 It8 It9 It9...
+
+                                // Ix * diff / Iy * diff
+                                v10 = _mm_mullo_epi16(Ixy_0, diff0);
+                                v11 = _mm_mulhi_epi16(Ixy_0, diff0);
+                                v00 = _mm_unpacklo_epi16(v10, v11);
+                                v10 = _mm_unpackhi_epi16(v10, v11);
+
+                                qb0[mmi] = _mm_add_ps(qb0[mmi], _mm_cvtepi32_ps(v00));
+                                qb1[mmi] = _mm_add_ps(qb1[mmi], _mm_cvtepi32_ps(v10));
+                                // It * Ix It * Iy [4 ... 7]
+                                // for set 1 hi sigma 1
+                                v10 = _mm_mullo_epi16(Ixy_1, diff1);
+                                v11 = _mm_mulhi_epi16(Ixy_1, diff1);
+                                v00 = _mm_unpacklo_epi16(v10, v11);
+                                v10 = _mm_unpackhi_epi16(v10, v11);
+                                qb0[mmi] = _mm_add_ps(qb0[mmi], _mm_cvtepi32_ps(v00));
+                                qb1[mmi] = _mm_add_ps(qb1[mmi], _mm_cvtepi32_ps(v10));
+                                // diff * J [0 ... 7]
+                                // for set 1  sigma 1
+                                // b3 += diff * Iptr[x]
+                                v10 = _mm_mullo_epi16(mmDiffc_epi16_t, I_0_7_epi16);
+                                v11 = _mm_mulhi_epi16(mmDiffc_epi16_t, I_0_7_epi16);
+                                v00 = _mm_unpacklo_epi16(v10, v11);
+
+                                v10 = _mm_unpackhi_epi16(v10, v11);
+                                qb2[mmi] = _mm_add_ps(qb2[mmi], _mm_cvtepi32_ps(v00));
+                                qb2[mmi] = _mm_add_ps(qb2[mmi], _mm_cvtepi32_ps(v10));
+                                qb3[mmi] = _mm_add_ps(qb3[mmi], _mm_cvtepi32_ps(diff_0_3_epi32));
+                                qb3[mmi] = _mm_add_ps(qb3[mmi], _mm_cvtepi32_ps(diff_4_7_epi32));
+                            }
 
+                            if( j == 0 )
+                            {
+                                __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16));
+                                __m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16,bSet1_epi16), 16));
+                                __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16));
+                                __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16),16)));
 
-                        __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16);
-                        __m128i bSet2_epi16, bSet1_epi16;
-                        // |It| < sigma1 ?
-                        bSet2_epi16        = _mm_cmplt_epi16(abs_epi16, mmParam1);
-                        // It > 0 ?
-                        __m128i diffIsPos_epi16    = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128());
-                        // sigma0 < |It| < sigma1 ?
-                        bSet1_epi16        = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0));
-                                                    // val = |It| -/+ sigma1
-                        __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)),
-                                                             _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1)));
-                        // It == 0     ? |It| > sigma13
-                        mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16);
+                                __m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16));
+                                __m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16),16));
 
-                        for( unsigned int mmi = 0; mmi < 4; mmi++)
-                        {
-                            __m128i mmDiffc_epi16_t = _mm_and_si128(mmDiffc_epi16[mmi], mask_0_7_epi16);
-                            mmDiffc_epi16_t = _mm_and_si128(bSet2_epi16, mmDiffc_epi16_t);
-
-                            // It == val ? sigma0 < |It| < sigma1
-                            mmDiffc_epi16_t = _mm_blendv_epi8(mmDiffc_epi16_t, tmpParam1_epi16, bSet1_epi16);
-                            __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3
-                            // diff = diff * sigma2
-                            lo = _mm_mullo_epi16(tale_epi16_, mmDiffc_epi16_t);
-                            hi = _mm_mulhi_epi16(tale_epi16_, mmDiffc_epi16_t);
-                            __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift);
-                            __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift);
-
-                            mmDiffc_epi16_t = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32);
-                            __m128i diff1 = _mm_unpackhi_epi16(mmDiffc_epi16_t, mmDiffc_epi16_t); // It4 It4 It5 It5 It6 It6 It7 It7   | It12 It12 It13 It13...
-                            __m128i diff0 = _mm_unpacklo_epi16(mmDiffc_epi16_t, mmDiffc_epi16_t); // It0 It0 It1 It1 It2 It2 It3 It3   | It8 It8 It9 It9...
-
-                            // Ix * diff / Iy * diff
-                            v10 = _mm_mullo_epi16(Ixy_0, diff0);
-                            v11 = _mm_mulhi_epi16(Ixy_0, diff0);
-                            v00 = _mm_unpacklo_epi16(v10, v11);
-                            v10 = _mm_unpackhi_epi16(v10, v11);
+                                __m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps);
+                                __m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps);
+                                tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps);
+                                tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps);
 
-                            qb0[mmi] = _mm_add_ps(qb0[mmi], _mm_cvtepi32_ps(v00));
-                            qb1[mmi] = _mm_add_ps(qb1[mmi], _mm_cvtepi32_ps(v10));
-                            // It * Ix It * Iy [4 ... 7]
-                            // for set 1 hi sigma 1
-                            v10 = _mm_mullo_epi16(Ixy_1, diff1);
-                            v11 = _mm_mulhi_epi16(Ixy_1, diff1);
-                            v00 = _mm_unpacklo_epi16(v10, v11);
-                            v10 = _mm_unpackhi_epi16(v10, v11);
-                            qb0[mmi] = _mm_add_ps(qb0[mmi], _mm_cvtepi32_ps(v00));
-                            qb1[mmi] = _mm_add_ps(qb1[mmi], _mm_cvtepi32_ps(v10));
-                            // diff * J [0 ... 7]
-                            // for set 1  sigma 1
-                            // b3 += diff * Iptr[x]
-                            v10 = _mm_mullo_epi16(mmDiffc_epi16_t, I_0_7_epi16);
-                            v11 = _mm_mulhi_epi16(mmDiffc_epi16_t, I_0_7_epi16);
-                            v00 = _mm_unpacklo_epi16(v10, v11);
+                                tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps);
+                                tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps);
 
-                            v10 = _mm_unpackhi_epi16(v10, v11);
-                            qb2[mmi] = _mm_add_ps(qb2[mmi], _mm_cvtepi32_ps(v00));
-                            qb2[mmi] = _mm_add_ps(qb2[mmi], _mm_cvtepi32_ps(v10));
-                            qb3[mmi] = _mm_add_ps(qb3[mmi], _mm_cvtepi32_ps(diff_0_3_epi32));
-                            qb3[mmi] = _mm_add_ps(qb3[mmi], _mm_cvtepi32_ps(diff_4_7_epi32));
-                        }
+                                t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3
+                                t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3
 
-                        if( j == 0 )
-                        {
-                            __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16));
-                            __m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16,bSet1_epi16), 16));
-                            __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16));
-                            __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16),16)));
+                                __m128 fy = _mm_cvtepi32_ps(t0);
+                                __m128 fx = _mm_cvtepi32_ps(t1);
 
-                            __m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16));
-                            __m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16),16));
+                                // 0 ... 3
+                                __m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16));
 
-                            __m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps);
-                            __m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps);
-                            tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps);
-                            tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps);
+                                // A11 - A22
+                                __m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps);
+                                __m128 fytale = _mm_mul_ps(fy, tale_0_3_ps);
 
-                            tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps);
-                            tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps);
+                                mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
+                                mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
+                                mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
 
-                            t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3
-                            t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3
+                                // sumIx und sumIy
+                                mmSumIx = _mm_add_ps(mmSumIx, fxtale);
+                                mmSumIy = _mm_add_ps(mmSumIy, fytale);
 
-                            __m128 fy = _mm_cvtepi32_ps(t0);
-                            __m128 fx = _mm_cvtepi32_ps(t1);
+                                mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale));
+                                mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale));
 
-                            // 0 ... 3
-                            __m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16));
+                                // sumI
+                                __m128 I_tale_ps = _mm_mul_ps(I_ps, tale_0_3_ps);
+                                mmSumI = _mm_add_ps(mmSumI,I_tale_ps);
 
-                            // A11 - A22
-                            __m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps);
-                            __m128 fytale = _mm_mul_ps(fy, tale_0_3_ps);
+                                // sumW
+                                mmSumW = _mm_add_ps(mmSumW, tale_0_3_ps);
 
-                            mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
-                            mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
-                            mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
+                                // sumDI
+                                mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_tale_ps));
 
-                            // sumIx und sumIy
-                            mmSumIx = _mm_add_ps(mmSumIx, fxtale);
-                            mmSumIy = _mm_add_ps(mmSumIy, fytale);
 
-                            mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale));
-                            mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale));
+                                t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11
+                                t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3
 
-                            // sumI
-                            __m128 I_tale_ps = _mm_mul_ps(I_ps, tale_0_3_ps);
-                            mmSumI = _mm_add_ps(mmSumI,I_tale_ps);
+                                fy =  _mm_cvtepi32_ps(t0);
+                                fx =  _mm_cvtepi32_ps(t1);
 
-                            // sumW
-                            mmSumW = _mm_add_ps(mmSumW, tale_0_3_ps);
+                                // 4 ... 7
+                                I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16));
 
-                            // sumDI
-                            mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_tale_ps));
+                                // A11 - A22
+                                fxtale = _mm_mul_ps(fx, tale_4_7_ps);
+                                fytale = _mm_mul_ps(fy, tale_4_7_ps);
 
+                                mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
+                                mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
+                                mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
 
-                            t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11
-                            t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3
+                                // sumIx und sumIy
+                                mmSumIx = _mm_add_ps(mmSumIx, fxtale);
+                                mmSumIy = _mm_add_ps(mmSumIy, fytale);
 
-                            fy =  _mm_cvtepi32_ps(t0);
-                            fx =  _mm_cvtepi32_ps(t1);
+                                mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale));
+                                mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale));
 
-                            // 4 ... 7
-                            I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16));
+                                // sumI
+                                I_tale_ps = _mm_mul_ps(I_ps, tale_4_7_ps);
+                                mmSumI = _mm_add_ps(mmSumI, I_tale_ps);
 
-                            // A11 - A22
-                            fxtale = _mm_mul_ps(fx, tale_4_7_ps);
-                            fytale = _mm_mul_ps(fy, tale_4_7_ps);
+                                // sumW
+                                mmSumW = _mm_add_ps(mmSumW, tale_4_7_ps);
 
-                            mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
-                            mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
-                            mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
+                                // sumDI
+                                mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_tale_ps));
+                            }
 
-                            // sumIx und sumIy
-                            mmSumIx = _mm_add_ps(mmSumIx, fxtale);
-                            mmSumIy = _mm_add_ps(mmSumIy, fytale);
+                        }
+    #else
+                        for( ; x < _winSize.width*cn; x++, dIptr += 2 )
+                        {
+                            if( maskPtr[x] == 0)
+                                continue;
 
-                            mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale));
-                            mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale));
+                            short ixval = dIptr[0];
+                            short iyval = dIptr[1];
+                            int illValue =  static_cast<int>(Iptr[x] * gainVec.x + gainVec.y  - Iptr[x]);
 
-                            // sumI
-                            I_tale_ps = _mm_mul_ps(I_ps, tale_4_7_ps);
-                            mmSumI = _mm_add_ps(mmSumI, I_tale_ps);
+                            int It[4] = {(Jptr1[x+cn]<< 5)    + illValue,
+                                         (Jptr[x+cn]<< 5)        + illValue,
+                                         (Jptr1[x]<< 5)        + illValue,
+                                         (Jptr[x] << 5)            + illValue};
 
-                            // sumW
-                            mmSumW = _mm_add_ps(mmSumW, tale_4_7_ps);
 
-                            // sumDI
-                            mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_tale_ps));
-                        }
+                            int J_val  =  CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
+                                                  Jptr1[x]*iw10 + Jptr1[x+cn]*iw11,
+                                                  W_BITS1-5);
 
-                    }
-#else
-                    for( ; x < _winSize.width*cn; x++, dIptr += 2 )
-                    {
-                        if( maskPtr[x] == 0)
-                            continue;
+                            int diff =  J_val + illValue;
 
-                        short ixval = dIptr[0];
-                        short iyval = dIptr[1];
-                        int illValue =  static_cast<int>(Iptr[x] * gainVec.x + gainVec.y  - Iptr[x]);
 
-                        int It[4] = {(Jptr1[x+cn]<< 5)    + illValue,
-                                     (Jptr[x+cn]<< 5)        + illValue,
-                                     (Jptr1[x]<< 5)        + illValue,
-                                     (Jptr[x] << 5)            + illValue};
 
+                            MEstimatorScale += (diff < MEstimatorScale) ? -eta : eta;
 
-                        int J_val  =  CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
-                                              Jptr1[x]*iw10 + Jptr1[x+cn]*iw11,
-                                              W_BITS1-5);
+                            int abss = (diff < 0) ? -diff : diff;
 
-                        int diff =  J_val + illValue;
 
+                            // compute the missmatch vector
+                            if( j >= 0)
+                            {
+                                if( abss > fParam1)
+                                {
+                                    It[0] = 0;
+                                    It[1] = 0;
+                                    It[2] = 0;
+                                    It[3] = 0;
+                                }
+                                else if( abss > static_cast<int>(fParam0) && diff >= 0 )
+                                {
+                                    It[0] = static_cast<int>(normSigma2 * (It[0] - fParam1));
+                                    It[1] = static_cast<int>(normSigma2 * (It[1] - fParam1));
+                                    It[2] = static_cast<int>(normSigma2 * (It[2] - fParam1));
+                                    It[3] = static_cast<int>(normSigma2 * (It[3] - fParam1));
+                                }
+                                else if( abss > static_cast<int>(fParam0) && diff < 0 )
+                                {
+                                    It[0] = static_cast<int>(normSigma2 * (It[0] + fParam1));
+                                    It[1] = static_cast<int>(normSigma2 * (It[1] + fParam1));
+                                    It[2] = static_cast<int>(normSigma2 * (It[2] + fParam1));
+                                    It[3] = static_cast<int>(normSigma2 * (It[3] + fParam1));
+                                }
+                            }
+                            _b0[0] += (float)(It[0]*dIptr[0]) ;
+                            _b0[1] += (float)(It[1]*dIptr[0]) ;
+                            _b0[2] += (float)(It[2]*dIptr[0]) ;
+                            _b0[3] += (float)(It[3]*dIptr[0]) ;
 
 
-                        MEstimatorScale += (diff < MEstimatorScale) ? -eta : eta;
+                            _b1[0] += (float)(It[0]*dIptr[1]) ;
+                            _b1[1] += (float)(It[1]*dIptr[1]) ;
+                            _b1[2] += (float)(It[2]*dIptr[1]) ;
+                            _b1[3] += (float)(It[3]*dIptr[1]) ;
 
-                        int abss = (diff < 0) ? -diff : diff;
+                            _b2[0] += (float)(It[0])*Iptr[x] ;
+                            _b2[1] += (float)(It[1])*Iptr[x] ;
+                            _b2[2] += (float)(It[2])*Iptr[x] ;
+                            _b2[3] += (float)(It[3])*Iptr[x] ;
 
+                            _b3[0] += (float)(It[0]);
+                            _b3[1] += (float)(It[1]);
+                            _b3[2] += (float)(It[2]);
+                            _b3[3] += (float)(It[3]);
 
-                        // compute the missmatch vector
-                        if( j >= 0)
-                        {
-                            if( abss > fParam1)
-                            {
-                                It[0] = 0;
-                                It[1] = 0;
-                                It[2] = 0;
-                                It[3] = 0;
-                            }
-                            else if( abss > static_cast<int>(fParam0) && diff >= 0 )
-                            {
-                                It[0] = static_cast<int>(normSigma2 * (It[0] - fParam1));
-                                It[1] = static_cast<int>(normSigma2 * (It[1] - fParam1));
-                                It[2] = static_cast<int>(normSigma2 * (It[2] - fParam1));
-                                It[3] = static_cast<int>(normSigma2 * (It[3] - fParam1));
-                            }
-                            else if( abss > static_cast<int>(fParam0) && diff < 0 )
+                            // compute the Gradient Matrice
+                            if( j == 0)
                             {
-                                It[0] = static_cast<int>(normSigma2 * (It[0] + fParam1));
-                                It[1] = static_cast<int>(normSigma2 * (It[1] + fParam1));
-                                It[2] = static_cast<int>(normSigma2 * (It[2] + fParam1));
-                                It[3] = static_cast<int>(normSigma2 * (It[3] + fParam1));
-                            }
-                        }
-                        _b0[0] += (float)(It[0]*dIptr[0]) ;
-                        _b0[1] += (float)(It[1]*dIptr[0]) ;
-                        _b0[2] += (float)(It[2]*dIptr[0]) ;
-                        _b0[3] += (float)(It[3]*dIptr[0]) ;
+                                float tale = normSigma2 * FLT_RESCALE;
+                                if( abss < fParam0 || j < 0)
+                                {
+                                    tale = FLT_RESCALE;
+                                }
+                                else if( abss > fParam1)
+                                {
+                                    tale *= 0.01f;
+                                }
+                                else
+                                {
+                                    tale *= normSigma2;
+                                }
+                                if( j == 0 )
+                                {
+                                    A11 += (float)(ixval*ixval)*tale;
+                                    A12 += (float)(ixval*iyval)*tale;
+                                    A22 += (float)(iyval*iyval)*tale;
+                                }
 
+                                dI += Iptr[x] * Iptr[x] * tale;
+                                float dx = static_cast<float>(dIptr[0]) * tale;
+                                float dy = static_cast<float>(dIptr[1]) * tale;
+                                sumIx += dx;
+                                sumIy += dy;
+                                w1 += dx * Iptr[x];
+                                w2 += dy * Iptr[x];
+                                sumI += Iptr[x]  * tale;
+                                sumW += tale;
+                            }
 
-                        _b1[0] += (float)(It[0]*dIptr[1]) ;
-                        _b1[1] += (float)(It[1]*dIptr[1]) ;
-                        _b1[2] += (float)(It[2]*dIptr[1]) ;
-                        _b1[3] += (float)(It[3]*dIptr[1]) ;
+                        }
+    #endif
+                    }
 
-                        _b2[0] += (float)(It[0])*Iptr[x] ;
-                        _b2[1] += (float)(It[1])*Iptr[x] ;
-                        _b2[2] += (float)(It[2])*Iptr[x] ;
-                        _b2[3] += (float)(It[3])*Iptr[x] ;
+    #ifdef RLOF_SSE
+                    short etaValues[8];
+                    _mm_storeu_si128((__m128i*)(etaValues), mmEta);
+                    MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3]
+                                               + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]);
+                    float CV_DECL_ALIGNED(32) wbuf[4];
+    #endif
+                    if( j == 0 )
+                    {
+    #ifdef RLOF_SSE
+                            _mm_store_ps(wbuf, mmSumW1);
+                            w1  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                            _mm_store_ps(wbuf, mmSumW2);
+                            w2  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                            _mm_store_ps(wbuf, mmSumDI);
+                            dI  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                            _mm_store_ps(wbuf, mmSumI);
+                            sumI  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                            _mm_store_ps(wbuf, mmSumIx);
+                            sumIx  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                            _mm_store_ps(wbuf, mmSumIy);
+                            sumIy  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                            _mm_store_ps(wbuf, mmSumW);
+                            sumW  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+    #endif
+                            sumIx *= -FLT_SCALE;
+                            sumIy *= -FLT_SCALE;
+                            sumI *=FLT_SCALE;
+                            sumW *= FLT_SCALE;
+                            w1 *= -FLT_SCALE;
+                            w2 *= -FLT_SCALE;
+                            dI *= FLT_SCALE;
+
+
+    #ifdef RLOF_SSE
+                        float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];//
+
+                        _mm_store_ps(A11buf, mmAxx);
+                        _mm_store_ps(A12buf, mmAxy);
+                        _mm_store_ps(A22buf, mmAyy);
+
+
+                        A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
+                        A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
+                        A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
+    #endif
+                        A11 *= FLT_SCALE; // 54866744.
+                        A12 *= FLT_SCALE; // -628764.00
+                        A22 *= FLT_SCALE; // 19730.000
 
-                        _b3[0] += (float)(It[0]);
-                        _b3[1] += (float)(It[1]);
-                        _b3[2] += (float)(It[2]);
-                        _b3[3] += (float)(It[3]);
+                        D = - A12*A12*sumI*sumI + dI*sumW*A12*A12 + 2*A12*sumI*sumIx*w2 + 2*A12*sumI*sumIy*w1
+                            - 2*dI*A12*sumIx*sumIy - 2*sumW*A12*w1*w2 + A11*A22*sumI*sumI - 2*A22*sumI*sumIx*w1
+                            - 2*A11*sumI*sumIy*w2 - sumIx*sumIx*w2*w2 + A22*dI*sumIx*sumIx + 2*sumIx*sumIy*w1*w2
+                            - sumIy*sumIy*w1*w1 + A11*dI*sumIy*sumIy + A22*sumW*w1*w1 + A11*sumW*w2*w2 - A11*A22*dI*sumW;
 
-                        // compute the Gradient Matrice
-                        if( j == 0)
+                        float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
+                                4.f*A12*A12))/(2*winArea);
+                        if(  minEig < minEigThreshold || std::abs(D) < FLT_EPSILON )
                         {
-                            float tale = normSigma2 * FLT_RESCALE;
-                            if( abss < fParam0 || j < 0)
-                            {
-                                tale = FLT_RESCALE;
-                            }
-                            else if( abss > fParam1)
-                            {
-                                tale *= 0.01f;
-                            }
-                            else
-                            {
-                                tale *= normSigma2;
-                            }
-                            if( j == 0 )
+                            if( level == 0 && status )
+                                status[ptidx] = 0;
+                            if( level > 0)
                             {
-                                A11 += (float)(ixval*ixval)*tale;
-                                A12 += (float)(ixval*iyval)*tale;
-                                A22 += (float)(iyval*iyval)*tale;
+                                nextPts[ptidx] = backUpNextPt;
+                                gainVecs[ptidx] = backUpGain;
                             }
-
-                            dI += Iptr[x] * Iptr[x] * tale;
-                            float dx = static_cast<float>(dIptr[0]) * tale;
-                            float dy = static_cast<float>(dIptr[1]) * tale;
-                            sumIx += dx;
-                            sumIy += dy;
-                            w1 += dx * Iptr[x];
-                            w2 += dy * Iptr[x];
-                            sumI += Iptr[x]  * tale;
-                            sumW += tale;
+                            noIteration++;
+                            break;
                         }
 
-                    }
-#endif
-                }
 
-#ifdef RLOF_SSE
-                short etaValues[8];
-                _mm_storeu_si128((__m128i*)(etaValues), mmEta);
-                MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3]
-                                           + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]);
-                float CV_DECL_ALIGNED(32) wbuf[4];
-#endif
-                if( j == 0 )
-                {
-#ifdef RLOF_SSE
-                        _mm_store_ps(wbuf, mmSumW1);
-                        w1  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-                        _mm_store_ps(wbuf, mmSumW2);
-                        w2  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-                        _mm_store_ps(wbuf, mmSumDI);
-                        dI  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-                        _mm_store_ps(wbuf, mmSumI);
-                        sumI  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-                        _mm_store_ps(wbuf, mmSumIx);
-                        sumIx  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-                        _mm_store_ps(wbuf, mmSumIy);
-                        sumIy  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-                        _mm_store_ps(wbuf, mmSumW);
-                        sumW  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-#endif
-                        sumIx *= -FLT_SCALE;
-                        sumIy *= -FLT_SCALE;
-                        sumI *=FLT_SCALE;
-                        sumW *= FLT_SCALE;
-                        w1 *= -FLT_SCALE;
-                        w2 *= -FLT_SCALE;
-                        dI *= FLT_SCALE;
+                        D = (1.f / D);
 
+                        invTensorMat(0,0) = (A22*sumI*sumI - 2*sumI*sumIy*w2 + dI*sumIy*sumIy + sumW*w2*w2 - A22*dI*sumW)* D;
+                        invTensorMat(0,1) = (A12*dI*sumW - A12*sumI * sumI - dI*sumIx*sumIy + sumI*sumIx*w2 + sumI*sumIy*w1 - sumW*w1*w2)* D;
+                        invTensorMat(0,2) = (A12*sumI*sumIy - sumIy*sumIy*w1 - A22*sumI*sumIx - A12*sumW*w2 + A22*sumW*w1 + sumIx*sumIy*w2)* D;
+                        invTensorMat(0,3) = (A22*dI*sumIx - A12*dI*sumIy - sumIx*w2*w2 + A12*sumI*w2 - A22*sumI*w1 + sumIy*w1*w2) * D;
+                        invTensorMat(1,0) = invTensorMat(0,1);
+                        invTensorMat(1,1) = (A11*sumI * sumI - 2*sumI*sumIx*w1 + dI*sumIx * sumIx + sumW*w1*w1 - A11*dI*sumW) * D;
+                        invTensorMat(1,2) = (A12*sumI*sumIx - A11*sumI*sumIy - sumIx * sumIx*w2 + A11*sumW*w2 - A12*sumW*w1 + sumIx*sumIy*w1) * D;
+                        invTensorMat(1,3) = (A11*dI*sumIy - sumIy*w1*w1 - A12*dI*sumIx - A11*sumI*w2 + A12*sumI*w1 + sumIx*w1*w2)* D;
+                        invTensorMat(2,0) = invTensorMat(0,2);
+                        invTensorMat(2,1) = invTensorMat(1,2);
+                        invTensorMat(2,2) = (sumW*A12*A12 - 2*A12*sumIx*sumIy + A22*sumIx*sumIx + A11*sumIy*sumIy - A11*A22*sumW)* D;
+                        invTensorMat(2,3) = (A11*A22*sumI - A12*A12*sumI - A11*sumIy*w2 + A12*sumIx*w2 + A12*sumIy*w1 - A22*sumIx*w1)* D;
+                        invTensorMat(3,0) = invTensorMat(0,3);
+                        invTensorMat(3,1) = invTensorMat(1,3);
+                        invTensorMat(3,2) = invTensorMat(2,3);
+                        invTensorMat(3,3) = (dI*A12*A12 - 2*A12*w1*w2 + A22*w1*w1 + A11*w2*w2 - A11*A22*dI)* D;
+                    }
 
-#ifdef RLOF_SSE
-                    float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];//
 
-                    _mm_store_ps(A11buf, mmAxx);
-                    _mm_store_ps(A12buf, mmAxy);
-                    _mm_store_ps(A22buf, mmAyy);
+    #ifdef RLOF_SSE
+                    float CV_DECL_ALIGNED(16) bbuf[4];
+                    for(int mmi = 0; mmi < 4; mmi++)
+                    {
 
+                        _mm_store_ps(bbuf, _mm_add_ps(qb0[mmi], qb1[mmi]));
+                        _b0[mmi] = bbuf[0] + bbuf[2];
+                        _b1[mmi] = bbuf[1] + bbuf[3];
+                        _mm_store_ps(bbuf, qb2[mmi]);
+                        _b2[mmi] = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
+                        _mm_store_ps(bbuf, qb3[mmi]);
+                        _b3[mmi] = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
 
-                    A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
-                    A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
-                    A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
-#endif
-                    A11 *= FLT_SCALE; // 54866744.
-                    A12 *= FLT_SCALE; // -628764.00
-                    A22 *= FLT_SCALE; // 19730.000
-
-                    D = - A12*A12*sumI*sumI + dI*sumW*A12*A12 + 2*A12*sumI*sumIx*w2 + 2*A12*sumI*sumIy*w1
-                        - 2*dI*A12*sumIx*sumIy - 2*sumW*A12*w1*w2 + A11*A22*sumI*sumI - 2*A22*sumI*sumIx*w1
-                        - 2*A11*sumI*sumIy*w2 - sumIx*sumIx*w2*w2 + A22*dI*sumIx*sumIx + 2*sumIx*sumIy*w1*w2
-                        - sumIy*sumIy*w1*w1 + A11*dI*sumIy*sumIy + A22*sumW*w1*w1 + A11*sumW*w2*w2 - A11*A22*dI*sumW;
-
-                    float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
-                            4.f*A12*A12))/(2*winArea);
-                    if(  minEig < minEigThreshold || std::abs(D) < FLT_EPSILON )
-                    {
-                        if( level == 0 && status )
-                            status[ptidx] = 0;
-                        if( level > 0)
-                        {
-                            nextPts[ptidx] = backUpNextPt;
-                            gainVecs[ptidx] = backUpGain;
-                        }
-                        noIteration++;
-                        break;
                     }
+    #endif
 
+                    _b0[0] *= FLT_SCALE;_b0[1] *= FLT_SCALE;_b0[2] *= FLT_SCALE;_b0[3] *= FLT_SCALE;
+                    _b1[0] *= FLT_SCALE;_b1[1] *= FLT_SCALE;_b1[2] *= FLT_SCALE;_b1[3] *= FLT_SCALE;
+                    _b2[0] *= FLT_SCALE;_b2[1] *= FLT_SCALE;_b2[2] *= FLT_SCALE;_b2[3] *= FLT_SCALE;
+                    _b3[0] *= FLT_SCALE;_b3[1] *= FLT_SCALE;_b3[2] *= FLT_SCALE;_b3[3] *= FLT_SCALE;
 
-                    D = (1.f / D);
-
-                    invTensorMat(0,0) = (A22*sumI*sumI - 2*sumI*sumIy*w2 + dI*sumIy*sumIy + sumW*w2*w2 - A22*dI*sumW)* D;
-                    invTensorMat(0,1) = (A12*dI*sumW - A12*sumI * sumI - dI*sumIx*sumIy + sumI*sumIx*w2 + sumI*sumIy*w1 - sumW*w1*w2)* D;
-                    invTensorMat(0,2) = (A12*sumI*sumIy - sumIy*sumIy*w1 - A22*sumI*sumIx - A12*sumW*w2 + A22*sumW*w1 + sumIx*sumIy*w2)* D;
-                    invTensorMat(0,3) = (A22*dI*sumIx - A12*dI*sumIy - sumIx*w2*w2 + A12*sumI*w2 - A22*sumI*w1 + sumIy*w1*w2) * D;
-                    invTensorMat(1,0) = invTensorMat(0,1);
-                    invTensorMat(1,1) = (A11*sumI * sumI - 2*sumI*sumIx*w1 + dI*sumIx * sumIx + sumW*w1*w1 - A11*dI*sumW) * D;
-                    invTensorMat(1,2) = (A12*sumI*sumIx - A11*sumI*sumIy - sumIx * sumIx*w2 + A11*sumW*w2 - A12*sumW*w1 + sumIx*sumIy*w1) * D;
-                    invTensorMat(1,3) = (A11*dI*sumIy - sumIy*w1*w1 - A12*dI*sumIx - A11*sumI*w2 + A12*sumI*w1 + sumIx*w1*w2)* D;
-                    invTensorMat(2,0) = invTensorMat(0,2);
-                    invTensorMat(2,1) = invTensorMat(1,2);
-                    invTensorMat(2,2) = (sumW*A12*A12 - 2*A12*sumIx*sumIy + A22*sumIx*sumIx + A11*sumIy*sumIy - A11*A22*sumW)* D;
-                    invTensorMat(2,3) = (A11*A22*sumI - A12*A12*sumI - A11*sumIy*w2 + A12*sumIx*w2 + A12*sumIy*w1 - A22*sumIx*w1)* D;
-                    invTensorMat(3,0) = invTensorMat(0,3);
-                    invTensorMat(3,1) = invTensorMat(1,3);
-                    invTensorMat(3,2) = invTensorMat(2,3);
-                    invTensorMat(3,3) = (dI*A12*A12 - 2*A12*w1*w2 + A22*w1*w1 + A11*w2*w2 - A11*A22*dI)* D;
-                }
 
+                    Mc0[0] =   _b0[0] - _b0[1] - _b0[2] + _b0[3];
+                    Mc0[1] =   _b1[0] - _b1[1] - _b1[2] + _b1[3];
+                    Mc0[2] = -(_b2[0] - _b2[1] - _b2[2] + _b2[3]);
+                    Mc0[3] = -(_b3[0] - _b3[1] - _b3[2] + _b3[3]);
 
-#ifdef RLOF_SSE
-                float CV_DECL_ALIGNED(16) bbuf[4];
-                for(int mmi = 0; mmi < 4; mmi++)
-                {
+                    Mc1[0] =   _b0[1] - _b0[3];
+                    Mc1[1] =   _b1[1] - _b1[3];
+                    Mc1[2] = -(_b2[1] - _b2[3]);
+                    Mc1[3] = -(_b3[1] - _b3[3]);
 
-                    _mm_store_ps(bbuf, _mm_add_ps(qb0[mmi], qb1[mmi]));
-                    _b0[mmi] = bbuf[0] + bbuf[2];
-                    _b1[mmi] = bbuf[1] + bbuf[3];
-                    _mm_store_ps(bbuf, qb2[mmi]);
-                    _b2[mmi] = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
-                    _mm_store_ps(bbuf, qb3[mmi]);
-                    _b3[mmi] = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
 
-                }
-#endif
+                    Mc2[0] =   _b0[2] - _b0[3];
+                    Mc2[1] =   _b1[2] - _b1[3];
+                    Mc2[2] = -(_b2[2] - _b2[3]);
+                    Mc2[3] = -(_b3[2] - _b3[3]);
 
-                _b0[0] *= FLT_SCALE;_b0[1] *= FLT_SCALE;_b0[2] *= FLT_SCALE;_b0[3] *= FLT_SCALE;
-                _b1[0] *= FLT_SCALE;_b1[1] *= FLT_SCALE;_b1[2] *= FLT_SCALE;_b1[3] *= FLT_SCALE;
-                _b2[0] *= FLT_SCALE;_b2[1] *= FLT_SCALE;_b2[2] *= FLT_SCALE;_b2[3] *= FLT_SCALE;
-                _b3[0] *= FLT_SCALE;_b3[1] *= FLT_SCALE;_b3[2] *= FLT_SCALE;_b3[3] *= FLT_SCALE;
-
-
-                Mc0[0] =   _b0[0] - _b0[1] - _b0[2] + _b0[3];
-                Mc0[1] =   _b1[0] - _b1[1] - _b1[2] + _b1[3];
-                Mc0[2] = -(_b2[0] - _b2[1] - _b2[2] + _b2[3]);
-                Mc0[3] = -(_b3[0] - _b3[1] - _b3[2] + _b3[3]);
-
-                Mc1[0] =   _b0[1] - _b0[3];
-                Mc1[1] =   _b1[1] - _b1[3];
-                Mc1[2] = -(_b2[1] - _b2[3]);
-                Mc1[3] = -(_b3[1] - _b3[3]);
-
-
-                Mc2[0] =   _b0[2] - _b0[3];
-                Mc2[1] =   _b1[2] - _b1[3];
-                Mc2[2] = -(_b2[2] - _b2[3]);
-                Mc2[3] = -(_b3[2] - _b3[3]);
-
-
-                Mc3[0] =  _b0[3];
-                Mc3[1] =  _b1[3];
-                Mc3[2] = -_b2[3];
-                Mc3[3] = -_b3[3];
-
-                //
-                c[0] = -Mc0[0];
-                c[1] = -Mc1[0];
-                c[2] = -Mc2[0];
-                c[3] = -Mc3[0];
-                c[4] = -Mc0[1];
-                c[5] = -Mc1[1];
-                c[6] = -Mc2[1];
-                c[7] = -Mc3[1];
-
-                float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]);
-                float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]);
-                float e2 = e0 * (c[1] * c[7] -c[3] * c[5]);
-                e0 = e1 * e1 - e2;
-                hasSolved = false;
-                if ( e0 > 0)
-                {
-                    e0 = sqrt(e0);
-                    float _y[2] = {-e1 - e0, e0 - e1};
-                    float c0yc1[2] = {c[0] * _y[0] + c[1],
-                                        c[0] * _y[1] + c[1]};
-                    float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0],
-                                    -(c[2] * _y[1] + c[3]) / c0yc1[1]};
-                    bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1);
-                    bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1);
-
-                    bool isSolution1 = checkSolution(_x[0], _y[0], c );
-                    bool isSolution2 = checkSolution(_x[1], _y[1], c );
-                    bool isSink1 = isIn1 && isSolution1;
-                    bool isSink2 = isIn2 && isSolution2;
-
-                    if ( isSink1 != isSink2)
+
+                    Mc3[0] =  _b0[3];
+                    Mc3[1] =  _b1[3];
+                    Mc3[2] = -_b2[3];
+                    Mc3[3] = -_b3[3];
+
+                    //
+                    c[0] = -Mc0[0];
+                    c[1] = -Mc1[0];
+                    c[2] = -Mc2[0];
+                    c[3] = -Mc3[0];
+                    c[4] = -Mc0[1];
+                    c[5] = -Mc1[1];
+                    c[6] = -Mc2[1];
+                    c[7] = -Mc3[1];
+
+                    float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]);
+                    float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]);
+                    float e2 = e0 * (c[1] * c[7] -c[3] * c[5]);
+                    e0 = e1 * e1 - e2;
+                    hasSolved = false;
+                    if ( e0 > 0)
                     {
-                        a = isSink1 ? _x[0] : _x[1];
-                        b = isSink1 ? _y[0] : _y[1];
-                        ab = a * b;
-                        hasSolved = true;
-                        delta.x = inextPt.x + a - nextPt.x;
-                        delta.y = inextPt.y + b - nextPt.y;
+                        e0 = sqrt(e0);
+                        float _y[2] = {-e1 - e0, e0 - e1};
+                        float c0yc1[2] = {c[0] * _y[0] + c[1],
+                                            c[0] * _y[1] + c[1]};
+                        float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0],
+                                        -(c[2] * _y[1] + c[3]) / c0yc1[1]};
+                        bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1);
+                        bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1);
 
-                        cv::Vec4f mismatchVec = ab * Mc0  + Mc1 *a + Mc2 * b + Mc3;
-                        deltaGain = est_DeltaGain(invTensorMat, mismatchVec);
+                        bool isSolution1 = checkSolution(_x[0], _y[0], c );
+                        bool isSolution2 = checkSolution(_x[1], _y[1], c );
+                        bool isSink1 = isIn1 && isSolution1;
+                        bool isSink2 = isIn2 && isSolution2;
 
-                    } // isIn1 != isIn2
-                }
+                        if ( isSink1 != isSink2)
+                        {
+                            a = isSink1 ? _x[0] : _x[1];
+                            b = isSink1 ? _y[0] : _y[1];
+                            ab = a * b;
+                            hasSolved = true;
+                            delta.x = inextPt.x + a - nextPt.x;
+                            delta.y = inextPt.y + b - nextPt.y;
+
+                            cv::Vec4f mismatchVec = ab * Mc0  + Mc1 *a + Mc2 * b + Mc3;
+                            deltaGain = est_DeltaGain(invTensorMat, mismatchVec);
+
+                        } // isIn1 != isIn2
+                    }
                     if( hasSolved == false)
                         noIteration++;
                 }
@@ -1469,14 +1460,14 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         prevGain = deltaGain;
                     gainVec += deltaGain;
                     nextPt  += delta    ;
-                    nextPts[ptidx] = nextPt + halfWin;
+                    nextPts[ptidx] = nextPt - halfWin;
                     gainVecs[ptidx]= gainVec;
 
                 }
                 else
                 {
                     nextPt += delta;
-                    nextPts[ptidx] = nextPt + halfWin;
+                    nextPts[ptidx] = nextPt - halfWin;
                     gainVecs[ptidx]= gainVec + deltaGain;
                     noSolvedIteration++;
                     break;
@@ -1486,7 +1477,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     (std::abs(delta.x - prevDelta.x) < 0.01  &&    std::abs(delta.y - prevDelta.y) < 0.01)
                  || ((delta.ddot(delta) <= 0.001) && std::abs(prevGain.x - deltaGain.x) < 0.01)))
                 {
-                    nextPts[ptidx]  -= delta*0.5f;
+                            nextPts[ptidx]  += delta*0.5f;
                     gainVecs[ptidx] -= deltaGain* 0.5f;
                     break;
                 }
@@ -1620,12 +1611,13 @@ class TrackerInvoker  : public cv::ParallelLoopBody
                     minWinSize,maxWinSize) == false)
                 continue;
 
-            prevPt -= halfWin;
+            halfWin = Point2f(static_cast<float>(maxWinSize), static_cast<float>(maxWinSize)) - halfWin;
+            prevPt += halfWin;
             iprevPt.x = cvFloor(prevPt.x);
             iprevPt.y = cvFloor(prevPt.y);
 
-            if( iprevPt.x + halfWin.x < 0 || iprevPt.x + halfWin.x>= derivI.cols ||
-                iprevPt.y + halfWin.y < 0 || iprevPt.y + halfWin.y >= derivI.rows )
+            if( iprevPt.x < 0 || iprevPt.x >= derivI.cols - winSize.width ||
+                iprevPt.y < 0 || iprevPt.y >= derivI.rows - winSize.height - 1)
             {
                 if( level == 0 )
                 {
@@ -1783,7 +1775,7 @@ class TrackerInvoker  : public cv::ParallelLoopBody
             D = 1.f/D;
 
             cv::Point2f backUpNextPt = nextPt;
-            nextPt -= halfWin;
+            nextPt += halfWin;
             Point2f prevDelta(0,0);
 
             float c[8];
@@ -1802,261 +1794,259 @@ class TrackerInvoker  : public cv::ParallelLoopBody
 
                 if( (inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j == 0))
                 {
-                inextPt.x = cvFloor(nextPt.x);
-                inextPt.y = cvFloor(nextPt.y);
-
-
-                if( inextPt.x < -winSize.width || inextPt.x >= J.cols ||
-                    inextPt.y < -winSize.height || inextPt.y >= J.rows )
-                {
-                    if( level == 0 && status )
-                        status[ptidx] = 3;
-                    if (level > 0)
-                        nextPts[ptidx] = backUpNextPt;
-                    break;
-                }
-
+                    inextPt.x = cvFloor(nextPt.x);
+                    inextPt.y = cvFloor(nextPt.y);
+                    if( inextPt.x < 0 || inextPt.x >= J.cols - winSize.width ||
+                        inextPt.y < 0 || inextPt.y >= J.rows - winSize.height - 1)
+                    {
+                        if( level == 0 && status )
+                            status[ptidx] = 3;
+                        if (level > 0)
+                            nextPts[ptidx] = backUpNextPt;
+                        break;
+                    }
 
-                iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
-                iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
-                iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
-                iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
 
-                float _b1[4] = {0,0,0,0};
-                float _b2[4] = {0,0,0,0};
-#ifdef RLOF_SSE
-                __m128 qbc0[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
-                __m128 qbc1[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
-                qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
-                qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
-#endif
-                for( y = 0; y < winSize.height; y++ )
-                {
-                    const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
-                    const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
-                    const short* Iptr  = IWinBuf.ptr<short>(y, 0);
-                    const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
-                    x = 0;
-#ifdef RLOF_SSE
+                    iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+                    iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+                    iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+                    iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
 
-                    const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
-                    for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 )
+                    float _b1[4] = {0,0,0,0};
+                    float _b2[4] = {0,0,0,0};
+    #ifdef RLOF_SSE
+                    __m128 qbc0[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
+                    __m128 qbc1[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()};
+                    qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+                    qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+    #endif
+                    for( y = 0; y < winSize.height; y++ )
                     {
-                        if( maskPtr[x  ] == 0 && maskPtr[x+1] == 0 && maskPtr[x+2] == 0 && maskPtr[x+3] == 0
-                        &&    maskPtr[x+4] == 0 && maskPtr[x+5] == 0 && maskPtr[x+6] == 0 && maskPtr[x+7] == 0)
-                            continue;
-                        __m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x));
-                        __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
-                        __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
-                        __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z);
-                        __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z);
-
-                        __m128i mmDiffc_epi16[4] =
-                        { _mm_subs_epi16(_mm_slli_epi16(v00, 5), diff0),
-                          _mm_subs_epi16(_mm_slli_epi16(v01, 5), diff0),
-                          _mm_subs_epi16(_mm_slli_epi16(v10, 5), diff0),
-                          _mm_subs_epi16(_mm_slli_epi16(v11, 5), diff0)
-                        };
-
-                        __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
-                        __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
-
-                        if(  x > winSize.width*cn - 8)
+                        const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
+                        const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
+                        const short* Iptr  = IWinBuf.ptr<short>(y, 0);
+                        const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
+                        x = 0;
+    #ifdef RLOF_SSE
+
+                        const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
+                        for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 )
                         {
-                            Ixy_0 = _mm_and_si128(Ixy_0, mmMask0);
-                            Ixy_1 = _mm_and_si128(Ixy_1, mmMask1);
-                        }
+                            if( maskPtr[x  ] == 0 && maskPtr[x+1] == 0 && maskPtr[x+2] == 0 && maskPtr[x+3] == 0
+                            &&    maskPtr[x+4] == 0 && maskPtr[x+5] == 0 && maskPtr[x+6] == 0 && maskPtr[x+7] == 0)
+                                continue;
+                            __m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x));
+                            __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
+                            __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
+                            __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z);
+                            __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z);
+
+                            __m128i mmDiffc_epi16[4] =
+                            { _mm_subs_epi16(_mm_slli_epi16(v00, 5), diff0),
+                              _mm_subs_epi16(_mm_slli_epi16(v01, 5), diff0),
+                              _mm_subs_epi16(_mm_slli_epi16(v10, 5), diff0),
+                              _mm_subs_epi16(_mm_slli_epi16(v11, 5), diff0)
+                            };
+
+                            __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
+                            __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
+
+                            if(  x > winSize.width*cn - 8)
+                            {
+                                Ixy_0 = _mm_and_si128(Ixy_0, mmMask0);
+                                Ixy_1 = _mm_and_si128(Ixy_1, mmMask1);
+                            }
 
-                        __m128i diffc1[4] =
-                        {_mm_unpackhi_epi16(mmDiffc_epi16[0],mmDiffc_epi16[0]),
-                         _mm_unpackhi_epi16(mmDiffc_epi16[1],mmDiffc_epi16[1]),
-                         _mm_unpackhi_epi16(mmDiffc_epi16[2],mmDiffc_epi16[2]),
-                         _mm_unpackhi_epi16(mmDiffc_epi16[3],mmDiffc_epi16[3])
-                        };
-
-                        __m128i diffc0[4] =
-                        {_mm_unpacklo_epi16(mmDiffc_epi16[0],mmDiffc_epi16[0]),
-                         _mm_unpacklo_epi16(mmDiffc_epi16[1],mmDiffc_epi16[1]),
-                         _mm_unpacklo_epi16(mmDiffc_epi16[2],mmDiffc_epi16[2]),
-                         _mm_unpacklo_epi16(mmDiffc_epi16[3],mmDiffc_epi16[3])
-                        };
-
-
-                        // It * Ix It * Iy [0 ... 3]
-                        //mask split for multiplication
-                        // for set 1 lo sigma 1
-
-
-                        v10 = _mm_mullo_epi16(Ixy_0, diffc0[0]);
-                        v11 = _mm_mulhi_epi16(Ixy_0, diffc0[0]);
-                        v00 = _mm_unpacklo_epi16(v10, v11);
-                        v10 = _mm_unpackhi_epi16(v10, v11);
-                        qbc0[0] = _mm_add_ps(qbc0[0], _mm_cvtepi32_ps(v00));
-                        qbc1[0] = _mm_add_ps(qbc1[0], _mm_cvtepi32_ps(v10));
-
-                        v10 = _mm_mullo_epi16(Ixy_0, diffc0[1]);
-                        v11 = _mm_mulhi_epi16(Ixy_0, diffc0[1]);
-                        v00 = _mm_unpacklo_epi16(v10, v11);
-                        v10 = _mm_unpackhi_epi16(v10, v11);
-                        qbc0[1] = _mm_add_ps(qbc0[1], _mm_cvtepi32_ps(v00));
-                        qbc1[1] = _mm_add_ps(qbc1[1], _mm_cvtepi32_ps(v10));
-
-                        v10 = _mm_mullo_epi16(Ixy_0, diffc0[2]);
-                        v11 = _mm_mulhi_epi16(Ixy_0, diffc0[2]);
-                        v00 = _mm_unpacklo_epi16(v10, v11);
-                        v10 = _mm_unpackhi_epi16(v10, v11);
-                        qbc0[2] = _mm_add_ps(qbc0[2], _mm_cvtepi32_ps(v00));
-                        qbc1[2] = _mm_add_ps(qbc1[2], _mm_cvtepi32_ps(v10));
-
-                        v10 = _mm_mullo_epi16(Ixy_0, diffc0[3]);
-                        v11 = _mm_mulhi_epi16(Ixy_0, diffc0[3]);
-                        v00 = _mm_unpacklo_epi16(v10, v11);
-                        v10 = _mm_unpackhi_epi16(v10, v11);
-                        qbc0[3] = _mm_add_ps(qbc0[3], _mm_cvtepi32_ps(v00));
-                        qbc1[3] = _mm_add_ps(qbc1[3], _mm_cvtepi32_ps(v10));
-                        // It * Ix It * Iy [4 ... 7]
-                        // for set 1 hi sigma 1
-
-                        v10 = _mm_mullo_epi16(Ixy_1, diffc1[0]);
-                        v11 = _mm_mulhi_epi16(Ixy_1, diffc1[0]);
-                        v00 = _mm_unpacklo_epi16(v10, v11);
-                        v10 = _mm_unpackhi_epi16(v10, v11);
-                        qbc0[0] = _mm_add_ps(qbc0[0], _mm_cvtepi32_ps(v00));
-                        qbc1[0] = _mm_add_ps(qbc1[0], _mm_cvtepi32_ps(v10));
-
-                        v10 = _mm_mullo_epi16(Ixy_1, diffc1[1]);
-                        v11 = _mm_mulhi_epi16(Ixy_1, diffc1[1]);
-                        v00 = _mm_unpacklo_epi16(v10, v11);
-                        v10 = _mm_unpackhi_epi16(v10, v11);
-                        qbc0[1] = _mm_add_ps(qbc0[1], _mm_cvtepi32_ps(v00));
-                        qbc1[1] = _mm_add_ps(qbc1[1], _mm_cvtepi32_ps(v10));
-
-                        v10 = _mm_mullo_epi16(Ixy_1, diffc1[2]);
-                        v11 = _mm_mulhi_epi16(Ixy_1, diffc1[2]);
-                        v00 = _mm_unpacklo_epi16(v10, v11);
-                        v10 = _mm_unpackhi_epi16(v10, v11);
-                        qbc0[2] = _mm_add_ps(qbc0[2], _mm_cvtepi32_ps(v00));
-                        qbc1[2] = _mm_add_ps(qbc1[2], _mm_cvtepi32_ps(v10));
-
-                        v10 = _mm_mullo_epi16(Ixy_1, diffc1[3]);
-                        v11 = _mm_mulhi_epi16(Ixy_1, diffc1[3]);
-                        v00 = _mm_unpacklo_epi16(v10, v11);
-                        v10 = _mm_unpackhi_epi16(v10, v11);
-                        qbc0[3] = _mm_add_ps(qbc0[3], _mm_cvtepi32_ps(v00));
-                        qbc1[3] = _mm_add_ps(qbc1[3], _mm_cvtepi32_ps(v10));
-
-                     }
-#else
-                    for( ; x < winSize.width*cn; x++, dIptr += 2 )
-                    {
-                        if( dIptr[0] == 0 && dIptr[1] == 0)
-                            continue;
-                        short It[4] = {(Jptr[x] << 5)            - Iptr[x],
-                                            (Jptr[x+cn]<< 5)        - Iptr[x],
-                                            (Jptr1[x]<< 5)        - Iptr[x],
-                                            (Jptr1[x+cn]<< 5)    - Iptr[x]};
-
-                        _b1[0] += (float)(It[0]*dIptr[0]);
-                        _b1[1] += (float)(It[1]*dIptr[0]);
-                        _b1[2] += (float)(It[2]*dIptr[0]);
-                        _b1[3] += (float)(It[3]*dIptr[0]);
-
-                        _b2[0] += (float)(It[0]*dIptr[1]);
-                        _b2[1] += (float)(It[1]*dIptr[1]);
-                        _b2[2] += (float)(It[2]*dIptr[1]);
-                        _b2[3] += (float)(It[3]*dIptr[1]);
-                    }
-#endif
+                            __m128i diffc1[4] =
+                            {_mm_unpackhi_epi16(mmDiffc_epi16[0],mmDiffc_epi16[0]),
+                             _mm_unpackhi_epi16(mmDiffc_epi16[1],mmDiffc_epi16[1]),
+                             _mm_unpackhi_epi16(mmDiffc_epi16[2],mmDiffc_epi16[2]),
+                             _mm_unpackhi_epi16(mmDiffc_epi16[3],mmDiffc_epi16[3])
+                            };
 
-                }
+                            __m128i diffc0[4] =
+                            {_mm_unpacklo_epi16(mmDiffc_epi16[0],mmDiffc_epi16[0]),
+                             _mm_unpacklo_epi16(mmDiffc_epi16[1],mmDiffc_epi16[1]),
+                             _mm_unpacklo_epi16(mmDiffc_epi16[2],mmDiffc_epi16[2]),
+                             _mm_unpacklo_epi16(mmDiffc_epi16[3],mmDiffc_epi16[3])
+                            };
 
-#ifdef RLOF_SSE
-                float CV_DECL_ALIGNED(16) bbuf[4];
-                _mm_store_ps(bbuf, _mm_add_ps(qbc0[0], qbc1[0]));
-                _b1[0] += bbuf[0] + bbuf[2];
-                _b2[0] += bbuf[1] + bbuf[3];
 
-                _mm_store_ps(bbuf, _mm_add_ps(qbc0[1], qbc1[1]));
-                _b1[1] += bbuf[0] + bbuf[2];
-                _b2[1] += bbuf[1] + bbuf[3];
+                            // It * Ix It * Iy [0 ... 3]
+                            //mask split for multiplication
+                            // for set 1 lo sigma 1
 
-                _mm_store_ps(bbuf, _mm_add_ps(qbc0[2], qbc1[2]));
-                _b1[2] += bbuf[0] + bbuf[2];
-                _b2[2] += bbuf[1] + bbuf[3];
 
-                _mm_store_ps(bbuf, _mm_add_ps(qbc0[3], qbc1[3]));
-                _b1[3] += bbuf[0] + bbuf[2];
-                _b2[3] += bbuf[1] + bbuf[3];
+                            v10 = _mm_mullo_epi16(Ixy_0, diffc0[0]);
+                            v11 = _mm_mulhi_epi16(Ixy_0, diffc0[0]);
+                            v00 = _mm_unpacklo_epi16(v10, v11);
+                            v10 = _mm_unpackhi_epi16(v10, v11);
+                            qbc0[0] = _mm_add_ps(qbc0[0], _mm_cvtepi32_ps(v00));
+                            qbc1[0] = _mm_add_ps(qbc1[0], _mm_cvtepi32_ps(v10));
 
-#endif
-                _b1[0] *= FLT_SCALE;
-                _b1[1] *= FLT_SCALE;
-                _b1[2] *= FLT_SCALE;
-                _b1[3] *= FLT_SCALE;
-                _b2[0] *= FLT_SCALE;
-                _b2[1] *= FLT_SCALE;
-                _b2[2] *= FLT_SCALE;
-                _b2[3] *= FLT_SCALE;
-
-                float c0[4] = {    _b1[3] + _b1[0] - _b1[2] - _b1[1],
-                                _b1[1] - _b1[0],
-                                _b1[2] - _b1[0],
-                                _b1[0]};
-                float c1[4] = {    _b2[3] + _b2[0] - _b2[2] - _b2[1],
-                                _b2[1] - _b2[0],
-                                _b2[2] - _b2[0],
-                                _b2[0]};
-
-                float DA12 = A12 * D ;
-                float DA22 = A22 * D ;
-                float DA11 = A11 * D ;
-                c[0]    = DA12 * c1[0] - DA22 * c0[0];
-                c[1]    = DA12 * c1[1] - DA22 * c0[1];
-                c[2]    = DA12 * c1[2] - DA22 * c0[2];
-                c[3]    = DA12 * c1[3] - DA22 * c0[3];
-                c[4]    = DA12 * c0[0] - DA11 * c1[0];
-                c[5]    = DA12 * c0[1] - DA11 * c1[1];
-                c[6]    = DA12 * c0[2] - DA11 * c1[2];
-                c[7]    = DA12 * c0[3] - DA11 * c1[3];
-
-                float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]);
-                float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]);
-                float e2 = e0 * (c[1] * c[7] -c[3] * c[5]);
-                e0 = e1 * e1 - e2;
-                if ( e0 >= 0)
-                {
-                    e0 = sqrt(e0);
-                    float _y[2] = {-e1 - e0, e0 - e1};
-                    float c0yc1[2] = {c[0] * _y[0] + c[1],
-                                      c[0] * _y[1] + c[1]};
+                            v10 = _mm_mullo_epi16(Ixy_0, diffc0[1]);
+                            v11 = _mm_mulhi_epi16(Ixy_0, diffc0[1]);
+                            v00 = _mm_unpacklo_epi16(v10, v11);
+                            v10 = _mm_unpackhi_epi16(v10, v11);
+                            qbc0[1] = _mm_add_ps(qbc0[1], _mm_cvtepi32_ps(v00));
+                            qbc1[1] = _mm_add_ps(qbc1[1], _mm_cvtepi32_ps(v10));
+
+                            v10 = _mm_mullo_epi16(Ixy_0, diffc0[2]);
+                            v11 = _mm_mulhi_epi16(Ixy_0, diffc0[2]);
+                            v00 = _mm_unpacklo_epi16(v10, v11);
+                            v10 = _mm_unpackhi_epi16(v10, v11);
+                            qbc0[2] = _mm_add_ps(qbc0[2], _mm_cvtepi32_ps(v00));
+                            qbc1[2] = _mm_add_ps(qbc1[2], _mm_cvtepi32_ps(v10));
 
-                    float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0],
-                                   -(c[2] * _y[1] + c[3]) / c0yc1[1]};
+                            v10 = _mm_mullo_epi16(Ixy_0, diffc0[3]);
+                            v11 = _mm_mulhi_epi16(Ixy_0, diffc0[3]);
+                            v00 = _mm_unpacklo_epi16(v10, v11);
+                            v10 = _mm_unpackhi_epi16(v10, v11);
+                            qbc0[3] = _mm_add_ps(qbc0[3], _mm_cvtepi32_ps(v00));
+                            qbc1[3] = _mm_add_ps(qbc1[3], _mm_cvtepi32_ps(v10));
+                            // It * Ix It * Iy [4 ... 7]
+                            // for set 1 hi sigma 1
 
-                    bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1);
-                    bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1);
+                            v10 = _mm_mullo_epi16(Ixy_1, diffc1[0]);
+                            v11 = _mm_mulhi_epi16(Ixy_1, diffc1[0]);
+                            v00 = _mm_unpacklo_epi16(v10, v11);
+                            v10 = _mm_unpackhi_epi16(v10, v11);
+                            qbc0[0] = _mm_add_ps(qbc0[0], _mm_cvtepi32_ps(v00));
+                            qbc1[0] = _mm_add_ps(qbc1[0], _mm_cvtepi32_ps(v10));
 
+                            v10 = _mm_mullo_epi16(Ixy_1, diffc1[1]);
+                            v11 = _mm_mulhi_epi16(Ixy_1, diffc1[1]);
+                            v00 = _mm_unpacklo_epi16(v10, v11);
+                            v10 = _mm_unpackhi_epi16(v10, v11);
+                            qbc0[1] = _mm_add_ps(qbc0[1], _mm_cvtepi32_ps(v00));
+                            qbc1[1] = _mm_add_ps(qbc1[1], _mm_cvtepi32_ps(v10));
 
-                    bool isSink1 = isIn1 && checkSolution(_x[0], _y[0], c );
-                    bool isSink2 = isIn2 && checkSolution(_x[1], _y[1], c );
+                            v10 = _mm_mullo_epi16(Ixy_1, diffc1[2]);
+                            v11 = _mm_mulhi_epi16(Ixy_1, diffc1[2]);
+                            v00 = _mm_unpacklo_epi16(v10, v11);
+                            v10 = _mm_unpackhi_epi16(v10, v11);
+                            qbc0[2] = _mm_add_ps(qbc0[2], _mm_cvtepi32_ps(v00));
+                            qbc1[2] = _mm_add_ps(qbc1[2], _mm_cvtepi32_ps(v10));
 
+                            v10 = _mm_mullo_epi16(Ixy_1, diffc1[3]);
+                            v11 = _mm_mulhi_epi16(Ixy_1, diffc1[3]);
+                            v00 = _mm_unpacklo_epi16(v10, v11);
+                            v10 = _mm_unpackhi_epi16(v10, v11);
+                            qbc0[3] = _mm_add_ps(qbc0[3], _mm_cvtepi32_ps(v00));
+                            qbc1[3] = _mm_add_ps(qbc1[3], _mm_cvtepi32_ps(v10));
 
-                    //if( isSink1 != isSink2 )
-                    {
-                        if( isSink1 )
+                         }
+    #else
+                        for( ; x < winSize.width*cn; x++, dIptr += 2 )
                         {
-                            hasSolved = true;
-                            delta.x = inextPt.x + _x[0] - nextPt.x;
-                            delta.y = inextPt.y + _y[0] - nextPt.y;
+                            if( dIptr[0] == 0 && dIptr[1] == 0)
+                                continue;
+                            short It[4] = {(Jptr[x] << 5)            - Iptr[x],
+                                                (Jptr[x+cn]<< 5)        - Iptr[x],
+                                                (Jptr1[x]<< 5)        - Iptr[x],
+                                                (Jptr1[x+cn]<< 5)    - Iptr[x]};
+
+                            _b1[0] += (float)(It[0]*dIptr[0]);
+                            _b1[1] += (float)(It[1]*dIptr[0]);
+                            _b1[2] += (float)(It[2]*dIptr[0]);
+                            _b1[3] += (float)(It[3]*dIptr[0]);
+
+                            _b2[0] += (float)(It[0]*dIptr[1]);
+                            _b2[1] += (float)(It[1]*dIptr[1]);
+                            _b2[2] += (float)(It[2]*dIptr[1]);
+                            _b2[3] += (float)(It[3]*dIptr[1]);
                         }
-                        if (isSink2 )
+    #endif
+
+                    }
+
+    #ifdef RLOF_SSE
+                    float CV_DECL_ALIGNED(16) bbuf[4];
+                    _mm_store_ps(bbuf, _mm_add_ps(qbc0[0], qbc1[0]));
+                    _b1[0] += bbuf[0] + bbuf[2];
+                    _b2[0] += bbuf[1] + bbuf[3];
+
+                    _mm_store_ps(bbuf, _mm_add_ps(qbc0[1], qbc1[1]));
+                    _b1[1] += bbuf[0] + bbuf[2];
+                    _b2[1] += bbuf[1] + bbuf[3];
+
+                    _mm_store_ps(bbuf, _mm_add_ps(qbc0[2], qbc1[2]));
+                    _b1[2] += bbuf[0] + bbuf[2];
+                    _b2[2] += bbuf[1] + bbuf[3];
+
+                    _mm_store_ps(bbuf, _mm_add_ps(qbc0[3], qbc1[3]));
+                    _b1[3] += bbuf[0] + bbuf[2];
+                    _b2[3] += bbuf[1] + bbuf[3];
+
+    #endif
+                    _b1[0] *= FLT_SCALE;
+                    _b1[1] *= FLT_SCALE;
+                    _b1[2] *= FLT_SCALE;
+                    _b1[3] *= FLT_SCALE;
+                    _b2[0] *= FLT_SCALE;
+                    _b2[1] *= FLT_SCALE;
+                    _b2[2] *= FLT_SCALE;
+                    _b2[3] *= FLT_SCALE;
+
+                    float c0[4] = {    _b1[3] + _b1[0] - _b1[2] - _b1[1],
+                                    _b1[1] - _b1[0],
+                                    _b1[2] - _b1[0],
+                                    _b1[0]};
+                    float c1[4] = {    _b2[3] + _b2[0] - _b2[2] - _b2[1],
+                                    _b2[1] - _b2[0],
+                                    _b2[2] - _b2[0],
+                                    _b2[0]};
+
+                    float DA12 = A12 * D ;
+                    float DA22 = A22 * D ;
+                    float DA11 = A11 * D ;
+                    c[0]    = DA12 * c1[0] - DA22 * c0[0];
+                    c[1]    = DA12 * c1[1] - DA22 * c0[1];
+                    c[2]    = DA12 * c1[2] - DA22 * c0[2];
+                    c[3]    = DA12 * c1[3] - DA22 * c0[3];
+                    c[4]    = DA12 * c0[0] - DA11 * c1[0];
+                    c[5]    = DA12 * c0[1] - DA11 * c1[1];
+                    c[6]    = DA12 * c0[2] - DA11 * c1[2];
+                    c[7]    = DA12 * c0[3] - DA11 * c1[3];
+
+                    float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]);
+                    float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]);
+                    float e2 = e0 * (c[1] * c[7] -c[3] * c[5]);
+                    e0 = e1 * e1 - e2;
+                    if ( e0 >= 0)
+                    {
+                        e0 = sqrt(e0);
+                        float _y[2] = {-e1 - e0, e0 - e1};
+                        float c0yc1[2] = {c[0] * _y[0] + c[1],
+                                          c[0] * _y[1] + c[1]};
+
+                        float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0],
+                                       -(c[2] * _y[1] + c[3]) / c0yc1[1]};
+
+                        bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1);
+                        bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1);
+
+
+                        bool isSink1 = isIn1 && checkSolution(_x[0], _y[0], c );
+                        bool isSink2 = isIn2 && checkSolution(_x[1], _y[1], c );
+
+
+                        //if( isSink1 != isSink2 )
                         {
-                            hasSolved = true;
-                            delta.x = inextPt.x + _x[1] - nextPt.x;
-                            delta.y = inextPt.y + _y[1] - nextPt.y;
-                        }
-                    } // isIn1 != isIn2
-                } // e0 >= 0
+                            if( isSink1 )
+                            {
+                                hasSolved = true;
+                                delta.x = inextPt.x + _x[0] - nextPt.x;
+                                delta.y = inextPt.y + _y[0] - nextPt.y;
+                            }
+                            if (isSink2 )
+                            {
+                                hasSolved = true;
+                                delta.x = inextPt.x + _x[1] - nextPt.x;
+                                delta.y = inextPt.y + _y[1] - nextPt.y;
+                            }
+                        } // isIn1 != isIn2
+                    } // e0 >= 0
                 }
                 else
                     hasSolved = false;
@@ -2065,12 +2055,12 @@ class TrackerInvoker  : public cv::ParallelLoopBody
                     delta = Point2f( c[0] * ab + c[1] * a + c[2] * b + c[3],
                                      c[4] * ab + c[5] * a + c[6] * b + c[7]);
                     nextPt += 0.7 * delta;
-                    nextPts[ptidx] = nextPt + halfWin;
+                    nextPts[ptidx] = nextPt - halfWin;
                 }
                 else
                 {
                     nextPt += delta;
-                    nextPts[ptidx] = nextPt + halfWin;
+                    nextPts[ptidx] = nextPt - halfWin;
                     break;
                 }
 
diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp
index 3f76ce460a4..3c0bad6f963 100644
--- a/modules/optflow/src/rlof/plk_invoker.hpp
+++ b/modules/optflow/src/rlof/plk_invoker.hpp
@@ -61,9 +61,6 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
     void operator()(const cv::Range& range) const CV_OVERRIDE
     {
-#ifdef DEBUG_INVOKER
-        printf("plk::radial");fflush(stdout);
-#endif
         cv::Size    winSize;
         cv::Point2f halfWin;
 
@@ -119,14 +116,14 @@ class TrackerInvoker : public cv::ParallelLoopBody
                                 winMaskMat,winSize,halfWin,winArea,
                                 minWinSize,maxWinSize) == false)
                         continue;
-
-            prevPt -= halfWin;
+            halfWin = Point2f(static_cast<float>(maxWinSize) ,static_cast<float>(maxWinSize) ) - halfWin;
+            prevPt += halfWin;
             iprevPt.x = cvFloor(prevPt.x);
             iprevPt.y = cvFloor(prevPt.y);
 
 
-            if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols ||
-                iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows )
+            if( iprevPt.x < 0 || iprevPt.x >= derivI.cols - winSize.width ||
+                iprevPt.y < 0 || iprevPt.y >= derivI.rows - winSize.height - 1)
             {
                 if( level == 0 )
                 {
@@ -243,7 +240,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
             }
 
             cv::Point2f backUpNextPt = nextPt;
-            nextPt -= halfWin;
+            nextPt += halfWin;
             Point2f prevDelta(0,0);    //relates to h(t-1)
             Point2f prevGain(1,0);
             cv::Point2f gainVec = gainVecs[ptidx];
@@ -255,9 +252,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 inextPt.x = cvFloor(nextPt.x);
                 inextPt.y = cvFloor(nextPt.y);
 
-
-                if( inextPt.x + halfWin.x < 0 || inextPt.x + halfWin.x>= derivI.cols ||
-                    inextPt.y + halfWin.y < 0 || inextPt.y + halfWin.y >= derivI.rows )
+                if( inextPt.x < 0 || inextPt.x >= J.cols - winSize.width ||
+                    inextPt.y < 0 || inextPt.y >= J.rows - winSize.height - 1)
                 {
                     if( level == 0 && status )
                         status[ptidx] = 3;
@@ -606,7 +602,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 if( j == 0)
                     prevGain = deltaGain;
                 nextPt += delta;
-                nextPts[ptidx] = nextPt + halfWin;
+                nextPts[ptidx] = nextPt - halfWin;
                 gainVecs[ptidx]= gainVec;
                 if( delta.ddot(delta) <= criteria.epsilon)
                     break;
@@ -709,9 +705,6 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
     void operator()(const cv::Range& range) const CV_OVERRIDE
     {
-#ifdef DEBUG_INVOKER
-        printf("plk::ica");fflush(stdout);
-#endif
         cv::Size    winSize;
         cv::Point2f halfWin;
 
@@ -762,15 +755,13 @@ class TrackerInvoker : public cv::ParallelLoopBody
                                 winMaskMat,winSize,halfWin,winArea,
                                 minWinSize,maxWinSize) == false)
                         continue;
-
-
-            prevPt -= halfWin;
+            halfWin = Point2f(static_cast<float>(maxWinSize), static_cast<float>(maxWinSize)) - halfWin;
+            prevPt += halfWin;
             iprevPt.x = cvFloor(prevPt.x);
             iprevPt.y = cvFloor(prevPt.y);
 
-
-            if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols ||
-                iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows )
+            if( iprevPt.x < 0 || iprevPt.x >= derivI.cols - winSize.width ||
+                iprevPt.y < 0 || iprevPt.y >= derivI.rows - winSize.height - 1)
             {
                 if( level == 0 )
                 {
@@ -927,7 +918,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
             D = 1.f/D;
 
-            nextPt -= halfWin;
+            nextPt += halfWin;
             Point2f prevDelta(0,0);    //relates to h(t-1)
 #ifdef RLOF_SSE
             __m128i mmMask0, mmMask1, mmMask;
@@ -939,9 +930,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 inextPt.x = cvFloor(nextPt.x);
                 inextPt.y = cvFloor(nextPt.y);
 
-
-                if( inextPt.x + halfWin.x < 0 || inextPt.x + halfWin.x>= derivI.cols ||
-                    inextPt.y + halfWin.y < 0 || inextPt.y + halfWin.y >= derivI.rows )
+                if( inextPt.x < 0 || inextPt.x >= J.cols - winSize.width ||
+                    inextPt.y < 0 || inextPt.y >= J.rows - winSize.height - 1)
                 {
                     if( level == 0 && status )
                         status[ptidx] = 3;
@@ -1054,7 +1044,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
 
                 nextPt += delta;
-                nextPts[ptidx] = nextPt + halfWin;
+                nextPts[ptidx] = nextPt - halfWin;
 
                 if( delta.ddot(delta) <= criteria.epsilon)
                     break;
diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp
index 38648871273..8dd0ecccd75 100644
--- a/modules/optflow/src/rlof/rlof_invoker.hpp
+++ b/modules/optflow/src/rlof/rlof_invoker.hpp
@@ -59,9 +59,6 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
     void operator()(const cv::Range& range) const CV_OVERRIDE
     {
-#ifdef DEBUG_INVOKER
-        printf("rlof::ica");fflush(stdout);
-#endif
         cv::Size    winSize;
         cv::Point2f halfWin;
 
@@ -110,14 +107,12 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 winMaskMat, winSize, halfWin, winArea,
                 minWinSize, maxWinSize) == false)
                 continue;
-
-
-            prevPt -= halfWin;
+            halfWin = Point2f(static_cast<float>(maxWinSize) ,static_cast<float>(maxWinSize) ) - halfWin;
+            prevPt += halfWin;
             iprevPt.x = cvFloor(prevPt.x);
             iprevPt.y = cvFloor(prevPt.y);
-
-            if (iprevPt.x + halfWin.x < 0 || iprevPt.x + halfWin.x >= derivI.cols ||
-                iprevPt.y + halfWin.y < 0 || iprevPt.y + halfWin.y >= derivI.rows)
+            if( iprevPt.x < 0 || iprevPt.x >= derivI.cols - winSize.width ||
+                iprevPt.y < 0 || iprevPt.y >= derivI.rows - winSize.height - 1)
             {
                 if (level == 0)
                 {
@@ -234,7 +229,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
             cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1);
 
             cv::Point2f backUpNextPt = nextPt;
-            nextPt -= halfWin;
+            nextPt += halfWin;
             int j;
 #ifdef RLOF_SSE
             __m128i mmMask0, mmMask1, mmMask;
@@ -249,8 +244,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 inextPt.x = cvFloor(nextPt.x);
                 inextPt.y = cvFloor(nextPt.y);
 
-                if (inextPt.x < -winSize.width || inextPt.x >= J.cols ||
-                    inextPt.y < -winSize.height || inextPt.y >= J.rows)
+                if( inextPt.x < 0 || inextPt.x >= J.cols - winSize.width ||
+                    inextPt.y < 0 || inextPt.y >= J.rows - winSize.height - 1)
                 {
                     if (level == 0 && status)
                         status[ptidx] = 3;
@@ -584,7 +579,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 delta.y = (delta.y != delta.y) ? 0 : delta.y;
 
                 nextPt += delta * 0.7;
-                nextPts[ptidx] = nextPt + halfWin;
+                nextPts[ptidx] = nextPt - halfWin;
 
                 if (j > 0 && std::abs(delta.x - prevDelta.x) < 0.01  &&
                     std::abs(delta.y - prevDelta.y) < 0.01)
@@ -739,13 +734,12 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 winMaskMat, winSize, halfWin, winArea,
                 minWinSize, maxWinSize) == false)
                 continue;
-
-            prevPt -= halfWin;
+            halfWin = Point2f(static_cast<float>(maxWinSize) ,static_cast<float>(maxWinSize) ) - halfWin;
+            prevPt += halfWin;
             iprevPt.x = cvFloor(prevPt.x);
             iprevPt.y = cvFloor(prevPt.y);
-
-            if (iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols ||
-                iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows)
+            if( iprevPt.x < 0 || iprevPt.x >= derivI.cols - winSize.width ||
+                iprevPt.y < 0 || iprevPt.y >= derivI.rows - winSize.height - 1)
             {
                 if (level == 0)
                 {
@@ -870,7 +864,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
             cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1);
 
             cv::Point2f backUpNextPt = nextPt;
-            nextPt -= halfWin;
+            nextPt += halfWin;
             Point2f prevDelta(0, 0);    //related to h(t-1)
             Point2f prevGain(0, 0);
             cv::Point2f gainVec = gainVecs[ptidx];
@@ -892,8 +886,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 inextPt.x = cvFloor(nextPt.x);
                 inextPt.y = cvFloor(nextPt.y);
 
-                if (inextPt.x < -winSize.width || inextPt.x >= J.cols ||
-                    inextPt.y < -winSize.height || inextPt.y >= J.rows)
+                if( inextPt.x < 0 || inextPt.x >= J.cols - winSize.width ||
+                    inextPt.y < 0 || inextPt.y >= J.rows - winSize.height - 1)
                 {
                     if (level == 0 && status)
                         status[ptidx] = 3;
@@ -1382,7 +1376,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     prevGain = deltaGain;
                 gainVec += deltaGain * 0.8;
                 nextPt += delta * 0.8;
-                nextPts[ptidx] = nextPt + halfWin;
+                nextPts[ptidx] = nextPt - halfWin;
                 gainVecs[ptidx] = gainVec;
 
                 if (
diff --git a/modules/optflow/src/rlof/rlof_invokerbase.hpp b/modules/optflow/src/rlof/rlof_invokerbase.hpp
index 470f4d87ad7..a7d09ceaff3 100644
--- a/modules/optflow/src/rlof/rlof_invokerbase.hpp
+++ b/modules/optflow/src/rlof/rlof_invokerbase.hpp
@@ -58,20 +58,6 @@ typedef uchar tMaskType;
 #define tCVMaskType CV_8UC1
 #define MaskSet 0xffffffff
 
-static inline bool notSameColor(const cv::Point3_<uchar> & ref, int r, int c, const cv::Mat & img, int winSize, int threshold)
-{
-    if (r >= img.rows + winSize || c >= img.cols + winSize || r < -winSize || c < -winSize) return true;
-    int step = static_cast<int>(img.step1());
-    const cv::Point3_<uchar> * tval = img.ptr<cv::Point3_<uchar>>();
-    tval += c;
-    tval += r * step / 3;
-    int R = std::abs((int)ref.x - (int)tval->x);
-    int G = std::abs((int)ref.y - (int)tval->y);
-    int B = std::abs((int)ref.z - (int)tval->z);
-    int diff = MAX(R, MAX(G, B));
-    return diff > threshold;
-}
-
 static
 void getLocalPatch(
         const cv::Mat & src,
diff --git a/modules/optflow/src/rlof/rlof_localflow.cpp b/modules/optflow/src/rlof/rlof_localflow.cpp
index 8689aa3ed9f..e1e59860926 100644
--- a/modules/optflow/src/rlof/rlof_localflow.cpp
+++ b/modules/optflow/src/rlof/rlof_localflow.cpp
@@ -4,7 +4,7 @@
 #include "../precomp.hpp"
 
 #include "opencv2/calib3d.hpp"  // findHomography
-
+#include "opencv2/highgui.hpp"
 #include "rlof_localflow.h"
 #include "berlof_invoker.hpp"
 #include "rlof_invoker.hpp"
@@ -156,13 +156,13 @@ class HorizontalCrossSegmentation  : public cv::ParallelLoopBody
             {
                 if( m_mask->at<uchar>(r,c) == 0)
                     continue;
-                const cv::Point3_<uchar> & ucval = m_data->at<cv::Point3_<uchar>>(r,c);
-                cv::Point3i val(static_cast<int>(ucval.x), static_cast<int>(ucval.y), static_cast<int>(ucval.z));
+        const Point3_<uchar> & ucval = m_data->at<Point3_<uchar>>(r,c);
+        Point3i val(static_cast<int>(ucval.x), static_cast<int>(ucval.y), static_cast<int>(ucval.z));
                 int x = c - hWinSize;
-                cv::Point dstPos = m_stride ? cv::Point(r,c) : cv::Point(c,r);
+        Point dstPos = m_stride ? Point(r,c) : Point(c,r);
                 for(int ix = 0; ix < m_winSize; ix++, x++)
                 {
-                    const cv::Point3_<uchar> & valref = m_data->at<cv::Point3_<uchar>>(r,x);
+            const Point3_<uchar> & valref = m_data->at<Point3_<uchar>>(r,x);
                     differenz[ix] = MAX(std::abs(static_cast<int>(valref.x) - val.x),
                                     MAX(std::abs(static_cast<int>(valref.y) - val.y),
                                     (std::abs(static_cast<int>(valref.z) - val.z))));
@@ -460,6 +460,9 @@ void calcLocalOpticalFlowCore(
             tRGBPrevPyr = preCrossMap;
         }
         // apply plk like tracker
+            prevImage.adjustROI(iWinSize, iWinSize, iWinSize, iWinSize);
+            currImage.adjustROI(iWinSize, iWinSize, iWinSize, iWinSize);
+            derivI.adjustROI(iWinSize, iWinSize, iWinSize, iWinSize);
         if (isrobust(param) == false)
         {
             if (param.useIlluminationModel)
diff --git a/modules/optflow/test/test_OF_accuracy.cpp b/modules/optflow/test/test_OF_accuracy.cpp
index 2e8940b3a2c..5d093cd49c3 100644
--- a/modules/optflow/test/test_OF_accuracy.cpp
+++ b/modules/optflow/test/test_OF_accuracy.cpp
@@ -198,7 +198,7 @@ TEST(SparseOpticalFlow, ReferenceAccuracy)
     param->solverType = ST_BILINEAR;
     algo->setRLOFOpticalFlowParameter(param);
     algo->calc(frame1, frame2, prevPts, currPts, status, err);
-    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.27f);
+    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.3f);
 
     param->solverType = ST_STANDART;
     algo->setRLOFOpticalFlowParameter(param);
@@ -267,7 +267,7 @@ TEST(DenseOpticalFlow_RLOF, ReferenceAccuracy)
 
     ASSERT_EQ(GT.rows, flow.rows);
     ASSERT_EQ(GT.cols, flow.cols);
-    EXPECT_LE(calcRMSE(GT, flow), 0.53f);
+    EXPECT_LE(calcRMSE(GT, flow), 0.55f);
 
 }