From 04830a7018db749253f98fddaed7c964a43b654a Mon Sep 17 00:00:00 2001 From: tsenst Date: Sun, 9 Dec 2018 17:38:41 +0100 Subject: [PATCH 01/25] Add robust local optical flow (RLOF) implementations which is an improved pyramidal iterative Lucas-Kanade approach. This implementations contains interfaces for sparse optical flow for feature tracking and dense optical flow based on sparse-to-dense interpolation schemes. Add performance and accuracy tests have been implementation as well as documentation with the related publications --- modules/optflow/doc/optflow.bib | 49 + modules/optflow/include/opencv2/optflow.hpp | 3 +- .../include/opencv2/optflow/rlofflow.hpp | 482 ++++ modules/optflow/include/opencv2/optflow_o.hpp | 309 +++ modules/optflow/perf/perf_rlof.cpp | 89 + modules/optflow/src/rlof/berlof_invoker.hpp | 2231 +++++++++++++++++ .../optflow/src/rlof/geo_interpolation.cpp | 525 ++++ .../optflow/src/rlof/geo_interpolation.hpp | 75 + modules/optflow/src/rlof/plk_invoker.hpp | 1141 +++++++++ modules/optflow/src/rlof/rlof_invoker.hpp | 1513 +++++++++++ modules/optflow/src/rlof/rlof_invokerbase.hpp | 309 +++ modules/optflow/src/rlof/rlof_localflow.cpp | 586 +++++ modules/optflow/src/rlof/rlof_localflow.h | 158 ++ modules/optflow/src/rlofflow.cpp | 385 +++ modules/optflow/test/test_OF_accuracy.cpp | 128 +- 15 files changed, 7978 insertions(+), 5 deletions(-) create mode 100644 modules/optflow/include/opencv2/optflow/rlofflow.hpp create mode 100644 modules/optflow/include/opencv2/optflow_o.hpp create mode 100644 modules/optflow/perf/perf_rlof.cpp create mode 100644 modules/optflow/src/rlof/berlof_invoker.hpp create mode 100644 modules/optflow/src/rlof/geo_interpolation.cpp create mode 100644 modules/optflow/src/rlof/geo_interpolation.hpp create mode 100644 modules/optflow/src/rlof/plk_invoker.hpp create mode 100644 modules/optflow/src/rlof/rlof_invoker.hpp create mode 100644 modules/optflow/src/rlof/rlof_invokerbase.hpp create mode 100644 modules/optflow/src/rlof/rlof_localflow.cpp create mode 100644 modules/optflow/src/rlof/rlof_localflow.h create mode 100644 modules/optflow/src/rlofflow.cpp diff --git a/modules/optflow/doc/optflow.bib b/modules/optflow/doc/optflow.bib index c36504b3e3c..616686e2698 100644 --- a/modules/optflow/doc/optflow.bib +++ b/modules/optflow/doc/optflow.bib @@ -61,3 +61,52 @@ @inproceedings{Wang_2016_CVPR month = {June}, year = {2016} } + +@inproceedings{Geistert2016, + author = {Jonas Geistert and Tobias Senst and Thomas Sikora}, + title = {Robust Local Optical Flow: Dense Motion Vector Field Interpolation}, + booktitle = {Picture Coding Symposium}, + year = {2016}, + pages = {1--5}, +} + +@inproceedings{Senst2016, + author = {Tobias Senst and Jonas Geistert and Thomas Sikora}, + title = {Robust local optical flow: Long-range motions and varying illuminations}, + booktitle = {IEEE International Conference on Image Processing}, + year = {2016}, + pages = {4478--4482}, +} + +@inproceedings{Senst2014, + author = {Tobias Senst and Thilo Borgmann and Ivo Keller and Thomas Sikora}, + title = {Cross based Robust Local Optical Flow}, + booktitle = {21th IEEE International Conference on Image Processing}, + year = {2014}, + pages = {1967--1971}, +} + +@inproceedings{Senst2013, + author = {Tobias Senst and Jonas Geistert and Ivo Keller and Thomas Sikora}, + title = {Robust Local Optical Flow Estimation using Bilinear Equations for Sparse Motion Estimation}, + booktitle = {20th IEEE International Conference on Image Processing}, + year = {2013}, + pages = {2499--2503}, +} + +@article{Senst2012, + author = {Tobias Senst and Volker Eiselein and Thomas Sikora}, + title = {Robust Local Optical Flow for Feature Tracking}, + journal = {IEEE Transactions on Circuits and Systems for Video Technology}, + year = {2012}, + pages = {1377--1387}, + volume = {22}, + number = {9}, +} + +@article{Tibshirani2008, + title={Fast computation of the median by successive binning}, + author={Tibshirani, Ryan J}, + journal={arXiv preprint arXiv:0806.3301}, + year={2008} +} diff --git a/modules/optflow/include/opencv2/optflow.hpp b/modules/optflow/include/opencv2/optflow.hpp index 65802750d1a..a4bd3a737e6 100644 --- a/modules/optflow/include/opencv2/optflow.hpp +++ b/modules/optflow/include/opencv2/optflow.hpp @@ -68,7 +68,7 @@ Functions reading and writing .flo files in "Middlebury" format, see: createOptFlow_Farneback(); //! Additional interface to the SparseToDenseFlow algorithm - calcOpticalFlowSparseToDense() CV_EXPORTS_W Ptr createOptFlow_SparseToDense(); + /** @brief "Dual TV L1" Optical Flow Algorithm. The class implements the "Dual TV L1" optical flow algorithm described in @cite Zach2007 and diff --git a/modules/optflow/include/opencv2/optflow/rlofflow.hpp b/modules/optflow/include/opencv2/optflow/rlofflow.hpp new file mode 100644 index 00000000000..e9accdab5c4 --- /dev/null +++ b/modules/optflow/include/opencv2/optflow/rlofflow.hpp @@ -0,0 +1,482 @@ +/* +By downloading, copying, installing or using the software you agree to this +license. If you do not agree to this license, do not download, install, +copy or use the software. + + + License Agreement + For Open Source Computer Vision Library + (3-clause BSD License) + +Copyright (C) 2018, OpenCV Foundation, all rights reserved. +Third party copyrights are property of their respective owners. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the names of the copyright holders nor the names of the contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +This software is provided by the copyright holders and contributors "as is" and +any express or implied warranties, including, but not limited to, the implied +warranties of merchantability and fitness for a particular purpose are +disclaimed. In no event shall copyright holders or contributors be liable for +any direct, indirect, incidental, special, exemplary, or consequential damages +(including, but not limited to, procurement of substitute goods or services; +loss of use, data, or profits; or business interruption) however caused +and on any theory of liability, whether in contract, strict liability, +or tort (including negligence or otherwise) arising in any way out of +the use of this software, even if advised of the possibility of such damage. +*/ + +/** + * @file rloflow.hpp + * @author Tobias Senst + * @brief Implementation of the robust local optical flow algorithm for sparse and dense motion + * estimation from the following papers: + * http://elvera.nue.tu-berlin.de/files/1349Senst2012.pdf + * http://elvera.nue.tu-berlin.de/files/1422Senst2013.pdf + * http://elvera.nue.tu-berlin.de/files/1448Senst2014.pdf + * http://elvera.nue.tu-berlin.de/files/1498Geistert2016.pdf + * http://elvera.nue.tu-berlin.de/files/1496Senst2016.pdf + * + * + * @cite Senst2012, @cite Senst2013, @cite Senst2014, @cite Senst2016 or @cite Geistert2016 + * +*/ +#ifndef __OPENCV_OPTFLOW_RLOFFLOW_HPP__ +#define __OPENCV_OPTFLOW_RLOFFLOW_HPP__ + +#include "opencv2/core.hpp" +#include "opencv2/video.hpp" + +namespace cv +{ + namespace optflow + { + //! @addtogroup optflow + //! @{ + + /** @brief This is used store and set up the parameters of the robust local optical flow (RLOF) algoritm. + * + * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as + * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). + * This RLOF implementation can be seen as an improved pyramidal iterative Lucas-Kanade and includes + * a set of improving modules. The main improvements in respect to the pyramidal iterative Lucas-Kanade + * are: + * - A more robust redecending M-estimator framework (see @cite Senst2012) to improve the accuracy at + * motion boundaries and appearing and disappearing pixels. + * - an adaptive support region strategies to improve the accuracy at motion boundaries to reduce the + * corona effect, i.e oversmoothing of the PLK at motion/object boundaries. The cross-based segementation + * strategy (SR_CROSS) proposed in @cite Senst2014 uses a simple segmenation approach to obtain the optimal + * shape of the support region. + * - To deal with illumination changes (outdoor sequences and shadow) the intensity constancy assumption + * based optical flow equation has been adopt with the Gennert and Negahdaripour illumination model + * (see @cite Senst2016). This model can be switched on/off with the useIlluminationModel variable. + * - By using a global motion prior initialization (see @cite Senst2016) of the iterative refinement + * the accuracy could be significantly improved for large displacements. This initialization can be + * switched on and of with useGlobalMotionPrior variable. + * + * The RLOF can be computed with the SparseOpticalFlow class or function interface to track a set of features + * or with the DenseOpticalFlow class or function interface to compute dense optical flow. + * + * @see optflow::DenseRLOFOpticalFlow, optflow::calcOpticalFlowDenseRLOF(), optflow::SparseRLOFOpticalFlow, optflow::calcOpticalFlowSparseRLOF() + */ + class RLOFOpticalFlowParameter { + public: + RLOFOpticalFlowParameter() + :solverType(ST_BILINEAR) + ,supportRegionType(SR_CROSS) + ,normSigma0(3.2f) + ,normSigma1(7.f) + ,smallWinSize(9) + ,largeWinSize(21) + ,crossSegmentationThreshold(25) + ,useInitialFlow(false) + ,useIlluminationModel(true) + ,useGlobalMotionPrior(true) + ,maxLevel(3) + ,maxIteration(30) + ,minEigenValue(0.0001f) + ,globalMotionRansacThreshold(10) + {} + enum SupportRegionType { + SR_FIXED = 0, /**< Apply a constant support region */ + SR_CROSS /**< Apply a adaptive support region obtained by cross-based segmentation + * as described in @cite Senst2014 + */ + }; + enum SolverType { + ST_STANDART = 0, /**< Apply standard iterative refinement */ + ST_BILINEAR /**< Apply optimized iterative refinement based bilinear equation solutions + * as described in @cite Senst2013 + */ + }; + SolverType solverType; + /**< Variable specifies the iterative refinement strategy. Please consider citing @cite Senst2013 when + * using ST_BILINEAR. + */ + + SupportRegionType supportRegionType; + /**< Variable specifies the support region shape extraction or shrinking strategy. + */ + + float normSigma0; + /**< \$ \sigma_0 \$ paramter of the shrinked Hampel norm introduced in @cite Senst2012. If + * \$ \sigma_0 \$ = std::numeric_limist::max() the least-square estimator will be used + * instead of the M-estimator. Althoug M-estimator is more robust against outlier in the support + * region the least-square can be fast in computation. + */ + float normSigma1; + /**< \$ \sigma_1 \$ paramter of the shrinked Hampel norm introduced in @cite Senst2012. If + * \$ \sigma_1 \$ = std::numeric_limist::max() the least-square estimator will be used + * instead of the M-estimator. Althoug M-estimator is more robust against outlier in the support + * region the least-square can be fast in computation. + */ + int smallWinSize; + /**< Minimal window size of the support region. This parameter is only used if supportRegionType is SR_CROSS. + */ + int largeWinSize; + /**< Maximal window size of the support region. If supportRegionType is SR_FIXED this gives the exact support + * region size. The speed of the RLOF is related to the applied win sizes. The smaller the window size the lower is the runtime, + * but the more sensitive to noise is the method. + */ + int crossSegmentationThreshold; + /**< Color similarity threshold used by cross-based segmentation following @cite Senst2014 . + * (Only used if supportRegionType is SR_CROSS). With the cross-bassed segmentation + * motion boundaries can be computed more accurately. + */ + int maxLevel; + /**< Maximal number of pyramid level used. The large this value is the more likely it is + * to obtain accurate solutions for long-range motions. The runtime is linear related to + * this parameter. + */ + bool useInitialFlow; + /**< Use next point list as initial values. A good intialization can imporve the algortihm + * accuracy and reduce the runtime by a faster convergence of the iteration refinement. + */ + bool useIlluminationModel; + /**< Use the Gennert and Negahdaripour illumination model instead of the intensity brigthness + * constraint. (Probosed in @cite Senst2016 ) This model is defined as follow: + * \f[ I(\mathbf{x},t) + m \cdot I(\mathbf{x},t) + c = I(\mathbf{x},t+1) \f[ + * and contains with m and c a multiplicative and additive term which makes the estimate + * more robust against illumination changes. The computational complexity is increased by + * enabling the illumination model. + */ + bool useGlobalMotionPrior; + /**< Use global motion prior initialisation has been introduced in @cite Senst2016 . It + * allows to be more accurate for long-range motion. The computational complexity is + * slightly increased by enabling the global motion prior initialisation. + */ + int maxIteration; + /**< Number of maximal iterations used for the iterative refinement. Lower values can + * reduce the runtime but also the accuracy. + */ + float minEigenValue; + /**< Threshold for the minimal eigenvalue of the gradient matrix defines when to abort the + * iterative refinement. + */ + float globalMotionRansacThreshold; + /**< To apply the global motion prior motion vectors will be computed on a regulary sampled which + * are the basis for Homography estimation using RANSAC. The reprojection threshold is based on + * n-th percentil (given by this value [0 ... 100]) of the forward backward errors relating to + * the motion vectors. See @cite Senst2016 for more details. + */ + }; + + /** @brief Fast dense optical flow computation based on robust local optical flow (RLOF) algorithms and sparse-to-dense interpolation + * scheme. + * + * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as + * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). + * + * The sparse-to-dense interpolation scheme allows for fast computation of dense optical flow using RLOF (see @cite Geistert2016). + * For this scheme the following steps are applied: + * -# motion vector seeded at a regular sampled grid are computed. The sparsity of this grid can be configured with setGridStep + * -# (optinally) errornous motion vectors are filter based on the forward backward confidence. The threshold can be configured + * with setForwardBackward. The filter is only applied if the threshold >0 but than the runtime is doubled due to the estimation + * of the backward flow. + * -# Vector field interpolation is applied to the motion vector set to obtain a dense vector field. + * + * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details. + * Parameters have been described in @cite Senst2012 @cite Senst2013 @cite Senst2014 and @cite Senst2016. + * + * @note If the grid size is set to (1,1) and the forward backward threshold <= 0 that the dense optical flow field is purely. + * computed with the RLOF. + * + * @see optflow::calcOpticalFlowDenseRLOF(), optflow::RLOFOpticalFlowParameter + */ + class CV_EXPORTS_W DenseRLOFOpticalFlow : public DenseOpticalFlow + { + public: + enum InterpolationType + { + INTERP_GEO = 0, /**< Fast geodesic interpolation, see @cite Geister2016 */ + INTERP_EPIC, /**< Edge-preserving interpolation, see @cite Revaud2015,Geistert2016. */ + }; + //! @brief Configuration of the RLOF alogrithm. + /** + @see optflow::RLOFOpticalFlowParameter, getRLOFOpticalFlowParameter + */ + CV_WRAP virtual void setRLOFOpticalFlowParameter(RLOFOpticalFlowParameter val) = 0; + /** @copybrief setRLOFOpticalFlowParameter + @see optflow::RLOFOpticalFlowParameter, setRLOFOpticalFlowParameter + */ + CV_WRAP virtual RLOFOpticalFlowParameter getRLOFOpticalFlowParameter() const = 0; + //! @brief Threshold for the forward backward confidence check + /**For each grid point \f$ \mathbf{x} \f$ a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed. + * If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f] + * is larger than threshold given by this function then the motion vector will not be used by the following + * vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test + * will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation. + * @see getForwardBackward, setGridStep + */ + CV_WRAP virtual void setForwardBackward(float val) = 0; + /** @copybrief setForwardBackward + @see setForwardBackward + */ + CV_WRAP virtual float getForwardBackward() const = 0; + //! @brief Size of the grid to spawn the motion vectors. + /** For each grid point a motion vector is computed. Some motion vectors will be removed due to the forwatd backward + * threshold (if set >0). The rest will be the base of the vector field interpolation. + * @see getForwardBackward, setGridStep + */ + CV_WRAP virtual Size getGridStep() const = 0; + /** @copybrief getGridStep + * @see getGridStep + */ + CV_WRAP virtual void setGridStep(Size val) = 0; + + //! @brief Interpolation used to compute the dense optical flow. + /** Two interpolation algorithms have been supported + * - **INTERP_GEO** applies the fast geodesic interpolation, see @cite Geister2016. + * - **INTERP_EPIC_RESIDUAL** applies the edge-preserving interpolation, see @cite Revaud2015,Geistert2016. + * @see ximgproc::EdgeAwareInterpolator, getInterpolation + */ + CV_WRAP virtual void setInterpolation(InterpolationType val) = 0; + /** @copybrief setInterpolation + * @see ximgproc::EdgeAwareInterpolator, setInterpolation + */ + CV_WRAP virtual InterpolationType getInterpolation() const = 0; + //! @brief see ximgproc::EdgeAwareInterpolator() K value. + /** K is a number of nearest-neighbor matches considered, when fitting a locally affine + * model. Usually it should be around 128. However, lower values would make the interpolation noticeably faster. + * @see ximgproc::EdgeAwareInterpolator, setEPICK + */ + CV_WRAP virtual int getEPICK() const = 0; + /** @copybrief getEPICK + * @see ximgproc::EdgeAwareInterpolator, getEPICK + */ + CV_WRAP virtual void setEPICK(int val) = 0; + //! @brief see ximgproc::EdgeAwareInterpolator() sigma value. + /** Sigma is a parameter defining how fast the weights decrease in the locally-weighted affine + * fitting. Higher values can help preserve fine details, lower values can help to get rid of noise in the + * output flow. + * @see ximgproc::EdgeAwareInterpolator, setEPICSigma + */ + CV_WRAP virtual float getEPICSigma() const = 0; + /** @copybrief getEPICSigma + * @see ximgproc::EdgeAwareInterpolator, getEPICSigma + */ + CV_WRAP virtual void setEPICSigma(float val) = 0; + //! @brief see ximgproc::EdgeAwareInterpolator() lambda value. + /** Lambda is a parameter defining the weight of the edge-aware term in geodesic distance, + * should be in the range of 0 to 1000. + * @see ximgproc::EdgeAwareInterpolator, setEPICSigma + */ + CV_WRAP virtual float getEPICLambda() const = 0; + /** @copybrief getEPICLambda + * @see ximgproc::EdgeAwareInterpolator, getEPICLambda + */ + CV_WRAP virtual void setEPICLambda(float val) = 0; + //! @brief see ximgproc::EdgeAwareInterpolator(). + /** Sets the respective fastGlobalSmootherFilter() parameter. + * @see ximgproc::EdgeAwareInterpolator, setFgsLambda + */ + CV_WRAP virtual float getFgsLambda() const = 0; + /** @copybrief getFgsLambda + * @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, getFgsLambda + */ + CV_WRAP virtual void setFgsLambda(float val) = 0; + //! @brief see ximgproc::EdgeAwareInterpolator(). + /** Sets the respective fastGlobalSmootherFilter() parameter. + * @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, setFgsSigma + */ + CV_WRAP virtual float getFgsSigma() const = 0; + /** @copybrief getFgsSigma + * @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, getFgsSigma + */ + CV_WRAP virtual void setFgsSigma(float val) = 0; + //! @brief enables ximgproc::fastGlobalSmootherFilter + /** + * @see getUsePostProc + */ + CV_WRAP virtual void setUsePostProc(bool val) = 0; + /** @copybrief setUsePostProc + * @see ximgproc::fastGlobalSmootherFilter, setUsePostProc + */ + CV_WRAP virtual bool getUsePostProc() const = 0; + //! @brief Creates instance of optflow::DenseRLOFOpticalFlow + /** + * @param rlofParam see optflow::RLOFOpticalFlowParameter + * @param forwardBackwardThreshold see setForwardBackward + * @param gridStep see setGridStep + * @param epicK see setEPICK + * @param epicSigma see setEPICSigma + * @param epicLambda see setEPICLambda + * @param fgsLambda see setFgsLambda + * @param fgsSigma see setFgsSigma + */ + CV_WRAP static Ptr create( + const RLOFOpticalFlowParameter & rlofParam = RLOFOpticalFlowParameter(), + float forwardBackwardThreshold = 1.f, + Size gridStep = Size(6, 6), + InterpolationType interp_type = InterpolationType::INTERP_EPIC, + int epicK = 128, + float epicSigma = 0.05f, + float epicLambda = 999.0f, + float fgsLambda = 500.0f, + float fgsSigma = 1.5f); + }; + + /** @brief Class used for calculation sparse optical flow and feature tracking with robust local optical flow (RLOF) algorithms. + * + * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as + * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). + * + * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details. + * Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014 and @cite Senst2016. + @sa optflow::calcOpticalFlowSparseRLOF(), optflow::RLOFOpticalFlowParameter + + */ + class CV_EXPORTS_W SparseRLOFOpticalFlow : public SparseOpticalFlow + { + public: + /** @copydoc DenseRLOFOpticalFlow::setRLOFOpticalFlowParameter + */ + CV_WRAP virtual void setRLOFOpticalFlowParameter(RLOFOpticalFlowParameter val) = 0; + /** @copybrief setRLOFOpticalFlowParameter + * @see setRLOFOpticalFlowParameter + */ + CV_WRAP virtual RLOFOpticalFlowParameter getRLOFOpticalFlowParameter() const = 0; + //! @brief Threshold for the forward backward confidence check + /** For each feature point a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed. + * If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f] + * is larger than threshold given by this function then the status will not be used by the following + * vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test + * will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation. + * @see setForwardBackward + */ + CV_WRAP virtual void setForwardBackward(float val) = 0; + /** @copybrief setForwardBackward + * @see setForwardBackward + */ + CV_WRAP virtual float getForwardBackward() const = 0; + + //! @brief Creates instance of SparseRLOFOpticalFlow + /** + * @param rlofParam see setRLOFOpticalFlowParameter + * @param forwardBackwardThreshold see setForwardBackward + */ + CV_WRAP static Ptr create( + const RLOFOpticalFlowParameter & rlofParam = RLOFOpticalFlowParameter(), + float forwardBackwardThreshold = 1.f); + + }; + + /** @brief Fast dense optical flow computation based on robust local optical flow (RLOF) algorithms and sparse-to-dense interpolation scheme. + * + * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as + * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). + * + * The sparse-to-dense interpolation scheme allows for fast computation of dense optical flow using RLOF (see @cite Geistert2016). + * For this scheme the following steps are applied: + * -# motion vector seeded at a regular sampled grid are computed. The sparsity of this grid can be configured with setGridStep + * -# (optinally) errornous motion vectors are filter based on the forward backward confidence. The threshold can be configured + * with setForwardBackward. The filter is only applied if the threshold >0 but than the runtime is doubled due to the estimation + * of the backward flow. + * -# Vector field interpolation is applied to the motion vector set to obtain a dense vector field. + * + * @param I0 first 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType + * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image. + * @param I1 second 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType + * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image. + * @param flow computed flow image that has the same size as I0 and type CV_32FC2. + * @param rlofParam see optflow::RLOFOpticalFlowParameter + * @param forwardBackwardThreshold Threshold for the forward backward confidence check. + * For each grid point \f$ \mathbf{x} \f$ a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed. + * If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f] + * is larger than threshold given by this function then the motion vector will not be used by the following + * vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test + * will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation. + * @param gridStep Size of the grid to spawn the motion vectors. For each grid point a motion vector is computed. + * Some motion vectors will be removed due to the forwatd backward threshold (if set >0). The rest will be the + * base of the vector field interpolation. + * @param epicK see ximgproc::EdgeAwareInterpolator() sets the respective parameter. + * @param epicSigma see ximgproc::EdgeAwareInterpolator() sets the respective parameter. + * @param epicLambda see ximgproc::EdgeAwareInterpolator() sets the respective parameter. + * @param use_post_proc enables ximgproc::fastGlobalSmootherFilter() parameter. + * @param fgsLambda sets the respective ximgproc::fastGlobalSmootherFilter() parameter. + * @param fgsSigma sets the respective ximgproc::fastGlobalSmootherFilter() parameter. + * + * Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014, @cite Senst2016. + * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details. + * @note If the grid size is set to (1,1) and the forward backward threshold <= 0 that the dense optical flow field is purely + * computed with the RLOF. + * + * @sa optflow::DenseRLOFOpticalFlow, optflow::RLOFOpticalFlowParameter + */ + CV_EXPORTS_W void calcOpticalFlowDenseRLOF(InputArray I0, InputArray I1, InputOutputArray flow, + RLOFOpticalFlowParameter rlofParam = RLOFOpticalFlowParameter(), + float forewardBackwardThreshold = 0, Size gridStep = Size(6, 6), + DenseRLOFOpticalFlow::InterpolationType interp_type = DenseRLOFOpticalFlow::InterpolationType::INTERP_EPIC, + int epicK = 128, float epicSigma = 0.05f, + bool use_post_proc = true, float fgsLambda = 500.0f, float fgsSigma = 1.5f); + + /** @brief Calculates fast optical flow for a sparse feature set using the robust local optical flow (RLOF) similar + * to optflow::calcOpticalFlowPyrLK(). + * + * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as + * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). + * + * @param I0 first 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType + * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image. + * @param I1 second 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType + * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image. + * @param prevPts vector of 2D points for which the flow needs to be found; point coordinates must be single-precision + * floating-point numbers. + * @param nextPts output vector of 2D points (with single-precision floating-point coordinates) containing the calculated + * new positions of input features in the second image; when optflow::RLOFOpticalFlowParameter::useInitialFlow variable is true the vector must + * have the same size as in the input and contain the initialization point correspondences. + * @param status output status vector (of unsigned chars); each element of the vector is set to 1 if the flow for the + * corresponding features has passed the forward backward check. + * @param err output vector of errors; each element of the vector is set to the forward backward error for the corresponding feature. + * @param rlofParam see optflow::RLOFOpticalFlowParameter + * @param forwardBackwardThreshold Threshold for the forward backward confidence check. If forewardBackwardThreshold <=0 the forward + * + * Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014 and @cite Senst2016. + * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details. + */ + CV_EXPORTS_W void calcOpticalFlowSparseRLOF(InputArray prevImg, InputArray nextImg, + InputArray prevPts, InputOutputArray nextPts, + OutputArray status, OutputArray err, + RLOFOpticalFlowParameter rlofParam = RLOFOpticalFlowParameter(), + float forewardBackwardThreshold = 0); + + //! Additional interface to the Dense RLOF algorithm - optflow::calcOpticalFlowDenseRLOF() + CV_EXPORTS_W Ptr createOptFlow_DenseRLOF(); + + //! Additional interface to the Sparse RLOF algorithm - optflow::calcOpticalFlowSparseRLOF() + CV_EXPORTS_W Ptr createOptFlow_SparseRLOF(); + //! @} + } +} +#endif diff --git a/modules/optflow/include/opencv2/optflow_o.hpp b/modules/optflow/include/opencv2/optflow_o.hpp new file mode 100644 index 00000000000..65802750d1a --- /dev/null +++ b/modules/optflow/include/opencv2/optflow_o.hpp @@ -0,0 +1,309 @@ +/* +By downloading, copying, installing or using the software you agree to this +license. If you do not agree to this license, do not download, install, +copy or use the software. + + + License Agreement + For Open Source Computer Vision Library + (3-clause BSD License) + +Copyright (C) 2013, OpenCV Foundation, all rights reserved. +Third party copyrights are property of their respective owners. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the names of the copyright holders nor the names of the contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +This software is provided by the copyright holders and contributors "as is" and +any express or implied warranties, including, but not limited to, the implied +warranties of merchantability and fitness for a particular purpose are +disclaimed. In no event shall copyright holders or contributors be liable for +any direct, indirect, incidental, special, exemplary, or consequential damages +(including, but not limited to, procurement of substitute goods or services; +loss of use, data, or profits; or business interruption) however caused +and on any theory of liability, whether in contract, strict liability, +or tort (including negligence or otherwise) arising in any way out of +the use of this software, even if advised of the possibility of such damage. +*/ + +#ifndef __OPENCV_OPTFLOW_HPP__ +#define __OPENCV_OPTFLOW_HPP__ + +#include "opencv2/core.hpp" +#include "opencv2/video.hpp" + +/** +@defgroup optflow Optical Flow Algorithms + +Dense optical flow algorithms compute motion for each point: + +- cv::optflow::calcOpticalFlowSF +- cv::optflow::createOptFlow_DeepFlow + +Motion templates is alternative technique for detecting motion and computing its direction. +See samples/motempl.py. + +- cv::motempl::updateMotionHistory +- cv::motempl::calcMotionGradient +- cv::motempl::calcGlobalOrientation +- cv::motempl::segmentMotion + +Functions reading and writing .flo files in "Middlebury" format, see: + +- cv::optflow::readOpticalFlow +- cv::optflow::writeOpticalFlow + + */ + +#include "opencv2/optflow/pcaflow.hpp" +#include "opencv2/optflow/sparse_matching_gpc.hpp" + +namespace cv +{ +namespace optflow +{ + +//! @addtogroup optflow +//! @{ + +/** @overload */ +CV_EXPORTS_W void calcOpticalFlowSF( InputArray from, InputArray to, OutputArray flow, + int layers, int averaging_block_size, int max_flow); + +/** @brief Calculate an optical flow using "SimpleFlow" algorithm. + +@param from First 8-bit 3-channel image. +@param to Second 8-bit 3-channel image of the same size as prev +@param flow computed flow image that has the same size as prev and type CV_32FC2 +@param layers Number of layers +@param averaging_block_size Size of block through which we sum up when calculate cost function +for pixel +@param max_flow maximal flow that we search at each level +@param sigma_dist vector smooth spatial sigma parameter +@param sigma_color vector smooth color sigma parameter +@param postprocess_window window size for postprocess cross bilateral filter +@param sigma_dist_fix spatial sigma for postprocess cross bilateralf filter +@param sigma_color_fix color sigma for postprocess cross bilateral filter +@param occ_thr threshold for detecting occlusions +@param upscale_averaging_radius window size for bilateral upscale operation +@param upscale_sigma_dist spatial sigma for bilateral upscale operation +@param upscale_sigma_color color sigma for bilateral upscale operation +@param speed_up_thr threshold to detect point with irregular flow - where flow should be +recalculated after upscale + +See @cite Tao2012 . And site of project - . + +@note + - An example using the simpleFlow algorithm can be found at samples/simpleflow_demo.cpp + */ +CV_EXPORTS_W void calcOpticalFlowSF( InputArray from, InputArray to, OutputArray flow, int layers, + int averaging_block_size, int max_flow, + double sigma_dist, double sigma_color, int postprocess_window, + double sigma_dist_fix, double sigma_color_fix, double occ_thr, + int upscale_averaging_radius, double upscale_sigma_dist, + double upscale_sigma_color, double speed_up_thr ); + +/** @brief Fast dense optical flow based on PyrLK sparse matches interpolation. + +@param from first 8-bit 3-channel or 1-channel image. +@param to second 8-bit 3-channel or 1-channel image of the same size as from +@param flow computed flow image that has the same size as from and CV_32FC2 type +@param grid_step stride used in sparse match computation. Lower values usually + result in higher quality but slow down the algorithm. +@param k number of nearest-neighbor matches considered, when fitting a locally affine + model. Lower values can make the algorithm noticeably faster at the cost of + some quality degradation. +@param sigma parameter defining how fast the weights decrease in the locally-weighted affine + fitting. Higher values can help preserve fine details, lower values can help to get rid + of the noise in the output flow. +@param use_post_proc defines whether the ximgproc::fastGlobalSmootherFilter() is used + for post-processing after interpolation +@param fgs_lambda see the respective parameter of the ximgproc::fastGlobalSmootherFilter() +@param fgs_sigma see the respective parameter of the ximgproc::fastGlobalSmootherFilter() + */ +CV_EXPORTS_W void calcOpticalFlowSparseToDense ( InputArray from, InputArray to, OutputArray flow, + int grid_step = 8, int k = 128, float sigma = 0.05f, + bool use_post_proc = true, float fgs_lambda = 500.0f, + float fgs_sigma = 1.5f ); + +/** @brief DeepFlow optical flow algorithm implementation. + +The class implements the DeepFlow optical flow algorithm described in @cite Weinzaepfel2013 . See +also . +Parameters - class fields - that may be modified after creating a class instance: +- member float alpha +Smoothness assumption weight +- member float delta +Color constancy assumption weight +- member float gamma +Gradient constancy weight +- member float sigma +Gaussian smoothing parameter +- member int minSize +Minimal dimension of an image in the pyramid (next, smaller images in the pyramid are generated +until one of the dimensions reaches this size) +- member float downscaleFactor +Scaling factor in the image pyramid (must be \< 1) +- member int fixedPointIterations +How many iterations on each level of the pyramid +- member int sorIterations +Iterations of Succesive Over-Relaxation (solver) +- member float omega +Relaxation factor in SOR + */ +CV_EXPORTS_W Ptr createOptFlow_DeepFlow(); + +//! Additional interface to the SimpleFlow algorithm - calcOpticalFlowSF() +CV_EXPORTS_W Ptr createOptFlow_SimpleFlow(); + +//! Additional interface to the Farneback's algorithm - calcOpticalFlowFarneback() +CV_EXPORTS_W Ptr createOptFlow_Farneback(); + +//! Additional interface to the SparseToDenseFlow algorithm - calcOpticalFlowSparseToDense() +CV_EXPORTS_W Ptr createOptFlow_SparseToDense(); + +/** @brief "Dual TV L1" Optical Flow Algorithm. + +The class implements the "Dual TV L1" optical flow algorithm described in @cite Zach2007 and +@cite Javier2012 . +Here are important members of the class that control the algorithm, which you can set after +constructing the class instance: + +- member double tau + Time step of the numerical scheme. + +- member double lambda + Weight parameter for the data term, attachment parameter. This is the most relevant + parameter, which determines the smoothness of the output. The smaller this parameter is, + the smoother the solutions we obtain. It depends on the range of motions of the images, so + its value should be adapted to each image sequence. + +- member double theta + Weight parameter for (u - v)\^2, tightness parameter. It serves as a link between the + attachment and the regularization terms. In theory, it should have a small value in order + to maintain both parts in correspondence. The method is stable for a large range of values + of this parameter. + +- member int nscales + Number of scales used to create the pyramid of images. + +- member int warps + Number of warpings per scale. Represents the number of times that I1(x+u0) and grad( + I1(x+u0) ) are computed per scale. This is a parameter that assures the stability of the + method. It also affects the running time, so it is a compromise between speed and + accuracy. + +- member double epsilon + Stopping criterion threshold used in the numerical scheme, which is a trade-off between + precision and running time. A small value will yield more accurate solutions at the + expense of a slower convergence. + +- member int iterations + Stopping criterion iterations number used in the numerical scheme. + +C. Zach, T. Pock and H. Bischof, "A Duality Based Approach for Realtime TV-L1 Optical Flow". +Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation". +*/ +class CV_EXPORTS_W DualTVL1OpticalFlow : public DenseOpticalFlow +{ +public: + //! @brief Time step of the numerical scheme + /** @see setTau */ + CV_WRAP virtual double getTau() const = 0; + /** @copybrief getTau @see getTau */ + CV_WRAP virtual void setTau(double val) = 0; + //! @brief Weight parameter for the data term, attachment parameter + /** @see setLambda */ + CV_WRAP virtual double getLambda() const = 0; + /** @copybrief getLambda @see getLambda */ + CV_WRAP virtual void setLambda(double val) = 0; + //! @brief Weight parameter for (u - v)^2, tightness parameter + /** @see setTheta */ + CV_WRAP virtual double getTheta() const = 0; + /** @copybrief getTheta @see getTheta */ + CV_WRAP virtual void setTheta(double val) = 0; + //! @brief coefficient for additional illumination variation term + /** @see setGamma */ + CV_WRAP virtual double getGamma() const = 0; + /** @copybrief getGamma @see getGamma */ + CV_WRAP virtual void setGamma(double val) = 0; + //! @brief Number of scales used to create the pyramid of images + /** @see setScalesNumber */ + CV_WRAP virtual int getScalesNumber() const = 0; + /** @copybrief getScalesNumber @see getScalesNumber */ + CV_WRAP virtual void setScalesNumber(int val) = 0; + //! @brief Number of warpings per scale + /** @see setWarpingsNumber */ + CV_WRAP virtual int getWarpingsNumber() const = 0; + /** @copybrief getWarpingsNumber @see getWarpingsNumber */ + CV_WRAP virtual void setWarpingsNumber(int val) = 0; + //! @brief Stopping criterion threshold used in the numerical scheme, which is a trade-off between precision and running time + /** @see setEpsilon */ + CV_WRAP virtual double getEpsilon() const = 0; + /** @copybrief getEpsilon @see getEpsilon */ + CV_WRAP virtual void setEpsilon(double val) = 0; + //! @brief Inner iterations (between outlier filtering) used in the numerical scheme + /** @see setInnerIterations */ + CV_WRAP virtual int getInnerIterations() const = 0; + /** @copybrief getInnerIterations @see getInnerIterations */ + CV_WRAP virtual void setInnerIterations(int val) = 0; + //! @brief Outer iterations (number of inner loops) used in the numerical scheme + /** @see setOuterIterations */ + CV_WRAP virtual int getOuterIterations() const = 0; + /** @copybrief getOuterIterations @see getOuterIterations */ + CV_WRAP virtual void setOuterIterations(int val) = 0; + //! @brief Use initial flow + /** @see setUseInitialFlow */ + CV_WRAP virtual bool getUseInitialFlow() const = 0; + /** @copybrief getUseInitialFlow @see getUseInitialFlow */ + CV_WRAP virtual void setUseInitialFlow(bool val) = 0; + //! @brief Step between scales (<1) + /** @see setScaleStep */ + CV_WRAP virtual double getScaleStep() const = 0; + /** @copybrief getScaleStep @see getScaleStep */ + CV_WRAP virtual void setScaleStep(double val) = 0; + //! @brief Median filter kernel size (1 = no filter) (3 or 5) + /** @see setMedianFiltering */ + CV_WRAP virtual int getMedianFiltering() const = 0; + /** @copybrief getMedianFiltering @see getMedianFiltering */ + CV_WRAP virtual void setMedianFiltering(int val) = 0; + + /** @brief Creates instance of cv::DualTVL1OpticalFlow*/ + CV_WRAP static Ptr create( + double tau = 0.25, + double lambda = 0.15, + double theta = 0.3, + int nscales = 5, + int warps = 5, + double epsilon = 0.01, + int innnerIterations = 30, + int outerIterations = 10, + double scaleStep = 0.8, + double gamma = 0.0, + int medianFiltering = 5, + bool useInitialFlow = false); +}; + +/** @brief Creates instance of cv::DenseOpticalFlow +*/ +CV_EXPORTS_W Ptr createOptFlow_DualTVL1(); + +//! @} + +} //optflow +} + +#include "opencv2/optflow/motempl.hpp" + +#endif diff --git a/modules/optflow/perf/perf_rlof.cpp b/modules/optflow/perf/perf_rlof.cpp new file mode 100644 index 00000000000..ea48a62be59 --- /dev/null +++ b/modules/optflow/perf/perf_rlof.cpp @@ -0,0 +1,89 @@ +#include "perf_precomp.hpp" +namespace opencv_test { + namespace { + + typedef tuple ST_SR_IM_Sparse_t; + typedef TestBaseWithParam ST_SR_IM_Sparse; + PERF_TEST_P(ST_SR_IM_Sparse, OpticalFlow_SparseRLOF, + testing::Combine( + testing::Values("ST_BILINEAR", "ST_STANDART"), + testing::Values("SR_CROSS", "SR_FIXED"), + testing::Values(true, false)) + ) + { + Size sz(640,480); + string frame1_path = "B:/workspace/opencv_extra/testdata/cv/optflow/RubberWhale1.png"; + string frame2_path = "B:/workspace/opencv_extra/testdata/cv/optflow/RubberWhale2.png"; + frame1_path.erase(std::remove_if(frame1_path.begin(), frame1_path.end(), isspace), frame1_path.end()); + frame2_path.erase(std::remove_if(frame2_path.begin(), frame2_path.end(), isspace), frame2_path.end()); + Mat frame1 = imread(frame1_path); + Mat frame2 = imread(frame2_path); + ASSERT_FALSE(frame1.empty()); + ASSERT_FALSE(frame2.empty()); + vector prevPts, currPts; + for (int r = 0; r < frame1.rows; r += 10) + { + for (int c = 0; c < frame1.cols; c += 10) + { + prevPts.push_back(Point2f(static_cast(c), static_cast(r))); + } + } + vector status(prevPts.size()); + vector err(prevPts.size()); + + RLOFOpticalFlowParameter param; + if (get<0>(GetParam()) == "ST_BILINEAR") + param.solverType = RLOFOpticalFlowParameter::ST_BILINEAR; + if (get<0>(GetParam()) == "ST_STANDART") + param.solverType = RLOFOpticalFlowParameter::ST_STANDART; + if (get<1>(GetParam()) == "SR_CROSS") + param.supportRegionType = RLOFOpticalFlowParameter::SR_CROSS; + if (get<1>(GetParam()) == "SR_FIXED") + param.supportRegionType = RLOFOpticalFlowParameter::SR_FIXED; + param.useIlluminationModel = get<2>(GetParam()); + + TEST_CYCLE_N(1) + { + calcOpticalFlowSparseRLOF(frame1, frame2, prevPts, currPts, status, err, param, 1.f); + } + + SANITY_CHECK_NOTHING(); + } + + typedef tuple INTERP_GRID_Dense_t; + typedef TestBaseWithParam INTERP_GRID_Dense; + PERF_TEST_P(INTERP_GRID_Dense, OpticalFlow_DenseRLOF, + testing::Combine( + testing::Values("INTERP_EPIC", "INTERP_GEO"), + testing::Values(4,10,50)) + ) + { + Mat flow; + string frame1_path = TS::ptr()->get_data_path() + "/cv/optflow/RubberWhale1.png"; + string frame2_path = TS::ptr()->get_data_path() + "/cv/optflow/RubberWhale2.png"; + // removing space may be an issue on windows machines + frame1_path.erase(std::remove_if(frame1_path.begin(), frame1_path.end(), isspace), frame1_path.end()); + frame2_path.erase(std::remove_if(frame2_path.begin(), frame2_path.end(), isspace), frame2_path.end()); + Mat frame1 = imread(frame1_path); + Mat frame2 = imread(frame2_path); + ASSERT_FALSE(frame1.empty()); + ASSERT_FALSE(frame2.empty()); + + RLOFOpticalFlowParameter param; + Ptr< DenseRLOFOpticalFlow> algo = DenseRLOFOpticalFlow::create(); + DenseRLOFOpticalFlow::InterpolationType interp_type; + if (get<0>(GetParam()) == "INTERP_EPIC") + interp_type = DenseRLOFOpticalFlow::InterpolationType::INTERP_EPIC; + if (get<0>(GetParam()) == "INTERP_GEO") + interp_type = DenseRLOFOpticalFlow::InterpolationType::INTERP_GEO; + + + TEST_CYCLE_N(5) + { + calcOpticalFlowDenseRLOF(frame1, frame2,flow, param, 1.0f, Size(get<1>(GetParam()), get<1>(GetParam())), interp_type); + } + + SANITY_CHECK_NOTHING(); + } + } +} // namespace diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp new file mode 100644 index 00000000000..d903d1733db --- /dev/null +++ b/modules/optflow/src/rlof/berlof_invoker.hpp @@ -0,0 +1,2231 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// + // + // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. + // + // By downloading, copying, installing or using the software you agree to this license. + // If you do not agree to this license, do not download, install, + // copy or use the software. + // + // + // License Agreement + // For Open Source Computer Vision Library + // + // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. + // Copyright (C) 2009, Willow Garage Inc., all rights reserved. + // Third party copyrights are property of their respective owners. + // + // Redistribution and use in source and binary forms, with or without modification, + // are permitted provided that the following conditions are met: + // + // * Redistribution's of source code must retain the above copyright notice, + // this list of conditions and the following disclaimer. + // + // * Redistribution's in binary form must reproduce the above copyright notice, + // this list of conditions and the following disclaimer in the documentation + // and/or other materials provided with the distribution. + // + // * The name of the copyright holders may not be used to endorse or promote products + // derived from this software without specific prior written permission. + // + // This software is provided by the copyright holders and contributors "as is" and + // any express or implied warranties, including, but not limited to, the implied + // warranties of merchantability and fitness for a particular purpose are disclaimed. + // In no event shall the Intel Corporation or contributors be liable for any direct, + // indirect, incidental, special, exemplary, or consequential damages + // (including, but not limited to, procurement of substitute goods or services; + // loss of use, data, or profits; or business interruption) however caused + // and on any theory of liability, whether in contract, strict liability, + // or tort (including negligence or otherwise) arising in any way out of + // the use of this software, even if advised of the possibility of such damage. + // + //M*/ +#ifndef _BERLOF_INVOKER_HPP_ +#define _BERLOF_INVOKER_HPP_ +#include "rlof_invokerbase.hpp" + + +namespace cv{ +namespace optflow{ + + bool checkSolution(float x, float y, float * c ) + { + float _a = x - 0.002f; + float _b = y - 0.002f; + cv::Point2f tl( c[0] * _a * _b + c[1] * _a + c[2] * _b + c[3], c[4] * _a * _b + c[5] * _a + c[6] * _b + c[7]); + _a = x + 0.002f; + cv::Point2f tr( c[0] * _a * _b + c[1] * _a + c[2] * _b + c[3], c[4] * _a * _b + c[5] * _a + c[6] * _b + c[7]); + _b = y + 0.002f; + cv::Point2f br( c[0] * _a * _b + c[1] * _a + c[2] * _b + c[3], c[4] * _a * _b + c[5] * _a + c[6] * _b + c[7]); + _a = x - 0.002f; + cv::Point2f bl( c[0] * _a * _b + c[1] * _a + c[2] * _b + c[3], c[4] * _a * _b + c[5] * _a + c[6] * _b + c[7]); + return (tl.x >= 0 && tl.y >= 0) && (tr.x <= 0 && tr.y >= 0) + && (bl.x >= 0 && bl.y <= 0) && (br.x <= 0 && br.y <= 0); + } + + inline cv::Point2f est_DeltaGain( const cv::Mat & src, const cv::Vec4f & val) + { + return cv::Point2f( + src.at(2,0) * val[0] + src.at(2,1) * val[1] + src.at(2,2) * val[2] + src.at(2,3) * val[3], + src.at(3,0) * val[0] + src.at(3,1) * val[1] + src.at(3,2) * val[2] + src.at(3,3) * val[3]); + } + inline void est_Result( const cv::Mat & src, const cv::Vec4f & val, cv::Point2f & delta, cv::Point2f & gain) + { + + delta = cv::Point2f( + -(src.at(0,0) * val[0] + src.at(0,1) * val[1] + src.at(0,2) * val[2] + src.at(0,3) * val[3]), + -(src.at(1,0) * val[0] + src.at(1,1) * val[1] + src.at(1,2) * val[2] + src.at(1,3) * val[3])); + + gain = cv::Point2f( + src.at(2,0) * val[0] + src.at(2,1) * val[1] + src.at(2,2) * val[2] + src.at(2,3) * val[3], + src.at(3,0) * val[0] + src.at(3,1) * val[1] + src.at(3,2) * val[2] + src.at(3,3) * val[3]); + } + + namespace berlof + { + + namespace ica + { + + class TrackerInvoker : public cv::ParallelLoopBody + { + public: + TrackerInvoker( + const Mat& _prevImg, + const Mat& _prevDeriv, + const Mat& _nextImg, + const Mat& _rgbPrevImg, + const Mat& _rgbNextImg, + const Point2f* _prevPts, + Point2f* _nextPts, + uchar* _status, + float* _err, + int _level, + int _maxLevel, + int _winSize[2], + int _maxIteration, + bool _useInitialFlow, + int _supportRegionType, + int _crossSegmentationThreshold, + std::vector _normSigmas, + float _minEigenValue) + { + prevImg = &_prevImg; + prevDeriv = &_prevDeriv; + nextImg = &_nextImg; + rgbPrevImg = &_rgbPrevImg; + rgbNextImg = &_rgbNextImg; + prevPts = _prevPts; + nextPts = _nextPts; + status = _status; + err = _err; + minWinSize = _winSize[0]; + maxWinSize = _winSize[1]; + criteria.maxCount = _maxIteration; + criteria.epsilon = 0.01; + level = _level; + maxLevel = _maxLevel; + windowType = _supportRegionType; + minEigThreshold = _minEigenValue; + paramReset = _normSigmas; + useInitialFlow = _useInitialFlow; + crossSegmentationThreshold = _crossSegmentationThreshold; + } + + TrackerInvoker & operator=(const TrackerInvoker &) {return *this;}; + void operator()(const cv::Range& range) const + { + + Point2f halfWin; + cv::Size winSize; + const Mat& I = *prevImg; + const Mat& J = *nextImg; + const Mat& derivI = *prevDeriv; + const Mat& BI = *rgbPrevImg; + + + const float FLT_SCALE = (1.f/(1 << 16)); + + winSize = cv::Size(maxWinSize,maxWinSize); + int winMaskwidth = static_cast(ceil(winSize.width / 16.f)) * 16; + cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); + winMaskMatBuf.setTo(1); + + std::vector param = paramReset; + int j, cn = I.channels(), cn2 = cn*2; + int winbufwidth = static_cast(ceil(winSize.width / 16.f)) * 16; + cv::Size winBufSize(winbufwidth,winbufwidth); + + cv::AutoBuffer _buf(winBufSize.area()*(cn + cn2)); + int derivDepth = DataType::depth; + + Mat IWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf.data()); + Mat derivIWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf.data() + winBufSize.area()*cn); + + + for( int ptidx = range.start; ptidx < range.end; ptidx++ ) + { + Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level)); + Point2f nextPt; + if( level == maxLevel ) + { + if( useInitialFlow ) + nextPt = nextPts[ptidx]*(float)(1./(1 << level)); + else + nextPt = prevPt; + } + else + nextPt = nextPts[ptidx]*2.f; + nextPts[ptidx] = nextPt; + + Point2i iprevPt, inextPt; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + int winArea = maxWinSize * maxWinSize; + cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize)); + winMaskMatBuf.setTo(0); + if( calcWinMaskMat(BI, windowType, iprevPt, + winMaskMat,winSize,halfWin,winArea, + minWinSize,maxWinSize, crossSegmentationThreshold) == false) + continue; + + prevPt -= halfWin; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + + if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols || + iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows ) + { + if( level == 0 ) + { + if( status ) + status[ptidx] = 3; + if( err ) + err[ptidx] = 0; + } + continue; + } + + float a = prevPt.x - iprevPt.x; + float b = prevPt.y - iprevPt.y; + const int W_BITS = 14, W_BITS1 = 14; + + int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + + int dstep = (int)(derivI.step/derivI.elemSize1()); + int step = (int)(I.step/I.elemSize1()); + int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1()); + CV_Assert( step == (int)(J.step/J.elemSize1()) ); + + float A11 = 0, A12 = 0, A22 = 0; + float b1 = 0, b2 = 0; + float D = 0; + + // extract the patch from the first image, compute covariation matrix of derivatives + int x, y; + for( y = 0; y < winSize.height; y++ ) + { + const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; + const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; + + short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step); + short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step); + x = 0; + for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 ) + { + if( winMaskMat.at(y,x) == 0) + { + dIptr[0] = 0; + dIptr[1] = 0; + continue; + } + int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + + src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5); + int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 + + dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1); + int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 + + dsrc[dstep+cn2+1]*iw11, W_BITS1); + Iptr[x] = (short)ival; + dIptr[0] = (short)ixval; + dIptr[1] = (short)iyval; + } + } + + cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1); + cv::Point2f backUpNextPt = nextPt; + nextPt -= halfWin; + Point2f prevDelta(0,0); //denotes h(t-1) + cv::Size _winSize = winSize; + __m128i mmMask0, mmMask1, mmMask; + getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask); + __m128 mmOnes = _mm_set1_ps(1.f ); + __m128i mmOnes_epi16 = _mm_set1_epi16(1 ); + __m128i mmZeros_epi16 = _mm_set1_epi16(0); + float MEstimatorScale = 1; + int buffIdx = 0; + float c[8]; + cv::Mat GMc0, GMc1, GMc2, GMc3; + cv::Vec2f Mc0, Mc1, Mc2, Mc3; + int noIteration = 0; + int noReusedIteration = 0; + int noSolvedIteration = 0; + for( j = 0; j < criteria.maxCount; j++ ) + { + cv::Point2f delta(0,0); + cv::Point2f deltaGain(0,0); + bool hasSolved = false; + a = nextPt.x - inextPt.x; + b = nextPt.y - inextPt.y; + float ab = a * b; + if( j == 0 + || ( inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j % 2 != 0 )) + { + inextPt.x = cvFloor(nextPt.x); + inextPt.y = cvFloor(nextPt.y); + + if( inextPt.x < -winSize.width || inextPt.x >= J.cols || + inextPt.y < -winSize.height || inextPt.y >= J.rows ) + { + if( level == 0 && status ) + status[ptidx] = 3; + break; + } + + + a = nextPt.x - inextPt.x; + b = nextPt.y - inextPt.y; + ab = a * b; + iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + // mismatch + b1 = 0; + b2 = 0; + if( j == 0 ) + { + A11 = 0; + A12 = 0; + A22 = 0; + } + + if ( j == 0 ) + { + buffIdx = 0; + for( y = 0; y < winSize.height; y++ ) + { + const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y )*step + (inextPt.x )*cn; + const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step); + const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + x = 0; + for( ; x < winSize.width*cn; x++, dIptr += 2) + { + if( maskPtr[x] == 0) + continue; + int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, W_BITS1-5) + - Iptr[x]; + residualMat.at(buffIdx++) = static_cast(diff); + } + } + /*! Estimation for the residual */ + cv::Mat residualMatRoi(residualMat, cv::Rect(0,0,1, buffIdx)); + MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi); + } + + float eta = 1.f / winArea; + float fParam0 = param[0] * 32.f; + float fParam1 = param[1] * 32.f; + fParam0 = param[0] * MEstimatorScale; + fParam1 = param[1] * MEstimatorScale; + + + buffIdx = 0; + float _b0[4] = {0,0,0,0}; + float _b1[4] = {0,0,0,0}; + + /* + */ + for( y = 0; y < _winSize.height; y++ ) + { + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn; + const short* Iptr = (const short*)(IWinBuf.data + y*IWinBuf.step); + const short* dIptr = (const short*)(derivIWinBuf.data + y*derivIWinBuf.step); + + x = 0; + for( ; x < _winSize.width*cn; x++, dIptr += 2 ) + { + if( maskPtr[x] == 0) + continue; + int illValue = - Iptr[x]; + + float It[4] = {static_cast((Jptr[x+step+cn]<< 5) + illValue), + static_cast((Jptr[x+cn]<< 5) + illValue), + static_cast((Jptr[x+step]<< 5) + illValue), + static_cast((Jptr[x] << 5) + illValue)}; + + + + int J_val = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, + W_BITS1-5); + + + int diff = J_val + illValue; + + + MEstimatorScale += (diff < MEstimatorScale) ? -eta : eta; + + int abss = (diff < 0) ? -diff : diff; + + // compute the missmatch vector + if( j >= 0) + { + if( abss > fParam1) + { + It[0] = 0; + It[1] = 0; + It[2] = 0; + It[3] = 0; + } + else if( abss > fParam0 && diff >= 0 ) + { + It[0] = param[2] * (It[0] - fParam1); + It[1] = param[2] * (It[1] - fParam1); + It[2] = param[2] * (It[2] - fParam1); + It[3] = param[2] * (It[3] - fParam1); + } + else if( abss > fParam0 && diff < 0 ) + { + It[0] = param[2] * (It[0] + fParam1); + It[1] = param[2] * (It[1] + fParam1); + It[2] = param[2] * (It[2] + fParam1); + It[3] = param[2] * (It[3] + fParam1); + } + } + + float It0 = It[0]; + float It1 = It[1]; + float It2 = It[2]; + float It3 = It[3]; + + float ixval = static_cast(dIptr[0]); + float iyval = static_cast(dIptr[1]); + _b0[0] += It0 * ixval; + _b0[1] += It1 * ixval; + _b0[2] += It2 * ixval; + _b0[3] += It3 * ixval; + + + _b1[0] += It0*iyval; + _b1[1] += It1*iyval; + _b1[2] += It2*iyval; + _b1[3] += It3*iyval; + + + // compute the Gradient Matrice + if( j == 0) + { + float tale = param[2] * FLT_RESCALE; + if( abss < fParam0 || j < 0) + { + tale = FLT_RESCALE; + } + else if( abss > fParam1) + { + tale *= 0.01f; + } + else + { + tale *= param[2]; + } + + A11 += (float)(ixval*ixval)*tale; + A12 += (float)(ixval*iyval)*tale; + A22 += (float)(iyval*iyval)*tale; + + } + } + } + + if( j == 0 ) + { + + A11 *= FLT_SCALE; // 54866744. + A12 *= FLT_SCALE; // -628764.00 + A22 *= FLT_SCALE; // 19730.000 + + D = A11 * A22 - A12 * A12; + float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) + + 4.f*A12*A12))/(2*winArea); + + if( minEig < minEigThreshold || std::abs(D) < FLT_EPSILON) + { + if( level == 0 && status ) + status[ptidx] = 0; + if( level > 0) + { + nextPts[ptidx] = backUpNextPt; + } + noIteration++; + break; + } + + D = (1.f / D); + + } + + _b0[0] *= FLT_SCALE;_b0[1] *= FLT_SCALE;_b0[2] *= FLT_SCALE;_b0[3] *= FLT_SCALE; + _b1[0] *= FLT_SCALE;_b1[1] *= FLT_SCALE;_b1[2] *= FLT_SCALE;_b1[3] *= FLT_SCALE; + + + Mc0[0] = _b0[0] - _b0[1] - _b0[2] + _b0[3]; + Mc0[1] = _b1[0] - _b1[1] - _b1[2] + _b1[3]; + + Mc1[0] = _b0[1] - _b0[3]; + Mc1[1] = _b1[1] - _b1[3]; + + + Mc2[0] = _b0[2] - _b0[3]; + Mc2[1] = _b1[2] - _b1[3]; + + + Mc3[0] = _b0[3]; + Mc3[1] = _b1[3]; + + c[0] = -Mc0[0]; + c[1] = -Mc1[0]; + c[2] = -Mc2[0]; + c[3] = -Mc3[0]; + c[4] = -Mc0[1]; + c[5] = -Mc1[1]; + c[6] = -Mc2[1]; + c[7] = -Mc3[1]; + + float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]); + float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]); + float e2 = e0 * (c[1] * c[7] -c[3] * c[5]); + e0 = e1 * e1 - e2; + hasSolved = false; + if ( e0 > 0) + { + e0 = sqrt(e0); + float _y[2] = {-e1 - e0, e0 - e1}; + float c0yc1[2] = {c[0] * _y[0] + c[1], + c[0] * _y[1] + c[1]}; + float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0], + -(c[2] * _y[1] + c[3]) / c0yc1[1]}; + float c0yc2[2] = {c[0] * _y[0] + c[2], + c[0] * _y[1] + c[2]}; + + float c4yc6[2] = {c[4] * _y[0] + c[6], + c[4] * _y[1] + c[6]}; + + float c0xc1[2] = {c[0] * _x[0] + c[1], + c[0] * _x[1] + c[1]}; + + float c4xc5[2] = {c[4] * _x[0] + c[5], + c[4] * _x[1] + c[5]}; + bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1); + bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1); + + bool isSolution1 = checkSolution(_x[0], _y[0], c ); + bool isSolution2 = checkSolution(_x[1], _y[1], c ); + bool isSink1 = isIn1 && isSolution1; + bool isSink2 = isIn2 && isSolution2; + + if ( isSink1 != isSink2) + { + a = isSink1 ? _x[0] : _x[1]; + b = isSink1 ? _y[0] : _y[1]; + ab = a * b; + hasSolved = true; + delta.x = inextPt.x + a - nextPt.x; + delta.y = inextPt.y + b - nextPt.y; + } // isIn1 != isIn2 + } + if( hasSolved == false) + noIteration++; + } + else + { + hasSolved = false; + noReusedIteration++; + } + if( hasSolved == false ) + { + + cv::Vec2f mismatchVec = ab * Mc0 + Mc1 *a + Mc2 * b + Mc3; + delta.x = (A12 * mismatchVec.val[1] - A22 * mismatchVec.val[0]) * D; + delta.y = (A12 * mismatchVec.val[0] - A11 * mismatchVec.val[1]) * D; + delta.x = MAX(-1.f, MIN(1.f, delta.x)); + delta.y = MAX(-1.f, MIN(1.f, delta.y)); + nextPt += delta; + nextPts[ptidx] = nextPt + halfWin; + } + else + { + nextPt += delta; + nextPts[ptidx] = nextPt + halfWin; + noSolvedIteration++; + break; + } + + delta.x = ( delta.x != delta.x) ? 0 : delta.x; + delta.y = ( delta.y != delta.y) ? 0 : delta.y; + + + cv::Point2f totalDistance = backUpNextPt - nextPts[ptidx]; + + if(j > 0 && ( + (std::abs(delta.x - prevDelta.x) < 0.01 && std::abs(delta.y - prevDelta.y) < 0.01))) + { + nextPts[ptidx] -= delta*0.5f; + break; + } + + + prevDelta = delta; + } + + } + + } + + const Mat* prevImg; + const Mat* nextImg; + const Mat* prevDeriv; + const Mat* rgbPrevImg; + const Mat* rgbNextImg; + const Point2f* prevPts; + Point2f* nextPts; + uchar* status; + float* err; + int maxWinSize; + int minWinSize; + TermCriteria criteria; + int level; + int maxLevel; + int windowType; + float minEigThreshold; + bool useInitialFlow; + std::vector paramReset; + int crossSegmentationThreshold; + + }; + } + namespace radial + { + class TrackerInvoker : public cv::ParallelLoopBody + { + public: + TrackerInvoker( + const Mat& _prevImg, + const Mat& _prevDeriv, + const Mat& _nextImg, + const Mat& _rgbPrevImg, + const Mat& _rgbNextImg, + const Point2f* _prevPts, + Point2f* _nextPts, + uchar* _status, + float* _err, + Point2f* _gainVecs, + int _level, + int _maxLevel, + int _winSize[2], + int _maxIteration, + bool _useInitialFlow, + int _supportRegionType, + int _crossSegmentationThreshold, + std::vector _normSigmas, + float _minEigenValue) + { + prevImg = &_prevImg; + prevDeriv = &_prevDeriv; + nextImg = &_nextImg; + rgbPrevImg = &_rgbPrevImg; + rgbNextImg = &_rgbNextImg; + prevPts = _prevPts; + nextPts = _nextPts; + status = _status; + err = _err; + gainVecs = _gainVecs; + minWinSize = _winSize[0]; + maxWinSize = _winSize[1]; + criteria.maxCount = _maxIteration; + criteria.epsilon = 0.01; + level = _level; + maxLevel = _maxLevel; + windowType = _supportRegionType; + minEigThreshold = _minEigenValue; + paramReset = _normSigmas; + useInitialFlow = _useInitialFlow; + crossSegmentationThreshold = _crossSegmentationThreshold; + } + + TrackerInvoker & operator=(const TrackerInvoker &) {return *this;}; + void operator()(const cv::Range& range) const + { + Point2f halfWin; + cv::Size winSize; + const Mat& I = *prevImg; + const Mat& J = *nextImg; + const Mat& derivI = *prevDeriv; + const Mat& BI = *rgbPrevImg; + const float FLT_SCALE = (1.f/(1 << 16));//(1.f/(1 << 20)); // 20 + winSize = cv::Size(maxWinSize,maxWinSize); + int winMaskwidth = static_cast(ceil(winSize.width / 16.f)) * 16; + cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); + winMaskMatBuf.setTo(1); + + std::vector param = paramReset; + int cn = I.channels(), cn2 = cn*2; + int winbufwidth = static_cast(ceil(winSize.width / 16.f)) * 16; + cv::Size winBufSize(winbufwidth,winbufwidth); + + + cv::Mat invTensorMat(4,4, CV_32FC1); + cv::Mat mismatchMat(4, 1, CV_32FC1); + cv::Mat resultMat(4, 1, CV_32FC1); + + cv::AutoBuffer _buf(winBufSize.area()*(cn + cn2)); + int derivDepth = DataType::depth; + + Mat IWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf.data()); + Mat derivIWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf.data() + winBufSize.area()*cn); + + for( int ptidx = range.start; ptidx < range.end; ptidx++ ) + { + Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level)); + Point2f nextPt; + if( level == maxLevel ) + { + if( useInitialFlow ) + nextPt = nextPts[ptidx]*(float)(1./(1 << level)); + else + nextPt = prevPt; + } + else + nextPt = nextPts[ptidx]*2.f; + nextPts[ptidx] = nextPt; + + Point2i iprevPt, inextPt; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + int winArea = maxWinSize * maxWinSize; + cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize)); + winMaskMatBuf.setTo(0); + if( calcWinMaskMat(BI, windowType, iprevPt, + winMaskMat,winSize,halfWin,winArea, + minWinSize,maxWinSize, crossSegmentationThreshold) == false) + continue; + + prevPt -= halfWin; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + + if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols || + iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows ) + { + if( level == 0 ) + { + if( status ) + status[ptidx] = 3; + if( err ) + err[ptidx] = 0; + } + continue; + } + + float a = prevPt.x - iprevPt.x; + float b = prevPt.y - iprevPt.y; + const int W_BITS = 14, W_BITS1 = 14; + + int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + + int dstep = (int)(derivI.step/derivI.elemSize1()); + int step = (int)(I.step/I.elemSize1()); + int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1()); + CV_Assert( step == (int)(J.step/J.elemSize1()) ); + + float A11 = 0, A12 = 0, A22 = 0; + float b1 = 0, b2 = 0, b3 = 0, b4 = 0; + + // tensor + float sumIx = 0; + float sumIy = 0; + float sumI = 0; + float sumW = 0; + float w1 = 0, w2 = 0; // -IyI + float dI = 0; // I^2 + float D = 0; + + #ifdef CV_SSE2 + __m128i rightMask = _mm_set_epi16(0, std::numeric_limits::max(), 0, std::numeric_limits::max(), 0, std::numeric_limits::max(), 0, std::numeric_limits::max()); + __m128i leftMask = _mm_set_epi16(std::numeric_limits::max(), 0, std::numeric_limits::max(), 0, std::numeric_limits::max(), 0, std::numeric_limits::max(), 0); + + __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128i z = _mm_setzero_si128(); + __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1)); + __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1)); + __m128i mmMask4_epi32; + __m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits::max()); + get4BitMask(winSize.width, mmMask4_epi32); + __m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps(); + #endif + + // extract the patch from the first image, compute covariation matrix of derivatives + int x, y; + for( y = 0; y < winSize.height; y++ ) + { + x = 0; + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; + const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; + + short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step); + short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step); + + + + #ifdef CV_SSE2 + for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 ) + { + __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16); + __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16); + + + __m128i v00, v01, v10, v11, t0, t1; + v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); + v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); + v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z); + v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); + if( x + 4 > winSize.width) + { + t0 = _mm_and_si128(t0, mmMask4_epi32); + } + t0 = _mm_and_si128(t0, mask_0_3_epi16); + _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0)); + + + v00 = _mm_loadu_si128((const __m128i*)(dsrc)); + v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); + v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep)); + v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2)); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1); + v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... + if( x + 4 > winSize.width) + { + v00 = _mm_and_si128(v00, mmMask4_epi32); + } + v00 = _mm_and_si128(v00, mask_0_3_epi16); + _mm_storeu_si128((__m128i*)dIptr, v00); + } + #else + + for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 ) + { + if( winMaskMat.at(y,x) == 0) + { + dIptr[0] = 0; + dIptr[1] = 0; + continue; + } + int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + + src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5); + int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 + + dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1); + int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 + + dsrc[dstep+cn2+1]*iw11, W_BITS1); + + Iptr[x] = (short)ival; + dIptr[0] = (short)ixval; + dIptr[1] = (short)iyval; + + } + #endif + } + + cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1); + + cv::Point2f backUpNextPt = nextPt; + nextPt -= halfWin; + Point2f prevDelta(0,0); //relates to h(t-1) + Point2f prevGain(1,0); + cv::Point2f gainVec = gainVecs[ptidx]; + cv::Point2f backUpGain = gainVec; + cv::Size _winSize = winSize; + int j; + __m128i mmMask0, mmMask1, mmMask; + getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask); + __m128 mmOnes = _mm_set1_ps(1.f ); + __m128i mmOnes_epi16 = _mm_set1_epi16(1 ); + __m128i mmZeros_epi16 = _mm_set1_epi16(0); + float MEstimatorScale = 1; + int buffIdx = 0; + float c[8]; + cv::Mat GMc0, GMc1, GMc2, GMc3; + cv::Vec4f Mc0, Mc1, Mc2, Mc3; + int noIteration = 0; + int noReusedIteration = 0; + int noSolvedIteration = 0; + for( j = 0; j < criteria.maxCount; j++ ) + { + cv::Point2f delta(0,0); + cv::Point2f deltaGain(0,0); + bool hasSolved = false; + a = nextPt.x - inextPt.x; + b = nextPt.y - inextPt.y; + float ab = a * b; + if( j == 0 + || (inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j % 2 != 0) ) + { + inextPt.x = cvFloor(nextPt.x); + inextPt.y = cvFloor(nextPt.y); + + if( inextPt.x < -winSize.width || inextPt.x >= J.cols || + inextPt.y < -winSize.height || inextPt.y >= J.rows ) + { + if( level == 0 && status ) + status[ptidx] = 3; + if (level > 0) + { + nextPts[ptidx] = backUpNextPt; + gainVecs[ptidx] = backUpGain; + } + noIteration++; + break; + } + + + a = nextPt.x - inextPt.x; + b = nextPt.y - inextPt.y; + ab = a * b; + iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + // mismatch + b1 = 0; + b2 = 0; + b3 = 0; + b4 = 0; + if( j == 0) + { + // tensor + w1 = 0; // -IxI + w2 = 0; // -IyI + dI = 0; // I^2 + sumIx = 0; + sumIy = 0; + sumI = 0; + sumW = 0; + A11 = 0; + A12 = 0; + A22 = 0; + } + + if ( j == 0 ) + { + buffIdx = 0; + for( y = 0; y < winSize.height; y++ ) + { + const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y )*step + (inextPt.x )*cn; + const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step); + const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + x = 0; + for( ; x < winSize.width*cn; x++, dIptr += 2) + { + if( maskPtr[x] == 0) + continue; + int diff = static_cast(CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, W_BITS1-5) + - Iptr[x] + Iptr[x] * gainVec.x +gainVec.y); + residualMat.at(buffIdx++) = static_cast(diff); + } + } + /*! Estimation for the residual */ + cv::Mat residualMatRoi(residualMat, cv::Rect(0,0,1, buffIdx)); + MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi); + } + + float eta = 1.f / winArea; + float fParam0 = param[0] * 32.f; + float fParam1 = param[1] * 32.f; + fParam0 = param[0] * MEstimatorScale; + fParam1 = param[1] * MEstimatorScale; + +#ifdef CV_SSE2 + + qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128 qb0[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; + __m128 qb1[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; + __m128 qb2[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; + __m128 qb3[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; + __m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps(); + __m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps(); + __m128 mmSumIy = _mm_setzero_ps(), mmSumIx = _mm_setzero_ps(); + __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps(); + __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits::max() -1, static_cast(fParam0))); + __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits::max()- 1, static_cast(fParam1))); + + + float s2Val = param[2] > 0 ? param[2] : -param[2]; + int s2bitShift = param[2] == 0 ? 1 : static_cast(ceil(log(200.f / s2Val) / log(2.f))); + __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast(param[2] * (float)(1 << s2bitShift))); + __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift); + __m128 mmParam2 = _mm_set1_ps(param[2]); + __m128 mmParam2s = _mm_set1_ps(0.01f * param[2]); + __m128 mmParam2s2 = _mm_set1_ps(param[2] * param[2]); + float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x; + int bitShift = gainVec.x == 0 ? 1 : static_cast(ceil(log(200.f / gainVal) / log(2.f))); + __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast(gainVec.x * (float)(1 << bitShift))); + __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast(gainVec.y)); + __m128i mmEta = _mm_setzero_si128(); + __m128i mmEtaNeg = _mm_set1_epi16(-1); + __m128i mmScale = _mm_set1_epi16(static_cast(MEstimatorScale)); + +#endif + + buffIdx = 0; + float _b0[4] = {0,0,0,0}; + float _b1[4] = {0,0,0,0}; + float _b2[4] = {0,0,0,0}; + float _b3[4] = {0,0,0,0}; + float _bm[4] = {0,0,0,0}; + float _bc[4] = {0,0,0,0}; + /* + */ + for( y = 0; y < _winSize.height; y++ ) + { + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn; + const short* Iptr = (const short*)(IWinBuf.data + y*IWinBuf.step); + const short* dIptr = (const short*)(derivIWinBuf.data + y*derivIWinBuf.step); + + x = 0; +#ifdef CV_SSE2 + for( ; x <= _winSize.width*cn; x += 8, dIptr += 8*2 ) + { + __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16); + __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); + + __m128i v00 = _mm_unpacklo_epi8( + _mm_loadl_epi64((const __m128i*)(Jptr + x)), z); + __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); + __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z); + __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z); + + __m128i t0 = _mm_add_epi32 + (_mm_madd_epi16( + _mm_unpacklo_epi16(v00, v01), + qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5); + + __m128i lo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16); + __m128i hi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16); + + __m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), bitShift); + __m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), bitShift); + __m128i Igain_epi16 = _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32); + + + __m128i diffValue = _mm_subs_epi16(_mm_add_epi16(Igain_epi16, mmConstValue_epi16), I_0_7_epi16); + __m128i mmDiffc_epi16[4] = + { + _mm_add_epi16(_mm_slli_epi16(v11, 5), diffValue), + _mm_add_epi16(_mm_slli_epi16(v01, 5), diffValue), + _mm_add_epi16(_mm_slli_epi16(v10, 5), diffValue), + _mm_add_epi16(_mm_slli_epi16(v00, 5), diffValue) + }; + + __m128i mmDiff_epi16 = _mm_add_epi16( _mm_packs_epi32(t0, t1), diffValue); + + + mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16); + + __m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale); + mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1))); + + + __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); + __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); + + + __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16); + __m128i bSet2_epi16, bSet1_epi16; + // |It| < sigma1 ? + bSet2_epi16 = _mm_cmplt_epi16(abs_epi16, mmParam1); + // It > 0 ? + __m128i diffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128()); + // sigma0 < |It| < sigma1 ? + bSet1_epi16 = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0)); + // val = |It| -/+ sigma1 + __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)), + _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1))); + // It == 0 ? |It| > sigma13 + mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16); + + for( unsigned int mmi = 0; mmi < 4; mmi++) + { + __m128i mmDiffc_epi16_t = _mm_and_si128(mmDiffc_epi16[mmi], mask_0_7_epi16); + mmDiffc_epi16_t = _mm_and_si128(bSet2_epi16, mmDiffc_epi16_t); + + // It == val ? sigma0 < |It| < sigma1 + mmDiffc_epi16_t = _mm_blendv_epi8(mmDiffc_epi16_t, tmpParam1_epi16, bSet1_epi16); + __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3 + // diff = diff * sigma2 + lo = _mm_mullo_epi16(tale_epi16_, mmDiffc_epi16_t); + hi = _mm_mulhi_epi16(tale_epi16_, mmDiffc_epi16_t); + __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); + __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); + + mmDiffc_epi16_t = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); + __m128i diff1 = _mm_unpackhi_epi16(mmDiffc_epi16_t, mmDiffc_epi16_t); // It4 It4 It5 It5 It6 It6 It7 It7 | It12 It12 It13 It13... + __m128i diff0 = _mm_unpacklo_epi16(mmDiffc_epi16_t, mmDiffc_epi16_t); // It0 It0 It1 It1 It2 It2 It3 It3 | It8 It8 It9 It9... + + // Ix * diff / Iy * diff + v10 = _mm_mullo_epi16(Ixy_0, diff0); + v11 = _mm_mulhi_epi16(Ixy_0, diff0); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + + qb0[mmi] = _mm_add_ps(qb0[mmi], _mm_cvtepi32_ps(v00)); + qb1[mmi] = _mm_add_ps(qb1[mmi], _mm_cvtepi32_ps(v10)); + // It * Ix It * Iy [4 ... 7] + // for set 1 hi sigma 1 + v10 = _mm_mullo_epi16(Ixy_1, diff1); + v11 = _mm_mulhi_epi16(Ixy_1, diff1); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qb0[mmi] = _mm_add_ps(qb0[mmi], _mm_cvtepi32_ps(v00)); + qb1[mmi] = _mm_add_ps(qb1[mmi], _mm_cvtepi32_ps(v10)); + // diff * J [0 ... 7] + // for set 1 sigma 1 + // b3 += diff * Iptr[x] + v10 = _mm_mullo_epi16(mmDiffc_epi16_t, I_0_7_epi16); + v11 = _mm_mulhi_epi16(mmDiffc_epi16_t, I_0_7_epi16); + v00 = _mm_unpacklo_epi16(v10, v11); + + v10 = _mm_unpackhi_epi16(v10, v11); + qb2[mmi] = _mm_add_ps(qb2[mmi], _mm_cvtepi32_ps(v00)); + qb2[mmi] = _mm_add_ps(qb2[mmi], _mm_cvtepi32_ps(v10)); + qb3[mmi] = _mm_add_ps(qb3[mmi], _mm_cvtepi32_ps(diff_0_3_epi32)); + qb3[mmi] = _mm_add_ps(qb3[mmi], _mm_cvtepi32_ps(diff_4_7_epi32)); + } + + if( j == 0 ) + { + __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16)); + __m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16,bSet1_epi16), 16)); + __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16)); + __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16),16))); + + __m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16)); + __m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16),16)); + + __m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps); + __m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps); + tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps); + tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps); + + tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps); + tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps); + + t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3 + t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3 + + __m128 fy = _mm_cvtepi32_ps(t0); + __m128 fx = _mm_cvtepi32_ps(t1); + + // 0 ... 3 + __m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16)); + + // A11 - A22 + __m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps); + __m128 fytale = _mm_mul_ps(fy, tale_0_3_ps); + + mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); + mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); + mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); + + // sumIx und sumIy + mmSumIx = _mm_add_ps(mmSumIx, fxtale); + mmSumIy = _mm_add_ps(mmSumIy, fytale); + + mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale)); + mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale)); + + // sumI + __m128 I_tale_ps = _mm_mul_ps(I_ps, tale_0_3_ps); + mmSumI = _mm_add_ps(mmSumI,I_tale_ps); + + // sumW + mmSumW = _mm_add_ps(mmSumW, tale_0_3_ps); + + // sumDI + mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_tale_ps)); + + + t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11 + t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3 + + fy = _mm_cvtepi32_ps(t0); + fx = _mm_cvtepi32_ps(t1); + + // 4 ... 7 + I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16)); + + // A11 - A22 + fxtale = _mm_mul_ps(fx, tale_4_7_ps); + fytale = _mm_mul_ps(fy, tale_4_7_ps); + + mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); + mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); + mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); + + // sumIx und sumIy + mmSumIx = _mm_add_ps(mmSumIx, fxtale); + mmSumIy = _mm_add_ps(mmSumIy, fytale); + + mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale)); + mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale)); + + // sumI + I_tale_ps = _mm_mul_ps(I_ps, tale_4_7_ps); + mmSumI = _mm_add_ps(mmSumI, I_tale_ps); + + // sumW + mmSumW = _mm_add_ps(mmSumW, tale_4_7_ps); + + // sumDI + mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_tale_ps)); + } + + } +#else + for( ; x < _winSize.width*cn; x++, dIptr += 2 ) + { + if( maskPtr[x] == 0) + continue; + + short ixval = static_cast(dIptr[0]); + short iyval = static_cast(dIptr[1]); + int illValue = Iptr[x] * gainVec.x + gainVec.y - Iptr[x]; + + int It[4] = {(Jptr[x+step+cn]<< 5) + illValue, + (Jptr[x+cn]<< 5) + illValue, + (Jptr[x+step]<< 5) + illValue, + (Jptr[x] << 5) + illValue}; + + + float J = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, + W_BITS1-5); + + int diff = J + illValue; + + + + MEstimatorScale += (diff < MEstimatorScale) ? -eta : eta; + + int abss = (diff < 0) ? -diff : diff; + + + // compute the missmatch vector + if( j >= 0) + { + if( abss > fParam1) + { + It[0] = 0; + It[1] = 0; + It[2] = 0; + It[3] = 0; + } + else if( abss > fParam0 && diff >= 0 ) + { + It[0] = param[2] * (It[0] - fParam1); + It[1] = param[2] * (It[1] - fParam1); + It[2] = param[2] * (It[2] - fParam1); + It[3] = param[2] * (It[3] - fParam1); + } + else if( abss > fParam0 && diff < 0 ) + { + It[0] = param[2] * (It[0] + fParam1); + It[1] = param[2] * (It[1] + fParam1); + It[2] = param[2] * (It[2] + fParam1); + It[3] = param[2] * (It[3] + fParam1); + } + } + _b0[0] += (float)(It[0]*dIptr[0]) ; + _b0[1] += (float)(It[1]*dIptr[0]) ; + _b0[2] += (float)(It[2]*dIptr[0]) ; + _b0[3] += (float)(It[3]*dIptr[0]) ; + + + _b1[0] += (float)(It[0]*dIptr[1]) ; + _b1[1] += (float)(It[1]*dIptr[1]) ; + _b1[2] += (float)(It[2]*dIptr[1]) ; + _b1[3] += (float)(It[3]*dIptr[1]) ; + + _b2[0] += (float)(It[0])*Iptr[x] ; + _b2[1] += (float)(It[1])*Iptr[x] ; + _b2[2] += (float)(It[2])*Iptr[x] ; + _b2[3] += (float)(It[3])*Iptr[x] ; + + _b3[0] += (float)(It[0]); + _b3[1] += (float)(It[1]); + _b3[2] += (float)(It[2]); + _b3[3] += (float)(It[3]); + + // compute the Gradient Matrice + if( j == 0) + { + float tale = param[2] * FLT_RESCALE; + if( abss < fParam0 || j < 0) + { + tale = FLT_RESCALE; + } + else if( abss > fParam1) + { + tale *= 0.01f; + } + else + { + tale *= param[2]; + } + if( j == 0 ) + { + A11 += (float)(ixval*ixval)*tale; + A12 += (float)(ixval*iyval)*tale; + A22 += (float)(iyval*iyval)*tale; + } + + dI += Iptr[x] * Iptr[x] * tale; + float dx = static_cast(dIptr[0]) * tale; + float dy = static_cast(dIptr[1]) * tale; + sumIx += dx; + sumIy += dy; + w1 += dx * Iptr[x]; + w2 += dy * Iptr[x]; + sumI += Iptr[x] * tale; + sumW += tale; + } + + } +#endif + } + +#ifdef CV_SSE2 + short etaValues[8]; + _mm_storeu_si128((__m128i*)(etaValues), mmEta); + MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3] + + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]); + +#endif + float CV_DECL_ALIGNED(32) wbuf[4];// + if( j == 0 ) + { +#ifdef CV_SSE2 + _mm_store_ps(wbuf, mmSumW1); + w1 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumW2); + w2 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumDI); + dI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumI); + sumI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumIx); + sumIx = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumIy); + sumIy = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumW); + sumW = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; +#endif + sumIx *= -FLT_SCALE; + sumIy *= -FLT_SCALE; + sumI *=FLT_SCALE; + sumW *= FLT_SCALE; + w1 *= -FLT_SCALE; + w2 *= -FLT_SCALE; + dI *= FLT_SCALE; + + +#ifdef CV_SSE2 + float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];// + + _mm_store_ps(A11buf, mmAxx); + _mm_store_ps(A12buf, mmAxy); + _mm_store_ps(A22buf, mmAyy); + + + A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; + A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; + A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; +#endif + A11 *= FLT_SCALE; // 54866744. + A12 *= FLT_SCALE; // -628764.00 + A22 *= FLT_SCALE; // 19730.000 + + D = - A12*A12*sumI*sumI + dI*sumW*A12*A12 + 2*A12*sumI*sumIx*w2 + 2*A12*sumI*sumIy*w1 + - 2*dI*A12*sumIx*sumIy - 2*sumW*A12*w1*w2 + A11*A22*sumI*sumI - 2*A22*sumI*sumIx*w1 + - 2*A11*sumI*sumIy*w2 - sumIx*sumIx*w2*w2 + A22*dI*sumIx*sumIx + 2*sumIx*sumIy*w1*w2 + - sumIy*sumIy*w1*w1 + A11*dI*sumIy*sumIy + A22*sumW*w1*w1 + A11*sumW*w2*w2 - A11*A22*dI*sumW; + + float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) + + 4.f*A12*A12))/(2*winArea); + if( minEig < minEigThreshold || std::abs(D) < FLT_EPSILON ) + { + if( level == 0 && status ) + status[ptidx] = 0; + if( level > 0) + { + nextPts[ptidx] = backUpNextPt; + gainVecs[ptidx] = backUpGain; + } + noIteration++; + break; + } + + + D = (1.f / D); + + invTensorMat.at(0,0) = (A22*sumI*sumI - 2*sumI*sumIy*w2 + dI*sumIy*sumIy + sumW*w2*w2 - A22*dI*sumW)* D; + invTensorMat.at(0,1) = (A12*dI*sumW - A12*sumI * sumI - dI*sumIx*sumIy + sumI*sumIx*w2 + sumI*sumIy*w1 - sumW*w1*w2)* D; + invTensorMat.at(0,2) = (A12*sumI*sumIy - sumIy*sumIy*w1 - A22*sumI*sumIx - A12*sumW*w2 + A22*sumW*w1 + sumIx*sumIy*w2)* D; + invTensorMat.at(0,3) = (A22*dI*sumIx - A12*dI*sumIy - sumIx*w2*w2 + A12*sumI*w2 - A22*sumI*w1 + sumIy*w1*w2) * D; + invTensorMat.at(1,0) = invTensorMat.at(0,1); + invTensorMat.at(1,1) = (A11*sumI * sumI - 2*sumI*sumIx*w1 + dI*sumIx * sumIx + sumW*w1*w1 - A11*dI*sumW) * D; + invTensorMat.at(1,2) = (A12*sumI*sumIx - A11*sumI*sumIy - sumIx * sumIx*w2 + A11*sumW*w2 - A12*sumW*w1 + sumIx*sumIy*w1) * D; + invTensorMat.at(1,3) = (A11*dI*sumIy - sumIy*w1*w1 - A12*dI*sumIx - A11*sumI*w2 + A12*sumI*w1 + sumIx*w1*w2)* D; + invTensorMat.at(2,0) = invTensorMat.at(0,2); + invTensorMat.at(2,1) = invTensorMat.at(1,2); + invTensorMat.at(2,2) = (sumW*A12*A12 - 2*A12*sumIx*sumIy + A22*sumIx*sumIx + A11*sumIy*sumIy - A11*A22*sumW)* D; + invTensorMat.at(2,3) = (A11*A22*sumI - A12*A12*sumI - A11*sumIy*w2 + A12*sumIx*w2 + A12*sumIy*w1 - A22*sumIx*w1)* D; + invTensorMat.at(3,0) = invTensorMat.at(0,3); + invTensorMat.at(3,1) = invTensorMat.at(1,3); + invTensorMat.at(3,2) = invTensorMat.at(2,3); + invTensorMat.at(3,3) = (dI*A12*A12 - 2*A12*w1*w2 + A22*w1*w1 + A11*w2*w2 - A11*A22*dI)* D; + } + + + #ifdef CV_SSE2 + float CV_DECL_ALIGNED(16) bbuf[4]; + for(int mmi = 0; mmi < 4; mmi++) + { + + _mm_store_ps(bbuf, _mm_add_ps(qb0[mmi], qb1[mmi])); + _b0[mmi] = bbuf[0] + bbuf[2]; + _b1[mmi] = bbuf[1] + bbuf[3]; + _mm_store_ps(bbuf, qb2[mmi]); + _b2[mmi] = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; + _mm_store_ps(bbuf, qb3[mmi]); + _b3[mmi] = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; + + } + #endif + + _b0[0] *= FLT_SCALE;_b0[1] *= FLT_SCALE;_b0[2] *= FLT_SCALE;_b0[3] *= FLT_SCALE; + _b1[0] *= FLT_SCALE;_b1[1] *= FLT_SCALE;_b1[2] *= FLT_SCALE;_b1[3] *= FLT_SCALE; + _b2[0] *= FLT_SCALE;_b2[1] *= FLT_SCALE;_b2[2] *= FLT_SCALE;_b2[3] *= FLT_SCALE; + _b3[0] *= FLT_SCALE;_b3[1] *= FLT_SCALE;_b3[2] *= FLT_SCALE;_b3[3] *= FLT_SCALE; + + + Mc0[0] = _b0[0] - _b0[1] - _b0[2] + _b0[3]; + Mc0[1] = _b1[0] - _b1[1] - _b1[2] + _b1[3]; + Mc0[2] = -(_b2[0] - _b2[1] - _b2[2] + _b2[3]); + Mc0[3] = -(_b3[0] - _b3[1] - _b3[2] + _b3[3]); + + Mc1[0] = _b0[1] - _b0[3]; + Mc1[1] = _b1[1] - _b1[3]; + Mc1[2] = -(_b2[1] - _b2[3]); + Mc1[3] = -(_b3[1] - _b3[3]); + + + Mc2[0] = _b0[2] - _b0[3]; + Mc2[1] = _b1[2] - _b1[3]; + Mc2[2] = -(_b2[2] - _b2[3]); + Mc2[3] = -(_b3[2] - _b3[3]); + + + Mc3[0] = _b0[3]; + Mc3[1] = _b1[3]; + Mc3[2] = -_b2[3]; + Mc3[3] = -_b3[3]; + + // + c[0] = -Mc0[0]; + c[1] = -Mc1[0]; + c[2] = -Mc2[0]; + c[3] = -Mc3[0]; + c[4] = -Mc0[1]; + c[5] = -Mc1[1]; + c[6] = -Mc2[1]; + c[7] = -Mc3[1]; + + float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]); + float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]); + float e2 = e0 * (c[1] * c[7] -c[3] * c[5]); + e0 = e1 * e1 - e2; + hasSolved = false; + if ( e0 > 0) + { + e0 = sqrt(e0); + float _y[2] = {-e1 - e0, e0 - e1}; + float c0yc1[2] = {c[0] * _y[0] + c[1], + c[0] * _y[1] + c[1]}; + float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0], + -(c[2] * _y[1] + c[3]) / c0yc1[1]}; + float c0yc2[2] = {c[0] * _y[0] + c[2], + c[0] * _y[1] + c[2]}; + + float c4yc6[2] = {c[4] * _y[0] + c[6], + c[4] * _y[1] + c[6]}; + + float c0xc1[2] = {c[0] * _x[0] + c[1], + c[0] * _x[1] + c[1]}; + + float c4xc5[2] = {c[4] * _x[0] + c[5], + c[4] * _x[1] + c[5]}; + bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1); + bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1); + + bool isSolution1 = checkSolution(_x[0], _y[0], c ); + bool isSolution2 = checkSolution(_x[1], _y[1], c ); + bool isSink1 = isIn1 && isSolution1; + bool isSink2 = isIn2 && isSolution2; + + if ( isSink1 != isSink2) + { + a = isSink1 ? _x[0] : _x[1]; + b = isSink1 ? _y[0] : _y[1]; + ab = a * b; + hasSolved = true; + delta.x = inextPt.x + a - nextPt.x; + delta.y = inextPt.y + b - nextPt.y; + + cv::Vec4f mismatchVec = ab * Mc0 + Mc1 *a + Mc2 * b + Mc3; + deltaGain = est_DeltaGain(invTensorMat, mismatchVec); + + } // isIn1 != isIn2 + } + if( hasSolved == false) + noIteration++; + } + else + { + hasSolved = false; + noReusedIteration++; + } + if( hasSolved == false ) + { + + cv::Vec4f mismatchVec = ab * Mc0 + Mc1 *a + Mc2 * b + Mc3; + est_Result(invTensorMat, mismatchVec, delta, deltaGain); + + delta.x = MAX(-1.f, MIN( 1.f , delta.x)); + delta.y = MAX(-1.f, MIN( 1.f , delta.y)); + + + if( j == 0) + prevGain = deltaGain; + gainVec += deltaGain; + nextPt += delta ; + nextPts[ptidx] = nextPt + halfWin; + gainVecs[ptidx]= gainVec; + + } + else + { + nextPt += delta; + nextPts[ptidx] = nextPt + halfWin; + gainVecs[ptidx]= gainVec + deltaGain; + noSolvedIteration++; + break; + } + + + cv::Point2f totalDistance = backUpNextPt - nextPts[ptidx]; + + if(j > 0 && ( + (std::abs(delta.x - prevDelta.x) < 0.01 && std::abs(delta.y - prevDelta.y) < 0.01) + || ((delta.ddot(delta) <= 0.001) && std::abs(prevGain.x - deltaGain.x) < 0.01))) + { + nextPts[ptidx] -= delta*0.5f; + gainVecs[ptidx] -= deltaGain* 0.5f; + break; + } + + + prevDelta = delta; + prevGain = deltaGain; + } + + } + + } + + const Mat* prevImg; + const Mat* nextImg; + const Mat* prevDeriv; + const Mat* rgbPrevImg; + const Mat* rgbNextImg; + const Point2f* prevPts; + Point2f* nextPts; + uchar* status; + cv::Point2f* gainVecs; // gain vector x -> multiplier y -> offset + float* err; + int maxWinSize; + int minWinSize; + TermCriteria criteria; + int level; + int maxLevel; + int windowType; + float minEigThreshold; + bool useInitialFlow; + std::vector paramReset; + int crossSegmentationThreshold; + + }; + } + } + namespace beplk + { + namespace ica{ + class TrackerInvoker : public cv::ParallelLoopBody + { + public: + TrackerInvoker(const Mat& _prevImg, + const Mat& _prevDeriv, + const Mat& _nextImg, + const Mat& _rgbPrevImg, + const Mat& _rgbNextImg, + const Point2f* _prevPts, + Point2f* _nextPts, + uchar* _status, + float* _err, + int _level, + int _maxLevel, + int _winSize[2], + int _maxIteration, + bool _useInitialFlow, + int _supportRegionType, + int _crossSegmentationThreshold, + float _minEigenValue) + { + prevImg = &_prevImg; + prevDeriv = &_prevDeriv; + nextImg = &_nextImg; + rgbPrevImg = &_rgbPrevImg; + rgbNextImg = &_rgbNextImg; + prevPts = _prevPts; + nextPts = _nextPts; + status = _status; + err = _err; + minWinSize = _winSize[0]; + maxWinSize = _winSize[1]; + criteria.maxCount = _maxIteration; + criteria.epsilon = 0.01; + level = _level; + maxLevel = _maxLevel; + windowType = _supportRegionType; + minEigThreshold = _minEigenValue; + useInitialFlow = _useInitialFlow; + crossSegmentationThreshold = _crossSegmentationThreshold; + } + + TrackerInvoker & operator=(const TrackerInvoker &) {return *this;}; + + void operator()(const cv::Range& range) const + { + cv::Size winSize; + cv::Point2f halfWin; + + const Mat& I = *prevImg; + const Mat& J = *nextImg; + const Mat& derivI = *prevDeriv; + const Mat& BI = *rgbPrevImg; + + winSize = cv::Size(maxWinSize,maxWinSize); + int winMaskwidth = static_cast(ceil(winSize.width / 8.f)) * 16; + cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); + winMaskMatBuf.setTo(1); + + const float FLT_SCALE = (1.f/(1 << 20)); // 20 + + int j, cn = I.channels(), cn2 = cn*2; + int winbufwidth = static_cast(ceil(winSize.width / 8.f)) * 8; + cv::Size winBufSize(winbufwidth,winbufwidth); + + std::vector _buf(winBufSize.area()*(cn + cn2)); + Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]); + Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]); + + for( int ptidx = range.start; ptidx < range.end; ptidx++ ) + { + Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level)); + Point2f nextPt; + if( level == maxLevel ) + { + if( useInitialFlow ) + nextPt = nextPts[ptidx]*(float)(1./(1 << level)); + else + nextPt = prevPt; + } + else + nextPt = nextPts[ptidx]*2.f; + nextPts[ptidx] = nextPt; + + Point2i iprevPt, inextPt; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + int winArea = maxWinSize * maxWinSize; + cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize)); + + if( calcWinMaskMat(BI, windowType, iprevPt, + winMaskMat,winSize,halfWin,winArea, + minWinSize,maxWinSize, crossSegmentationThreshold) == false) + continue; + + prevPt -= halfWin; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + + if( iprevPt.x + halfWin.x < 0 || iprevPt.x + halfWin.x>= derivI.cols || + iprevPt.y + halfWin.y < 0 || iprevPt.y + halfWin.y >= derivI.rows ) + { + if( level == 0 ) + { + if( status ) + status[ptidx] = 3; + if( err ) + err[ptidx] = 0; + } + continue; + } + + float a = prevPt.x - iprevPt.x; + float b = prevPt.y - iprevPt.y; + const int W_BITS = 14, W_BITS1 = 14; + + int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + + int dstep = (int)(derivI.step/derivI.elemSize1()); + int step = (int)(I.step/I.elemSize1()); + int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1()); + CV_Assert( step == (int)(J.step/J.elemSize1()) ); + float A11 = 0, A12 = 0, A22 = 0; + + #ifdef CV_SSE2 + __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128i z = _mm_setzero_si128(); + __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1)); + __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1)); + __m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps(); + __m128i mmMask4_epi32; + get4BitMask(winSize.width, mmMask4_epi32); + #endif + + + int x, y; + for( y = 0; y < winSize.height; y++ ) + { + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; + const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; + + short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step); + short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step); + + x = 0; + + #ifdef CV_SSE2 + for( ; x < winSize.width*cn; x += 4, dsrc += 4*2, dIptr += 4*2 ) + { + __m128i wMask = _mm_set_epi32(MaskSet * maskPtr[x+3], + MaskSet * maskPtr[x+2], + MaskSet * maskPtr[x+1], + MaskSet * maskPtr[x]); + __m128i v00, v01, v10, v11, t0, t1; + v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); + v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); + v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z); + v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); + _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0)); + + v00 = _mm_loadu_si128((const __m128i*)(dsrc)); + v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); + v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep)); + v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2)); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1); + v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... + + if( x + 4 > winSize.width) + { + v00 = _mm_and_si128(v00, mmMask4_epi32); + } + v00 = _mm_and_si128(v00, wMask); + + _mm_storeu_si128((__m128i*)dIptr, v00); + t0 = _mm_srai_epi32(v00, 16); // Iy0 Iy1 Iy2 Iy3 + t1 = _mm_srai_epi32(_mm_slli_epi32(v00, 16), 16); // Ix0 Ix1 Ix2 Ix3 + + __m128 fy = _mm_cvtepi32_ps(t0); + __m128 fx = _mm_cvtepi32_ps(t1); + + qA22 = _mm_add_ps(qA22, _mm_mul_ps(fy, fy)); + qA12 = _mm_add_ps(qA12, _mm_mul_ps(fx, fy)); + qA11 = _mm_add_ps(qA11, _mm_mul_ps(fx, fx)); + } + #else + + for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2) + { + if( maskPtr[x] == 0) + { + dIptr[0] = (short)0; + dIptr[1] = (short)0; + + continue; + } + int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + + src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5); + int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 + + dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1); + int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 + + dsrc[dstep+cn2+1]*iw11, W_BITS1); + + Iptr[x] = (short)ival; + dIptr[0] = (short)ixval; + dIptr[1] = (short)iyval; + A11 += (float)(ixval*ixval); + A12 += (float)(ixval*iyval); + A22 += (float)(iyval*iyval); + + } + + + #endif + + } + + #ifdef CV_SSE2 + float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4]; + _mm_store_ps(A11buf, qA11); + _mm_store_ps(A12buf, qA12); + _mm_store_ps(A22buf, qA22); + A11 += A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; + A12 += A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; + A22 += A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; + #endif + + A11 *= FLT_SCALE; + A12 *= FLT_SCALE; + A22 *= FLT_SCALE; + + + float D = A11*A22 - A12*A12; + float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) + + 4.f*A12*A12))/(2 * winArea); + + if( err ) + err[ptidx] = (float)minEig; + + if( D < FLT_EPSILON ) + { + if( level == 0 && status ) + status[ptidx] = 0; + continue; + } + + D = 1.f/D; + + cv::Point2f backUpNextPt = nextPt; + nextPt -= halfWin; + Point2f prevDelta(0,0); + + float c[8]; + // init the maske + __m128i mmMask_epi32 = _mm_set_epi32(0, std::numeric_limits::max(), 0, std::numeric_limits::max()); + __m128i mmMask_epi16 = _mm_set_epi16(0, std::numeric_limits::max(), 0, std::numeric_limits::max(), + 0, std::numeric_limits::max(), 0, std::numeric_limits::max()); + __m128i mmMask0, mmMask1, mmMask; + getWBitMask(winSize.width, mmMask0, mmMask1, mmMask); + for( j = 0; j < criteria.maxCount; j++ ) + { + cv::Point2f delta; + bool hasSolved = false; + a = nextPt.x - cvFloor(nextPt.x); + b = nextPt.y - cvFloor(nextPt.y); + float ab = a * b; + + + if( (inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j == 0)) + { + inextPt.x = cvFloor(nextPt.x); + inextPt.y = cvFloor(nextPt.y); + + + if( inextPt.x < -winSize.width || inextPt.x >= J.cols || + inextPt.y < -winSize.height || inextPt.y >= J.rows ) + { + if( level == 0 && status ) + status[ptidx] = 3; + if (level > 0) + nextPts[ptidx] = backUpNextPt; + break; + } + + + iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + + float _b1[4] = {0,0,0,0}; + float _b2[4] = {0,0,0,0}; + #ifdef CV_SSE2 + __m128 qbc0[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; + __m128 qbc1[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; + qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps(); + #endif + for( y = 0; y < winSize.height; y++ ) + { + const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn; + const short* Iptr = (const short*)(IWinBuf.data + ( y)*IWinBuf.step); + const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); + x = 0; + #ifdef CV_SSE2 + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 ) + { + if( maskPtr[x ] == 0 && maskPtr[x+1] == 0 && maskPtr[x+2] == 0 && maskPtr[x+3] == 0 + && maskPtr[x+4] == 0 && maskPtr[x+5] == 0 && maskPtr[x+6] == 0 && maskPtr[x+7] == 0) + continue; + __m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x)); + __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z); + __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); + __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z); + __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z); + + __m128i mmDiffc_epi16[4] = + { _mm_subs_epi16(_mm_slli_epi16(v00, 5), diff0), + _mm_subs_epi16(_mm_slli_epi16(v01, 5), diff0), + _mm_subs_epi16(_mm_slli_epi16(v10, 5), diff0), + _mm_subs_epi16(_mm_slli_epi16(v11, 5), diff0) + }; + + __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... + __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); + + if( x > winSize.width*cn - 8) + { + Ixy_0 = _mm_and_si128(Ixy_0, mmMask0); + Ixy_1 = _mm_and_si128(Ixy_1, mmMask1); + } + + __m128i diffc1[4] = + {_mm_unpackhi_epi16(mmDiffc_epi16[0],mmDiffc_epi16[0]), + _mm_unpackhi_epi16(mmDiffc_epi16[1],mmDiffc_epi16[1]), + _mm_unpackhi_epi16(mmDiffc_epi16[2],mmDiffc_epi16[2]), + _mm_unpackhi_epi16(mmDiffc_epi16[3],mmDiffc_epi16[3]) + }; + + __m128i diffc0[4] = + {_mm_unpacklo_epi16(mmDiffc_epi16[0],mmDiffc_epi16[0]), + _mm_unpacklo_epi16(mmDiffc_epi16[1],mmDiffc_epi16[1]), + _mm_unpacklo_epi16(mmDiffc_epi16[2],mmDiffc_epi16[2]), + _mm_unpacklo_epi16(mmDiffc_epi16[3],mmDiffc_epi16[3]) + }; + + + // It * Ix It * Iy [0 ... 3] + //mask split for multiplication + // for set 1 lo sigma 1 + + + v10 = _mm_mullo_epi16(Ixy_0, diffc0[0]); + v11 = _mm_mulhi_epi16(Ixy_0, diffc0[0]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[0] = _mm_add_ps(qbc0[0], _mm_cvtepi32_ps(v00)); + qbc1[0] = _mm_add_ps(qbc1[0], _mm_cvtepi32_ps(v10)); + + v10 = _mm_mullo_epi16(Ixy_0, diffc0[1]); + v11 = _mm_mulhi_epi16(Ixy_0, diffc0[1]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[1] = _mm_add_ps(qbc0[1], _mm_cvtepi32_ps(v00)); + qbc1[1] = _mm_add_ps(qbc1[1], _mm_cvtepi32_ps(v10)); + + v10 = _mm_mullo_epi16(Ixy_0, diffc0[2]); + v11 = _mm_mulhi_epi16(Ixy_0, diffc0[2]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[2] = _mm_add_ps(qbc0[2], _mm_cvtepi32_ps(v00)); + qbc1[2] = _mm_add_ps(qbc1[2], _mm_cvtepi32_ps(v10)); + + v10 = _mm_mullo_epi16(Ixy_0, diffc0[3]); + v11 = _mm_mulhi_epi16(Ixy_0, diffc0[3]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[3] = _mm_add_ps(qbc0[3], _mm_cvtepi32_ps(v00)); + qbc1[3] = _mm_add_ps(qbc1[3], _mm_cvtepi32_ps(v10)); + // It * Ix It * Iy [4 ... 7] + // for set 1 hi sigma 1 + + v10 = _mm_mullo_epi16(Ixy_1, diffc1[0]); + v11 = _mm_mulhi_epi16(Ixy_1, diffc1[0]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[0] = _mm_add_ps(qbc0[0], _mm_cvtepi32_ps(v00)); + qbc1[0] = _mm_add_ps(qbc1[0], _mm_cvtepi32_ps(v10)); + + v10 = _mm_mullo_epi16(Ixy_1, diffc1[1]); + v11 = _mm_mulhi_epi16(Ixy_1, diffc1[1]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[1] = _mm_add_ps(qbc0[1], _mm_cvtepi32_ps(v00)); + qbc1[1] = _mm_add_ps(qbc1[1], _mm_cvtepi32_ps(v10)); + + v10 = _mm_mullo_epi16(Ixy_1, diffc1[2]); + v11 = _mm_mulhi_epi16(Ixy_1, diffc1[2]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[2] = _mm_add_ps(qbc0[2], _mm_cvtepi32_ps(v00)); + qbc1[2] = _mm_add_ps(qbc1[2], _mm_cvtepi32_ps(v10)); + + v10 = _mm_mullo_epi16(Ixy_1, diffc1[3]); + v11 = _mm_mulhi_epi16(Ixy_1, diffc1[3]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[3] = _mm_add_ps(qbc0[3], _mm_cvtepi32_ps(v00)); + qbc1[3] = _mm_add_ps(qbc1[3], _mm_cvtepi32_ps(v10)); + + } + #else + for( ; x < winSize.width*cn; x++, dIptr += 2 ) + { + if( dIptr[0] == 0 && dIptr[1] == 0) + continue; + /*short It[4] = {CV_DESCALE((Jptr[x]) *(1 << W_BITS),W_BITS1-5) - Iptr[x], + CV_DESCALE((Jptr[x+cn]) *(1 << W_BITS),W_BITS1-5) - Iptr[x], + CV_DESCALE((Jptr[x+step]) *(1 << W_BITS),W_BITS1-5) - Iptr[x], + CV_DESCALE((Jptr[x+step+cn])*(1 << W_BITS),W_BITS1-5) - Iptr[x]};*/ + short It[4] = {(Jptr[x] << 5) - Iptr[x], + (Jptr[x+cn]<< 5) - Iptr[x], + (Jptr[x+step]<< 5) - Iptr[x], + (Jptr[x+step+cn]<< 5) - Iptr[x]}; + + _b1[0] += (float)(It[0]*dIptr[0]); + _b1[1] += (float)(It[1]*dIptr[0]); + _b1[2] += (float)(It[2]*dIptr[0]); + _b1[3] += (float)(It[3]*dIptr[0]); + + _b2[0] += (float)(It[0]*dIptr[1]); + _b2[1] += (float)(It[1]*dIptr[1]); + _b2[2] += (float)(It[2]*dIptr[1]); + _b2[3] += (float)(It[3]*dIptr[1]); + } + #endif + + } + + #ifdef CV_SSE2 + float CV_DECL_ALIGNED(16) bbuf[4]; + _mm_store_ps(bbuf, _mm_add_ps(qbc0[0], qbc1[0])); + _b1[0] += bbuf[0] + bbuf[2]; + _b2[0] += bbuf[1] + bbuf[3]; + + _mm_store_ps(bbuf, _mm_add_ps(qbc0[1], qbc1[1])); + _b1[1] += bbuf[0] + bbuf[2]; + _b2[1] += bbuf[1] + bbuf[3]; + + _mm_store_ps(bbuf, _mm_add_ps(qbc0[2], qbc1[2])); + _b1[2] += bbuf[0] + bbuf[2]; + _b2[2] += bbuf[1] + bbuf[3]; + + _mm_store_ps(bbuf, _mm_add_ps(qbc0[3], qbc1[3])); + _b1[3] += bbuf[0] + bbuf[2]; + _b2[3] += bbuf[1] + bbuf[3]; + + #endif + _b1[0] *= FLT_SCALE; + _b1[1] *= FLT_SCALE; + _b1[2] *= FLT_SCALE; + _b1[3] *= FLT_SCALE; + _b2[0] *= FLT_SCALE; + _b2[1] *= FLT_SCALE; + _b2[2] *= FLT_SCALE; + _b2[3] *= FLT_SCALE; + + float c0[4] = { _b1[3] + _b1[0] - _b1[2] - _b1[1], + _b1[1] - _b1[0], + _b1[2] - _b1[0], + _b1[0]}; + float c1[4] = { _b2[3] + _b2[0] - _b2[2] - _b2[1], + _b2[1] - _b2[0], + _b2[2] - _b2[0], + _b2[0]}; + + float DA12 = A12 * D ; + float DA22 = A22 * D ; + float DA11 = A11 * D ; + c[0] = DA12 * c1[0] - DA22 * c0[0]; + c[1] = DA12 * c1[1] - DA22 * c0[1]; + c[2] = DA12 * c1[2] - DA22 * c0[2]; + c[3] = DA12 * c1[3] - DA22 * c0[3]; + c[4] = DA12 * c0[0] - DA11 * c1[0]; + c[5] = DA12 * c0[1] - DA11 * c1[1]; + c[6] = DA12 * c0[2] - DA11 * c1[2]; + c[7] = DA12 * c0[3] - DA11 * c1[3]; + + float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]); + float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]); + float e2 = e0 * (c[1] * c[7] -c[3] * c[5]); + e0 = e1 * e1 - e2; + if ( e0 >= 0) + { + e0 = sqrt(e0); + float _y[2] = {-e1 - e0, e0 - e1}; + float c0yc1[2] = {c[0] * _y[0] + c[1], + c[0] * _y[1] + c[1]}; + + float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0], + -(c[2] * _y[1] + c[3]) / c0yc1[1]}; + + float c0yc2[2] = {c[0] * _y[0] + c[2], + c[0] * _y[1] + c[2]}; + + float c4yc6[2] = {c[4] * _y[0] + c[6], + c[4] * _y[1] + c[6]}; + + float c0xc1[2] = {c[0] * _x[0] + c[1], + c[0] * _x[1] + c[1]}; + + float c4xc5[2] = {c[4] * _x[0] + c[5], + c[4] * _x[1] + c[5]}; + bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1); + bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1); + + + bool isSink1 = isIn1 && checkSolution(_x[0], _y[0], c ); + bool isSink2 = isIn2 && checkSolution(_x[1], _y[1], c ); + + + //if( isSink1 != isSink2 ) + { + if( isSink1 ) + { + hasSolved = true; + delta.x = inextPt.x + _x[0] - nextPt.x; + delta.y = inextPt.y + _y[0] - nextPt.y; + } + if (isSink2 ) + { + hasSolved = true; + delta.x = inextPt.x + _x[1] - nextPt.x; + delta.y = inextPt.y + _y[1] - nextPt.y; + } + } // isIn1 != isIn2 + } // e0 >= 0 + } + else + hasSolved = false; + if(hasSolved == false) + { + delta = Point2f( c[0] * ab + c[1] * a + c[2] * b + c[3], + c[4] * ab + c[5] * a + c[6] * b + c[7]); + nextPt += 0.7 * delta; + nextPts[ptidx] = nextPt + halfWin; + } + else + { + nextPt += delta; + nextPts[ptidx] = nextPt + halfWin; + break; + } + + + if( delta.ddot(delta) <= criteria.epsilon) + break; + + if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01 && + std::abs(delta.y - prevDelta.y) < 0.01) + { + nextPts[ptidx] -= delta*0.35f; + break; + } + + prevDelta = delta; + + } + + } + } + + + const Mat* prevImg; + const Mat* nextImg; + const Mat* prevDeriv; + const Mat* rgbPrevImg; + const Mat* rgbNextImg; + const Point2f* prevPts; + Point2f* nextPts; + uchar* status; + float* err; + int maxWinSize; + int minWinSize; + TermCriteria criteria; + int level; + int maxLevel; + int windowType; + float minEigThreshold; + bool useInitialFlow; + int crossSegmentationThreshold; + }; +} +} +} +} +#endif \ No newline at end of file diff --git a/modules/optflow/src/rlof/geo_interpolation.cpp b/modules/optflow/src/rlof/geo_interpolation.cpp new file mode 100644 index 00000000000..47e92c96b0d --- /dev/null +++ b/modules/optflow/src/rlof/geo_interpolation.cpp @@ -0,0 +1,525 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// + // + // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. + // + // By downloading, copying, installing or using the software you agree to this license. + // If you do not agree to this license, do not download, install, + // copy or use the software. + // + // + // License Agreement + // For Open Source Computer Vision Library + // + // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. + // Copyright (C) 2009, Willow Garage Inc., all rights reserved. + // Third party copyrights are property of their respective owners. + // + // Redistribution and use in source and binary forms, with or without modification, + // are permitted provided that the following conditions are met: + // + // * Redistribution's of source code must retain the above copyright notice, + // this list of conditions and the following disclaimer. + // + // * Redistribution's in binary form must reproduce the above copyright notice, + // this list of conditions and the following disclaimer in the documentation + // and/or other materials provided with the distribution. + // + // * The name of the copyright holders may not be used to endorse or promote products + // derived from this software without specific prior written permission. + // + // This software is provided by the copyright holders and contributors "as is" and + // any express or implied warranties, including, but not limited to, the implied + // warranties of merchantability and fitness for a particular purpose are disclaimed. + // In no event shall the Intel Corporation or contributors be liable for any direct, + // indirect, incidental, special, exemplary, or consequential damages + // (including, but not limited to, procurement of substitute goods or services; + // loss of use, data, or profits; or business interruption) however caused + // and on any theory of liability, whether in contract, strict liability, + // or tort (including negligence or otherwise) arising in any way out of + // the use of this software, even if advised of the possibility of such damage. + // + //M*/ +// This functions have been contributed by Jonas Geisters +#include "precomp.hpp" +#include "geo_interpolation.hpp" +#include +#include +namespace cv +{ + namespace optflow + { + + double K_h1(float x, double h2) + { + return exp(-(0.5 / (h2)) * x); + } + + struct Graph_helper { + int * mem; + int e_size; + Graph_helper(int k, int num_nodes) { + e_size = (2 * k + 1); + mem = (int*)malloc(sizeof(int) * e_size * num_nodes); + memset(mem, 0, sizeof(int) * e_size * num_nodes); + } + inline int size(int id) { + int r_addr = id * (e_size); + return ((int*)mem)[r_addr]; + } + inline int * data(int id) { + int r_addr = id * (e_size)+1; + return mem + r_addr; + } + inline void add(int id, std::pair data) { + int r_addr = id * (e_size); + int size = ++((int*)mem)[r_addr]; + r_addr += 2 * size - 1;//== 1 + 2*(size-1); + ((float*)mem)[r_addr] = data.first; + ((int*)mem)[r_addr + 1] = data.second; + } + inline bool color_in_target(int id, int color) { + int r_addr = id * (e_size); + int size = (((int*)mem)[r_addr]); + r_addr += 2; + for (int i = 0; i < size; i++) { + if (((int*)mem)[r_addr] == color) { + return true; + } + r_addr += 2; + } + return false; + } + + }; + + Mat sgeo_dist(const Mat& gra, int y, int x, float max, Mat &prev) + { + std::vector points; + points.push_back(Point2f(static_cast(x), static_cast(y))); + return sgeo_dist(gra, points, max, prev); + } + Mat sgeo_dist(const Mat& gra, const std::vector & points, float max, Mat &prev) + { + int Dx[] = { -1,0,1,-1,1,-1,0,1 }; + int Dy[] = { -1,-1,-1,0,0,1,1,1 }; + Mat dm(gra.rows, gra.cols, CV_32F, Scalar(max)); + prev = Mat(gra.rows, gra.cols, CV_8U, Scalar(255)); + + std::multimap not_visited_with_value; + + for (auto i = points.begin(); i != points.end(); i++) + { + int y = static_cast(i->y); + int x = static_cast(i->x); + not_visited_with_value.insert(std::pair(0.f, Vec2i(y, x))); + dm.at(y, x) = 0; + } + + bool done = false; + while (!done) + { + if (not_visited_with_value.begin() == not_visited_with_value.end()) { + done = true; + break; + } + std::multimap::iterator current_it = not_visited_with_value.begin(); + std::pair current_p = *current_it; + not_visited_with_value.erase(current_it); + + int y = current_p.second[0]; + int x = current_p.second[1]; + float cur_d = current_p.first; + + if (dm.at(y, x) != cur_d) { + continue; + } + + Vec8f gra_e = gra.at(y, x); + + for (int i = 0; i < 8; i++) { + if (gra_e[i] < 0) { + continue; + } + int dx = Dx[i]; + int dy = Dy[i]; + + if (dm.at(y + dy, x + dx) > cur_d + gra_e[i]) { + dm.at(y + dy, x + dx) = cur_d + gra_e[i]; + prev.at(y + dy, x + dx) = static_cast(7 - i); + not_visited_with_value.insert(std::pair(cur_d + gra_e[i], Vec2i(y + dy, x + dx))); + } + + } + } + + + + + return dm; + } + + Mat interpolate_irregular_nn_raster(const std::vector & prevPoints, + const std::vector & nextPoints, + const std::vector & status, + const Mat & i1) + { + Mat gra = getGraph(i1, 0.1f); + int Dx[] = { -1,0,1,-1,1,-1,0,1 }; + int Dy[] = { -1,-1,-1,0,0,1,1,1 }; + int max_rounds = 10; + Mat dirt = Mat(gra.rows, gra.cols, CV_8U, Scalar(0)); + Mat quellknoten = Mat(gra.rows, gra.cols, CV_32S, Scalar(-1)); + Mat dist = Mat(gra.rows, gra.cols, CV_32F, Scalar(std::numeric_limits::max())); + /* + * assign quellknoten ids. + */ + for (int i = 0; i < prevPoints.size(); i++) + { + int x = (int)prevPoints[i].x; + int y = (int)prevPoints[i].y; + if (status[i] == 0) + continue; + dirt.at(y, x) = 1; + dist.at(y, x) = 0; + quellknoten.at(y, x) = i; + } + + bool clean = true; + bool done = false; + int x = 0; + int y = 0; + int rounds = 0; + while (!done) { + /* + * Update x and y + * on even rounds go rasterscanorder , on odd round inverse rasterscanorder + */ + if (rounds % 2 == 0) { + x++; + if (x >= gra.cols) { + x = 0; + y++; + if (y >= gra.rows) { + y = 0; + rounds++; + y = gra.rows - 1; + x = gra.cols - 1; + if (rounds >= max_rounds || clean) { + done = true; + break; + } + } + } + } + else { + x--; + if (x < 0) { + x = gra.cols - 1; + y--; + if (y < 0) { + y = gra.rows - 1; + rounds++; + y = 0; + x = 0; + if (rounds >= max_rounds || clean) { + done = true; + break; + } + } + } + } + if (dirt.at(y, x) == 0) { + continue; + } + dirt.at(y, x) = 0; + + float c_dist = dist.at(y, x); + Vec8f gra_e = gra.at(y, x); + + for (int i = 0; i < 8; i++) { + int tx = Dx[i]; + int ty = Dy[i]; + if (ty == 0 && tx == 0) { + continue; + } + if (x + tx < 0 || x + tx >= gra.cols) { + continue; + } + if (y + ty < 0 || y + ty >= gra.rows) { + continue; + } + if (c_dist > dist.at(y + ty, x + tx)) { + if (c_dist > dist.at(y + ty, x + tx) + gra_e[i]) { + quellknoten.at(y, x) = quellknoten.at(y + ty, x + tx); + dist.at(y, x) = dist.at(y + ty, x + tx) + gra_e[i]; + dirt.at(y, x) = 1; + clean = false; + } + } + else { + if (c_dist + gra_e[i] < dist.at(y + ty, x + tx)) { + quellknoten.at(y + ty, x + tx) = quellknoten.at(y, x); + dist.at(y + ty, x + tx) = dist.at(y, x) + gra_e[i]; + dirt.at(y + ty, x + tx) = 1; + clean = false; + } + } + } + + + + } + + Mat nnFlow(i1.rows, i1.cols, CV_32FC2, Scalar(0)); + for (y = 0; y < i1.rows; y++) { + for (x = 0; x < i1.cols; x++) { + + int id = quellknoten.at(y, x); + if (id != -1) + { + nnFlow.at(y, x) = nextPoints[id] - prevPoints[id]; + } + } + } + return nnFlow; + } + + Mat interpolate_irregular_knn( + const std::vector & _prevPoints, + const std::vector & _nextPoints, + const std::vector & status, + const Mat &color_img, + int k, + float pixeldistance) + { + Mat in(color_img.rows, color_img.cols, CV_32FC2); + Mat mask = Mat::zeros(color_img.rows, color_img.cols, CV_8UC1); + + for (unsigned n = 0; n < _prevPoints.size(); n++) + { + if (_prevPoints[n].x >= 0 && _prevPoints[n].y >= 0 && _prevPoints[n].x < color_img.cols && _prevPoints[n].y < color_img.rows) + { + in.at(_prevPoints[n]) = _nextPoints[n] - _prevPoints[n]; + mask.at(_prevPoints[n]) = status[n]; + } + + } + int Dx[] = { -1,0,1,-1,1,-1,0,1 }; + int Dy[] = { -1,-1,-1,0,0,1,1,1 }; + Mat gra = getGraph(color_img, pixeldistance); + Mat nnFlow(in.rows, in.cols, CV_32FC2, Scalar(0)); + + std::multimap my_agents; // > + Graph_helper graph_helper(k, in.rows*in.cols); //< arrivaltime, color> + + + int color = 0; + std::vector flow_point_list; + for (int y = 0; y < in.rows; y++) { + for (int x = 0; x < in.cols; x++) { + if (mask.at(y, x) > 0) { + flow_point_list.push_back(Vec2i(y, x)); + nnFlow.at(y, x) = in.at(y, x); + + int v_id = (y * in.cols + x); + graph_helper.add(v_id, std::pair(0.f, color)); + + + Vec8f gra_e = gra.at(y, x); + for (int i = 0; i < 8; i++) { + if (gra_e[i] < 0) + continue; + int dx = Dx[i]; + int dy = Dy[i]; + int target = (y + dy) * in.cols + (x + dx); + Vec2i agent(target, color); + my_agents.insert(std::pair(gra_e[i], agent)); + + } + color++; + } + } + } + + int global_time = 0; + + bool done = false; + while (!done) { + if (my_agents.size() == 0) { + done = true; + break; + } + global_time++; + + std::multimap::iterator current_it = my_agents.begin(); + std::pair current_p = *current_it; + my_agents.erase(current_it); + + int target = current_p.second[0]; + color = current_p.second[1]; + float arriv_time = current_p.first; + + Vec8f gra_e = gra.at(target);// (y*cols+x) + if (graph_helper.size(target) >= k) { + continue; + } + + bool color_found_in_target = graph_helper.color_in_target(target, color); + if (color_found_in_target) { + continue; + } + graph_helper.add(target, std::pair(arriv_time, color)); + + + for (int i = 0; i < 8; i++) { + if (gra_e[i] < 0) + continue; + + int dx = Dx[i]; + int dy = Dy[i]; + int new_target = target + dx + (dy*in.cols); + if (graph_helper.size(new_target) >= k) { + continue; + } + color_found_in_target = graph_helper.color_in_target(new_target, color); + if (color_found_in_target) { + continue; + } + Vec2i new_agent(new_target, color); + my_agents.insert(std::pair(arriv_time + gra_e[i], new_agent)); + + } + } + + Mat ret(in.rows, in.cols*k, CV_32FC2); + for (int y = 0; y < in.rows; y++) { + for (int x = 0; x < in.cols; x++) { + for (int i = 0; i < k; i++) { + float dist = *((float*)(graph_helper.data(y*in.cols + x) + 2 * i)); + float id = *((float*)(graph_helper.data(y*in.cols + x) + 2 * i + 1)); + ret.at(y, k*x + i) = Vec2f(dist, id); + } + } + } + return ret; + + for (int y = 0; y < in.rows; y++) { + for (int x = 0; x < in.cols; x++) { + int v_id = y * in.cols + x; + std::vector > flow_s(k); + + for (int i = 0; i < k; i++) { + color = *((int*)(graph_helper.data(v_id) + 1 + 2 * i)); + int cy = flow_point_list[color][0]; + int cx = flow_point_list[color][1]; + Vec2f flow = in.at(cy, cx); + flow_s[i] = std::make_pair(static_cast(norm(flow)), flow); + + + + } + + nnFlow.at(y, x) = std::min_element(flow_s.begin(), flow_s.end(), [](const std::pair &left, const std::pair &right) { + return left.first < right.first; + })->second; + + + } + } + free(graph_helper.mem); + return nnFlow; + + + } + Mat getGraph(const Mat &image, float edge_length) + { + + int Dx[] = { -1,0,1,-1,1,-1,0,1 }; + int Dy[] = { -1,-1,-1,0,0,1,1,1 }; + Mat gra(image.rows, image.cols, CV_32FC(8)); + + for (int y = 0; y < gra.rows; y++) { + for (int x = 0; x < gra.cols; x++) { + + for (int i = 0; i < 8; i++) { + int dx = Dx[i]; + int dy = Dy[i]; + gra.at(y, x)[i] = -1; + + if (x + dx < 0 || y + dy < 0 || x + dx >= gra.cols || y + dy >= gra.rows) { + continue; + } + + if (i < 4) { + int si = 7 - i; + gra.at(y, x)[i] = gra.at(y + dy, x + dx)[si]; + } + else { + float p1 = dx * dx*edge_length*edge_length + dy * dy*edge_length*edge_length; + float p2 = static_cast(image.at(y, x)[0] - image.at(y + dy, x + dx)[0]); + float p3 = static_cast(image.at(y, x)[1] - image.at(y + dy, x + dx)[1]); + float p4 = static_cast(image.at(y, x)[2] - image.at(y + dy, x + dx)[2]); + gra.at(y, x)[i] = sqrt(p1 + p2 * p2 + p3 * p3 + p4 * p4); + } + + } + + } + } + + return gra; + } + + + Mat interpolate_irregular_nn( + const std::vector & _prevPoints, + const std::vector & _nextPoints, + const std::vector & status, + const Mat &color_img, + float pixeldistance) + { + int Dx[] = { -1,0,1,-1,1,-1,0,1 }; + int Dy[] = { -1,-1,-1,0,0,1,1,1 }; + std::vector prevPoints, nextPoints; + std::map, std::pair> flowMap; + for (unsigned n = 0; n < _prevPoints.size(); n++) + { + if (status[n] != 0) + { + flowMap.insert(std::make_pair( + std::make_pair(_prevPoints[n].x, _prevPoints[n].y), + std::make_pair(_nextPoints[n].x, _nextPoints[n].y))); + prevPoints.push_back(_prevPoints[n]); + nextPoints.push_back(_nextPoints[n]); + } + + } + + Mat gra = getGraph(color_img, pixeldistance); + + Mat prev; + Mat geo_dist = sgeo_dist(gra, prevPoints, std::numeric_limits::max(), prev); + + + Mat nnFlow = Mat::zeros(color_img.size(), CV_32FC2); + for (int y = 0; y < nnFlow.rows; y++) + { + for (int x = 0; x < nnFlow.cols; x++) + { + int cx = x; + int cy = y; + while (prev.at(cy, cx) != 255) + { + int i = prev.at(cy, cx); + cx += Dx[i]; + cy += Dy[i]; + } + auto val = flowMap[std::make_pair(static_cast(cx), static_cast(cy))]; + nnFlow.at(y, x) = Vec2f(val.first - cx, val.second - cy); + } + } + return nnFlow; + } + + } +} \ No newline at end of file diff --git a/modules/optflow/src/rlof/geo_interpolation.hpp b/modules/optflow/src/rlof/geo_interpolation.hpp new file mode 100644 index 00000000000..2c9dfbd38c0 --- /dev/null +++ b/modules/optflow/src/rlof/geo_interpolation.hpp @@ -0,0 +1,75 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// + // + // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. + // + // By downloading, copying, installing or using the software you agree to this license. + // If you do not agree to this license, do not download, install, + // copy or use the software. + // + // + // License Agreement + // For Open Source Computer Vision Library + // + // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. + // Copyright (C) 2009, Willow Garage Inc., all rights reserved. + // Third party copyrights are property of their respective owners. + // + // Redistribution and use in source and binary forms, with or without modification, + // are permitted provided that the following conditions are met: + // + // * Redistribution's of source code must retain the above copyright notice, + // this list of conditions and the following disclaimer. + // + // * Redistribution's in binary form must reproduce the above copyright notice, + // this list of conditions and the following disclaimer in the documentation + // and/or other materials provided with the distribution. + // + // * The name of the copyright holders may not be used to endorse or promote products + // derived from this software without specific prior written permission. + // + // This software is provided by the copyright holders and contributors "as is" and + // any express or implied warranties, including, but not limited to, the implied + // warranties of merchantability and fitness for a particular purpose are disclaimed. + // In no event shall the Intel Corporation or contributors be liable for any direct, + // indirect, incidental, special, exemplary, or consequential damages + // (including, but not limited to, procurement of substitute goods or services; + // loss of use, data, or profits; or business interruption) however caused + // and on any theory of liability, whether in contract, strict liability, + // or tort (including negligence or otherwise) arising in any way out of + // the use of this software, even if advised of the possibility of such damage. + // + //M*/ +#ifndef _GEO_INTERPOLATION_HPP_ +#define _GEO_INTERPOLATION_HPP_ +#include +namespace cv +{ + namespace optflow + { + typedef Vec Vec8f; + Mat getGraph(const Mat & image, float edge_length); + Mat sgeo_dist(const Mat& gra, int y, int x, float max, Mat &prev); + Mat sgeo_dist(const Mat& gra, const std::vector & points, float max, Mat &prev); + Mat interpolate_irregular_nw(const Mat &in, const Mat &mask, const Mat &color_img, float max_d, float bandwidth, float pixeldistance); + Mat interpolate_irregular_nn( + const std::vector & prevPoints, + const std::vector & nextPoints, + const std::vector & status, + const Mat &color_img, + float pixeldistance); + Mat interpolate_irregular_knn( + const std::vector & _prevPoints, + const std::vector & _nextPoints, + const std::vector & status, + const Mat &color_img, + int k, + float pixeldistance); + + Mat interpolate_irregular_nn_raster(const std::vector & prevPoints, + const std::vector & nextPoints, + const std::vector & status, + const Mat & i1); + + } +} +#endif diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp new file mode 100644 index 00000000000..1a8f5eec380 --- /dev/null +++ b/modules/optflow/src/rlof/plk_invoker.hpp @@ -0,0 +1,1141 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// + // + // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. + // + // By downloading, copying, installing or using the software you agree to this license. + // If you do not agree to this license, do not download, install, + // copy or use the software. + // + // + // License Agreement + // For Open Source Computer Vision Library + // + // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. + // Copyright (C) 2009, Willow Garage Inc., all rights reserved. + // Third party copyrights are property of their respective owners. + // + // Redistribution and use in source and binary forms, with or without modification, + // are permitted provided that the following conditions are met: + // + // * Redistribution's of source code must retain the above copyright notice, + // this list of conditions and the following disclaimer. + // + // * Redistribution's in binary form must reproduce the above copyright notice, + // this list of conditions and the following disclaimer in the documentation + // and/or other materials provided with the distribution. + // + // * The name of the copyright holders may not be used to endorse or promote products + // derived from this software without specific prior written permission. + // + // This software is provided by the copyright holders and contributors "as is" and + // any express or implied warranties, including, but not limited to, the implied + // warranties of merchantability and fitness for a particular purpose are disclaimed. + // In no event shall the Intel Corporation or contributors be liable for any direct, + // indirect, incidental, special, exemplary, or consequential damages + // (including, but not limited to, procurement of substitute goods or services; + // loss of use, data, or profits; or business interruption) however caused + // and on any theory of liability, whether in contract, strict liability, + // or tort (including negligence or otherwise) arising in any way out of + // the use of this software, even if advised of the possibility of such damage. + // + //M*/ +#ifndef _PLK_INVOKER_HPP_ +#define _PLK_INVOKER_HPP_ +#include "rlof_invokerbase.hpp" + + +namespace cv{ + namespace optflow{ +namespace plk +{ + +// implementierung ohne SSE +namespace radial +{ +class TrackerInvoker : public cv::ParallelLoopBody +{ +public: + TrackerInvoker( + + const Mat& _prevImg, + const Mat& _prevDeriv, + const Mat& _nextImg, + const Mat& _rgbPrevImg, + const Mat& _rgbNextImg, + const Point2f* _prevPts, + Point2f* _nextPts, + uchar* _status, + float* _err, + Point2f* _gainVecs, + int _level, + int _maxLevel, + int _winSize[2], + int _maxIteration, + bool _useInitialFlow, + int _supportRegionType, + float _minEigenValue, + int _crossSegmentationThreshold) + { + prevImg = &_prevImg; + prevDeriv = &_prevDeriv; + nextImg = &_nextImg; + rgbPrevImg = &_rgbPrevImg; + rgbNextImg = &_rgbNextImg; + prevPts = _prevPts; + nextPts = _nextPts; + status = _status; + err = _err; + gainVecs = _gainVecs; + minWinSize = _winSize[0]; + maxWinSize = _winSize[1]; + criteria.maxCount = _maxIteration; + criteria.epsilon = 0.01; + level = _level; + maxLevel = _maxLevel; + windowType = _supportRegionType; + minEigThreshold = _minEigenValue; + useInitialFlow = _useInitialFlow; + crossSegmentationThreshold = _crossSegmentationThreshold; + } + + TrackerInvoker & operator=(const TrackerInvoker &) {return *this;}; + void operator()(const cv::Range& range) const + { + cv::Size winSize; + cv::Point2f halfWin; + + const Mat& I = *prevImg; + const Mat& J = *nextImg; + const Mat& derivI = *prevDeriv; + const Mat& BI = *rgbPrevImg; + + winSize = cv::Size(maxWinSize,maxWinSize); + int winMaskwidth = static_cast(ceil(winSize.width / 16.f)) * 16; + cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); + winMaskMatBuf.setTo(1); + + const float FLT_SCALE = (1.f/(1 << 16));//(1.f/(1 << 20)); // 20 + + int cn = I.channels(), cn2 = cn*2; + int winbufwidth = static_cast(ceil(winSize.width / 16.f)) * 16; + cv::Size winBufSize(winbufwidth,winbufwidth); + + cv::Mat invTensorMat(4,4, CV_32FC1); + cv::Mat mismatchMat(4, 1, CV_32FC1); + cv::Mat resultMat(4, 1, CV_32FC1); + + cv::AutoBuffer _buf(winBufSize.area()*(cn + cn2)); + int derivDepth = DataType::depth; + + Mat IWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf.data()); + Mat derivIWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf.data() + winBufSize.area()*cn); + + for( int ptidx = range.start; ptidx < range.end; ptidx++ ) + { + Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level)); + Point2f nextPt; + if( level == maxLevel ) + { + if( useInitialFlow ) + { + nextPt = nextPts[ptidx]*(float)(1./(1 << level)); + } + else + nextPt = prevPt; + } + else + nextPt = nextPts[ptidx]*2.f; + nextPts[ptidx] = nextPt; + + Point2i iprevPt, inextPt; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + int winArea = maxWinSize * maxWinSize; + cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize)); + winMaskMatBuf.setTo(0); + if( calcWinMaskMat(BI, windowType, iprevPt, + winMaskMat,winSize,halfWin,winArea, + minWinSize,maxWinSize, crossSegmentationThreshold) == false) + continue; + + + prevPt -= halfWin; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + + + if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols || + iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows ) + { + if( level == 0 ) + { + if( status ) + status[ptidx] = 3; + if( err ) + err[ptidx] = 0; + } + continue; + } + + float a = prevPt.x - iprevPt.x; + float b = prevPt.y - iprevPt.y; + const int W_BITS = 14, W_BITS1 = 14; + + int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + + int dstep = (int)(derivI.step/derivI.elemSize1()); + int step = (int)(I.step/I.elemSize1()); + int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1()); + CV_Assert( step == (int)(J.step/J.elemSize1()) ); + float A11 = 0, A12 = 0, A22 = 0; + // tensor + float sumIx = 0; + float sumIy = 0; + float sumI = 0; + float sumW = 0; + float w1 = 0, w2 = 0; // -IyI + float dI = 0; // I^2 + float D = 0; + #ifdef CV_SSE2 + __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128i z = _mm_setzero_si128(); + __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1)); + __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1)); + __m128i mmMask4_epi32; + __m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits::max()); + get4BitMask(winSize.width, mmMask4_epi32); +#endif + // extract the patch from the first image, compute covariation matrix of derivatives + int x, y; + for( y = 0; y < winSize.height; y++ ) + { + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; + const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; + + short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step); + short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step); + + x = 0; +#ifdef CV_SSE2 + for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 ) + { + __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16); + __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16); + __m128i v00, v01, v10, v11, t0, t1; + v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); + v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); + v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z); + v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); + if( x + 4 > winSize.width) + { + t0 = _mm_and_si128(t0, mmMask4_epi32); + } + t0 = _mm_and_si128(t0, mask_0_3_epi16); + _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0)); + + + v00 = _mm_loadu_si128((const __m128i*)(dsrc)); + v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); + v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep)); + v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2)); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1); + v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... + if( x + 4 > winSize.width) + { + v00 = _mm_and_si128(v00, mmMask4_epi32); + } + v00 = _mm_and_si128(v00, mask_0_3_epi16); + _mm_storeu_si128((__m128i*)dIptr, v00); + } +#else + for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 ) + { + if( maskPtr[x] == 0) + { + dIptr[0] = 0; + dIptr[1] = 0; + continue; + } + int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + + src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5); + int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 + + dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1); + int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 + + dsrc[dstep+cn2+1]*iw11, W_BITS1); + + Iptr[x] = (short)ival; + dIptr[0] = (short)ixval; + dIptr[1] = (short)iyval; + } +#endif + + } + + cv::Point2f backUpNextPt = nextPt; + nextPt -= halfWin; + Point2f prevDelta(0,0); //relates to h(t-1) + Point2f prevGain(1,0); + cv::Point2f gainVec = gainVecs[ptidx]; + cv::Point2f backUpGain = gainVec; + int j; + for( j = 0; j < criteria.maxCount; j++ ) + { + status[ptidx] = static_cast(j); + inextPt.x = cvFloor(nextPt.x); + inextPt.y = cvFloor(nextPt.y); + + + if( inextPt.x + halfWin.x < 0 || inextPt.x + halfWin.x>= derivI.cols || + inextPt.y + halfWin.y < 0 || inextPt.y + halfWin.y >= derivI.rows ) + { + if( level == 0 && status ) + status[ptidx] = 3; + break; + } + + a = nextPt.x - inextPt.x; + b = nextPt.y - inextPt.y; + iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + float b1,b2,b3,b4; + b1 = 0; + b2 = 0; + b3 = 0; + b4 = 0; + + if( j == 0 ) + { + // tensor + w1 = 0; // -IxI + w2 = 0; // -IyI + dI = 0; // I^2 + sumIx = 0; + sumIy = 0; + sumI = 0; + sumW = 0; + A11 = 0; + A12 = 0; + A22 = 0; + } +#ifdef CV_SSE2 + qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(), qb2 = _mm_setzero_ps(); + __m128 qb3 = _mm_setzero_ps(); + __m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps(); + __m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps(); + __m128 mmSumIy = _mm_setzero_ps(), mmSumIx = _mm_setzero_ps(); + __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps(); + float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x; + int bitShift = gainVec.x == 0 ? 1 : static_cast(ceil(log(200.f / gainVal) / log(2.f))); // zwei stellen nach den komma + __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast(gainVec.x * (float)(1 << bitShift))); + __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast(gainVec.y)); +#endif + for( y = 0; y < winSize.height; y++ ) + { + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn; + const short* Iptr = (const short*)(IWinBuf.data + ( y)*IWinBuf.step); + const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); + + x = 0; +#ifdef CV_SSE2 + + for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 ) + { + // iw = iw << 14 (16384 short / 2) + // iq0 = iw01 |iw00, iw01 |iw00, iw01 |iw00, iw01 |iw00 16 bit + __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16); + + // I [0...8] + __m128i diff0, diff1; + __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); // element 0 to 7 + + __m128i v00 = _mm_unpacklo_epi8( + _mm_loadl_epi64((const __m128i*)(Jptr + x)) // J0, J1, J2, ..., J7, 64 bit nullen + , z); //J0 , 0, J1, 0, J2, 0 ... J7,0 + __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); + __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z); + __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z); + + + __m128i t0 = _mm_add_epi32 + (_mm_madd_epi16( + _mm_unpacklo_epi16(v00, v01), //J0, , J1, J1, J2, J2, ... J3 , J4 + qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + + __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5); + + + __m128i Ilo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16); + __m128i Ihi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16); + + __m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(Ilo, Ihi), bitShift); + __m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(Ilo, Ihi), bitShift); + __m128i Igain_epi16 = _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32); + + // diff = J - I + I*gain.x + gain.y + __m128i mmDiff_epi16 = _mm_add_epi16( + _mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16), // J - I + _mm_add_epi16(Igain_epi16, mmConstValue_epi16)); + + + mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16); + + + __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... + __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); + + diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7 | It12 It12 It13 It13... + diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3 | It8 It8 It9 It9... + + v10 = _mm_mullo_epi16(Ixy_0, diff0); + v11 = _mm_mulhi_epi16(Ixy_0, diff0); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); + qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); + + v10 = _mm_mullo_epi16(Ixy_1, diff1); + v11 = _mm_mulhi_epi16(Ixy_1, diff1); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); + qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); + // for set 1 sigma 1 + // b3 += diff * Iptr[x] + Ilo = _mm_mullo_epi16(mmDiff_epi16, I_0_7_epi16); + Ihi = _mm_mulhi_epi16(mmDiff_epi16, I_0_7_epi16); + v00 = _mm_unpacklo_epi16(Ilo, Ihi); + v10 = _mm_unpackhi_epi16(Ilo, Ihi); + qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v00)); + qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v10)); + + // b4+= diff + __m128 mmDiff_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mmDiff_epi16)); // (_mm_srai_epi32(_mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16),16)); + __m128 mmDiff_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16),16))); + qb3 = _mm_add_ps(qb3, mmDiff_0_3_ps); + qb3 = _mm_add_ps(qb3, mmDiff_4_7_ps); + if( j == 0 ) + { + __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16)); + __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16))); + + t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3 + t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3 + + __m128 fy = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t0), mask_0_4_ps); + __m128 fx = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t1), mask_0_4_ps); + + // 0 ... 3 + __m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16)); + + // A11 - A22 + mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fy)); + mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fy)); + mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fx)); + + // sumIx und sumIy + mmSumIx = _mm_add_ps(mmSumIx, fx); + mmSumIy = _mm_add_ps(mmSumIy, fy); + + mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fx)); + mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fy)); + + // sumI + I_ps = _mm_blendv_ps(_mm_set1_ps(0), I_ps, mask_0_4_ps); + mmSumI = _mm_add_ps(mmSumI,I_ps); + + // sumDI + mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_ps)); + + t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11 + t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3 + + fy = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t0), mask_4_7_ps); + fx = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t1), mask_4_7_ps); + + // 4 ... 7 + I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16)); + + + + mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fy)); + mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fy)); + mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fx)); + + // sumIx und sumIy + mmSumIx = _mm_add_ps(mmSumIx, fx); + mmSumIy = _mm_add_ps(mmSumIy, fy); + + mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fx)); + mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fy)); + + // sumI + I_ps = _mm_blendv_ps(_mm_set1_ps(0), I_ps, mask_4_7_ps); + mmSumI = _mm_add_ps(mmSumI, I_ps); + + // sumW + // sumDI + mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_ps)); + } + + } +#else + for( ; x < winSize.width*cn; x++, dIptr += 2 ) + { + if( maskPtr[x] == 0) + continue; + int J = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, + W_BITS1-5); + int diff = J - Iptr[x] + static_cast(Iptr[x]) * gainVec.x + gainVec.y; + + b1 += (float)(diff*dIptr[0]) * FLT_RESCALE; + b2 += (float)(diff*dIptr[1]) * FLT_RESCALE; + b3 += (float)(diff) * Iptr[x] * FLT_RESCALE; + b4 += (float)(diff); + + // compute the Gradient Matrice + if( j == 0 ) + { + A11 += (float)(dIptr[0]*dIptr[0]); + A12 += (float)(dIptr[0]*dIptr[1]); + A22 += (float)(dIptr[1]*dIptr[1]); + + + dI += Iptr[x] * Iptr[x] * FLT_RESCALE; + float dx = static_cast(dIptr[0]) * FLT_RESCALE; + float dy = static_cast(dIptr[1]) * FLT_RESCALE; + sumIx += dx; + sumIy += dy; + w1 += dx * Iptr[x]; + w2 += dy * Iptr[x]; + sumI += Iptr[x] * FLT_RESCALE; + //sumW += FLT_RESCALE; + + } + } +#endif + } + + float CV_DECL_ALIGNED(16) wbuf[4];// + if( j == 0 ) + { +#ifdef CV_SSE2 + _mm_store_ps(wbuf, mmSumW1); + w1 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumW2); + w2 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumDI); + dI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumI); + sumI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumIx); + sumIx = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumIy); + sumIy = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumW); + sumW = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; +#endif + sumIx *= -FLT_SCALE; + sumIy *= -FLT_SCALE; + sumI *=FLT_SCALE; + sumW = winArea * FLT_SCALE; + w1 *= -FLT_SCALE; + w2 *= -FLT_SCALE; + dI *= FLT_SCALE; +#ifdef CV_SSE2 + float CV_DECL_ALIGNED(32) A11buf[4], A12buf[4], A22buf[4];// + + _mm_store_ps(A11buf, mmAxx); + _mm_store_ps(A12buf, mmAxy); + _mm_store_ps(A22buf, mmAyy); + + A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; + A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; + A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; +#endif + A11 *= FLT_SCALE; + A12 *= FLT_SCALE; + A22 *= FLT_SCALE; + } + +#ifdef CV_SSE2 + float CV_DECL_ALIGNED(16) bbuf[4]; + _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1)); + b1 = bbuf[0] + bbuf[2]; + b2 = bbuf[1] + bbuf[3]; + _mm_store_ps(bbuf, qb2); + b3 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; + _mm_store_ps(bbuf, qb3); + b4 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; +#endif + + mismatchMat.at(0,0) = b1 * FLT_SCALE; + mismatchMat.at(1,0) = b2 * FLT_SCALE; + mismatchMat.at(2,0) = -b3 * FLT_SCALE; + mismatchMat.at(3,0) = -b4 * FLT_SCALE; + + D = - A12*A12*sumI*sumI + dI*sumW*A12*A12 + 2*A12*sumI*sumIx*w2 + 2*A12*sumI*sumIy*w1 + - 2*dI*A12*sumIx*sumIy - 2*sumW*A12*w1*w2 + A11*A22*sumI*sumI - 2*A22*sumI*sumIx*w1 + - 2*A11*sumI*sumIy*w2 - sumIx*sumIx*w2*w2 + A22*dI*sumIx*sumIx + 2*sumIx*sumIy*w1*w2 + - sumIy*sumIy*w1*w1 + A11*dI*sumIy*sumIy + A22*sumW*w1*w1 + A11*sumW*w2*w2 - A11*A22*dI*sumW; + + + float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) + + 4.f*A12*A12))/(2*winArea); + if( minEig < minEigThreshold ) + { + if( level == 0 && status || std::abs(D) < FLT_EPSILON) + status[ptidx] = 0; + if( level > 0) + { + nextPts[ptidx] = backUpNextPt; + gainVecs[ptidx] = backUpGain; + } + break; + } + + + + D = (1.f / D); + invTensorMat.at(0,0) = (A22*sumI*sumI - 2*sumI*sumIy*w2 + dI*sumIy*sumIy + sumW*w2*w2 - A22*dI*sumW)* D; invTensorMat.at(0,1) = (A12*dI*sumW - A12*sumI * sumI - dI*sumIx*sumIy + sumI*sumIx*w2 + sumI*sumIy*w1 - sumW*w1*w2)* D; + invTensorMat.at(0,2) = (A12*sumI*sumIy - sumIy*sumIy*w1 - A22*sumI*sumIx - A12*sumW*w2 + A22*sumW*w1 + sumIx*sumIy*w2)* D; + invTensorMat.at(0,3) = (A22*dI*sumIx - A12*dI*sumIy - sumIx*w2*w2 + A12*sumI*w2 - A22*sumI*w1 + sumIy*w1*w2) * D; + invTensorMat.at(1,0) = invTensorMat.at(0,1); + invTensorMat.at(1,1) = (A11*sumI * sumI - 2*sumI*sumIx*w1 + dI*sumIx * sumIx + sumW*w1*w1 - A11*dI*sumW) * D; + invTensorMat.at(1,2) = (A12*sumI*sumIx - A11*sumI*sumIy - sumIx * sumIx*w2 + A11*sumW*w2 - A12*sumW*w1 + sumIx*sumIy*w1) * D; + invTensorMat.at(1,3) = (A11*dI*sumIy - sumIy*w1*w1 - A12*dI*sumIx - A11*sumI*w2 + A12*sumI*w1 + sumIx*w1*w2)* D; + invTensorMat.at(2,0) = invTensorMat.at(0,2); + invTensorMat.at(2,1) = invTensorMat.at(1,2); + invTensorMat.at(2,2) = (sumW*A12*A12 - 2*A12*sumIx*sumIy + A22*sumIx*sumIx + A11*sumIy*sumIy - A11*A22*sumW)* D; + invTensorMat.at(2,3) = (A11*A22*sumI - A12*A12*sumI - A11*sumIy*w2 + A12*sumIx*w2 + A12*sumIy*w1 - A22*sumIx*w1)* D; + invTensorMat.at(3,0) = invTensorMat.at(0,3); + invTensorMat.at(3,1) = invTensorMat.at(1,3); + invTensorMat.at(3,2) = invTensorMat.at(2,3); + invTensorMat.at(3,3) = (dI*A12*A12 - 2*A12*w1*w2 + A22*w1*w1 + A11*w2*w2 - A11*A22*dI)* D; + + resultMat = invTensorMat * mismatchMat; + + // 0.08 -12.10 + Point2f delta(-resultMat.at(0), -resultMat.at(1)); + Point2f deltaGain(-resultMat.at(2), -resultMat.at(3)); + + + if( j == 0) + prevGain = deltaGain; + nextPt += delta; + nextPts[ptidx] = nextPt + halfWin; + gainVecs[ptidx]= gainVec; + if( delta.ddot(delta) <= criteria.epsilon) + break; + if( ( std::abs(delta.x - prevDelta.x) < 0.01 + && std::abs(delta.y - prevDelta.y) < 0.01) + || ((delta.ddot(delta) <= 0.001) + && std::abs(prevGain.x - deltaGain.x) < 0.01) + ) + { + nextPts[ptidx] -= delta*0.5f; + gainVecs[ptidx] -= deltaGain* 0.5f; + break; + } + + if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01 && + std::abs(delta.y - prevDelta.y) < 0.01) + { + nextPts[ptidx] -= delta*0.5f; + break; + } + + prevDelta = delta; + prevGain = deltaGain; + + } + + + } + + } + + const Mat* prevImg; + const Mat* nextImg; + const Mat* prevDeriv; + const Mat* rgbPrevImg; + const Mat* rgbNextImg; + const Point2f* prevPts; + Point2f* nextPts; + uchar* status; + cv::Point2f* gainVecs; // gain vector x -> multiplier y -> offset + float* err; + int maxWinSize; + int minWinSize; + TermCriteria criteria; + int level; + int maxLevel; + int windowType; + float minEigThreshold; + bool useInitialFlow; + int crossSegmentationThreshold; +}; +} +namespace ica{ +class TrackerInvoker : public cv::ParallelLoopBody +{ +public: + TrackerInvoker(const Mat& _prevImg, + const Mat& _prevDeriv, + const Mat& _nextImg, + const Mat& _rgbPrevImg, + const Mat& _rgbNextImg, + const Point2f* _prevPts, + Point2f* _nextPts, + uchar* _status, + float* _err, + int _level, + int _maxLevel, + int _winSize[2], + int _maxIteration, + bool _useInitialFlow, + int _supportRegionType, + int _crossSegmentationThreshold, + float _minEigenValue) + { + prevImg = &_prevImg; + prevDeriv = &_prevDeriv; + nextImg = &_nextImg; + rgbPrevImg = &_rgbPrevImg; + rgbNextImg = &_rgbNextImg; + prevPts = _prevPts; + nextPts = _nextPts; + status = _status; + err = _err; + minWinSize = _winSize[0]; + maxWinSize = _winSize[1]; + criteria.maxCount = _maxIteration; + criteria.epsilon = 0.01; + level = _level; + maxLevel = _maxLevel; + windowType = _supportRegionType; + minEigThreshold = _minEigenValue; + useInitialFlow = _useInitialFlow; + crossSegmentationThreshold = _crossSegmentationThreshold; + } + + TrackerInvoker & operator=(const TrackerInvoker &) {return *this;}; + void operator()(const cv::Range& range) const + { + cv::Size winSize; + cv::Point2f halfWin; + + const Mat& I = *prevImg; + const Mat& J = *nextImg; + const Mat& derivI = *prevDeriv; + const Mat& BI = *rgbPrevImg; + + winSize = cv::Size(maxWinSize,maxWinSize); + int winMaskwidth = static_cast(ceil(winSize.width / 8.f)) * 16; + cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); + winMaskMatBuf.setTo(1); + + + + const float FLT_SCALE = (1.f/(1 << 20)); // 20 + + int j, cn = I.channels(), cn2 = cn*2; + int winbufwidth = static_cast(ceil(winSize.width / 8.f)) * 8; + cv::Size winBufSize(winbufwidth,winbufwidth); + + std::vector _buf(winBufSize.area()*(cn + cn2)); + Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]); + Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]); + + for( int ptidx = range.start; ptidx < range.end; ptidx++ ) + { + Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level)); + Point2f nextPt; + if( level == maxLevel ) + { + if( useInitialFlow ) + nextPt = nextPts[ptidx]*(float)(1./(1 << level)); + else + nextPt = prevPt; + } + else + nextPt = nextPts[ptidx]*2.f; + nextPts[ptidx] = nextPt; + + Point2i iprevPt, inextPt; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + int winArea = maxWinSize * maxWinSize; + cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize)); + + if( calcWinMaskMat(BI, windowType, iprevPt, + winMaskMat,winSize,halfWin,winArea, + minWinSize,maxWinSize, crossSegmentationThreshold) == false) + continue; + + + prevPt -= halfWin; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + + + if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols || + iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows ) + { + if( level == 0 ) + { + if( status ) + status[ptidx] = 3; + if( err ) + err[ptidx] = 0; + } + continue; + } + + float a = prevPt.x - iprevPt.x; + float b = prevPt.y - iprevPt.y; + const int W_BITS = 14, W_BITS1 = 14; + + int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + + int dstep = (int)(derivI.step/derivI.elemSize1()); + int step = (int)(I.step/I.elemSize1()); + int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1()); + CV_Assert( step == (int)(J.step/J.elemSize1()) ); + float A11 = 0, A12 = 0, A22 = 0; + +#ifdef CV_SSE2 + __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128i z = _mm_setzero_si128(); + __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1)); + __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1)); + __m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps(); + __m128i mmMask4_epi32; + get4BitMask(winSize.width, mmMask4_epi32); +#endif + + // extract the patch from the first image, compute covariation matrix of derivatives + int x, y; + for( y = 0; y < winSize.height; y++ ) + { + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; + const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; + + short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step); + short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step); + + x = 0; + +#ifdef CV_SSE2 + for( ; x <= winSize.width*cn; x += 4, dsrc += 4*2, dIptr += 4*2 ) + { + __m128i wMask = _mm_set_epi32(MaskSet * maskPtr[x+3], + MaskSet * maskPtr[x+2], + MaskSet * maskPtr[x+1], + MaskSet * maskPtr[x]); + __m128i v00, v01, v10, v11, t0, t1; + v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); + v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); + v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z); + v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + + + + + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); + _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0)); + + + + v00 = _mm_loadu_si128((const __m128i*)(dsrc)); + v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); + v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep)); + v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2)); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1); + v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... + + if( x + 4 > winSize.width) + { + v00 = _mm_and_si128(v00, mmMask4_epi32); + } + v00 = _mm_and_si128(v00, wMask); + + _mm_storeu_si128((__m128i*)dIptr, v00); + t0 = _mm_srai_epi32(v00, 16); // Iy0 Iy1 Iy2 Iy3 + t1 = _mm_srai_epi32(_mm_slli_epi32(v00, 16), 16); // Ix0 Ix1 Ix2 Ix3 + + __m128 fy = _mm_cvtepi32_ps(t0); + __m128 fx = _mm_cvtepi32_ps(t1); + + + qA22 = _mm_add_ps(qA22, _mm_mul_ps(fy, fy)); + qA12 = _mm_add_ps(qA12, _mm_mul_ps(fx, fy)); + qA11 = _mm_add_ps(qA11, _mm_mul_ps(fx, fx)); + } +#else + + for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 ) + { + if( maskPtr[x] == 0) + { + dIptr[0] = 0; + dIptr[1] = 0; + continue; + } + int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + + src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5); + int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 + + dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1); + int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 + + dsrc[dstep+cn2+1]*iw11, W_BITS1); + + Iptr[x] = (short)ival; + dIptr[0] = (short)ixval; + dIptr[1] = (short)iyval; + A11 += (float)(ixval*ixval); + A12 += (float)(ixval*iyval); + A22 += (float)(iyval*iyval); + } +#endif + } + +#ifdef CV_SSE2 + float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4]; + _mm_store_ps(A11buf, qA11); + _mm_store_ps(A12buf, qA12); + _mm_store_ps(A22buf, qA22); + A11 += A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; + A12 += A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; + A22 += A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; +#endif + + A11 *= FLT_SCALE; + A12 *= FLT_SCALE; + A22 *= FLT_SCALE; + + float D = A11*A22 - A12*A12; + float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) + + 4.f*A12*A12))/(2 * winArea); + + if( err ) + err[ptidx] = (float)minEig; + + if( minEig < minEigThreshold || D < FLT_EPSILON ) + { + if( level == 0 && status ) + status[ptidx] = 0; + continue; + } + + D = 1.f/D; + + nextPt -= halfWin; + Point2f prevDelta(0,0); //relates to h(t-1) + __m128i mmMask0, mmMask1, mmMask; + getWBitMask(winSize.width, mmMask0, mmMask1, mmMask); + for( j = 0; j < criteria.maxCount; j++ ) + { + status[ptidx] = static_cast(j); + inextPt.x = cvFloor(nextPt.x); + inextPt.y = cvFloor(nextPt.y); + + + if( inextPt.x + halfWin.x < 0 || inextPt.x + halfWin.x>= derivI.cols || + inextPt.y + halfWin.y < 0 || inextPt.y + halfWin.y >= derivI.rows ) + { + if( level == 0 && status ) + status[ptidx] = 3; + break; + } + + a = nextPt.x - inextPt.x; + b = nextPt.y - inextPt.y; + iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + float b1 = 0, b2 = 0; +#ifdef CV_SSE2 + qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(); + __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps(); + + +#endif + for( y = 0; y < winSize.height; y++ ) + { + const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn; + const short* Iptr = (const short*)(IWinBuf.data + ( y)*IWinBuf.step); + const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); + + x = 0; +#ifdef CV_SSE2 + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 ) + { + if( maskPtr[x ] == 0 && maskPtr[x+1] == 0 && maskPtr[x+2] == 0 && maskPtr[x+3] == 0 + && maskPtr[x+4] == 0 && maskPtr[x+5] == 0 && maskPtr[x+6] == 0 && maskPtr[x+7] == 0) + continue; + __m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x)), diff1; + __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z); + __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); + __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z); + __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z); + + __m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5); + + __m128i mmDiff_epi16 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), diff0); + + __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... + __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); + + if( x > winSize.width*cn - 8) + { + Ixy_0 = _mm_and_si128(Ixy_0, mmMask0); + Ixy_1 = _mm_and_si128(Ixy_1, mmMask1); + } + + + diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7 | It12 It12 It13 It13... + diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3 | It8 It8 It9 It9... + + + v10 = _mm_mullo_epi16(Ixy_0, diff0); + v11 = _mm_mulhi_epi16(Ixy_0, diff0); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); + qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); + + // It * Ix It * Iy [4 ... 7] + // for set 1 hi sigma 1 + + v10 = _mm_mullo_epi16(Ixy_1, diff1); + v11 = _mm_mulhi_epi16(Ixy_1, diff1); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); + qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); + } +#else + for( ; x < winSize.width*cn; x++, dIptr += 2 ) + { + if( dIptr[0] == 0 && dIptr[1] == 0) + continue; + int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, + W_BITS1-5) - Iptr[x]; + + b1 += (float)(diff*dIptr[0]) * FLT_RESCALE; + b2 += (float)(diff*dIptr[1]) * FLT_RESCALE; + } +#endif + } + +#ifdef CV_SSE2 + float CV_DECL_ALIGNED(16) bbuf[4]; + _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1)); + b1 += bbuf[0] + bbuf[2]; + b2 += bbuf[1] + bbuf[3]; +#endif + + b1 *= FLT_SCALE; + b2 *= FLT_SCALE; + + Point2f delta( (float)((A12*b2 - A22*b1) * D), + (float)((A12*b1 - A11*b2) * D)); + + + + nextPt += delta; + nextPts[ptidx] = nextPt + halfWin; + + if( delta.ddot(delta) <= criteria.epsilon) + break; + if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01 && + std::abs(delta.y - prevDelta.y) < 0.01) + { + nextPts[ptidx] -= delta*0.5f; + break; + } + + prevDelta = delta; + + } + + + + + + } + + } + + const Mat* prevImg; + const Mat* nextImg; + const Mat* prevDeriv; + const Mat* rgbPrevImg; + const Mat* rgbNextImg; + const Point2f* prevPts; + Point2f* nextPts; + uchar* status; + float* err; + int maxWinSize; + int minWinSize; + TermCriteria criteria; + int level; + int maxLevel; + int windowType; + float minEigThreshold; + bool useInitialFlow; + int crossSegmentationThreshold; +}; +} +} +}} +#endif \ No newline at end of file diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp new file mode 100644 index 00000000000..6dbdfef441e --- /dev/null +++ b/modules/optflow/src/rlof/rlof_invoker.hpp @@ -0,0 +1,1513 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// + // + // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. + // + // By downloading, copying, installing or using the software you agree to this license. + // If you do not agree to this license, do not download, install, + // copy or use the software. + // + // + // License Agreement + // For Open Source Computer Vision Library + // + // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. + // Copyright (C) 2009, Willow Garage Inc., all rights reserved. + // Third party copyrights are property of their respective owners. + // + // Redistribution and use in source and binary forms, with or without modification, + // are permitted provided that the following conditions are met: + // + // * Redistribution's of source code must retain the above copyright notice, + // this list of conditions and the following disclaimer. + // + // * Redistribution's in binary form must reproduce the above copyright notice, + // this list of conditions and the following disclaimer in the documentation + // and/or other materials provided with the distribution. + // + // * The name of the copyright holders may not be used to endorse or promote products + // derived from this software without specific prior written permission. + // + // This software is provided by the copyright holders and contributors "as is" and + // any express or implied warranties, including, but not limited to, the implied + // warranties of merchantability and fitness for a particular purpose are disclaimed. + // In no event shall the Intel Corporation or contributors be liable for any direct, + // indirect, incidental, special, exemplary, or consequential damages + // (including, but not limited to, procurement of substitute goods or services; + // loss of use, data, or profits; or business interruption) however caused + // and on any theory of liability, whether in contract, strict liability, + // or tort (including negligence or otherwise) arising in any way out of + // the use of this software, even if advised of the possibility of such damage. + // + //M*/ +#ifndef _RLOF_INVOKER_HPP_ +#define _RLOF_INVOKER_HPP_ +#include "rlof_invokerbase.hpp" +namespace cv +{ +namespace optflow +{ +namespace rlof +{ +namespace ica +{ + int calc_diff(int diff,float fParam0,float fParam1,float param_2) + { + int abss = (diff < 0) ? -diff : diff; + if (abss > fParam1) + { + diff = 0; + } + else if (abss > fParam0 && diff >= 0) + { + //diff = fParam1 * (diff - fParam1); + diff = static_cast(param_2 * (diff - fParam1)); + } + else if (abss > fParam0 && diff < 0) + { + //diff = fParam1 * (diff + fParam1); + diff = static_cast(param_2 * (diff + fParam1)); + + } + return diff; + } + class TrackerInvoker : public cv::ParallelLoopBody + { + public: + TrackerInvoker( + const Mat& _prevImg, + const Mat& _prevDeriv, + const Mat& _nextImg, + const Mat& _rgbPrevImg, + const Mat& _rgbNextImg, + const Point2f* _prevPts, + Point2f* _nextPts, + uchar* _status, + float* _err, + int _level, + int _maxLevel, + int _winSize[2], + int _maxIteration, + bool _useInitialFlow, + int _supportRegionType, + std::vector _normSigmas, + float _minEigenValue, + int _crossSegmentationThreshold) + { + prevImg = &_prevImg; + prevDeriv = &_prevDeriv; + nextImg = &_nextImg; + rgbPrevImg = &_rgbPrevImg; + rgbNextImg = &_rgbNextImg; + prevPts = _prevPts; + nextPts = _nextPts; + status = _status; + err = _err; + minWinSize = _winSize[0]; + maxWinSize = _winSize[1]; + criteria.maxCount = _maxIteration; + criteria.epsilon = 0.01; + level = _level; + maxLevel = _maxLevel; + windowType = _supportRegionType; + minEigThreshold = _minEigenValue; + paramReset = _normSigmas; + useInitialFlow = _useInitialFlow; + crossSegmentationThreshold = _crossSegmentationThreshold; + } + TrackerInvoker & operator=(const TrackerInvoker &) { return *this; }; + void operator()(const cv::Range& range) const + { + cv::Size winSize; + cv::Point2f halfWin; + + const Mat& I = *prevImg; + const Mat& J = *nextImg; + const Mat& derivI = *prevDeriv; + const Mat& BI = *rgbPrevImg; + + winSize = cv::Size(maxWinSize, maxWinSize); + int winMaskwidth = static_cast(ceil(winSize.width / 8.f)) * 16; + cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); + winMaskMatBuf.setTo(1); + + const float FLT_SCALE = (1.f / (1 << 20)); // 20 + + std::vector param = paramReset; + int cn = I.channels(), cn2 = cn * 2; + int winbufwidth = static_cast(ceil(winSize.width / 8.f)) * 8; + cv::Size winBufSize(winbufwidth, winbufwidth); + + std::vector _buf(winBufSize.area()*(cn + cn2)); + Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]); + Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]); + + for (int ptidx = range.start; ptidx < range.end; ptidx++) + { + Point2f prevPt = prevPts[ptidx] * (float)(1. / (1 << level)); + Point2f nextPt; + if (level == maxLevel) + { + if ( useInitialFlow) + nextPt = nextPts[ptidx] * (float)(1. / (1 << level)); + else + nextPt = prevPt; + } + else + nextPt = nextPts[ptidx] * 2.f; + nextPts[ptidx] = nextPt; + + Point2i iprevPt, inextPt; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + int winArea = maxWinSize * maxWinSize; + cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0, 0, maxWinSize, maxWinSize)); + + if (calcWinMaskMat(BI, windowType, iprevPt, + winMaskMat, winSize, halfWin, winArea, + minWinSize, maxWinSize, crossSegmentationThreshold) == false) + continue; + + + prevPt -= halfWin; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + + if (iprevPt.x + halfWin.x < 0 || iprevPt.x + halfWin.x >= derivI.cols || + iprevPt.y + halfWin.y < 0 || iprevPt.y + halfWin.y >= derivI.rows) + { + if (level == 0) + { + if (status) + status[ptidx] = 3; + if (err) + err[ptidx] = 0; + } + continue; + } + + float a = prevPt.x - iprevPt.x; + float b = prevPt.y - iprevPt.y; + const int W_BITS = 14, W_BITS1 = 14; + + int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + + int dstep = (int)(derivI.step / derivI.elemSize1()); + int step = (int)(I.step / I.elemSize1()); + int mStep = (int)(winMaskMat.step / winMaskMat.elemSize1()); + CV_Assert(step == (int)(J.step / J.elemSize1())); + float A11 = 0, A12 = 0, A22 = 0; + float b1 = 0, b2 = 0; + float D = 0; + float minEig; + +#ifdef CV_SSE2 + __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128i z = _mm_setzero_si128(); + __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1 - 1)); + __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1 - 5 - 1)); + __m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps(); + __m128i mmMask4_epi32; + __m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits::max()); + get4BitMask(winSize.width, mmMask4_epi32); +#endif + + // extract the patch from the first image, compute covariation matrix of derivatives + int x, y; + for (y = 0; y < winSize.height; y++) + { + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; + const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; + + short* Iptr = (short*)(IWinBuf.data + y * IWinBuf.step); + short* dIptr = (short*)(derivIWinBuf.data + y * derivIWinBuf.step); + + x = 0; + +#ifdef CV_SSE2 + + for (; x <= winSize.width*cn; x += 4, dsrc += 4 * 2, dIptr += 4 * 2) + { + __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16); + __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16); + + __m128i v00, v01, v10, v11, t0, t1; + v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); + v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); + v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z); + v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5); + if (x + 4 > winSize.width) + { + t0 = _mm_and_si128(t0, mmMask4_epi32); + } + t0 = _mm_and_si128(t0, mask_0_3_epi16); + _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0, t0)); + + v00 = _mm_loadu_si128((const __m128i*)(dsrc)); + v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); + v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep)); + v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2)); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1); + v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... + if (x + 4 > winSize.width) + { + v00 = _mm_and_si128(v00, mmMask4_epi32); + } + v00 = _mm_and_si128(v00, mask_0_3_epi16); + _mm_storeu_si128((__m128i*)dIptr, v00); + } +#else + + for (; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2) + { + if (maskPtr[x] == 0) + { + dIptr[0] = 0; + dIptr[1] = 0; + continue; + } + int ival = CV_DESCALE(src[x] * iw00 + src[x + cn] * iw01 + + src[x + step] * iw10 + src[x + step + cn] * iw11, W_BITS1 - 5); + int ixval = CV_DESCALE(dsrc[0] * iw00 + dsrc[cn2] * iw01 + + dsrc[dstep] * iw10 + dsrc[dstep + cn2] * iw11, W_BITS1); + int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc[dstep + 1] * iw10 + + dsrc[dstep + cn2 + 1] * iw11, W_BITS1); + + Iptr[x] = (short)ival; + dIptr[0] = (short)ixval; + dIptr[1] = (short)iyval; + } +#endif + } + + cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1); + + cv::Point2f backUpNextPt = nextPt; + nextPt -= halfWin; + int j; + __m128i mmMask0, mmMask1, mmMask; + getWBitMask(winSize.width, mmMask0, mmMask1, mmMask); + float MEstimatorScale = 1; + int buffIdx = 0; + cv::Point2f prevDelta(0, 0); + + for (j = 0; j < criteria.maxCount; j++) + { + inextPt.x = cvFloor(nextPt.x); + inextPt.y = cvFloor(nextPt.y); + + if (inextPt.x < -winSize.width || inextPt.x >= J.cols || + inextPt.y < -winSize.height || inextPt.y >= J.rows) + { + if (level == 0 && status) + status[ptidx] = 3; + if (level > 0) + nextPts[ptidx] = backUpNextPt; + break; + } + + + a = nextPt.x - inextPt.x; + b = nextPt.y - inextPt.y; + iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + b1 = 0; + b2 = 0; + + if (j == 0 ) + { + A11 = 0; + A12 = 0; + A22 = 0; + } + + if (j == 0 ) + { + buffIdx = 0; + for (y = 0; y < winSize.height; y++) + { + const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn; + const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step); + const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + x = 0; + for (; x < winSize.width*cn; x++, dIptr += 2) + { + if (maskPtr[x] == 0) + continue; + int diff = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11, W_BITS1 - 5) - Iptr[x]; + residualMat.at(buffIdx++) = static_cast(diff); + } + } + /*! Estimation for the residual */ + cv::Mat residualMatRoi(residualMat, cv::Rect(0, 0, 1, buffIdx)); + MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi); + } + + float eta = 1.f / winArea; + float fParam0 = param[0] * 32.f; + float fParam1 = param[1] * 32.f; + fParam0 = param[0] * MEstimatorScale; + fParam1 = param[1] * MEstimatorScale; +#ifdef CV_SSE2 + qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(); + __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps(); + __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits::max() - 1, static_cast(fParam0))); + __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits::max() - 1, static_cast(fParam1))); + __m128 mmParam2 = _mm_set1_ps(param[2]); + float s2Val = param[2] > 0 ? param[2] : -param[2]; + int s2bitShift = param[2] == 0 ? 1 : static_cast(ceil(log(200.f / s2Val) / log(2.f))); // zwei stellen nach den komma + __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast(param[2] * (float)(1 << s2bitShift))); + __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift); + __m128 mmParam2s = _mm_set1_ps(0.01f * param[2]); + __m128 mmParam2s2 = _mm_set1_ps(param[2] * param[2]); + __m128 mmOnes = _mm_set1_ps(1.f); + __m128 mmError0 = _mm_setzero_ps(); + __m128 mmError1 = _mm_setzero_ps(); + __m128i mmEta = _mm_setzero_si128(); + __m128i mmEtaNeg = _mm_set1_epi16(-1); + __m128i mmScale = _mm_set1_epi16(static_cast(MEstimatorScale)); + +#endif + + buffIdx = 0; + for (y = 0; y < winSize.height; y++) + { + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn; + const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step); + const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); + + x = 0; +#ifdef CV_SSE2 + for (; x <= winSize.width*cn; x += 8, dIptr += 8 * 2) + { + __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16); + __m128i diff0, diff1; + __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); // von element 0 bis 7 + + __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z); + __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); + __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z); + __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z); + + __m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1 - 5); + + __m128i mmDiff_epi16 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16); + + mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16); + + + __m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale); + mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1))); + + + __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... + __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); + + __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16); + __m128i bSet2_epi16, bSet1_epi16; + // |It| < sigma1 ? + bSet2_epi16 = _mm_cmplt_epi16(abs_epi16, mmParam1); + // It > 0 ? + __m128i diffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128()); + // sigma0 < |It| < sigma1 ? + bSet1_epi16 = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0)); + // val = |It| -/+ sigma1 + __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)), + _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1))); + // It == 0 ? |It| > sigma13 + mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16); + // It == val ? sigma0 < |It| < sigma1 + mmDiff_epi16 = _mm_blendv_epi8(mmDiff_epi16, tmpParam1_epi16, bSet1_epi16); + + + __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3 + // diff = diff * sigma2 + __m128i lo = _mm_mullo_epi16(tale_epi16_, mmDiff_epi16); + __m128i hi = _mm_mulhi_epi16(tale_epi16_, mmDiff_epi16); + __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); //diff 0_3_epi32 + __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); // diff 4_7_epi32 + + mmDiff_epi16 = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); + diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7 | It12 It12 It13 It13... + diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3 | It8 It8 It9 It9... + + // Ix * diff / Iy * diff + v10 = _mm_mullo_epi16(Ixy_0, diff0); + v11 = _mm_mulhi_epi16(Ixy_0, diff0); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); + qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); + + // It * Ix It * Iy [4 ... 7] + // for set 1 hi sigma 1 + + v10 = _mm_mullo_epi16(Ixy_1, diff1); + v11 = _mm_mulhi_epi16(Ixy_1, diff1); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); + qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); + + + if (j == 0) + { + __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16)); + __m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16, bSet1_epi16), 16)); + __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16)); + __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16))); + + __m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16)); + __m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16), 16)); + + __m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps); + __m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps); + tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps); + tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps); + + tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps); + tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps); + + t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3 + t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3 + + __m128 fy = _mm_cvtepi32_ps(t0); + __m128 fx = _mm_cvtepi32_ps(t1); + + // A11 - A22 + __m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps); + __m128 fytale = _mm_mul_ps(fy, tale_0_3_ps); + + mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); + mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); + mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); + + t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11 + t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3 + + fy = _mm_cvtepi32_ps(t0); + fx = _mm_cvtepi32_ps(t1); + + // A11 - A22 + fxtale = _mm_mul_ps(fx, tale_4_7_ps); + fytale = _mm_mul_ps(fy, tale_4_7_ps); + + mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); + mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); + mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); + } + } +#else + for (; x < winSize.width*cn; x++, dIptr += 2) + { + if (maskPtr[x] == 0) + continue; + int diff = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + + Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11, + W_BITS1 - 5) - Iptr[x]; + + + + + + if (diff > MEstimatorScale) + MEstimatorScale += eta; + if (diff < MEstimatorScale) + MEstimatorScale -= eta; + + calc_diff(diff, fParam0, fParam1, param[2]); + + + float ixval = (float)(dIptr[0]); + float iyval = (float)(dIptr[1]); + b1 += (float)(diff*ixval); + b2 += (float)(diff*iyval); + + if ( j == 0) + { + int abss = (diff < 0) ? -diff : diff; + float tale = param[2] * FLT_RESCALE; + //float tale = fParam1 * FLT_RESCALE; + if (abss < fParam0) + { + tale = FLT_RESCALE; + } + else if (abss > fParam1) + { + tale *= 0.01f; + } + else + { + tale *= param[2]; + } + A11 += (float)(ixval*ixval*tale); + A12 += (float)(ixval*iyval*tale); + A22 += (float)(iyval*iyval*tale); + } + + + } +#endif + } + +#ifdef CV_SSE2 + short etaValues[8]; + _mm_storeu_si128((__m128i*)(etaValues), mmEta); + MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3] + + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]); + + +#endif + if (j == 0) + { +#ifdef CV_SSE2 + float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];// + + _mm_store_ps(A11buf, mmAxx); + _mm_store_ps(A12buf, mmAxy); + _mm_store_ps(A22buf, mmAyy); + + + A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; + A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; + A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; +#endif + A11 *= FLT_SCALE; + A12 *= FLT_SCALE; + A22 *= FLT_SCALE; + + + D = A11 * A22 - A12 * A12; + minEig = (A22 + A11 - std::sqrt((A11 - A22)*(A11 - A22) + + 4.f*A12*A12)) / (2 * winArea); + D = 1.f / D; + + if (minEig < minEigThreshold || std::abs(D) < FLT_EPSILON) + { + if (level == 0 && status) + status[ptidx] = 0; + if (level > 0) + { + nextPts[ptidx] = backUpNextPt; + } + break; + } + } + +#ifdef CV_SSE2 + float CV_DECL_ALIGNED(16) bbuf[4]; + _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1)); + b1 += bbuf[0] + bbuf[2]; + b2 += bbuf[1] + bbuf[3]; +#endif + + b1 *= FLT_SCALE; + b2 *= FLT_SCALE; + + + Point2f delta((float)((A12*b2 - A22 * b1) * D), (float)((A12*b1 - A11 * b2) * D)); + + delta.x = (delta.x != delta.x) ? 0 : delta.x; + delta.y = (delta.y != delta.y) ? 0 : delta.y; + + nextPt += delta * 0.7; + nextPts[ptidx] = nextPt + halfWin; + + if (j > 0 && std::abs(delta.x - prevDelta.x) < 0.01 && + std::abs(delta.y - prevDelta.y) < 0.01) + { + nextPts[ptidx] -= delta * 0.5f; + break; + } + + prevDelta = delta; + + } + + } + + } + + const Mat* prevImg; + const Mat* nextImg; + const Mat* prevDeriv; + const Mat* rgbPrevImg; + const Mat* rgbNextImg; + const Point2f* prevPts; + Point2f* nextPts; + uchar* status; + float* err; + int maxWinSize; + int minWinSize; + TermCriteria criteria; + int level; + int maxLevel; + int windowType; + float minEigThreshold; + bool useInitialFlow; + std::vector paramReset; + int crossSegmentationThreshold; + }; +} +namespace radial +{ + class TrackerInvoker : public cv::ParallelLoopBody + { + public: + TrackerInvoker( + const Mat& _prevImg, + const Mat& _prevDeriv, + const Mat& _nextImg, + const Mat& _rgbPrevImg, + const Mat& _rgbNextImg, + const Point2f* _prevPts, + Point2f* _nextPts, + uchar* _status, + float* _err, + Point2f* _gainVecs, + int _level, + int _maxLevel, + int _winSize[2], + int _maxIteration, + bool _useInitialFlow, + int _supportRegionType, + std::vector _normSigmas, + float _minEigenValue, + int _crossSegmentationThreshold) + { + prevImg = &_prevImg; + prevDeriv = &_prevDeriv; + nextImg = &_nextImg; + rgbPrevImg = &_rgbPrevImg; + rgbNextImg = &_rgbNextImg; + prevPts = _prevPts; + nextPts = _nextPts; + status = _status; + err = _err; + gainVecs = _gainVecs; + minWinSize = _winSize[0]; + maxWinSize = _winSize[1]; + criteria.maxCount = _maxIteration; + criteria.epsilon = 0.01; + level = _level; + maxLevel = _maxLevel; + windowType = _supportRegionType; + minEigThreshold = _minEigenValue; + paramReset = _normSigmas; + useInitialFlow = _useInitialFlow; + crossSegmentationThreshold = _crossSegmentationThreshold; + } + + TrackerInvoker & operator=(const TrackerInvoker &) { return *this; }; + void operator()(const cv::Range& range) const + { + Point2f halfWin; + cv::Size winSize; + const Mat& I = *prevImg; + const Mat& J = *nextImg; + const Mat& derivI = *prevDeriv; + const Mat& BI = *rgbPrevImg; + + + const float FLT_SCALE = (1.f / (1 << 16)); + + winSize = cv::Size(maxWinSize, maxWinSize); + int winMaskwidth = static_cast(ceil(winSize.width / 16.f)) * 16; + cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); + winMaskMatBuf.setTo(1); + + std::vector param = paramReset; + int cn = I.channels(), cn2 = cn * 2; + int winbufwidth = static_cast(ceil(winSize.width / 16.f)) * 16; + cv::Size winBufSize(winbufwidth, winbufwidth); + + cv::Mat invTensorMat(4, 4, CV_32FC1); + cv::Mat mismatchMat(4, 1, CV_32FC1); + cv::Mat resultMat(4, 1, CV_32FC1); + + + std::vector _buf(winBufSize.area()*(cn + cn2)); + Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]); + Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]); + + for (int ptidx = range.start; ptidx < range.end; ptidx++) + { + + Point2f prevPt = prevPts[ptidx] * (float)(1. / (1 << level)); + Point2f nextPt; + if (level == maxLevel) + { + if (useInitialFlow) + { + nextPt = nextPts[ptidx] * (float)(1. / (1 << level)); + } + else + nextPt = prevPt; + } + else + { + nextPt = nextPts[ptidx] * 2.f; + + } + nextPts[ptidx] = nextPt; + + Point2i iprevPt, inextPt; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + int winArea = maxWinSize * maxWinSize; + cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0, 0, maxWinSize, maxWinSize)); + winMaskMatBuf.setTo(0); + if (calcWinMaskMat(BI, windowType, iprevPt, + winMaskMat, winSize, halfWin, winArea, + minWinSize, maxWinSize, crossSegmentationThreshold) == false) + continue; + + prevPt -= halfWin; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + + if (iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols || + iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows) + { + if (level == 0) + { + if (status) + status[ptidx] = 3; + if (err) + err[ptidx] = 0; + } + continue; + } + + float a = prevPt.x - iprevPt.x; + float b = prevPt.y - iprevPt.y; + const int W_BITS = 14, W_BITS1 = 14; + + int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + + int dstep = (int)(derivI.step / derivI.elemSize1()); + int step = (int)(I.step / I.elemSize1()); + int mStep = (int)(winMaskMat.step / winMaskMat.elemSize1()); + CV_Assert(step == (int)(J.step / J.elemSize1())); + + float A11 = 0, A12 = 0, A22 = 0; + float b1 = 0, b2 = 0, b3 = 0, b4 = 0; + // tensor + float sumIx = 0; + float sumIy = 0; + float sumI = 0; + float sumW = 0; + float w1 = 0, w2 = 0; // -IyI + float dI = 0; // I^2 + float D = 0; + +#ifdef CV_SSE2 + __m128i rightMask = _mm_set_epi16(0, std::numeric_limits::max(), 0, std::numeric_limits::max(), 0, std::numeric_limits::max(), 0, std::numeric_limits::max()); + __m128i leftMask = _mm_set_epi16(std::numeric_limits::max(), 0, std::numeric_limits::max(), 0, std::numeric_limits::max(), 0, std::numeric_limits::max(), 0); + __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128i z = _mm_setzero_si128(); + __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1 - 1)); + __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1 - 5 - 1)); + __m128i mmMask4_epi32; + __m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits::max()); + get4BitMask(winSize.width, mmMask4_epi32); + __m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps(); +#endif + + // extract the patch from the first image, compute covariation matrix of derivatives + int x, y; + for (y = 0; y < winSize.height; y++) + { + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; + const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; + + short* Iptr = (short*)(IWinBuf.data + y * IWinBuf.step); + short* dIptr = (short*)(derivIWinBuf.data + y * derivIWinBuf.step); + + x = 0; + +#ifdef CV_SSE2 + for (; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4 * 2, dIptr += 4 * 2) + { + __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16); + __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16); + __m128i v00, v01, v10, v11, t0, t1; + v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); + v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); + v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z); + v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5); + if (x + 4 > winSize.width) + { + t0 = _mm_and_si128(t0, mmMask4_epi32); + } + t0 = _mm_and_si128(t0, mask_0_3_epi16); + _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0, t0)); + + + v00 = _mm_loadu_si128((const __m128i*)(dsrc)); + v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); + v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep)); + v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2)); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1); + v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... + if (x + 4 > winSize.width) + { + v00 = _mm_and_si128(v00, mmMask4_epi32); + } + v00 = _mm_and_si128(v00, mask_0_3_epi16); + _mm_storeu_si128((__m128i*)dIptr, v00); + } +#else + + for (; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2) + { + if (winMaskMat.at(y, x) == 0) + { + dIptr[0] = 0; + dIptr[1] = 0; + continue; + } + int ival = CV_DESCALE(src[x] * iw00 + src[x + cn] * iw01 + + src[x + step] * iw10 + src[x + step + cn] * iw11, W_BITS1 - 5); + int ixval = CV_DESCALE(dsrc[0] * iw00 + dsrc[cn2] * iw01 + + dsrc[dstep] * iw10 + dsrc[dstep + cn2] * iw11, W_BITS1); + int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc[dstep + 1] * iw10 + + dsrc[dstep + cn2 + 1] * iw11, W_BITS1); + + Iptr[x] = (short)ival; + dIptr[0] = (short)ixval; + dIptr[1] = (short)iyval; + + } +#endif + } + + cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1); + + cv::Point2f backUpNextPt = nextPt; + nextPt -= halfWin; + Point2f prevDelta(0, 0); //related to h(t-1) + Point2f prevGain(0, 0); + cv::Point2f gainVec = gainVecs[ptidx]; + cv::Point2f backUpGain = gainVec; + cv::Size _winSize = winSize; + int j; + __m128i mmMask0, mmMask1, mmMask; + getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask); + __m128 mmOnes = _mm_set1_ps(1.f); + __m128i mmOnes_epi16 = _mm_set1_epi16(1); + __m128i mmZeros_epi16 = _mm_set1_epi16(0); + float MEstimatorScale = 1; + int buffIdx = 0; + float minEigValue, maxEigValue; + + for (j = 0; j < criteria.maxCount; j++) + { + + inextPt.x = cvFloor(nextPt.x); + inextPt.y = cvFloor(nextPt.y); + + if (inextPt.x < -winSize.width || inextPt.x >= J.cols || + inextPt.y < -winSize.height || inextPt.y >= J.rows) + { + if (level == 0 && status) + status[ptidx] = 3; + if (level > 0) + { + nextPts[ptidx] = backUpNextPt; + gainVecs[ptidx] = backUpGain; + } + break; + } + + a = nextPt.x - inextPt.x; + b = nextPt.y - inextPt.y; + iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + + // mismatch + b1 = 0; + b2 = 0; + b3 = 0; + b4 = 0; + if (j == 0) + { + // tensor + w1 = 0; // -IxI + w2 = 0; // -IyI + dI = 0; // I^2 + sumIx = 0; + sumIy = 0; + sumI = 0; + sumW = 0; + A11 = 0; + A12 = 0; + A22 = 0; + } + + if (j == 0 ) + { + buffIdx = 0; + for (y = 0; y < winSize.height; y++) + { + const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn; + const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step); + const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); + x = 0; + for (; x < winSize.width*cn; x++, dIptr += 2) + { + if (dIptr[0] == 0 && dIptr[1] == 0) + continue; + int diff = static_cast(CV_DESCALE( Jptr[x] * iw00 + + Jptr[x + cn] * iw01 + + Jptr[x + step] * iw10 + + Jptr[x + step + cn] * iw11, W_BITS1 - 5) + - Iptr[x] + Iptr[x] * gainVec.x + gainVec.y); + residualMat.at(buffIdx++) = static_cast(diff); + } + } + /*! Estimation for the residual */ + cv::Mat residualMatRoi(residualMat, cv::Rect(0, 0, 1, buffIdx)); + MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi); + } + + float eta = 1.f / winArea; + float fParam0 = param[0] * 32.f; + float fParam1 = param[1] * 32.f; + fParam0 = param[0] * MEstimatorScale; + fParam1 = param[1] * MEstimatorScale; + +#ifdef CV_SSE2 + + qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(), qb2 = _mm_setzero_ps(); + __m128 qb3 = _mm_setzero_ps(); + __m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps(); + __m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps(); + __m128 mmSumIy = _mm_setzero_ps(), mmSumIx = _mm_setzero_ps(); + __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps(); + __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits::max() - 1, static_cast(fParam0))); + __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits::max() - 1, static_cast(fParam1))); + + + float s2Val = param[2] > 0 ? param[2] : -param[2]; + int s2bitShift = param[2] == 0 ? 1 : static_cast(ceil(log(200.f / s2Val) / log(2.f))); // zwei stellen nach den komma + __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast(param[2] * (float)(1 << s2bitShift))); + __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift); + __m128 mmParam2 = _mm_set1_ps(param[2]); + __m128 mmParam2s = _mm_set1_ps(0.01f * param[2]); + __m128 mmParam2s2 = _mm_set1_ps(param[2] * param[2]); + + float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x; + int bitShift = gainVec.x == 0 ? 1 : static_cast(ceil(log(200.f / gainVal) / log(2.f))); // zwei stellen nach den komma + + __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast(gainVec.x * (float)(1 << bitShift))); + __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast(gainVec.y)); + __m128i mmEta = _mm_setzero_si128(); + __m128i mmEtaNeg = _mm_set1_epi16(-1); + __m128i mmScale = _mm_set1_epi16(static_cast(MEstimatorScale)); + +#endif + buffIdx = 0; + + for (y = 0; y < _winSize.height; y++) + { + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn; + const short* Iptr = (const short*)(IWinBuf.data + y * IWinBuf.step); + const short* dIptr = (const short*)(derivIWinBuf.data + y * derivIWinBuf.step); + + x = 0; +#ifdef CV_SSE2 + + + for (; x <= _winSize.width*cn; x += 8, dIptr += 8 * 2) + { + __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16); + __m128i diff0, diff1; + __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); // von element 0 bis 7 + __m128i v00 = _mm_unpacklo_epi8( + _mm_loadl_epi64((const __m128i*)(Jptr + x)) , z); //J0 , 0, J1, 0, J2, 0 ... J7,0 + __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); + __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z); + __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z); + + __m128i t0 = _mm_add_epi32 + (_mm_madd_epi16( + _mm_unpacklo_epi16(v00, v01), //J0, , J1, J1, J2, J2, ... J3 , J4 + qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1 - 5); + + + __m128i lo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16); + __m128i hi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16); + + __m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), bitShift); + __m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), bitShift); + __m128i Igain_epi16 = _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32); + + // diff = J - I + I*gain.x + gain.y + __m128i mmDiff_epi16 = _mm_add_epi16( + _mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16), // J - I + _mm_add_epi16(Igain_epi16, mmConstValue_epi16)); + + + mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16); + + + __m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale); + mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1))); + + + __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... + __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); + + + __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16); + __m128i bSet2_epi16, bSet1_epi16; + // |It| < sigma1 ? + bSet2_epi16 = _mm_cmplt_epi16(abs_epi16, mmParam1); + // It > 0 ? + __m128i diffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128()); + // sigma0 < |It| < sigma1 ? + bSet1_epi16 = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0)); + // val = |It| -/+ sigma1 + __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)), + _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1))); + // It == 0 ? |It| > sigma13 + mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16); + // It == val ? sigma0 < |It| < sigma1 + mmDiff_epi16 = _mm_blendv_epi8(mmDiff_epi16, tmpParam1_epi16, bSet1_epi16); + + + __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3 + // diff = diff * sigma2 + lo = _mm_mullo_epi16(tale_epi16_, mmDiff_epi16); + hi = _mm_mulhi_epi16(tale_epi16_, mmDiff_epi16); + __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); //diff 0_3_epi32 + __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); // diff 4_7_epi32 + + mmDiff_epi16 = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); // ,da das ergebniss kleiner als 16 bit sein sollte + + diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7 | It12 It12 It13 It13... + diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3 | It8 It8 It9 It9... + + // Ix * diff / Iy * diff + v10 = _mm_mullo_epi16(Ixy_0, diff0); + v11 = _mm_mulhi_epi16(Ixy_0, diff0); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); + qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); + + // It * Ix It * Iy [4 ... 7] + // for set 1 hi sigma 1 + + v10 = _mm_mullo_epi16(Ixy_1, diff1); + v11 = _mm_mulhi_epi16(Ixy_1, diff1); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); + qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); + // diff * J [0 ... 7] + // for set 1 sigma 1 + // b3 += diff * Iptr[x] + + v10 = _mm_mullo_epi16(mmDiff_epi16, I_0_7_epi16); + v11 = _mm_mulhi_epi16(mmDiff_epi16, I_0_7_epi16); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v00)); + qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v10)); + + qb3 = _mm_add_ps(qb3, _mm_cvtepi32_ps(diff_0_3_epi32)); + qb3 = _mm_add_ps(qb3, _mm_cvtepi32_ps(diff_4_7_epi32)); + + if (j == 0) + { + __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16)); + __m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16, bSet1_epi16), 16)); + __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16)); + __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16))); + + __m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16)); + __m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16), 16)); + + __m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps); + __m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps); + tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps); + tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps); + + tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps); + tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps); + + t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3 + t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3 + + __m128 fy = _mm_cvtepi32_ps(t0); + __m128 fx = _mm_cvtepi32_ps(t1); + + // 0 ... 3 + __m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16)); + + // A11 - A22 + __m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps); + __m128 fytale = _mm_mul_ps(fy, tale_0_3_ps); + + mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); + mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); + mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); + + // sumIx und sumIy + mmSumIx = _mm_add_ps(mmSumIx, fxtale); + mmSumIy = _mm_add_ps(mmSumIy, fytale); + + mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale)); + mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale)); + + // sumI + __m128 I_tale_ps = _mm_mul_ps(I_ps, tale_0_3_ps); + mmSumI = _mm_add_ps(mmSumI, I_tale_ps); + + // sumW + mmSumW = _mm_add_ps(mmSumW, tale_0_3_ps); + + // sumDI + mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps(I_ps, I_tale_ps)); + + t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11 + t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3 + + fy = _mm_cvtepi32_ps(t0); + fx = _mm_cvtepi32_ps(t1); + + // 4 ... 7 + I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16)); + + // A11 - A22 + fxtale = _mm_mul_ps(fx, tale_4_7_ps); + fytale = _mm_mul_ps(fy, tale_4_7_ps); + + mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); + mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); + mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); + + // sumIx und sumIy + mmSumIx = _mm_add_ps(mmSumIx, fxtale); + mmSumIy = _mm_add_ps(mmSumIy, fytale); + + mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale)); + mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale)); + + // sumI + I_tale_ps = _mm_mul_ps(I_ps, tale_4_7_ps); + mmSumI = _mm_add_ps(mmSumI, I_tale_ps); + + // sumW + mmSumW = _mm_add_ps(mmSumW, tale_4_7_ps); + + // sumDI + mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps(I_ps, I_tale_ps)); + } + + } +#else + for (; x < _winSize.width*cn; x++, dIptr += 2) + { + if (maskPtr[x] == 0) + continue; + + float J = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + + Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11, + W_BITS1 - 5); + short ixval = static_cast(dIptr[0]); + short iyval = static_cast(dIptr[1]); + int diff = J - static_cast(Iptr[x]) + Iptr[x] * gainVec.x + gainVec.y; + int abss = (diff < 0) ? -diff : diff; + if (diff > MEstimatorScale) + MEstimatorScale += eta; + if (diff < MEstimatorScale) + MEstimatorScale -= eta; + // compute the missmatch vector + + if (abss > fParam1) + { + diff = 0.f; + } + else if (abss > fParam0 && diff >= 0) + { + diff = param[2] * (diff - fParam1); + } + else if (abss > fParam0 && diff < 0) + { + diff = param[2] * (diff + fParam1); + } + + + b1 += (float)(diff*ixval); + b2 += (float)(diff*iyval); ; + b3 += (float)(diff)* Iptr[x]; + b4 += (float)(diff); + + + // compute the Gradient Matrice + if (j == 0) + { + float tale = param[2] * FLT_RESCALE; + if (abss < fParam0 || j < 0) + { + tale = FLT_RESCALE; + } + else if (abss > fParam1) + { + tale *= 0.01f; + } + else + { + tale *= param[2]; + } + if (j == 0) + { + A11 += (float)(ixval*ixval)*tale; + A12 += (float)(ixval*iyval)*tale; + A22 += (float)(iyval*iyval)*tale; + } + + dI += Iptr[x] * Iptr[x] * tale; + float dx = static_cast(dIptr[0]) * tale; + float dy = static_cast(dIptr[1]) * tale; + sumIx += dx; + sumIy += dy; + w1 += dx * Iptr[x]; + w2 += dy * Iptr[x]; + sumI += Iptr[x] * tale; + sumW += tale; + } + + } +#endif + } + + +#ifdef CV_SSE2 + short etaValues[8]; + _mm_storeu_si128((__m128i*)(etaValues), mmEta); + MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3] + + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]); + +#endif + float CV_DECL_ALIGNED(32) wbuf[4];// + if (j == 0) + { +#ifdef CV_SSE2 + _mm_store_ps(wbuf, mmSumW1); + w1 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumW2); + w2 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumDI); + dI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumI); + sumI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumIx); + sumIx = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumIy); + sumIy = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumW); + sumW = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; +#endif + sumIx *= -FLT_SCALE; + sumIy *= -FLT_SCALE; + sumI *= FLT_SCALE; + sumW *= FLT_SCALE; + w1 *= -FLT_SCALE; + w2 *= -FLT_SCALE; + dI *= FLT_SCALE; + + +#ifdef CV_SSE2 + float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];// + + _mm_store_ps(A11buf, mmAxx); + _mm_store_ps(A12buf, mmAxy); + _mm_store_ps(A22buf, mmAyy); + + + A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; + A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; + A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; +#endif + A11 *= FLT_SCALE; + A12 *= FLT_SCALE; + A22 *= FLT_SCALE; + + + } + + +#ifdef CV_SSE2 + float CV_DECL_ALIGNED(16) bbuf[4]; + _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1)); + b1 = bbuf[0] + bbuf[2]; + b2 = bbuf[1] + bbuf[3]; + _mm_store_ps(bbuf, qb2); + b3 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; + _mm_store_ps(bbuf, qb3); + b4 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; + +#endif + + mismatchMat.at(0, 0) = b1 * FLT_SCALE; + mismatchMat.at(1, 0) = b2 * FLT_SCALE; + mismatchMat.at(2, 0) = -b3 * FLT_SCALE; + mismatchMat.at(3, 0) = -b4 * FLT_SCALE; + + D = -A12 * A12*sumI*sumI + dI * sumW*A12*A12 + 2 * A12*sumI*sumIx*w2 + 2 * A12*sumI*sumIy*w1 + - 2 * dI*A12*sumIx*sumIy - 2 * sumW*A12*w1*w2 + A11 * A22*sumI*sumI - 2 * A22*sumI*sumIx*w1 + - 2 * A11*sumI*sumIy*w2 - sumIx * sumIx*w2*w2 + A22 * dI*sumIx*sumIx + 2 * sumIx*sumIy*w1*w2 + - sumIy * sumIy*w1*w1 + A11 * dI*sumIy*sumIy + A22 * sumW*w1*w1 + A11 * sumW*w2*w2 - A11 * A22*dI*sumW; + + float sqrtVal = std::sqrt((A11 - A22)*(A11 - A22) + 4.f*A12*A12); + minEigValue = (A22 + A11 - sqrtVal) / (2.0f*winArea);; + maxEigValue = (A22 + A11 + sqrtVal) / (2.0f*winArea); + if (minEigValue < minEigThreshold || std::abs(D) < FLT_EPSILON) + { + if (level == 0 && status) + status[ptidx] = 0; + if (level > 0) + { + nextPts[ptidx] = backUpNextPt; + gainVecs[ptidx] = backUpGain; + } + break; + } + + D = (1.f / D); + + invTensorMat.at(0, 0) = (A22*sumI*sumI - 2 * sumI*sumIy*w2 + dI * sumIy*sumIy + sumW * w2*w2 - A22 * dI*sumW)* D; invTensorMat.at(0, 1) = (A12*dI*sumW - A12 * sumI * sumI - dI * sumIx*sumIy + sumI * sumIx*w2 + sumI * sumIy*w1 - sumW * w1*w2)* D; + invTensorMat.at(0, 2) = (A12*sumI*sumIy - sumIy * sumIy*w1 - A22 * sumI*sumIx - A12 * sumW*w2 + A22 * sumW*w1 + sumIx * sumIy*w2)* D; + invTensorMat.at(0, 3) = (A22*dI*sumIx - A12 * dI*sumIy - sumIx * w2*w2 + A12 * sumI*w2 - A22 * sumI*w1 + sumIy * w1*w2) * D; + invTensorMat.at(1, 0) = invTensorMat.at(0, 1); + invTensorMat.at(1, 1) = (A11*sumI * sumI - 2 * sumI*sumIx*w1 + dI * sumIx * sumIx + sumW * w1*w1 - A11 * dI*sumW) * D; + invTensorMat.at(1, 2) = (A12*sumI*sumIx - A11 * sumI*sumIy - sumIx * sumIx*w2 + A11 * sumW*w2 - A12 * sumW*w1 + sumIx * sumIy*w1) * D; + invTensorMat.at(1, 3) = (A11*dI*sumIy - sumIy * w1*w1 - A12 * dI*sumIx - A11 * sumI*w2 + A12 * sumI*w1 + sumIx * w1*w2)* D; + invTensorMat.at(2, 0) = invTensorMat.at(0, 2); + invTensorMat.at(2, 1) = invTensorMat.at(1, 2); + invTensorMat.at(2, 2) = (sumW*A12*A12 - 2 * A12*sumIx*sumIy + A22 * sumIx*sumIx + A11 * sumIy*sumIy - A11 * A22*sumW)* D; + invTensorMat.at(2, 3) = (A11*A22*sumI - A12 * A12*sumI - A11 * sumIy*w2 + A12 * sumIx*w2 + A12 * sumIy*w1 - A22 * sumIx*w1)* D; + invTensorMat.at(3, 0) = invTensorMat.at(0, 3); + invTensorMat.at(3, 1) = invTensorMat.at(1, 3); + invTensorMat.at(3, 2) = invTensorMat.at(2, 3); + invTensorMat.at(3, 3) = (dI*A12*A12 - 2 * A12*w1*w2 + A22 * w1*w1 + A11 * w2*w2 - A11 * A22*dI)* D; + + resultMat = invTensorMat * mismatchMat; + + + Point2f delta(-resultMat.at(0), -resultMat.at(1)); + Point2f deltaGain(resultMat.at(2), resultMat.at(3)); + + + + + if (j == 0) + prevGain = deltaGain; + gainVec += deltaGain * 0.8; + nextPt += delta * 0.8; + nextPts[ptidx] = nextPt + halfWin; + gainVecs[ptidx] = gainVec; + + + + cv::Point2f totalDistance = backUpNextPt - nextPts[ptidx]; + + if (// j > 1 && + (std::abs(delta.x - prevDelta.x) < 0.01 && std::abs(delta.y - prevDelta.y) < 0.01) + || ((delta.ddot(delta) <= 0.001) && std::abs(prevGain.x - deltaGain.x) < 0.01) + ) + { + nextPts[ptidx] -= delta * 0.5f; + gainVecs[ptidx] -= deltaGain * 0.5f; + break; + } + + prevDelta = delta; + prevGain = deltaGain; + } + + } + + } + + const Mat* prevImg; + const Mat* nextImg; + const Mat* prevDeriv; + const Mat* rgbPrevImg; + const Mat* rgbNextImg; + const Point2f* prevPts; + Point2f* nextPts; + uchar* status; + cv::Point2f* gainVecs; + float* err; + int maxWinSize; + int minWinSize; + TermCriteria criteria; + int level; + int maxLevel; + int windowType; + float minEigThreshold; + bool useInitialFlow; + std::vector paramReset; + int crossSegmentationThreshold; + }; +} +} +} +} +#endif \ No newline at end of file diff --git a/modules/optflow/src/rlof/rlof_invokerbase.hpp b/modules/optflow/src/rlof/rlof_invokerbase.hpp new file mode 100644 index 00000000000..cb6243bf2aa --- /dev/null +++ b/modules/optflow/src/rlof/rlof_invokerbase.hpp @@ -0,0 +1,309 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// + // + // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. + // + // By downloading, copying, installing or using the software you agree to this license. + // If you do not agree to this license, do not download, install, + // copy or use the software. + // + // + // License Agreement + // For Open Source Computer Vision Library + // + // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. + // Copyright (C) 2009, Willow Garage Inc., all rights reserved. + // Third party copyrights are property of their respective owners. + // + // Redistribution and use in source and binary forms, with or without modification, + // are permitted provided that the following conditions are met: + // + // * Redistribution's of source code must retain the above copyright notice, + // this list of conditions and the following disclaimer. + // + // * Redistribution's in binary form must reproduce the above copyright notice, + // this list of conditions and the following disclaimer in the documentation + // and/or other materials provided with the distribution. + // + // * The name of the copyright holders may not be used to endorse or promote products + // derived from this software without specific prior written permission. + // + // This software is provided by the copyright holders and contributors "as is" and + // any express or implied warranties, including, but not limited to, the implied + // warranties of merchantability and fitness for a particular purpose are disclaimed. + // In no event shall the Intel Corporation or contributors be liable for any direct, + // indirect, incidental, special, exemplary, or consequential damages + // (including, but not limited to, procurement of substitute goods or services; + // loss of use, data, or profits; or business interruption) however caused + // and on any theory of liability, whether in contract, strict liability, + // or tort (including negligence or otherwise) arising in any way out of + // the use of this software, even if advised of the possibility of such damage. + // + //M*/ +#ifndef _RLOF_INVOKERBASE_HPP_ +#define _RLOF_INVOKERBASE_HPP_ + + +#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n)) +#define FLT_RESCALE 1 +#ifndef _MSC_VER +#include +#endif + +#include "opencv2/core.hpp" +#include "opencv2/video.hpp" +#include "rlof_localflow.h" +#include +#include "opencv2/core/hal/intrin.hpp" +using namespace std; +using namespace cv; + + +namespace cv +{ + namespace optflow + { + typedef short deriv_type; + inline void get4BitMask(const int & width, __m128i & mask) + { + int noBits = width - static_cast(floor(width / 4.f) * 4.f); + unsigned int val[4]; + for (int n = 0; n < 4; n++) + { + val[n] = (noBits > n) ? (std::numeric_limits::max()) : 0; + } + mask = _mm_set_epi32(val[3], val[2], val[1], val[0]); + + } + inline void getWBitMask(const int & width, __m128i & t0, __m128i & t1, __m128i & t2) + { + int noBits = width - static_cast(floor(width / 8.f) * 8.f); + unsigned short val[8]; + for (int n = 0; n < 8; n++) + { + val[n] = (noBits > n) ? (0xffff) : 0; + } + t1 = _mm_set_epi16(val[7], val[7], val[6], val[6], val[5], val[5], val[4], val[4]); + t0 = _mm_set_epi16(val[3], val[3], val[2], val[2], val[1], val[1], val[0], val[0]); + t2 = _mm_set_epi16(val[7], val[6], val[5], val[4], val[3], val[2], val[1], val[0]); + } + + typedef uchar tMaskType; +#define tCVMaskType CV_8UC1 +#define MaskSet 0xffffffff + + inline bool notSameColor(const cv::Point3_ & ref, int r, int c, const cv::Mat & img, int winSize, int threshold) + { + if (r >= img.rows + winSize || c >= img.cols + winSize || r < -winSize || c < -winSize) return true; + int step = static_cast(img.step1()); + const cv::Point3_ * tval = img.ptr>(); + tval += c; + tval += r * step / 3; + int R = std::abs((int)ref.x - (int)tval->x); + int G = std::abs((int)ref.y - (int)tval->y); + int B = std::abs((int)ref.z - (int)tval->z); + int diff = MAX(R, MAX(G, B)); + return diff > threshold; + } + + /*! Estimate the mask with points of the same color as the middle point + *\param prevGray input CV_8UC1 + *\param prevImg input CV_8UC3 + *\param nextImg input CV_8UC3 + *\param prevPoint global position of the middle point of the mask window + *\param nextPoint global position of the middle point of the mask window + *\param winPointMask mask matrice with 1 labeling points contained and 0 point is not contained by the segment + *\param noPoints number of points contained by the segment + *\param winRoi rectangle of the region of interesset in global coordinates + *\param minWinSize, + *\param threshold, + *\param useBothImages + */ + static void getLocalPatch( + const cv::Mat & prevImg, + const cv::Point2i & prevPoint, // feature points + cv::Mat & winPointMask, + int & noPoints, + cv::Rect & winRoi, + int minWinSize, + int threshold) + { + int maxWinSizeH = (winPointMask.cols - 1) / 2; + winRoi.x = prevPoint.x; + winRoi.y = prevPoint.y; + winRoi.width = winPointMask.cols; + winRoi.height = winPointMask.rows; + + if (minWinSize == winPointMask.cols) + { + winRoi.x = prevPoint.x - maxWinSizeH; + winRoi.y = prevPoint.y - maxWinSizeH; + winPointMask.setTo(1); + noPoints = winPointMask.size().area(); + return; + } + noPoints = 0; + int c = prevPoint.x; + int r = prevPoint.y; + int c_left = c - 1; + int c_right = c + 1; + int r_top = r - 1; + int r_bottom = r; + int border_left = c - maxWinSizeH; + int border_right = c + maxWinSizeH; + int border_top = r - maxWinSizeH; + int border_bottom = r + maxWinSizeH; + int c_local_diff = prevPoint.x - maxWinSizeH; + int r_local_diff = prevPoint.y - maxWinSizeH; + int _c = c - c_local_diff; + int _r = r - r_local_diff; + int min_r = _r; + int max_r = _r; + int min_c = _c; + int max_c = _c; + // horizontal line + if (r < 0 || r >= prevImg.rows || c < 0 || c >= prevImg.cols) + { + noPoints = 0; + return; + } + cv::Point3_ val1 = prevImg.at>(r, c); // middle grayvalue + cv::Point3_ tval; + + //vertical line + for (int dr = r_top; dr >= border_top; dr--) + { + if (notSameColor(val1, dr, c, prevImg, maxWinSizeH, threshold)) + break; + + int _dr = dr - r_local_diff; + min_r = MIN(min_r, _dr); + max_r = MAX(max_r, _dr); + winPointMask.at(_dr, _c) = 1; + noPoints++; + } + for (int dr = r_bottom; dr < border_bottom; dr++) + { + if (notSameColor(val1, dr, c, prevImg, maxWinSizeH, threshold) + ) + break; + + int _dr = dr - r_local_diff; + min_r = MIN(min_r, _dr); + max_r = MAX(max_r, _dr); + winPointMask.at(_dr, _c) = 1; + noPoints++; + } + // accumulate along the vertical line and the search line that was still labled + for (int dr = min_r + r_local_diff; dr <= max_r + r_local_diff; dr++) + { + int _dr = dr - r_local_diff; + if (winPointMask.at(_dr, _c) == 0) + { + winPointMask.row(_dr).setTo(0); + continue; + } + bool skip = false; + int _dc = c_right - c_local_diff; + for (int dc = c_right; dc < border_right; dc++, _dc++) + { + if (skip == false) + { + if (notSameColor(val1, dr, dc, prevImg, maxWinSizeH, threshold)) + skip = true; + } + if (skip == false) + { + min_c = MIN(min_c, _dc); + max_c = MAX(max_c, _dc); + winPointMask.at(_dr, _dc) = 1; + noPoints++; + } + else + winPointMask.at(_dr, _dc) = 0; + } + + skip = false; + _dc = c_left - c_local_diff; + for (int dc = c_left; dc >= border_left; dc--, _dc--) + { + if (skip == false) + { + if (notSameColor(val1, dr, dc, prevImg, maxWinSizeH, threshold)) + skip = true; + } + if (skip == false) + { + min_c = MIN(min_c, _dc); + max_c = MAX(max_c, _dc); + winPointMask.at(_dr, _dc) = 1; + noPoints++; + } + else + winPointMask.at(_dr, _dc) = 0; + } + } + + + // get the initial small window + if (noPoints < minWinSize * minWinSize) + { + cv::Rect roi(winPointMask.cols / 2 - (minWinSize - 1) / 2, + winPointMask.rows / 2 - (minWinSize - 1) / 2, + minWinSize, minWinSize); + cv::Mat(winPointMask, roi).setTo(1); + min_c = MIN(MIN(min_c, roi.tl().x), roi.br().x - 1); + max_c = MAX(MAX(max_c, roi.tl().x), roi.br().x - 1); + min_r = MIN(MIN(min_r, roi.tl().y), roi.br().y - 1); + max_r = MAX(MAX(max_r, roi.tl().y), roi.br().y - 1); + noPoints += minWinSize * minWinSize; + } + winRoi.x = c_local_diff + min_c; + winRoi.y = r_local_diff + min_r; + winRoi.width = max_c - min_c + 1; + winRoi.height = max_r - min_r + 1; + winPointMask = winPointMask(cv::Rect(min_c, min_r, winRoi.width, winRoi.height)); + } + + inline bool calcWinMaskMat( const cv::Mat & BI, + const int windowType, + const cv::Point2i & iprevPt, + cv::Mat & winMaskMat, + cv::Size & winSize, + cv::Point2f & halfWin, + int & winArea, + const int minWinSize, + const int maxWinSize, + const int crossSegmentationThreshold) + { + + if (windowType == RLOFOpticalFlowParameter::SR_CROSS && maxWinSize != minWinSize) + { + // patch generation + cv::Rect winRoi; + getLocalPatch(BI, iprevPt, winMaskMat, winArea, winRoi, minWinSize, crossSegmentationThreshold); + if (winArea == 0) + return false; + winSize = winRoi.size(); + halfWin = Point2f(static_cast(iprevPt.x - winRoi.tl().x), + static_cast(iprevPt.y - winRoi.tl().y)); + } + else + { + winSize = cv::Size(maxWinSize, maxWinSize); + halfWin = Point2f((winSize.width - 1) / 2.f, (winSize.height - 1) / 2.f); + winMaskMat.setTo(1); + } + return true; + } + + inline short estimateScale(cv::Mat & residuals) + { + cv::Mat absMat = cv::abs(residuals); + return quickselect(absMat, absMat.rows / 2); + } + } + + +} + +#endif \ No newline at end of file diff --git a/modules/optflow/src/rlof/rlof_localflow.cpp b/modules/optflow/src/rlof/rlof_localflow.cpp new file mode 100644 index 00000000000..3f995fd5477 --- /dev/null +++ b/modules/optflow/src/rlof/rlof_localflow.cpp @@ -0,0 +1,586 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// + // + // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. + // + // By downloading, copying, installing or using the software you agree to this license. + // If you do not agree to this license, do not download, install, + // copy or use the software. + // + // + // License Agreement + // For Open Source Computer Vision Library + // + // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. + // Copyright (C) 2009, Willow Garage Inc., all rights reserved. + // Third party copyrights are property of their respective owners. + // + // Redistribution and use in source and binary forms, with or without modification, + // are permitted provided that the following conditions are met: + // + // * Redistribution's of source code must retain the above copyright notice, + // this list of conditions and the following disclaimer. + // + // * Redistribution's in binary form must reproduce the above copyright notice, + // this list of conditions and the following disclaimer in the documentation + // and/or other materials provided with the distribution. + // + // * The name of the copyright holders may not be used to endorse or promote products + // derived from this software without specific prior written permission. + // + // This software is provided by the copyright holders and contributors "as is" and + // any express or implied warranties, including, but not limited to, the implied + // warranties of merchantability and fitness for a particular purpose are disclaimed. + // In no event shall the Intel Corporation or contributors be liable for any direct, + // indirect, incidental, special, exemplary, or consequential damages + // (including, but not limited to, procurement of substitute goods or services; + // loss of use, data, or profits; or business interruption) however caused + // and on any theory of liability, whether in contract, strict liability, + // or tort (including negligence or otherwise) arising in any way out of + // the use of this software, even if advised of the possibility of such damage. + // + //M*/ +#include "precomp.hpp" +#include "rlof_localflow.h" +#ifdef HAVE_OPENCV_CALIB3D +#include "opencv2/calib3d.hpp" +#endif +#include "berlof_invoker.hpp" +#include "rlof_invoker.hpp" +#include "plk_invoker.hpp" +using namespace std; +using namespace cv; + +namespace cv +{ + namespace detail + { + + typedef short deriv_type; + } +namespace +{ + static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst) + { + using namespace cv; + using cv::detail::deriv_type; + int rows = src.rows, cols = src.cols, cn = src.channels(), colsn = cols * cn, depth = src.depth(); + CV_Assert(depth == CV_8U); + dst.create(rows, cols, CV_MAKETYPE(DataType::depth, cn * 2)); + + int x, y, delta = (int)alignSize((cols + 2)*cn, 16); + AutoBuffer _tempBuf(delta * 2 + 64); + deriv_type *trow0 = alignPtr(_tempBuf.data() + cn, 16), *trow1 = alignPtr(trow0 + delta, 16); + +#if CV_SIMD128 + v_int16x8 c3 = v_setall_s16(3), c10 = v_setall_s16(10); + bool haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); +#endif + + for (y = 0; y < rows; y++) + { + const uchar* srow0 = src.ptr(y > 0 ? y - 1 : rows > 1 ? 1 : 0); + const uchar* srow1 = src.ptr(y); + const uchar* srow2 = src.ptr(y < rows - 1 ? y + 1 : rows > 1 ? rows - 2 : 0); + deriv_type* drow = dst.ptr(y); + + // do vertical convolution + x = 0; +#if CV_SIMD128 + if (haveSIMD) + { + for (; x <= colsn - 8; x += 8) + { + v_int16x8 s0 = v_reinterpret_as_s16(v_load_expand(srow0 + x)); + v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(srow1 + x)); + v_int16x8 s2 = v_reinterpret_as_s16(v_load_expand(srow2 + x)); + + v_int16x8 t1 = s2 - s0; + v_int16x8 t0 = v_mul_wrap(s0 + s2, c3) + v_mul_wrap(s1, c10); + + v_store(trow0 + x, t0); + v_store(trow1 + x, t1); + } + } +#endif + + for (; x < colsn; x++) + { + int t0 = (srow0[x] + srow2[x]) * 3 + srow1[x] * 10; + int t1 = srow2[x] - srow0[x]; + trow0[x] = (deriv_type)t0; + trow1[x] = (deriv_type)t1; + } + + // make border + int x0 = (cols > 1 ? 1 : 0)*cn, x1 = (cols > 1 ? cols - 2 : 0)*cn; + for (int k = 0; k < cn; k++) + { + trow0[-cn + k] = trow0[x0 + k]; trow0[colsn + k] = trow0[x1 + k]; + trow1[-cn + k] = trow1[x0 + k]; trow1[colsn + k] = trow1[x1 + k]; + } + + // do horizontal convolution, interleave the results and store them to dst + x = 0; +#if CV_SIMD128 + if (haveSIMD) + { + for (; x <= colsn - 8; x += 8) + { + v_int16x8 s0 = v_load(trow0 + x - cn); + v_int16x8 s1 = v_load(trow0 + x + cn); + v_int16x8 s2 = v_load(trow1 + x - cn); + v_int16x8 s3 = v_load(trow1 + x); + v_int16x8 s4 = v_load(trow1 + x + cn); + + v_int16x8 t0 = s1 - s0; + v_int16x8 t1 = v_mul_wrap(s2 + s4, c3) + v_mul_wrap(s3, c10); + + v_store_interleave((drow + x * 2), t0, t1); + } + } +#endif + for (; x < colsn; x++) + { + deriv_type t0 = (deriv_type)(trow0[x + cn] - trow0[x - cn]); + deriv_type t1 = (deriv_type)((trow1[x + cn] + trow1[x - cn]) * 3 + trow1[x] * 10); + drow[x * 2] = t0; drow[x * 2 + 1] = t1; + } + } + } + +}//namespace + namespace optflow + { + bool isrobust(const RLOFOpticalFlowParameter & param) + { + return (param.normSigma0 != std::numeric_limits::max()) + && (param.normSigma1 != std::numeric_limits::max()); + } + std::vector get_norm(float sigma0, float sigma1) + { + std::vector result = { sigma0, sigma1, sigma0 / (sigma0 - sigma1), sigma0 * sigma1 }; + return result; + } + + + int buildOpticalFlowPyramidScale(InputArray _img, OutputArrayOfArrays pyramid, Size winSize, int maxLevel, bool withDerivatives, + int pyrBorder, int derivBorder, bool tryReuseInputImage, float levelScale[2]) + { + Mat img = _img.getMat(); + CV_Assert(img.depth() == CV_8U && winSize.width > 2 && winSize.height > 2); + int pyrstep = withDerivatives ? 2 : 1; + + pyramid.create(1, (maxLevel + 1) * pyrstep, 0 /*type*/, -1, true); + + int derivType = CV_MAKETYPE(DataType::depth, img.channels() * 2); + + //level 0 + bool lvl0IsSet = false; + if (tryReuseInputImage && img.isSubmatrix() && (pyrBorder & BORDER_ISOLATED) == 0) + { + Size wholeSize; + Point ofs; + img.locateROI(wholeSize, ofs); + if (ofs.x >= winSize.width && ofs.y >= winSize.height + && ofs.x + img.cols + winSize.width <= wholeSize.width + && ofs.y + img.rows + winSize.height <= wholeSize.height) + { + pyramid.getMatRef(0) = img; + lvl0IsSet = true; + } + } + + if (!lvl0IsSet) + { + Mat& temp = pyramid.getMatRef(0); + + if (!temp.empty()) + temp.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width); + if (temp.type() != img.type() || temp.cols != winSize.width * 2 + img.cols || temp.rows != winSize.height * 2 + img.rows) + temp.create(img.rows + winSize.height * 2, img.cols + winSize.width * 2, img.type()); + + if (pyrBorder == BORDER_TRANSPARENT) + img.copyTo(temp(Rect(winSize.width, winSize.height, img.cols, img.rows))); + else + copyMakeBorder(img, temp, winSize.height, winSize.height, winSize.width, winSize.width, pyrBorder); + temp.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width); + } + + Size sz = img.size(); + Mat prevLevel = pyramid.getMatRef(0); + Mat thisLevel = prevLevel; + + for (int level = 0; level <= maxLevel; ++level) + { + if (level != 0) + { + Mat& temp = pyramid.getMatRef(level * pyrstep); + + if (!temp.empty()) + temp.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width); + if (temp.type() != img.type() || temp.cols != winSize.width * 2 + sz.width || temp.rows != winSize.height * 2 + sz.height) + temp.create(sz.height + winSize.height * 2, sz.width + winSize.width * 2, img.type()); + + thisLevel = temp(Rect(winSize.width, winSize.height, sz.width, sz.height)); + pyrDown(prevLevel, thisLevel, sz); + + if (pyrBorder != BORDER_TRANSPARENT) + copyMakeBorder(thisLevel, temp, winSize.height, winSize.height, winSize.width, winSize.width, pyrBorder | BORDER_ISOLATED); + temp.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width); + } + + if (withDerivatives) + { + Mat& deriv = pyramid.getMatRef(level * pyrstep + 1); + + if (!deriv.empty()) + deriv.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width); + if (deriv.type() != derivType || deriv.cols != winSize.width * 2 + sz.width || deriv.rows != winSize.height * 2 + sz.height) + deriv.create(sz.height + winSize.height * 2, sz.width + winSize.width * 2, derivType); + + Mat derivI = deriv(Rect(winSize.width, winSize.height, sz.width, sz.height)); + calcSharrDeriv(thisLevel, derivI); + + if (derivBorder != BORDER_TRANSPARENT) + copyMakeBorder(derivI, deriv, winSize.height, winSize.height, winSize.width, winSize.width, derivBorder | BORDER_ISOLATED); + deriv.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width); + } + + sz = Size(static_cast((sz.width + 1) / levelScale[0]), + static_cast((sz.height + 1) / levelScale[1])); + if (sz.width <= winSize.width || sz.height <= winSize.height) + { + pyramid.create(1, (level + 1) * pyrstep, 0 /*type*/, -1, true);//check this + return level; + } + + prevLevel = thisLevel; + } + + return maxLevel; + } + int CImageBuffer::buildPyramid(cv::Size winSize, int maxLevel, float levelScale[2]) + { + if (m_Overwrite == false) + return m_maxLevel; + m_maxLevel = buildOpticalFlowPyramidScale(m_Image, m_ImagePyramid, winSize, maxLevel, false, 4, 0, true, levelScale); + return m_maxLevel; + } + + void calcLocalOpticalFlowCore( + Ptr prevPyramids[2], + Ptr currPyramids[2], + InputArray _prevPts, + InputOutputArray _nextPts, + const RLOFOpticalFlowParameter & param) + { + + bool useAdditionalRGB = param.supportRegionType == RLOFOpticalFlowParameter::SR_CROSS; + int iWinSize = param.largeWinSize; + int winSizes[2] = { iWinSize, iWinSize }; + if (param.supportRegionType != RLOFOpticalFlowParameter::SR_FIXED) + { + winSizes[0] = param.smallWinSize; + } + //cv::Size winSize(iWinSize, iWinSize); + cv::TermCriteria criteria(cv::TermCriteria::COUNT | cv::TermCriteria::EPS, param.maxIteration, 0.01); + std::vector rlofNorm = get_norm(param.normSigma0, param.normSigma1); + CV_Assert(winSizes[0] <= winSizes[1]); + + Mat prevPtsMat = _prevPts.getMat(); + const int derivDepth = DataType::depth; + + CV_Assert(param.maxLevel >= 0 && iWinSize > 2); + + int level = 0, npoints; + CV_Assert((npoints = prevPtsMat.checkVector(2, CV_32F, true)) >= 0); + + if (!(param.useInitialFlow)) + _nextPts.create(prevPtsMat.size(), prevPtsMat.type(), -1, true); + + Mat nextPtsMat = _nextPts.getMat(); + CV_Assert(nextPtsMat.checkVector(2, CV_32F, true) == npoints); + + const Point2f* prevPts = (const Point2f*)prevPtsMat.data; + Point2f* nextPts = (Point2f*)nextPtsMat.data; + std::vector status(npoints); + std::vector err(npoints); + std::vector gainPts(npoints); + + float levelScale[2] = { 2.f,2.f }; + + int maxLevel = prevPyramids[0]->buildPyramid(cv::Size(iWinSize, iWinSize), param.maxLevel, levelScale); + + maxLevel = currPyramids[0]->buildPyramid(cv::Size(iWinSize, iWinSize), maxLevel, levelScale); + if (useAdditionalRGB) + { + prevPyramids[1]->buildPyramid(cv::Size(iWinSize, iWinSize), maxLevel, levelScale); + currPyramids[1]->buildPyramid(cv::Size(iWinSize, iWinSize), maxLevel, levelScale); + } + + if ((criteria.type & TermCriteria::COUNT) == 0) + criteria.maxCount = 30; + else + criteria.maxCount = std::min(std::max(criteria.maxCount, 0), 100); + if ((criteria.type & TermCriteria::EPS) == 0) + criteria.epsilon = 0.001; + else + criteria.epsilon = std::min(std::max(criteria.epsilon, 0.), 10.); + criteria.epsilon *= criteria.epsilon; + + // dI/dx ~ Ix, dI/dy ~ Iy + Mat derivIBuf; + derivIBuf.create(prevPyramids[0]->m_ImagePyramid[0].rows + iWinSize * 2, prevPyramids[0]->m_ImagePyramid[0].cols + iWinSize * 2, CV_MAKETYPE(derivDepth, prevPyramids[0]->m_ImagePyramid[0].channels() * 2)); + + for (level = maxLevel; level >= 0; level--) + { + Mat derivI; + Size imgSize = prevPyramids[0]->getImage(level).size(); + Mat _derivI(imgSize.height + iWinSize * 2, imgSize.width + iWinSize * 2, derivIBuf.type(), derivIBuf.data); + derivI = _derivI(Rect(iWinSize, iWinSize, imgSize.width, imgSize.height)); + calcSharrDeriv(prevPyramids[0]->getImage(level), derivI); + copyMakeBorder(derivI, _derivI, iWinSize, iWinSize, iWinSize, iWinSize, BORDER_CONSTANT | BORDER_ISOLATED); + + cv::Mat tRGBPrevPyr; + cv::Mat tRGBNextPyr; + if (useAdditionalRGB) + { + tRGBPrevPyr = prevPyramids[1]->getImage(level); + tRGBNextPyr = prevPyramids[1]->getImage(level); + + prevPyramids[1]->m_Overwrite = false; + currPyramids[1]->m_Overwrite = false; + } + + cv::Mat prevImage = prevPyramids[0]->getImage(level); + cv::Mat currImage = currPyramids[0]->getImage(level); + + // apply plk like tracker + if (isrobust(param) == false) + { + if (param.useIlluminationModel) + { + cv::parallel_for_(cv::Range(0, npoints), + plk::radial::TrackerInvoker( + prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, + prevPts, nextPts, &status[0], &err[0], &gainPts[0], + level, maxLevel, winSizes, + param.maxIteration, + param.useInitialFlow, + param.supportRegionType, + param.minEigenValue, + param.crossSegmentationThreshold)); + } + else + { + if (param.solverType == RLOFOpticalFlowParameter::SolverType::ST_STANDART) + { + cv::parallel_for_(cv::Range(0, npoints), + plk::ica::TrackerInvoker( + prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, + prevPts, nextPts, &status[0], &err[0], + level, maxLevel, winSizes, + param.maxIteration, + param.useInitialFlow, + param.supportRegionType, + param.crossSegmentationThreshold, param.minEigenValue)); + } + else + { + cv::parallel_for_(cv::Range(0, npoints), + beplk::ica::TrackerInvoker(prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, + prevPts, nextPts, &status[0], &err[0], + level, maxLevel, winSizes, + param.maxIteration, + param.useInitialFlow, + param.supportRegionType, + param.crossSegmentationThreshold, + param.minEigenValue)); + } + } + } + // for robust models + else + { + if (param.useIlluminationModel) + { + if (param.solverType == RLOFOpticalFlowParameter::SolverType::ST_STANDART) + { + cv::parallel_for_(cv::Range(0, npoints), + rlof::radial::TrackerInvoker( + prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, + prevPts, nextPts, &status[0], &err[0], &gainPts[0], + level, maxLevel, winSizes, + param.maxIteration, + param.useInitialFlow, + param.supportRegionType, + rlofNorm, + param.minEigenValue, + param.crossSegmentationThreshold)); + } + else + { + cv::parallel_for_(cv::Range(0, npoints), + berlof::radial::TrackerInvoker(prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, + prevPts, nextPts, &status[0], &err[0], &gainPts[0], + level, maxLevel, winSizes, + param.maxIteration, + param.useInitialFlow, + param.supportRegionType, + param.crossSegmentationThreshold, + rlofNorm, + param.minEigenValue)); + } + } + else + { + + if (param.solverType == RLOFOpticalFlowParameter::SolverType::ST_STANDART) + { + cv::parallel_for_(cv::Range(0, npoints), + rlof::ica::TrackerInvoker( + prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, + prevPts, nextPts, &status[0], &err[0], + level, maxLevel, winSizes, + param.maxIteration, + param.useInitialFlow, + param.supportRegionType, + rlofNorm, + param.minEigenValue, + param.crossSegmentationThreshold)); + } + else + { + cv::parallel_for_(cv::Range(0, npoints), + berlof::ica::TrackerInvoker(prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, + prevPts, nextPts, &status[0], &err[0], + level, maxLevel, winSizes, + param.maxIteration, + param.useInitialFlow, + param.supportRegionType, + param.crossSegmentationThreshold, + rlofNorm, + param.minEigenValue)); + } + + } + } + + prevPyramids[0]->m_Overwrite = true; + currPyramids[0]->m_Overwrite = true; + } + } + + void preprocess(Ptr prevPyramids[2], + Ptr currPyramids[2], + const std::vector & prevPoints, + std::vector & currPoints, + const RLOFOpticalFlowParameter & param) + { + cv::Mat mask, homography; + if (param.useGlobalMotionPrior == false) + return; + + currPoints.resize(prevPoints.size()); + + RLOFOpticalFlowParameter gmeTrackerParam = param; + gmeTrackerParam.useGlobalMotionPrior = false; + gmeTrackerParam.largeWinSize = 17; + // use none robust tracker for global motion estimation since it is faster + gmeTrackerParam.normSigma0 = std::numeric_limits::max(); + gmeTrackerParam.maxIteration = MAX(15, param.maxIteration); + gmeTrackerParam.minEigenValue = 0.000001f; + + std::vector gmPrevPoints, gmCurrPoints, gmCheckPoints; + + // Initialize point grid + int stepr = prevPyramids[0]->m_Image.rows / 30; + int stepc = prevPyramids[0]->m_Image.rows / 40; + for (int r = stepr / 2; r < prevPyramids[0]->m_Image.rows; r += stepr) + { + for (int c = stepc / 2; c < prevPyramids[0]->m_Image.cols; c += stepc) + { + gmPrevPoints.push_back(cv::Point2f(static_cast(c), static_cast(r))); + } + } + + // perform motion estimation + calcLocalOpticalFlowCore(prevPyramids, currPyramids, gmPrevPoints, gmCurrPoints, gmeTrackerParam); + calcLocalOpticalFlowCore(currPyramids, prevPyramids, gmCurrPoints, gmCheckPoints, gmeTrackerParam); + + cv::Mat prevPointsMat(static_cast(gmPrevPoints.size()), 1, CV_32FC2); + cv::Mat currPointsMat(static_cast(gmPrevPoints.size()), 1, CV_32FC2); + cv::Mat distMat(static_cast(gmPrevPoints.size()), 1, CV_32FC1); + + // Forward backward confidence to estimate optimal ransac reprojection error + int noPoints = 0; + for (unsigned int n = 0; n < gmPrevPoints.size(); n++) + { + cv::Point2f dist = gmCheckPoints[n] - gmPrevPoints[n]; + + cv::Point2f flow = gmCurrPoints[n] - gmPrevPoints[n]; + prevPointsMat.at(noPoints) = gmPrevPoints[n]; + currPointsMat.at(noPoints) = gmCurrPoints[n]; + distMat.at(noPoints) = flow.x * flow.x + flow.y* flow.y; + if (isnan(distMat.at(noPoints)) == false) + noPoints++; + } + + float medianDist = (param.globalMotionRansacThreshold == 0) ? 1.f : + quickselect(distMat, static_cast(noPoints * static_cast(param.globalMotionRansacThreshold) / 100.f)); + medianDist = sqrt(medianDist); + + if (noPoints < 8) + return; + + cv::findHomography(prevPointsMat(cv::Rect(0, 0, 1, noPoints)), currPointsMat(cv::Rect(0, 0, 1, noPoints)), cv::RANSAC, medianDist, mask).convertTo(homography, CV_32FC1); + + if (homography.empty()) + return; + + cv::perspectiveTransform(prevPoints, currPoints, homography); + } + + void calcLocalOpticalFlow( + const Mat prevImage, + const Mat currImage, + Ptr prevPyramids[2], + Ptr currPyramids[2], + const std::vector & prevPoints, + std::vector & currPoints, + const RLOFOpticalFlowParameter & param) + { + if (prevImage.empty() == false && currImage.empty()== false) + { + prevPyramids[0]->m_Overwrite = true; + currPyramids[0]->m_Overwrite = true; + prevPyramids[1]->m_Overwrite = true; + currPyramids[1]->m_Overwrite = true; + if (prevImage.type() == CV_8UC3) + { + prevPyramids[0]->setGrayFromRGB(prevImage); + currPyramids[0]->setGrayFromRGB(currImage); + prevPyramids[1]->setImage(prevImage); + currPyramids[1]->setImage(currImage); + + if (param.supportRegionType == RLOFOpticalFlowParameter::SR_CROSS) + { + prevPyramids[1]->setBlurFromRGB(prevImage); + currPyramids[1]->setBlurFromRGB(currImage); + } + } + else + { + prevPyramids[0]->setImage(prevImage); + currPyramids[0]->setImage(currImage); + } + } + preprocess(prevPyramids, currPyramids, prevPoints, currPoints, param); + RLOFOpticalFlowParameter internParam = param; + if (param.useGlobalMotionPrior == true) + internParam.useInitialFlow = true; + calcLocalOpticalFlowCore(prevPyramids, currPyramids, prevPoints, currPoints, internParam); + } + } +} + \ No newline at end of file diff --git a/modules/optflow/src/rlof/rlof_localflow.h b/modules/optflow/src/rlof/rlof_localflow.h new file mode 100644 index 00000000000..6f6a0fc5e2a --- /dev/null +++ b/modules/optflow/src/rlof/rlof_localflow.h @@ -0,0 +1,158 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// + // + // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. + // + // By downloading, copying, installing or using the software you agree to this license. + // If you do not agree to this license, do not download, install, + // copy or use the software. + // + // + // License Agreement + // For Open Source Computer Vision Library + // + // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. + // Copyright (C) 2009, Willow Garage Inc., all rights reserved. + // Third party copyrights are property of their respective owners. + // + // Redistribution and use in source and binary forms, with or without modification, + // are permitted provided that the following conditions are met: + // + // * Redistribution's of source code must retain the above copyright notice, + // this list of conditions and the following disclaimer. + // + // * Redistribution's in binary form must reproduce the above copyright notice, + // this list of conditions and the following disclaimer in the documentation + // and/or other materials provided with the distribution. + // + // * The name of the copyright holders may not be used to endorse or promote products + // derived from this software without specific prior written permission. + // + // This software is provided by the copyright holders and contributors "as is" and + // any express or implied warranties, including, but not limited to, the implied + // warranties of merchantability and fitness for a particular purpose are disclaimed. + // In no event shall the Intel Corporation or contributors be liable for any direct, + // indirect, incidental, special, exemplary, or consequential damages + // (including, but not limited to, procurement of substitute goods or services; + // loss of use, data, or profits; or business interruption) however caused + // and on any theory of liability, whether in contract, strict liability, + // or tort (including negligence or otherwise) arising in any way out of + // the use of this software, even if advised of the possibility of such damage. + // + //M*/ +#ifndef _RLOF_LOCALFLOW_H_ +#define _RLOF_LOCALFLOW_H_ +#include +#include +#include +#include +#include "opencv2/core.hpp" +#include "opencv2/imgproc.hpp" +#include "opencv2/optflow/rlofflow.hpp" +//! Fast median estimation method based on @cite Tibshirani2008. This implementation relates to http://www.stat.cmu.edu/~ryantibs/median/ +using namespace cv; +template +T quickselect(const Mat & inp, int k) +{ + unsigned long i; + unsigned long ir; + unsigned long j; + unsigned long l; + unsigned long mid; + Mat values = inp.clone(); + T a; + + l = 0; + ir = MAX(values.rows, values.cols) - 1; + while (true) + { + if (ir <= l + 1) + { + if (ir == l + 1 && values.at(ir) < values.at(l)) + std::swap(values.at(l), values.at(ir)); + return values.at(k); + } + else + { + mid = (l + ir) >> 1; + std::swap(values.at(mid), values.at(l+1)); + if (values.at(l) > values.at(ir)) + std::swap(values.at(l), values.at(ir)); + if (values.at(l+1) > values.at(ir)) + std::swap(values.at(l+1), values.at(ir)); + if (values.at(l) > values.at(l+1)) + std::swap(values.at(l), values.at(l+1)); + i = l + 1; + j = ir; + a = values.at(l+1); + while (true) + { + do + { + i++; + } + while (values.at(i) < a); + do + { + j--; + } + while (values.at(j) > a); + if (j < i) break; + std::swap(values.at(i), values.at(j)); + } + values.at(l+1) = values.at(j); + values.at(j) = a; + if (j >= static_cast(k)) ir = j - 1; + if (j <= static_cast(k)) l = i; + } + } +} + +namespace cv +{ + namespace optflow + { + class CImageBuffer + { + public: + CImageBuffer() + : m_Overwrite(true) + {}; + void setGrayFromRGB(const cv::Mat & inp) + { + if(m_Overwrite) + cv::cvtColor(inp, m_Image, cv::COLOR_BGR2GRAY); + } + void setImage(const cv::Mat & inp) + { + if(m_Overwrite) + inp.copyTo(m_Image); + } + void setBlurFromRGB(const cv::Mat & inp) + { + //cv::medianBlur(constNextImage, blurNextImg, 7); + if(m_Overwrite) + cv::GaussianBlur(inp, m_BlurredImage, cv::Size(7,7), -1); + } + + int buildPyramid(cv::Size winSize, int maxLevel, float levelScale[2]); + cv::Mat & getImage(int level) {return m_ImagePyramid[level];} + + std::vector m_ImagePyramid; + cv::Mat m_BlurredImage; + cv::Mat m_Image; + std::vector m_CrossPyramid; + int m_maxLevel; + bool m_Overwrite; + }; + + void calcLocalOpticalFlow( + const Mat prevImage, + const Mat currImage, + Ptr prevPyramids[2], + Ptr currPyramids[2], + const std::vector & prevPoints, + std::vector & currPoints, + const RLOFOpticalFlowParameter & param); +} +} +#endif \ No newline at end of file diff --git a/modules/optflow/src/rlofflow.cpp b/modules/optflow/src/rlofflow.cpp new file mode 100644 index 00000000000..1bdc45a0221 --- /dev/null +++ b/modules/optflow/src/rlofflow.cpp @@ -0,0 +1,385 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// + // + // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. + // + // By downloading, copying, installing or using the software you agree to this license. + // If you do not agree to this license, do not download, install, + // copy or use the software. + // + // + // License Agreement + // For Open Source Computer Vision Library + // + // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. + // Copyright (C) 2009, Willow Garage Inc., all rights reserved. + // Third party copyrights are property of their respective owners. + // + // Redistribution and use in source and binary forms, with or without modification, + // are permitted provided that the following conditions are met: + // + // * Redistribution's of source code must retain the above copyright notice, + // this list of conditions and the following disclaimer. + // + // * Redistribution's in binary form must reproduce the above copyright notice, + // this list of conditions and the following disclaimer in the documentation + // and/or other materials provided with the distribution. + // + // * The name of the copyright holders may not be used to endorse or promote products + // derived from this software without specific prior written permission. + // + // This software is provided by the copyright holders and contributors "as is" and + // any express or implied warranties, including, but not limited to, the implied + // warranties of merchantability and fitness for a particular purpose are disclaimed. + // In no event shall the Intel Corporation or contributors be liable for any direct, + // indirect, incidental, special, exemplary, or consequential damages + // (including, but not limited to, procurement of substitute goods or services; + // loss of use, data, or profits; or business interruption) however caused + // and on any theory of liability, whether in contract, strict liability, + // or tort (including negligence or otherwise) arising in any way out of + // the use of this software, even if advised of the possibility of such damage. + // + //M*/ +#include "precomp.hpp" +#include "rlof/rlof_localflow.h" +#include "rlof/geo_interpolation.hpp" +#include "opencv2/ximgproc.hpp" +namespace cv +{ + namespace optflow + { + + class DenseOpticalFlowRLOFImpl : public DenseRLOFOpticalFlow + { + public: + DenseOpticalFlowRLOFImpl() + : forwardBackwardThreshold(1.f) + , gridStep(6, 6) + , interp_type(InterpolationType::INTERP_GEO) + , k(128) + , sigma(0.05f) + , lambda(999.f) + , fgs_lambda(500.0f) + , fgs_sigma(1.5f) + , use_post_proc(true) + + { + prevPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer); + prevPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer); + currPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer); + currPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer); + } + virtual void setRLOFOpticalFlowParameter(RLOFOpticalFlowParameter val) CV_OVERRIDE { param = val; } + virtual RLOFOpticalFlowParameter getRLOFOpticalFlowParameter() const CV_OVERRIDE { return param; } + + virtual float getForwardBackward() const CV_OVERRIDE { return forwardBackwardThreshold; } + virtual void setForwardBackward(float val) CV_OVERRIDE { forwardBackwardThreshold = val; } + + virtual void setInterpolation(InterpolationType val) CV_OVERRIDE { interp_type = val; } + virtual InterpolationType getInterpolation() const CV_OVERRIDE { return interp_type; } + + virtual Size getGridStep() const CV_OVERRIDE { return gridStep; } + virtual void setGridStep(Size val) CV_OVERRIDE { gridStep = val; } + + virtual int getEPICK() const CV_OVERRIDE { return k; } + virtual void setEPICK(int val) CV_OVERRIDE { k = val; } + + virtual float getEPICSigma() const CV_OVERRIDE { return sigma; } + virtual void setEPICSigma(float val) CV_OVERRIDE { sigma = val; } + + virtual float getEPICLambda() const CV_OVERRIDE { return lambda; } + virtual void setEPICLambda(float val) CV_OVERRIDE { lambda = val; } + + virtual float getFgsLambda() const CV_OVERRIDE { return fgs_lambda; } + virtual void setFgsLambda(float val) CV_OVERRIDE { fgs_lambda = val; } + + virtual float getFgsSigma() const CV_OVERRIDE { return fgs_sigma; } + virtual void setFgsSigma(float val) CV_OVERRIDE { fgs_sigma = val; } + + virtual bool getUsePostProc() const CV_OVERRIDE { return use_post_proc; } + virtual void setUsePostProc(bool val) CV_OVERRIDE { use_post_proc = val; } + + virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow) CV_OVERRIDE + { + CV_Assert(!I0.empty() && I0.depth() == CV_8U && (I0.channels() == 3 || I0.channels() == 1)); + CV_Assert(!I1.empty() && I1.depth() == CV_8U && (I1.channels() == 3 || I1.channels() == 1)); + CV_Assert(I0.sameSize(I1)); + if (param.supportRegionType == RLOFOpticalFlowParameter::SR_CROSS) + CV_Assert( I0.channels() == 3 && I1.channels() == 3); + CV_Assert(interp_type == InterpolationType::INTERP_EPIC || interp_type == InterpolationType::INTERP_GEO); + + Mat prevImage = I0.getMat(); + Mat currImage = I1.getMat(); + int noPoints = prevImage.cols * prevImage.rows; + std::vector prevPoints(noPoints); + std::vector currPoints, refPoints; + noPoints = 0; + cv::Size grid_h = gridStep / 2; + for (int r = grid_h.height; r < prevImage.rows; r += gridStep.height) + { + for (int c = grid_h.width; c < prevImage.cols; c += gridStep.width) + { + prevPoints[noPoints++] = cv::Point2f(static_cast(c), static_cast(r)); + } + } + prevPoints.erase(prevPoints.begin() + noPoints, prevPoints.end()); + currPoints.resize(prevPoints.size()); + + calcLocalOpticalFlow(prevImage, currImage, prevPyramid, currPyramid, prevPoints, currPoints, param); + + flow.create(prevImage.size(), CV_32FC2); + Mat dense_flow = flow.getMat(); + + std::vector filtered_prevPoints; + std::vector filtered_currPoints; + if (gridStep == cv::Size(1, 1) && forwardBackwardThreshold <= 0) + { + for (unsigned int n = 0; n < prevPoints.size(); n++) + { + dense_flow.at(prevPoints[n]) = currPoints[n] - prevPoints[n]; + } + return; + } + if (forwardBackwardThreshold > 0) + { + // reuse image pyramids + calcLocalOpticalFlow(currImage, prevImage, currPyramid, prevPyramid, currPoints, refPoints, param); + + filtered_prevPoints.resize(prevPoints.size()); + filtered_currPoints.resize(prevPoints.size()); + float sqrForwardBackwardThreshold = forwardBackwardThreshold * forwardBackwardThreshold; + noPoints = 0; + for (unsigned int r = 0; r < refPoints.size(); r++) + { + Point2f diff = refPoints[r] - prevPoints[r]; + if (diff.x * diff.x + diff.y * diff.y < sqrForwardBackwardThreshold) + { + filtered_prevPoints[noPoints] = prevPoints[r]; + filtered_currPoints[noPoints++] = currPoints[r]; + } + } + + filtered_prevPoints.erase(filtered_prevPoints.begin() + noPoints, filtered_prevPoints.end()); + filtered_currPoints.erase(filtered_currPoints.begin() + noPoints, filtered_currPoints.end()); + + } + else + { + filtered_prevPoints = prevPoints; + filtered_currPoints = currPoints; + } + + if (interp_type == InterpolationType::INTERP_EPIC) + { + Ptr gd = ximgproc::createEdgeAwareInterpolator(); + gd->setK(k); + gd->setSigma(sigma); + gd->setLambda(lambda); + gd->setUsePostProcessing(false); + gd->interpolate(prevImage, filtered_prevPoints, currImage, filtered_currPoints, dense_flow); + } + else + { + Mat blurredPrevImage, blurredCurrImage; + GaussianBlur(prevImage, blurredPrevImage, cv::Size(5, 5), -1); + std::vector status(filtered_currPoints.size(), 1); + interpolate_irregular_nn_raster(filtered_prevPoints, filtered_currPoints, status, blurredPrevImage).copyTo(dense_flow); + std::vector vecMats; + std::vector vecMats2(2); + cv::split(dense_flow, vecMats); + cv::bilateralFilter(vecMats[0], vecMats2[0], 5, 2, 20); + cv::bilateralFilter(vecMats[1], vecMats2[1], 5, 2, 20); + cv::merge(vecMats2, dense_flow); + } + if (use_post_proc) + { + ximgproc::fastGlobalSmootherFilter(prevImage, flow, flow, fgs_lambda, fgs_sigma); + } + } + + virtual void collectGarbage() CV_OVERRIDE + { + prevPyramid[0].release(); + prevPyramid[1].release(); + currPyramid[0].release(); + currPyramid[1].release(); + } + + protected: + RLOFOpticalFlowParameter param; + float forwardBackwardThreshold; + Ptr prevPyramid[2]; + Ptr currPyramid[2]; + cv::Size gridStep; + InterpolationType interp_type; + int k; + float sigma; + float lambda; + float fgs_lambda; + float fgs_sigma; + bool use_post_proc; + }; + + Ptr DenseRLOFOpticalFlow::create( + const RLOFOpticalFlowParameter & rlofParam, + float forwardBackwardThreshold, + cv::Size gridStep, + InterpolationType interp_type, + int epicK, + float epicSigma, + float epicLambda, + float fgs_lambda, + float fgs_sigma) + { + Ptr algo = makePtr(); + algo->setRLOFOpticalFlowParameter(rlofParam); + algo->setForwardBackward(forwardBackwardThreshold); + algo->setGridStep(gridStep); + algo->setInterpolation(interp_type); + algo->setEPICK(epicK); + algo->setEPICSigma(epicSigma); + algo->setEPICLambda(epicLambda); + algo->setFgsLambda(fgs_lambda); + algo->setFgsSigma(fgs_sigma); + return algo; + } + + class SparseRLOFOpticalFlowImpl : public SparseRLOFOpticalFlow + { + public: + SparseRLOFOpticalFlowImpl() + : forwardBackwardThreshold(1.f) + { + prevPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer); + prevPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer); + currPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer); + currPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer); + } + virtual void setRLOFOpticalFlowParameter(RLOFOpticalFlowParameter val) CV_OVERRIDE { param = val; } + virtual RLOFOpticalFlowParameter getRLOFOpticalFlowParameter() const CV_OVERRIDE { return param; } + + virtual float getForwardBackward() const CV_OVERRIDE { return forwardBackwardThreshold; } + virtual void setForwardBackward(float val) CV_OVERRIDE { forwardBackwardThreshold = val; } + + virtual void calc(InputArray prevImg, InputArray nextImg, + InputArray prevPts, InputOutputArray nextPts, + OutputArray status, + OutputArray err) CV_OVERRIDE + { + CV_Assert(!prevImg.empty() && prevImg.depth() == CV_8U && (prevImg.channels() == 3 || prevImg.channels() == 1)); + CV_Assert(!nextImg.empty() && nextImg.depth() == CV_8U && (nextImg.channels() == 3 || nextImg.channels() == 1)); + CV_Assert(prevImg.sameSize(nextImg)); + + //CV_Assert(param.supportRegionType == RLOFOpticalFlowParameter::SR_CROSS && I0.channels() == 3 && I1.channels() == 3); + + Mat prevImage = prevImg.getMat(); + Mat nextImage = nextImg.getMat(); + Mat prevPtsMat = prevPts.getMat(); + + if (param.useInitialFlow == false) + nextPts.create(prevPtsMat.size(), prevPtsMat.type(), -1, true); + + int npoints = 0; + CV_Assert((npoints = prevPtsMat.checkVector(2, CV_32F, true)) >= 0); + + if (npoints == 0) + { + nextPts.release(); + status.release(); + err.release(); + return; + } + + Mat nextPtsMat = nextPts.getMat(); + CV_Assert(nextPtsMat.checkVector(2, CV_32F, true) == npoints); + std::vector prevPoints(npoints), nextPoints(npoints), refPoints; + prevPtsMat.copyTo(cv::Mat(1, npoints, CV_32FC2, &prevPoints[0])); + if (param.useInitialFlow ) + nextPtsMat.copyTo(cv::Mat(1, nextPtsMat.cols, CV_32FC2, &nextPoints[0])); + + cv::Mat statusMat; + cv::Mat errorMat; + if (status.needed() || forwardBackwardThreshold > 0) + { + status.create((int)npoints, 1, CV_8U, -1, true); + statusMat = status.getMat(); + statusMat.setTo(1); + } + + if (err.needed() || forwardBackwardThreshold > 0) + { + err.create((int)npoints, 1, CV_32F, -1, true); + errorMat = err.getMat(); + errorMat.setTo(0); + } + + calcLocalOpticalFlow(prevImage, nextImage, prevPyramid, currPyramid, prevPoints, nextPoints, param); + cv::Mat(1,npoints , CV_32FC2, &nextPoints[0]).copyTo(nextPtsMat); + if (forwardBackwardThreshold > 0) + { + // reuse image pyramids + calcLocalOpticalFlow(nextImage, prevImage, currPyramid, prevPyramid, nextPoints, refPoints, param); + } + for (unsigned int r = 0; r < refPoints.size(); r++) + { + Point2f diff = refPoints[r] - prevPoints[r]; + errorMat.at(r) = sqrt(diff.x * diff.x + diff.y * diff.y); + if (errorMat.at(r) <= forwardBackwardThreshold) + statusMat.at(r) = 0; + } + + } + + protected: + RLOFOpticalFlowParameter param; + float forwardBackwardThreshold; + Ptr prevPyramid[2]; + Ptr currPyramid[2]; + }; + + Ptr SparseRLOFOpticalFlow::create( + const RLOFOpticalFlowParameter & rlofParam, + float forwardBackwardThreshold) + { + Ptr algo = makePtr(); + algo->setRLOFOpticalFlowParameter(rlofParam); + algo->setForwardBackward(forwardBackwardThreshold); + return algo; + } + + void calcOpticalFlowDenseRLOF(InputArray I0, InputArray I1, InputOutputArray flow, + RLOFOpticalFlowParameter rlofParam , + float forewardBackwardThreshold, Size gridStep, + DenseRLOFOpticalFlow::InterpolationType interp_type, + int epicK, float epicSigma, + bool use_post_proc, float fgsLambda, float fgsSigma) + { + Ptr algo = DenseRLOFOpticalFlow::create( + rlofParam, forewardBackwardThreshold, gridStep, interp_type, + epicK, epicSigma, use_post_proc, fgsLambda, fgsSigma); + algo->calc(I0, I1, flow); + algo->collectGarbage(); + } + + void calcOpticalFlowSparseRLOF(InputArray prevImg, InputArray nextImg, + InputArray prevPts, InputOutputArray nextPts, + OutputArray status, OutputArray err, + RLOFOpticalFlowParameter rlofParam, + float forewardBackwardThreshold) + { + Ptr algo = SparseRLOFOpticalFlow::create( + rlofParam, forewardBackwardThreshold); + algo->calc(prevImg, nextImg, prevPts, nextPts, status, err); + } + Ptr createOptFlow_DenseRLOF() + { + return DenseRLOFOpticalFlow::create(); + } + + Ptr createOptFlow_SparseRLOF() + { + return SparseRLOFOpticalFlow::create(); + } + } + + +} \ No newline at end of file diff --git a/modules/optflow/test/test_OF_accuracy.cpp b/modules/optflow/test/test_OF_accuracy.cpp index 6a5ed702e48..d2aedf2573a 100644 --- a/modules/optflow/test/test_OF_accuracy.cpp +++ b/modules/optflow/test/test_OF_accuracy.cpp @@ -83,7 +83,20 @@ static float calcRMSE(Mat flow1, Mat flow2) } return (float)sqrt(sum / (1e-9 + counter)); } - +static float calcRMSE(vector prevPts, vector currPts, Mat flow) +{ + vector ee; + for (unsigned int n = 0; n < prevPts.size(); n++) + { + Point2f gtFlow = flow.at(prevPts[n]); + if (isFlowCorrect(gtFlow.x) && isFlowCorrect(gtFlow.y)) + { + Point2f diffFlow = (currPts[n] - prevPts[n]) - gtFlow; + ee.push_back(sqrt(diffFlow.x * diffFlow.x + diffFlow.y * diffFlow.y)); + } + } + return mean(ee).val[0]; +} static float calcAvgEPE(vector< pair > corr, Mat flow) { double sum = 0; @@ -111,9 +124,13 @@ static float calcAvgEPE(vector< pair > corr, Mat flow) bool readRubberWhale(Mat &dst_frame_1, Mat &dst_frame_2, Mat &dst_GT) { - const string frame1_path = getRubberWhaleFrame1(); - const string frame2_path = getRubberWhaleFrame2(); - const string gt_flow_path = getRubberWhaleGroundTruth(); + string frame1_path = getRubberWhaleFrame1(); + string frame2_path = getRubberWhaleFrame2(); + string gt_flow_path = getRubberWhaleGroundTruth(); + // removing space may be an issue on windows machines + frame1_path.erase(std::remove_if(frame1_path.begin(), frame1_path.end(), isspace), frame1_path.end()); + frame2_path.erase(std::remove_if(frame2_path.begin(), frame2_path.end(), isspace), frame2_path.end()); + gt_flow_path.erase(std::remove_if(gt_flow_path.begin(), gt_flow_path.end(), isspace), gt_flow_path.end()); dst_frame_1 = imread(frame1_path); dst_frame_2 = imread(frame2_path); @@ -125,6 +142,7 @@ bool readRubberWhale(Mat &dst_frame_1, Mat &dst_frame_2, Mat &dst_GT) return true; } + TEST(DenseOpticalFlow_SimpleFlow, ReferenceAccuracy) { Mat frame1, frame2, GT; @@ -157,6 +175,108 @@ TEST(DenseOpticalFlow_DeepFlow, ReferenceAccuracy) EXPECT_LE(calcRMSE(GT, flow), target_RMSE); } +TEST(SparseOpticalFlow, ReferenceAccuracy) +{ + // with the following test each invoker class should be tested once + Mat frame1, frame2, GT; + ASSERT_TRUE(readRubberWhale(frame1, frame2, GT)); + vector prevPts, currPts; + + for (int r = 0; r < frame1.rows; r+=10) + { + for (int c = 0; c < frame1.cols; c+=10) + { + prevPts.push_back(Point2f(static_cast(c), static_cast(r))); + } + } + vector status(prevPts.size()); + vector err(prevPts.size()); + Ptr algo = SparseRLOFOpticalFlow::create(); + algo->setForwardBackward(0.0f); + + RLOFOpticalFlowParameter param; + param.supportRegionType = RLOFOpticalFlowParameter::SR_CROSS; + param.useIlluminationModel = true; + + param.solverType = RLOFOpticalFlowParameter::ST_BILINEAR; + algo->setRLOFOpticalFlowParameter(param); + algo->calc(frame1, frame2, prevPts, currPts, status, err); + EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.22f); + + param.solverType = RLOFOpticalFlowParameter::ST_STANDART; + algo->setRLOFOpticalFlowParameter(param); + algo->calc(frame1, frame2, prevPts, currPts, status, err); + EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.34f); + + param.useIlluminationModel = false; + + param.solverType = RLOFOpticalFlowParameter::ST_BILINEAR; + algo->setRLOFOpticalFlowParameter(param); + algo->calc(frame1, frame2, prevPts, currPts, status, err); + EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.24f); + + param.solverType = RLOFOpticalFlowParameter::ST_STANDART; + algo->setRLOFOpticalFlowParameter(param); + algo->calc(frame1, frame2, prevPts, currPts, status, err); + EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.23f); + + param.normSigma0 = numeric_limits::max(); + param.normSigma1 = numeric_limits::max(); + param.useIlluminationModel = true; + + param.solverType = RLOFOpticalFlowParameter::ST_BILINEAR; + algo->setRLOFOpticalFlowParameter(param); + algo->calc(frame1, frame2, prevPts, currPts, status, err); + EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.26f); + + param.solverType = RLOFOpticalFlowParameter::ST_STANDART; + algo->setRLOFOpticalFlowParameter(param); + algo->calc(frame1, frame2, prevPts, currPts, status, err); + EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.26f); + + param.useIlluminationModel = false; + + param.solverType = RLOFOpticalFlowParameter::ST_BILINEAR; + algo->setRLOFOpticalFlowParameter(param); + algo->calc(frame1, frame2, prevPts, currPts, status, err); + EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.25f); + + param.solverType = RLOFOpticalFlowParameter::ST_STANDART; + algo->setRLOFOpticalFlowParameter(param); + algo->calc(frame1, frame2, prevPts, currPts, status, err); + EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.26f); +} + +TEST(DenseOpticalFlow_RLOF, ReferenceAccuracy) +{ + Mat frame1, frame2, GT; + ASSERT_TRUE(readRubberWhale(frame1, frame2, GT)); + + Mat flow; + Ptr algo = DenseRLOFOpticalFlow::create(); + RLOFOpticalFlowParameter param; + param.supportRegionType = RLOFOpticalFlowParameter::SR_CROSS; + param.solverType = RLOFOpticalFlowParameter::ST_BILINEAR; + algo->setRLOFOpticalFlowParameter(param); + algo->setForwardBackward(1.0f); + algo->setGridStep(cv::Size(4, 4)); + + algo->setInterpolation(DenseRLOFOpticalFlow::INTERP_EPIC); + algo->calc(frame1, frame2, flow); + + ASSERT_EQ(GT.rows, flow.rows); + ASSERT_EQ(GT.cols, flow.cols); + EXPECT_LE(calcRMSE(GT, flow), 0.36f); + + algo->setInterpolation(DenseRLOFOpticalFlow::INTERP_GEO); + algo->calc(frame1, frame2, flow); + + ASSERT_EQ(GT.rows, flow.rows); + ASSERT_EQ(GT.cols, flow.cols); + EXPECT_LE(calcRMSE(GT, flow), 0.49f); + +} + TEST(DenseOpticalFlow_SparseToDenseFlow, ReferenceAccuracy) { Mat frame1, frame2, GT; From 69ef7842458d0f9fefcbbcfa998b653e40ac4cc5 Mon Sep 17 00:00:00 2001 From: senst Date: Mon, 10 Dec 2018 20:50:19 +0100 Subject: [PATCH 02/25] - exchange tabs with spaces - fix optflow.bib indentation - remove optflow_o.hpp - change RLOFOpticalFlowParameter interfaces to Ptr to remove error on building. Fix warnings --- modules/optflow/doc/optflow.bib | 52 +- .../include/opencv2/optflow/rlofflow.hpp | 873 ++-- modules/optflow/include/opencv2/optflow_o.hpp | 309 -- modules/optflow/perf/perf_rlof.cpp | 159 +- modules/optflow/src/rlof/berlof_invoker.hpp | 4328 ++++++++--------- .../optflow/src/rlof/geo_interpolation.cpp | 996 ++-- .../optflow/src/rlof/geo_interpolation.hpp | 94 +- modules/optflow/src/rlof/plk_invoker.hpp | 1341 +++-- modules/optflow/src/rlof/rlof_invoker.hpp | 2865 ++++++----- modules/optflow/src/rlof/rlof_invokerbase.hpp | 498 +- modules/optflow/src/rlof/rlof_localflow.cpp | 1082 ++--- modules/optflow/src/rlof/rlof_localflow.h | 232 +- modules/optflow/src/rlofflow.cpp | 712 ++- modules/optflow/test/test_OF_accuracy.cpp | 216 +- 14 files changed, 6491 insertions(+), 7266 deletions(-) delete mode 100644 modules/optflow/include/opencv2/optflow_o.hpp diff --git a/modules/optflow/doc/optflow.bib b/modules/optflow/doc/optflow.bib index 616686e2698..746cecb6732 100644 --- a/modules/optflow/doc/optflow.bib +++ b/modules/optflow/doc/optflow.bib @@ -63,36 +63,36 @@ @inproceedings{Wang_2016_CVPR } @inproceedings{Geistert2016, - author = {Jonas Geistert and Tobias Senst and Thomas Sikora}, - title = {Robust Local Optical Flow: Dense Motion Vector Field Interpolation}, - booktitle = {Picture Coding Symposium}, - year = {2016}, - pages = {1--5}, + author = {Jonas Geistert and Tobias Senst and Thomas Sikora}, + title = {Robust Local Optical Flow: Dense Motion Vector Field Interpolation}, + booktitle = {Picture Coding Symposium}, + year = {2016}, + pages = {1--5}, } -@inproceedings{Senst2016, - author = {Tobias Senst and Jonas Geistert and Thomas Sikora}, - title = {Robust local optical flow: Long-range motions and varying illuminations}, - booktitle = {IEEE International Conference on Image Processing}, - year = {2016}, - pages = {4478--4482}, -} +@inproceedings{Senst2016, + author = {Tobias Senst and Jonas Geistert and Thomas Sikora}, + title = {Robust local optical flow: Long-range motions and varying illuminations}, + booktitle = {IEEE International Conference on Image Processing}, + year = {2016}, + pages = {4478--4482}, +} -@inproceedings{Senst2014, - author = {Tobias Senst and Thilo Borgmann and Ivo Keller and Thomas Sikora}, - title = {Cross based Robust Local Optical Flow}, - booktitle = {21th IEEE International Conference on Image Processing}, - year = {2014}, - pages = {1967--1971}, -} +@inproceedings{Senst2014, + author = {Tobias Senst and Thilo Borgmann and Ivo Keller and Thomas Sikora}, + title = {Cross based Robust Local Optical Flow}, + booktitle = {21th IEEE International Conference on Image Processing}, + year = {2014}, + pages = {1967--1971}, +} -@inproceedings{Senst2013, - author = {Tobias Senst and Jonas Geistert and Ivo Keller and Thomas Sikora}, - title = {Robust Local Optical Flow Estimation using Bilinear Equations for Sparse Motion Estimation}, - booktitle = {20th IEEE International Conference on Image Processing}, - year = {2013}, - pages = {2499--2503}, -} +@inproceedings{Senst2013, + author = {Tobias Senst and Jonas Geistert and Ivo Keller and Thomas Sikora}, + title = {Robust Local Optical Flow Estimation using Bilinear Equations for Sparse Motion Estimation}, + booktitle = {20th IEEE International Conference on Image Processing}, + year = {2013}, + pages = {2499--2503}, +} @article{Senst2012, author = {Tobias Senst and Volker Eiselein and Thomas Sikora}, diff --git a/modules/optflow/include/opencv2/optflow/rlofflow.hpp b/modules/optflow/include/opencv2/optflow/rlofflow.hpp index e9accdab5c4..d1a3d6f53ad 100644 --- a/modules/optflow/include/opencv2/optflow/rlofflow.hpp +++ b/modules/optflow/include/opencv2/optflow/rlofflow.hpp @@ -1,57 +1,6 @@ -/* -By downloading, copying, installing or using the software you agree to this -license. If you do not agree to this license, do not download, install, -copy or use the software. - - - License Agreement - For Open Source Computer Vision Library - (3-clause BSD License) - -Copyright (C) 2018, OpenCV Foundation, all rights reserved. -Third party copyrights are property of their respective owners. - -Redistribution and use in source and binary forms, with or without modification, -are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - - * Neither the names of the copyright holders nor the names of the contributors - may be used to endorse or promote products derived from this software - without specific prior written permission. - -This software is provided by the copyright holders and contributors "as is" and -any express or implied warranties, including, but not limited to, the implied -warranties of merchantability and fitness for a particular purpose are -disclaimed. In no event shall copyright holders or contributors be liable for -any direct, indirect, incidental, special, exemplary, or consequential damages -(including, but not limited to, procurement of substitute goods or services; -loss of use, data, or profits; or business interruption) however caused -and on any theory of liability, whether in contract, strict liability, -or tort (including negligence or otherwise) arising in any way out of -the use of this software, even if advised of the possibility of such damage. -*/ - -/** - * @file rloflow.hpp - * @author Tobias Senst - * @brief Implementation of the robust local optical flow algorithm for sparse and dense motion - * estimation from the following papers: - * http://elvera.nue.tu-berlin.de/files/1349Senst2012.pdf - * http://elvera.nue.tu-berlin.de/files/1422Senst2013.pdf - * http://elvera.nue.tu-berlin.de/files/1448Senst2014.pdf - * http://elvera.nue.tu-berlin.de/files/1498Geistert2016.pdf - * http://elvera.nue.tu-berlin.de/files/1496Senst2016.pdf - * - * - * @cite Senst2012, @cite Senst2013, @cite Senst2014, @cite Senst2016 or @cite Geistert2016 - * -*/ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. #ifndef __OPENCV_OPTFLOW_RLOFFLOW_HPP__ #define __OPENCV_OPTFLOW_RLOFFLOW_HPP__ @@ -60,423 +9,425 @@ the use of this software, even if advised of the possibility of such damage. namespace cv { - namespace optflow +namespace optflow +{ + //! @addtogroup optflow + //! @{ + + /** @brief This is used store and set up the parameters of the robust local optical flow (RLOF) algoritm. + * + * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as + * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). + * This RLOF implementation can be seen as an improved pyramidal iterative Lucas-Kanade and includes + * a set of improving modules. The main improvements in respect to the pyramidal iterative Lucas-Kanade + * are: + * - A more robust redecending M-estimator framework (see @cite Senst2012) to improve the accuracy at + * motion boundaries and appearing and disappearing pixels. + * - an adaptive support region strategies to improve the accuracy at motion boundaries to reduce the + * corona effect, i.e oversmoothing of the PLK at motion/object boundaries. The cross-based segementation + * strategy (SR_CROSS) proposed in @cite Senst2014 uses a simple segmenation approach to obtain the optimal + * shape of the support region. + * - To deal with illumination changes (outdoor sequences and shadow) the intensity constancy assumption + * based optical flow equation has been adopt with the Gennert and Negahdaripour illumination model + * (see @cite Senst2016). This model can be switched on/off with the useIlluminationModel variable. + * - By using a global motion prior initialization (see @cite Senst2016) of the iterative refinement + * the accuracy could be significantly improved for large displacements. This initialization can be + * switched on and of with useGlobalMotionPrior variable. + * + * The RLOF can be computed with the SparseOpticalFlow class or function interface to track a set of features + * or with the DenseOpticalFlow class or function interface to compute dense optical flow. + * + * @see optflow::DenseRLOFOpticalFlow, optflow::calcOpticalFlowDenseRLOF(), optflow::SparseRLOFOpticalFlow, optflow::calcOpticalFlowSparseRLOF() + */ + enum SupportRegionType { + SR_FIXED = 0, /**< Apply a constant support region */ + SR_CROSS = 1 /**< Apply a adaptive support region obtained by cross-based segmentation + * as described in @cite Senst2014 + */ + }; + enum SolverType { + ST_STANDART = 0, /**< Apply standard iterative refinement */ + ST_BILINEAR = 1 /**< Apply optimized iterative refinement based bilinear equation solutions + * as described in @cite Senst2013 + */ + }; + + enum InterpolationType { - //! @addtogroup optflow - //! @{ + INTERP_GEO = 0, /**< Fast geodesic interpolation, see @cite Geister2016 */ + INTERP_EPIC = 1, /**< Edge-preserving interpolation, see @cite Revaud2015,Geistert2016. */ + }; + + class CV_EXPORTS_W RLOFOpticalFlowParameter{ + public: + RLOFOpticalFlowParameter() + :solverType(ST_BILINEAR) + ,supportRegionType(SR_CROSS) + ,normSigma0(3.2f) + ,normSigma1(7.f) + ,smallWinSize(9) + ,largeWinSize(21) + ,crossSegmentationThreshold(25) + ,maxLevel(3) + ,useInitialFlow(false) + ,useIlluminationModel(true) + ,useGlobalMotionPrior(true) + ,maxIteration(30) + ,minEigenValue(0.0001f) + ,globalMotionRansacThreshold(10) + {} + SolverType solverType; + /**< Variable specifies the iterative refinement strategy. Please consider citing @cite Senst2013 when + * using ST_BILINEAR. + */ - /** @brief This is used store and set up the parameters of the robust local optical flow (RLOF) algoritm. - * - * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as - * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). - * This RLOF implementation can be seen as an improved pyramidal iterative Lucas-Kanade and includes - * a set of improving modules. The main improvements in respect to the pyramidal iterative Lucas-Kanade - * are: - * - A more robust redecending M-estimator framework (see @cite Senst2012) to improve the accuracy at - * motion boundaries and appearing and disappearing pixels. - * - an adaptive support region strategies to improve the accuracy at motion boundaries to reduce the - * corona effect, i.e oversmoothing of the PLK at motion/object boundaries. The cross-based segementation - * strategy (SR_CROSS) proposed in @cite Senst2014 uses a simple segmenation approach to obtain the optimal - * shape of the support region. - * - To deal with illumination changes (outdoor sequences and shadow) the intensity constancy assumption - * based optical flow equation has been adopt with the Gennert and Negahdaripour illumination model - * (see @cite Senst2016). This model can be switched on/off with the useIlluminationModel variable. - * - By using a global motion prior initialization (see @cite Senst2016) of the iterative refinement - * the accuracy could be significantly improved for large displacements. This initialization can be - * switched on and of with useGlobalMotionPrior variable. - * - * The RLOF can be computed with the SparseOpticalFlow class or function interface to track a set of features - * or with the DenseOpticalFlow class or function interface to compute dense optical flow. - * - * @see optflow::DenseRLOFOpticalFlow, optflow::calcOpticalFlowDenseRLOF(), optflow::SparseRLOFOpticalFlow, optflow::calcOpticalFlowSparseRLOF() - */ - class RLOFOpticalFlowParameter { - public: - RLOFOpticalFlowParameter() - :solverType(ST_BILINEAR) - ,supportRegionType(SR_CROSS) - ,normSigma0(3.2f) - ,normSigma1(7.f) - ,smallWinSize(9) - ,largeWinSize(21) - ,crossSegmentationThreshold(25) - ,useInitialFlow(false) - ,useIlluminationModel(true) - ,useGlobalMotionPrior(true) - ,maxLevel(3) - ,maxIteration(30) - ,minEigenValue(0.0001f) - ,globalMotionRansacThreshold(10) - {} - enum SupportRegionType { - SR_FIXED = 0, /**< Apply a constant support region */ - SR_CROSS /**< Apply a adaptive support region obtained by cross-based segmentation - * as described in @cite Senst2014 - */ - }; - enum SolverType { - ST_STANDART = 0, /**< Apply standard iterative refinement */ - ST_BILINEAR /**< Apply optimized iterative refinement based bilinear equation solutions - * as described in @cite Senst2013 - */ - }; - SolverType solverType; - /**< Variable specifies the iterative refinement strategy. Please consider citing @cite Senst2013 when - * using ST_BILINEAR. - */ - - SupportRegionType supportRegionType; - /**< Variable specifies the support region shape extraction or shrinking strategy. - */ - - float normSigma0; - /**< \$ \sigma_0 \$ paramter of the shrinked Hampel norm introduced in @cite Senst2012. If - * \$ \sigma_0 \$ = std::numeric_limist::max() the least-square estimator will be used - * instead of the M-estimator. Althoug M-estimator is more robust against outlier in the support - * region the least-square can be fast in computation. - */ - float normSigma1; - /**< \$ \sigma_1 \$ paramter of the shrinked Hampel norm introduced in @cite Senst2012. If - * \$ \sigma_1 \$ = std::numeric_limist::max() the least-square estimator will be used - * instead of the M-estimator. Althoug M-estimator is more robust against outlier in the support - * region the least-square can be fast in computation. - */ - int smallWinSize; - /**< Minimal window size of the support region. This parameter is only used if supportRegionType is SR_CROSS. - */ - int largeWinSize; - /**< Maximal window size of the support region. If supportRegionType is SR_FIXED this gives the exact support - * region size. The speed of the RLOF is related to the applied win sizes. The smaller the window size the lower is the runtime, - * but the more sensitive to noise is the method. - */ - int crossSegmentationThreshold; - /**< Color similarity threshold used by cross-based segmentation following @cite Senst2014 . - * (Only used if supportRegionType is SR_CROSS). With the cross-bassed segmentation - * motion boundaries can be computed more accurately. - */ - int maxLevel; - /**< Maximal number of pyramid level used. The large this value is the more likely it is - * to obtain accurate solutions for long-range motions. The runtime is linear related to - * this parameter. - */ - bool useInitialFlow; - /**< Use next point list as initial values. A good intialization can imporve the algortihm - * accuracy and reduce the runtime by a faster convergence of the iteration refinement. - */ - bool useIlluminationModel; - /**< Use the Gennert and Negahdaripour illumination model instead of the intensity brigthness - * constraint. (Probosed in @cite Senst2016 ) This model is defined as follow: - * \f[ I(\mathbf{x},t) + m \cdot I(\mathbf{x},t) + c = I(\mathbf{x},t+1) \f[ - * and contains with m and c a multiplicative and additive term which makes the estimate - * more robust against illumination changes. The computational complexity is increased by - * enabling the illumination model. - */ - bool useGlobalMotionPrior; - /**< Use global motion prior initialisation has been introduced in @cite Senst2016 . It - * allows to be more accurate for long-range motion. The computational complexity is - * slightly increased by enabling the global motion prior initialisation. - */ - int maxIteration; - /**< Number of maximal iterations used for the iterative refinement. Lower values can - * reduce the runtime but also the accuracy. - */ - float minEigenValue; - /**< Threshold for the minimal eigenvalue of the gradient matrix defines when to abort the - * iterative refinement. - */ - float globalMotionRansacThreshold; - /**< To apply the global motion prior motion vectors will be computed on a regulary sampled which - * are the basis for Homography estimation using RANSAC. The reprojection threshold is based on - * n-th percentil (given by this value [0 ... 100]) of the forward backward errors relating to - * the motion vectors. See @cite Senst2016 for more details. - */ - }; + SupportRegionType supportRegionType; + /**< Variable specifies the support region shape extraction or shrinking strategy. + */ - /** @brief Fast dense optical flow computation based on robust local optical flow (RLOF) algorithms and sparse-to-dense interpolation - * scheme. - * - * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as - * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). - * - * The sparse-to-dense interpolation scheme allows for fast computation of dense optical flow using RLOF (see @cite Geistert2016). - * For this scheme the following steps are applied: - * -# motion vector seeded at a regular sampled grid are computed. The sparsity of this grid can be configured with setGridStep - * -# (optinally) errornous motion vectors are filter based on the forward backward confidence. The threshold can be configured - * with setForwardBackward. The filter is only applied if the threshold >0 but than the runtime is doubled due to the estimation - * of the backward flow. - * -# Vector field interpolation is applied to the motion vector set to obtain a dense vector field. - * - * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details. - * Parameters have been described in @cite Senst2012 @cite Senst2013 @cite Senst2014 and @cite Senst2016. - * - * @note If the grid size is set to (1,1) and the forward backward threshold <= 0 that the dense optical flow field is purely. - * computed with the RLOF. - * - * @see optflow::calcOpticalFlowDenseRLOF(), optflow::RLOFOpticalFlowParameter - */ - class CV_EXPORTS_W DenseRLOFOpticalFlow : public DenseOpticalFlow - { - public: - enum InterpolationType - { - INTERP_GEO = 0, /**< Fast geodesic interpolation, see @cite Geister2016 */ - INTERP_EPIC, /**< Edge-preserving interpolation, see @cite Revaud2015,Geistert2016. */ - }; - //! @brief Configuration of the RLOF alogrithm. - /** - @see optflow::RLOFOpticalFlowParameter, getRLOFOpticalFlowParameter - */ - CV_WRAP virtual void setRLOFOpticalFlowParameter(RLOFOpticalFlowParameter val) = 0; - /** @copybrief setRLOFOpticalFlowParameter - @see optflow::RLOFOpticalFlowParameter, setRLOFOpticalFlowParameter - */ - CV_WRAP virtual RLOFOpticalFlowParameter getRLOFOpticalFlowParameter() const = 0; - //! @brief Threshold for the forward backward confidence check - /**For each grid point \f$ \mathbf{x} \f$ a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed. - * If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f] - * is larger than threshold given by this function then the motion vector will not be used by the following - * vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test - * will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation. - * @see getForwardBackward, setGridStep - */ - CV_WRAP virtual void setForwardBackward(float val) = 0; - /** @copybrief setForwardBackward - @see setForwardBackward - */ - CV_WRAP virtual float getForwardBackward() const = 0; - //! @brief Size of the grid to spawn the motion vectors. - /** For each grid point a motion vector is computed. Some motion vectors will be removed due to the forwatd backward - * threshold (if set >0). The rest will be the base of the vector field interpolation. - * @see getForwardBackward, setGridStep - */ - CV_WRAP virtual Size getGridStep() const = 0; - /** @copybrief getGridStep - * @see getGridStep - */ - CV_WRAP virtual void setGridStep(Size val) = 0; + float normSigma0; + /**< \$ \sigma_0 \$ paramter of the shrinked Hampel norm introduced in @cite Senst2012. If + * \$ \sigma_0 \$ = std::numeric_limist::max() the least-square estimator will be used + * instead of the M-estimator. Althoug M-estimator is more robust against outlier in the support + * region the least-square can be fast in computation. + */ + float normSigma1; + /**< \$ \sigma_1 \$ paramter of the shrinked Hampel norm introduced in @cite Senst2012. If + * \$ \sigma_1 \$ = std::numeric_limist::max() the least-square estimator will be used + * instead of the M-estimator. Althoug M-estimator is more robust against outlier in the support + * region the least-square can be fast in computation. + */ + int smallWinSize; + /**< Minimal window size of the support region. This parameter is only used if supportRegionType is SR_CROSS. + */ + int largeWinSize; + /**< Maximal window size of the support region. If supportRegionType is SR_FIXED this gives the exact support + * region size. The speed of the RLOF is related to the applied win sizes. The smaller the window size the lower is the runtime, + * but the more sensitive to noise is the method. + */ + int crossSegmentationThreshold; + /**< Color similarity threshold used by cross-based segmentation following @cite Senst2014 . + * (Only used if supportRegionType is SR_CROSS). With the cross-bassed segmentation + * motion boundaries can be computed more accurately. + */ + int maxLevel; + /**< Maximal number of pyramid level used. The large this value is the more likely it is + * to obtain accurate solutions for long-range motions. The runtime is linear related to + * this parameter. + */ + bool useInitialFlow; + /**< Use next point list as initial values. A good intialization can imporve the algortihm + * accuracy and reduce the runtime by a faster convergence of the iteration refinement. + */ + bool useIlluminationModel; + /**< Use the Gennert and Negahdaripour illumination model instead of the intensity brigthness + * constraint. (Probosed in @cite Senst2016 ) This model is defined as follow: + * \f[ I(\mathbf{x},t) + m \cdot I(\mathbf{x},t) + c = I(\mathbf{x},t+1) \f[ + * and contains with m and c a multiplicative and additive term which makes the estimate + * more robust against illumination changes. The computational complexity is increased by + * enabling the illumination model. + */ + bool useGlobalMotionPrior; + /**< Use global motion prior initialisation has been introduced in @cite Senst2016 . It + * allows to be more accurate for long-range motion. The computational complexity is + * slightly increased by enabling the global motion prior initialisation. + */ + int maxIteration; + /**< Number of maximal iterations used for the iterative refinement. Lower values can + * reduce the runtime but also the accuracy. + */ + float minEigenValue; + /**< Threshold for the minimal eigenvalue of the gradient matrix defines when to abort the + * iterative refinement. + */ + float globalMotionRansacThreshold; + /**< To apply the global motion prior motion vectors will be computed on a regulary sampled which + * are the basis for Homography estimation using RANSAC. The reprojection threshold is based on + * n-th percentil (given by this value [0 ... 100]) of the forward backward errors relating to + * the motion vectors. See @cite Senst2016 for more details. + */ + }; - //! @brief Interpolation used to compute the dense optical flow. - /** Two interpolation algorithms have been supported - * - **INTERP_GEO** applies the fast geodesic interpolation, see @cite Geister2016. - * - **INTERP_EPIC_RESIDUAL** applies the edge-preserving interpolation, see @cite Revaud2015,Geistert2016. - * @see ximgproc::EdgeAwareInterpolator, getInterpolation - */ - CV_WRAP virtual void setInterpolation(InterpolationType val) = 0; - /** @copybrief setInterpolation - * @see ximgproc::EdgeAwareInterpolator, setInterpolation - */ - CV_WRAP virtual InterpolationType getInterpolation() const = 0; - //! @brief see ximgproc::EdgeAwareInterpolator() K value. - /** K is a number of nearest-neighbor matches considered, when fitting a locally affine - * model. Usually it should be around 128. However, lower values would make the interpolation noticeably faster. - * @see ximgproc::EdgeAwareInterpolator, setEPICK - */ - CV_WRAP virtual int getEPICK() const = 0; - /** @copybrief getEPICK - * @see ximgproc::EdgeAwareInterpolator, getEPICK - */ - CV_WRAP virtual void setEPICK(int val) = 0; - //! @brief see ximgproc::EdgeAwareInterpolator() sigma value. - /** Sigma is a parameter defining how fast the weights decrease in the locally-weighted affine - * fitting. Higher values can help preserve fine details, lower values can help to get rid of noise in the - * output flow. - * @see ximgproc::EdgeAwareInterpolator, setEPICSigma - */ - CV_WRAP virtual float getEPICSigma() const = 0; - /** @copybrief getEPICSigma - * @see ximgproc::EdgeAwareInterpolator, getEPICSigma - */ - CV_WRAP virtual void setEPICSigma(float val) = 0; - //! @brief see ximgproc::EdgeAwareInterpolator() lambda value. - /** Lambda is a parameter defining the weight of the edge-aware term in geodesic distance, - * should be in the range of 0 to 1000. - * @see ximgproc::EdgeAwareInterpolator, setEPICSigma - */ - CV_WRAP virtual float getEPICLambda() const = 0; - /** @copybrief getEPICLambda - * @see ximgproc::EdgeAwareInterpolator, getEPICLambda - */ - CV_WRAP virtual void setEPICLambda(float val) = 0; - //! @brief see ximgproc::EdgeAwareInterpolator(). - /** Sets the respective fastGlobalSmootherFilter() parameter. - * @see ximgproc::EdgeAwareInterpolator, setFgsLambda - */ - CV_WRAP virtual float getFgsLambda() const = 0; - /** @copybrief getFgsLambda - * @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, getFgsLambda - */ - CV_WRAP virtual void setFgsLambda(float val) = 0; - //! @brief see ximgproc::EdgeAwareInterpolator(). - /** Sets the respective fastGlobalSmootherFilter() parameter. - * @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, setFgsSigma - */ - CV_WRAP virtual float getFgsSigma() const = 0; - /** @copybrief getFgsSigma - * @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, getFgsSigma - */ - CV_WRAP virtual void setFgsSigma(float val) = 0; - //! @brief enables ximgproc::fastGlobalSmootherFilter - /** - * @see getUsePostProc - */ - CV_WRAP virtual void setUsePostProc(bool val) = 0; - /** @copybrief setUsePostProc - * @see ximgproc::fastGlobalSmootherFilter, setUsePostProc - */ - CV_WRAP virtual bool getUsePostProc() const = 0; - //! @brief Creates instance of optflow::DenseRLOFOpticalFlow - /** - * @param rlofParam see optflow::RLOFOpticalFlowParameter - * @param forwardBackwardThreshold see setForwardBackward - * @param gridStep see setGridStep - * @param epicK see setEPICK - * @param epicSigma see setEPICSigma - * @param epicLambda see setEPICLambda - * @param fgsLambda see setFgsLambda - * @param fgsSigma see setFgsSigma - */ - CV_WRAP static Ptr create( - const RLOFOpticalFlowParameter & rlofParam = RLOFOpticalFlowParameter(), - float forwardBackwardThreshold = 1.f, - Size gridStep = Size(6, 6), - InterpolationType interp_type = InterpolationType::INTERP_EPIC, - int epicK = 128, - float epicSigma = 0.05f, - float epicLambda = 999.0f, - float fgsLambda = 500.0f, - float fgsSigma = 1.5f); - }; + /** @brief Fast dense optical flow computation based on robust local optical flow (RLOF) algorithms and sparse-to-dense interpolation + * scheme. + * + * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as + * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). + * + * The sparse-to-dense interpolation scheme allows for fast computation of dense optical flow using RLOF (see @cite Geistert2016). + * For this scheme the following steps are applied: + * -# motion vector seeded at a regular sampled grid are computed. The sparsity of this grid can be configured with setGridStep + * -# (optinally) errornous motion vectors are filter based on the forward backward confidence. The threshold can be configured + * with setForwardBackward. The filter is only applied if the threshold >0 but than the runtime is doubled due to the estimation + * of the backward flow. + * -# Vector field interpolation is applied to the motion vector set to obtain a dense vector field. + * + * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details. + * Parameters have been described in @cite Senst2012 @cite Senst2013 @cite Senst2014 and @cite Senst2016. + * + * @note If the grid size is set to (1,1) and the forward backward threshold <= 0 that the dense optical flow field is purely. + * computed with the RLOF. + * + * @see optflow::calcOpticalFlowDenseRLOF(), optflow::RLOFOpticalFlowParameter + */ + class CV_EXPORTS_W DenseRLOFOpticalFlow : public DenseOpticalFlow + { + public: + //! @brief Configuration of the RLOF alogrithm. + /** + @see optflow::RLOFOpticalFlowParameter, getRLOFOpticalFlowParameter + */ + CV_WRAP virtual void setRLOFOpticalFlowParameter(Ptr val) = 0; + /** @copybrief setRLOFOpticalFlowParameter + @see optflow::RLOFOpticalFlowParameter, setRLOFOpticalFlowParameter + */ + CV_WRAP virtual Ptr getRLOFOpticalFlowParameter() const = 0; + //! @brief Threshold for the forward backward confidence check + /**For each grid point \f$ \mathbf{x} \f$ a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed. + * If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f] + * is larger than threshold given by this function then the motion vector will not be used by the following + * vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test + * will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation. + * @see getForwardBackward, setGridStep + */ + CV_WRAP virtual void setForwardBackward(float val) = 0; + /** @copybrief setForwardBackward + @see setForwardBackward + */ + CV_WRAP virtual float getForwardBackward() const = 0; + //! @brief Size of the grid to spawn the motion vectors. + /** For each grid point a motion vector is computed. Some motion vectors will be removed due to the forwatd backward + * threshold (if set >0). The rest will be the base of the vector field interpolation. + * @see getForwardBackward, setGridStep + */ + CV_WRAP virtual Size getGridStep() const = 0; + /** @copybrief getGridStep + * @see getGridStep + */ + CV_WRAP virtual void setGridStep(Size val) = 0; + + //! @brief Interpolation used to compute the dense optical flow. + /** Two interpolation algorithms have been supported + * - **INTERP_GEO** applies the fast geodesic interpolation, see @cite Geister2016. + * - **INTERP_EPIC_RESIDUAL** applies the edge-preserving interpolation, see @cite Revaud2015,Geistert2016. + * @see ximgproc::EdgeAwareInterpolator, getInterpolation + */ + CV_WRAP virtual void setInterpolation(InterpolationType val) = 0; + /** @copybrief setInterpolation + * @see ximgproc::EdgeAwareInterpolator, setInterpolation + */ + CV_WRAP virtual InterpolationType getInterpolation() const = 0; + //! @brief see ximgproc::EdgeAwareInterpolator() K value. + /** K is a number of nearest-neighbor matches considered, when fitting a locally affine + * model. Usually it should be around 128. However, lower values would make the interpolation noticeably faster. + * @see ximgproc::EdgeAwareInterpolator, setEPICK + */ + CV_WRAP virtual int getEPICK() const = 0; + /** @copybrief getEPICK + * @see ximgproc::EdgeAwareInterpolator, getEPICK + */ + CV_WRAP virtual void setEPICK(int val) = 0; + //! @brief see ximgproc::EdgeAwareInterpolator() sigma value. + /** Sigma is a parameter defining how fast the weights decrease in the locally-weighted affine + * fitting. Higher values can help preserve fine details, lower values can help to get rid of noise in the + * output flow. + * @see ximgproc::EdgeAwareInterpolator, setEPICSigma + */ + CV_WRAP virtual float getEPICSigma() const = 0; + /** @copybrief getEPICSigma + * @see ximgproc::EdgeAwareInterpolator, getEPICSigma + */ + CV_WRAP virtual void setEPICSigma(float val) = 0; + //! @brief see ximgproc::EdgeAwareInterpolator() lambda value. + /** Lambda is a parameter defining the weight of the edge-aware term in geodesic distance, + * should be in the range of 0 to 1000. + * @see ximgproc::EdgeAwareInterpolator, setEPICSigma + */ + CV_WRAP virtual float getEPICLambda() const = 0; + /** @copybrief getEPICLambda + * @see ximgproc::EdgeAwareInterpolator, getEPICLambda + */ + CV_WRAP virtual void setEPICLambda(float val) = 0; + //! @brief see ximgproc::EdgeAwareInterpolator(). + /** Sets the respective fastGlobalSmootherFilter() parameter. + * @see ximgproc::EdgeAwareInterpolator, setFgsLambda + */ + CV_WRAP virtual float getFgsLambda() const = 0; + /** @copybrief getFgsLambda + * @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, getFgsLambda + */ + CV_WRAP virtual void setFgsLambda(float val) = 0; + //! @brief see ximgproc::EdgeAwareInterpolator(). + /** Sets the respective fastGlobalSmootherFilter() parameter. + * @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, setFgsSigma + */ + CV_WRAP virtual float getFgsSigma() const = 0; + /** @copybrief getFgsSigma + * @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, getFgsSigma + */ + CV_WRAP virtual void setFgsSigma(float val) = 0; + //! @brief enables ximgproc::fastGlobalSmootherFilter + /** + * @see getUsePostProc + */ + CV_WRAP virtual void setUsePostProc(bool val) = 0; + /** @copybrief setUsePostProc + * @see ximgproc::fastGlobalSmootherFilter, setUsePostProc + */ + CV_WRAP virtual bool getUsePostProc() const = 0; + //! @brief Creates instance of optflow::DenseRLOFOpticalFlow + /** + * @param rlofParam see optflow::RLOFOpticalFlowParameter + * @param forwardBackwardThreshold see setForwardBackward + * @param gridStep see setGridStep + * @param epicK see setEPICK + * @param epicSigma see setEPICSigma + * @param epicLambda see setEPICLambda + * @param fgsLambda see setFgsLambda + * @param fgsSigma see setFgsSigma + */ + CV_WRAP static Ptr create( + Ptr rlofParam = Ptr(), + float forwardBackwardThreshold = 1.f, + Size gridStep = Size(6, 6), + InterpolationType interp_type = InterpolationType::INTERP_EPIC, + int epicK = 128, + float epicSigma = 0.05f, + float epicLambda = 999.0f, + float fgsLambda = 500.0f, + float fgsSigma = 1.5f); + }; - /** @brief Class used for calculation sparse optical flow and feature tracking with robust local optical flow (RLOF) algorithms. - * - * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as - * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). - * - * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details. - * Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014 and @cite Senst2016. - @sa optflow::calcOpticalFlowSparseRLOF(), optflow::RLOFOpticalFlowParameter + /** @brief Class used for calculation sparse optical flow and feature tracking with robust local optical flow (RLOF) algorithms. + * + * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as + * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). + * + * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details. + * Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014 and @cite Senst2016. + @sa optflow::calcOpticalFlowSparseRLOF(), optflow::RLOFOpticalFlowParameter - */ - class CV_EXPORTS_W SparseRLOFOpticalFlow : public SparseOpticalFlow - { - public: - /** @copydoc DenseRLOFOpticalFlow::setRLOFOpticalFlowParameter - */ - CV_WRAP virtual void setRLOFOpticalFlowParameter(RLOFOpticalFlowParameter val) = 0; - /** @copybrief setRLOFOpticalFlowParameter - * @see setRLOFOpticalFlowParameter - */ - CV_WRAP virtual RLOFOpticalFlowParameter getRLOFOpticalFlowParameter() const = 0; - //! @brief Threshold for the forward backward confidence check - /** For each feature point a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed. - * If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f] - * is larger than threshold given by this function then the status will not be used by the following - * vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test - * will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation. - * @see setForwardBackward - */ - CV_WRAP virtual void setForwardBackward(float val) = 0; - /** @copybrief setForwardBackward - * @see setForwardBackward - */ - CV_WRAP virtual float getForwardBackward() const = 0; - - //! @brief Creates instance of SparseRLOFOpticalFlow - /** - * @param rlofParam see setRLOFOpticalFlowParameter - * @param forwardBackwardThreshold see setForwardBackward - */ - CV_WRAP static Ptr create( - const RLOFOpticalFlowParameter & rlofParam = RLOFOpticalFlowParameter(), - float forwardBackwardThreshold = 1.f); + */ + class CV_EXPORTS_W SparseRLOFOpticalFlow : public SparseOpticalFlow + { + public: + /** @copydoc DenseRLOFOpticalFlow::setRLOFOpticalFlowParameter + */ + CV_WRAP virtual void setRLOFOpticalFlowParameter(Ptr val) = 0; + /** @copybrief setRLOFOpticalFlowParameter + * @see setRLOFOpticalFlowParameter + */ + CV_WRAP virtual Ptr getRLOFOpticalFlowParameter() const = 0; + //! @brief Threshold for the forward backward confidence check + /** For each feature point a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed. + * If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f] + * is larger than threshold given by this function then the status will not be used by the following + * vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test + * will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation. + * @see setForwardBackward + */ + CV_WRAP virtual void setForwardBackward(float val) = 0; + /** @copybrief setForwardBackward + * @see setForwardBackward + */ + CV_WRAP virtual float getForwardBackward() const = 0; + + //! @brief Creates instance of SparseRLOFOpticalFlow + /** + * @param rlofParam see setRLOFOpticalFlowParameter + * @param forwardBackwardThreshold see setForwardBackward + */ + CV_WRAP static Ptr create( + Ptr rlofParam = Ptr(), + float forwardBackwardThreshold = 1.f); - }; + }; - /** @brief Fast dense optical flow computation based on robust local optical flow (RLOF) algorithms and sparse-to-dense interpolation scheme. - * - * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as - * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). - * - * The sparse-to-dense interpolation scheme allows for fast computation of dense optical flow using RLOF (see @cite Geistert2016). - * For this scheme the following steps are applied: - * -# motion vector seeded at a regular sampled grid are computed. The sparsity of this grid can be configured with setGridStep - * -# (optinally) errornous motion vectors are filter based on the forward backward confidence. The threshold can be configured - * with setForwardBackward. The filter is only applied if the threshold >0 but than the runtime is doubled due to the estimation - * of the backward flow. - * -# Vector field interpolation is applied to the motion vector set to obtain a dense vector field. - * - * @param I0 first 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType - * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image. - * @param I1 second 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType - * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image. - * @param flow computed flow image that has the same size as I0 and type CV_32FC2. - * @param rlofParam see optflow::RLOFOpticalFlowParameter - * @param forwardBackwardThreshold Threshold for the forward backward confidence check. - * For each grid point \f$ \mathbf{x} \f$ a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed. - * If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f] - * is larger than threshold given by this function then the motion vector will not be used by the following - * vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test - * will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation. - * @param gridStep Size of the grid to spawn the motion vectors. For each grid point a motion vector is computed. - * Some motion vectors will be removed due to the forwatd backward threshold (if set >0). The rest will be the - * base of the vector field interpolation. - * @param epicK see ximgproc::EdgeAwareInterpolator() sets the respective parameter. - * @param epicSigma see ximgproc::EdgeAwareInterpolator() sets the respective parameter. - * @param epicLambda see ximgproc::EdgeAwareInterpolator() sets the respective parameter. - * @param use_post_proc enables ximgproc::fastGlobalSmootherFilter() parameter. - * @param fgsLambda sets the respective ximgproc::fastGlobalSmootherFilter() parameter. - * @param fgsSigma sets the respective ximgproc::fastGlobalSmootherFilter() parameter. - * - * Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014, @cite Senst2016. - * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details. - * @note If the grid size is set to (1,1) and the forward backward threshold <= 0 that the dense optical flow field is purely - * computed with the RLOF. - * - * @sa optflow::DenseRLOFOpticalFlow, optflow::RLOFOpticalFlowParameter - */ - CV_EXPORTS_W void calcOpticalFlowDenseRLOF(InputArray I0, InputArray I1, InputOutputArray flow, - RLOFOpticalFlowParameter rlofParam = RLOFOpticalFlowParameter(), - float forewardBackwardThreshold = 0, Size gridStep = Size(6, 6), - DenseRLOFOpticalFlow::InterpolationType interp_type = DenseRLOFOpticalFlow::InterpolationType::INTERP_EPIC, - int epicK = 128, float epicSigma = 0.05f, - bool use_post_proc = true, float fgsLambda = 500.0f, float fgsSigma = 1.5f); + /** @brief Fast dense optical flow computation based on robust local optical flow (RLOF) algorithms and sparse-to-dense interpolation scheme. + * + * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as + * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). + * + * The sparse-to-dense interpolation scheme allows for fast computation of dense optical flow using RLOF (see @cite Geistert2016). + * For this scheme the following steps are applied: + * -# motion vector seeded at a regular sampled grid are computed. The sparsity of this grid can be configured with setGridStep + * -# (optinally) errornous motion vectors are filter based on the forward backward confidence. The threshold can be configured + * with setForwardBackward. The filter is only applied if the threshold >0 but than the runtime is doubled due to the estimation + * of the backward flow. + * -# Vector field interpolation is applied to the motion vector set to obtain a dense vector field. + * + * @param I0 first 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType + * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image. + * @param I1 second 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType + * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image. + * @param flow computed flow image that has the same size as I0 and type CV_32FC2. + * @param rlofParam see optflow::RLOFOpticalFlowParameter + * @param forwardBackwardThreshold Threshold for the forward backward confidence check. + * For each grid point \f$ \mathbf{x} \f$ a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed. + * If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f] + * is larger than threshold given by this function then the motion vector will not be used by the following + * vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test + * will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation. + * @param gridStep Size of the grid to spawn the motion vectors. For each grid point a motion vector is computed. + * Some motion vectors will be removed due to the forwatd backward threshold (if set >0). The rest will be the + * base of the vector field interpolation. + * @param epicK see ximgproc::EdgeAwareInterpolator() sets the respective parameter. + * @param epicSigma see ximgproc::EdgeAwareInterpolator() sets the respective parameter. + * @param epicLambda see ximgproc::EdgeAwareInterpolator() sets the respective parameter. + * @param use_post_proc enables ximgproc::fastGlobalSmootherFilter() parameter. + * @param fgsLambda sets the respective ximgproc::fastGlobalSmootherFilter() parameter. + * @param fgsSigma sets the respective ximgproc::fastGlobalSmootherFilter() parameter. + * + * Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014, @cite Senst2016. + * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details. + * @note If the grid size is set to (1,1) and the forward backward threshold <= 0 that the dense optical flow field is purely + * computed with the RLOF. + * + * @sa optflow::DenseRLOFOpticalFlow, optflow::RLOFOpticalFlowParameter + */ + CV_EXPORTS_W void calcOpticalFlowDenseRLOF(InputArray I0, InputArray I1, InputOutputArray flow, + Ptr rlofParam = Ptr(), + float forewardBackwardThreshold = 0, Size gridStep = Size(6, 6), + InterpolationType interp_type = InterpolationType::INTERP_EPIC, + int epicK = 128, float epicSigma = 0.05f, + bool use_post_proc = true, float fgsLambda = 500.0f, float fgsSigma = 1.5f); - /** @brief Calculates fast optical flow for a sparse feature set using the robust local optical flow (RLOF) similar - * to optflow::calcOpticalFlowPyrLK(). - * - * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as - * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). - * - * @param I0 first 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType - * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image. - * @param I1 second 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType - * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image. - * @param prevPts vector of 2D points for which the flow needs to be found; point coordinates must be single-precision - * floating-point numbers. - * @param nextPts output vector of 2D points (with single-precision floating-point coordinates) containing the calculated - * new positions of input features in the second image; when optflow::RLOFOpticalFlowParameter::useInitialFlow variable is true the vector must - * have the same size as in the input and contain the initialization point correspondences. - * @param status output status vector (of unsigned chars); each element of the vector is set to 1 if the flow for the - * corresponding features has passed the forward backward check. - * @param err output vector of errors; each element of the vector is set to the forward backward error for the corresponding feature. - * @param rlofParam see optflow::RLOFOpticalFlowParameter - * @param forwardBackwardThreshold Threshold for the forward backward confidence check. If forewardBackwardThreshold <=0 the forward - * - * Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014 and @cite Senst2016. - * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details. - */ - CV_EXPORTS_W void calcOpticalFlowSparseRLOF(InputArray prevImg, InputArray nextImg, - InputArray prevPts, InputOutputArray nextPts, - OutputArray status, OutputArray err, - RLOFOpticalFlowParameter rlofParam = RLOFOpticalFlowParameter(), - float forewardBackwardThreshold = 0); + /** @brief Calculates fast optical flow for a sparse feature set using the robust local optical flow (RLOF) similar + * to optflow::calcOpticalFlowPyrLK(). + * + * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as + * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). + * + * @param I0 first 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType + * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image. + * @param I1 second 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType + * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image. + * @param prevPts vector of 2D points for which the flow needs to be found; point coordinates must be single-precision + * floating-point numbers. + * @param nextPts output vector of 2D points (with single-precision floating-point coordinates) containing the calculated + * new positions of input features in the second image; when optflow::RLOFOpticalFlowParameter::useInitialFlow variable is true the vector must + * have the same size as in the input and contain the initialization point correspondences. + * @param status output status vector (of unsigned chars); each element of the vector is set to 1 if the flow for the + * corresponding features has passed the forward backward check. + * @param err output vector of errors; each element of the vector is set to the forward backward error for the corresponding feature. + * @param rlofParam see optflow::RLOFOpticalFlowParameter + * @param forwardBackwardThreshold Threshold for the forward backward confidence check. If forewardBackwardThreshold <=0 the forward + * + * Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014 and @cite Senst2016. + * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details. + */ + CV_EXPORTS_W void calcOpticalFlowSparseRLOF(InputArray prevImg, InputArray nextImg, + InputArray prevPts, InputOutputArray nextPts, + OutputArray status, OutputArray err, + Ptr rlofParam = Ptr(), + float forewardBackwardThreshold = 0); - //! Additional interface to the Dense RLOF algorithm - optflow::calcOpticalFlowDenseRLOF() - CV_EXPORTS_W Ptr createOptFlow_DenseRLOF(); + //! Additional interface to the Dense RLOF algorithm - optflow::calcOpticalFlowDenseRLOF() + CV_EXPORTS_W Ptr createOptFlow_DenseRLOF(); - //! Additional interface to the Sparse RLOF algorithm - optflow::calcOpticalFlowSparseRLOF() - CV_EXPORTS_W Ptr createOptFlow_SparseRLOF(); - //! @} - } + //! Additional interface to the Sparse RLOF algorithm - optflow::calcOpticalFlowSparseRLOF() + CV_EXPORTS_W Ptr createOptFlow_SparseRLOF(); + //! @} +} } #endif diff --git a/modules/optflow/include/opencv2/optflow_o.hpp b/modules/optflow/include/opencv2/optflow_o.hpp deleted file mode 100644 index 65802750d1a..00000000000 --- a/modules/optflow/include/opencv2/optflow_o.hpp +++ /dev/null @@ -1,309 +0,0 @@ -/* -By downloading, copying, installing or using the software you agree to this -license. If you do not agree to this license, do not download, install, -copy or use the software. - - - License Agreement - For Open Source Computer Vision Library - (3-clause BSD License) - -Copyright (C) 2013, OpenCV Foundation, all rights reserved. -Third party copyrights are property of their respective owners. - -Redistribution and use in source and binary forms, with or without modification, -are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - - * Neither the names of the copyright holders nor the names of the contributors - may be used to endorse or promote products derived from this software - without specific prior written permission. - -This software is provided by the copyright holders and contributors "as is" and -any express or implied warranties, including, but not limited to, the implied -warranties of merchantability and fitness for a particular purpose are -disclaimed. In no event shall copyright holders or contributors be liable for -any direct, indirect, incidental, special, exemplary, or consequential damages -(including, but not limited to, procurement of substitute goods or services; -loss of use, data, or profits; or business interruption) however caused -and on any theory of liability, whether in contract, strict liability, -or tort (including negligence or otherwise) arising in any way out of -the use of this software, even if advised of the possibility of such damage. -*/ - -#ifndef __OPENCV_OPTFLOW_HPP__ -#define __OPENCV_OPTFLOW_HPP__ - -#include "opencv2/core.hpp" -#include "opencv2/video.hpp" - -/** -@defgroup optflow Optical Flow Algorithms - -Dense optical flow algorithms compute motion for each point: - -- cv::optflow::calcOpticalFlowSF -- cv::optflow::createOptFlow_DeepFlow - -Motion templates is alternative technique for detecting motion and computing its direction. -See samples/motempl.py. - -- cv::motempl::updateMotionHistory -- cv::motempl::calcMotionGradient -- cv::motempl::calcGlobalOrientation -- cv::motempl::segmentMotion - -Functions reading and writing .flo files in "Middlebury" format, see: - -- cv::optflow::readOpticalFlow -- cv::optflow::writeOpticalFlow - - */ - -#include "opencv2/optflow/pcaflow.hpp" -#include "opencv2/optflow/sparse_matching_gpc.hpp" - -namespace cv -{ -namespace optflow -{ - -//! @addtogroup optflow -//! @{ - -/** @overload */ -CV_EXPORTS_W void calcOpticalFlowSF( InputArray from, InputArray to, OutputArray flow, - int layers, int averaging_block_size, int max_flow); - -/** @brief Calculate an optical flow using "SimpleFlow" algorithm. - -@param from First 8-bit 3-channel image. -@param to Second 8-bit 3-channel image of the same size as prev -@param flow computed flow image that has the same size as prev and type CV_32FC2 -@param layers Number of layers -@param averaging_block_size Size of block through which we sum up when calculate cost function -for pixel -@param max_flow maximal flow that we search at each level -@param sigma_dist vector smooth spatial sigma parameter -@param sigma_color vector smooth color sigma parameter -@param postprocess_window window size for postprocess cross bilateral filter -@param sigma_dist_fix spatial sigma for postprocess cross bilateralf filter -@param sigma_color_fix color sigma for postprocess cross bilateral filter -@param occ_thr threshold for detecting occlusions -@param upscale_averaging_radius window size for bilateral upscale operation -@param upscale_sigma_dist spatial sigma for bilateral upscale operation -@param upscale_sigma_color color sigma for bilateral upscale operation -@param speed_up_thr threshold to detect point with irregular flow - where flow should be -recalculated after upscale - -See @cite Tao2012 . And site of project - . - -@note - - An example using the simpleFlow algorithm can be found at samples/simpleflow_demo.cpp - */ -CV_EXPORTS_W void calcOpticalFlowSF( InputArray from, InputArray to, OutputArray flow, int layers, - int averaging_block_size, int max_flow, - double sigma_dist, double sigma_color, int postprocess_window, - double sigma_dist_fix, double sigma_color_fix, double occ_thr, - int upscale_averaging_radius, double upscale_sigma_dist, - double upscale_sigma_color, double speed_up_thr ); - -/** @brief Fast dense optical flow based on PyrLK sparse matches interpolation. - -@param from first 8-bit 3-channel or 1-channel image. -@param to second 8-bit 3-channel or 1-channel image of the same size as from -@param flow computed flow image that has the same size as from and CV_32FC2 type -@param grid_step stride used in sparse match computation. Lower values usually - result in higher quality but slow down the algorithm. -@param k number of nearest-neighbor matches considered, when fitting a locally affine - model. Lower values can make the algorithm noticeably faster at the cost of - some quality degradation. -@param sigma parameter defining how fast the weights decrease in the locally-weighted affine - fitting. Higher values can help preserve fine details, lower values can help to get rid - of the noise in the output flow. -@param use_post_proc defines whether the ximgproc::fastGlobalSmootherFilter() is used - for post-processing after interpolation -@param fgs_lambda see the respective parameter of the ximgproc::fastGlobalSmootherFilter() -@param fgs_sigma see the respective parameter of the ximgproc::fastGlobalSmootherFilter() - */ -CV_EXPORTS_W void calcOpticalFlowSparseToDense ( InputArray from, InputArray to, OutputArray flow, - int grid_step = 8, int k = 128, float sigma = 0.05f, - bool use_post_proc = true, float fgs_lambda = 500.0f, - float fgs_sigma = 1.5f ); - -/** @brief DeepFlow optical flow algorithm implementation. - -The class implements the DeepFlow optical flow algorithm described in @cite Weinzaepfel2013 . See -also . -Parameters - class fields - that may be modified after creating a class instance: -- member float alpha -Smoothness assumption weight -- member float delta -Color constancy assumption weight -- member float gamma -Gradient constancy weight -- member float sigma -Gaussian smoothing parameter -- member int minSize -Minimal dimension of an image in the pyramid (next, smaller images in the pyramid are generated -until one of the dimensions reaches this size) -- member float downscaleFactor -Scaling factor in the image pyramid (must be \< 1) -- member int fixedPointIterations -How many iterations on each level of the pyramid -- member int sorIterations -Iterations of Succesive Over-Relaxation (solver) -- member float omega -Relaxation factor in SOR - */ -CV_EXPORTS_W Ptr createOptFlow_DeepFlow(); - -//! Additional interface to the SimpleFlow algorithm - calcOpticalFlowSF() -CV_EXPORTS_W Ptr createOptFlow_SimpleFlow(); - -//! Additional interface to the Farneback's algorithm - calcOpticalFlowFarneback() -CV_EXPORTS_W Ptr createOptFlow_Farneback(); - -//! Additional interface to the SparseToDenseFlow algorithm - calcOpticalFlowSparseToDense() -CV_EXPORTS_W Ptr createOptFlow_SparseToDense(); - -/** @brief "Dual TV L1" Optical Flow Algorithm. - -The class implements the "Dual TV L1" optical flow algorithm described in @cite Zach2007 and -@cite Javier2012 . -Here are important members of the class that control the algorithm, which you can set after -constructing the class instance: - -- member double tau - Time step of the numerical scheme. - -- member double lambda - Weight parameter for the data term, attachment parameter. This is the most relevant - parameter, which determines the smoothness of the output. The smaller this parameter is, - the smoother the solutions we obtain. It depends on the range of motions of the images, so - its value should be adapted to each image sequence. - -- member double theta - Weight parameter for (u - v)\^2, tightness parameter. It serves as a link between the - attachment and the regularization terms. In theory, it should have a small value in order - to maintain both parts in correspondence. The method is stable for a large range of values - of this parameter. - -- member int nscales - Number of scales used to create the pyramid of images. - -- member int warps - Number of warpings per scale. Represents the number of times that I1(x+u0) and grad( - I1(x+u0) ) are computed per scale. This is a parameter that assures the stability of the - method. It also affects the running time, so it is a compromise between speed and - accuracy. - -- member double epsilon - Stopping criterion threshold used in the numerical scheme, which is a trade-off between - precision and running time. A small value will yield more accurate solutions at the - expense of a slower convergence. - -- member int iterations - Stopping criterion iterations number used in the numerical scheme. - -C. Zach, T. Pock and H. Bischof, "A Duality Based Approach for Realtime TV-L1 Optical Flow". -Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation". -*/ -class CV_EXPORTS_W DualTVL1OpticalFlow : public DenseOpticalFlow -{ -public: - //! @brief Time step of the numerical scheme - /** @see setTau */ - CV_WRAP virtual double getTau() const = 0; - /** @copybrief getTau @see getTau */ - CV_WRAP virtual void setTau(double val) = 0; - //! @brief Weight parameter for the data term, attachment parameter - /** @see setLambda */ - CV_WRAP virtual double getLambda() const = 0; - /** @copybrief getLambda @see getLambda */ - CV_WRAP virtual void setLambda(double val) = 0; - //! @brief Weight parameter for (u - v)^2, tightness parameter - /** @see setTheta */ - CV_WRAP virtual double getTheta() const = 0; - /** @copybrief getTheta @see getTheta */ - CV_WRAP virtual void setTheta(double val) = 0; - //! @brief coefficient for additional illumination variation term - /** @see setGamma */ - CV_WRAP virtual double getGamma() const = 0; - /** @copybrief getGamma @see getGamma */ - CV_WRAP virtual void setGamma(double val) = 0; - //! @brief Number of scales used to create the pyramid of images - /** @see setScalesNumber */ - CV_WRAP virtual int getScalesNumber() const = 0; - /** @copybrief getScalesNumber @see getScalesNumber */ - CV_WRAP virtual void setScalesNumber(int val) = 0; - //! @brief Number of warpings per scale - /** @see setWarpingsNumber */ - CV_WRAP virtual int getWarpingsNumber() const = 0; - /** @copybrief getWarpingsNumber @see getWarpingsNumber */ - CV_WRAP virtual void setWarpingsNumber(int val) = 0; - //! @brief Stopping criterion threshold used in the numerical scheme, which is a trade-off between precision and running time - /** @see setEpsilon */ - CV_WRAP virtual double getEpsilon() const = 0; - /** @copybrief getEpsilon @see getEpsilon */ - CV_WRAP virtual void setEpsilon(double val) = 0; - //! @brief Inner iterations (between outlier filtering) used in the numerical scheme - /** @see setInnerIterations */ - CV_WRAP virtual int getInnerIterations() const = 0; - /** @copybrief getInnerIterations @see getInnerIterations */ - CV_WRAP virtual void setInnerIterations(int val) = 0; - //! @brief Outer iterations (number of inner loops) used in the numerical scheme - /** @see setOuterIterations */ - CV_WRAP virtual int getOuterIterations() const = 0; - /** @copybrief getOuterIterations @see getOuterIterations */ - CV_WRAP virtual void setOuterIterations(int val) = 0; - //! @brief Use initial flow - /** @see setUseInitialFlow */ - CV_WRAP virtual bool getUseInitialFlow() const = 0; - /** @copybrief getUseInitialFlow @see getUseInitialFlow */ - CV_WRAP virtual void setUseInitialFlow(bool val) = 0; - //! @brief Step between scales (<1) - /** @see setScaleStep */ - CV_WRAP virtual double getScaleStep() const = 0; - /** @copybrief getScaleStep @see getScaleStep */ - CV_WRAP virtual void setScaleStep(double val) = 0; - //! @brief Median filter kernel size (1 = no filter) (3 or 5) - /** @see setMedianFiltering */ - CV_WRAP virtual int getMedianFiltering() const = 0; - /** @copybrief getMedianFiltering @see getMedianFiltering */ - CV_WRAP virtual void setMedianFiltering(int val) = 0; - - /** @brief Creates instance of cv::DualTVL1OpticalFlow*/ - CV_WRAP static Ptr create( - double tau = 0.25, - double lambda = 0.15, - double theta = 0.3, - int nscales = 5, - int warps = 5, - double epsilon = 0.01, - int innnerIterations = 30, - int outerIterations = 10, - double scaleStep = 0.8, - double gamma = 0.0, - int medianFiltering = 5, - bool useInitialFlow = false); -}; - -/** @brief Creates instance of cv::DenseOpticalFlow -*/ -CV_EXPORTS_W Ptr createOptFlow_DualTVL1(); - -//! @} - -} //optflow -} - -#include "opencv2/optflow/motempl.hpp" - -#endif diff --git a/modules/optflow/perf/perf_rlof.cpp b/modules/optflow/perf/perf_rlof.cpp index ea48a62be59..4c87f51ef78 100644 --- a/modules/optflow/perf/perf_rlof.cpp +++ b/modules/optflow/perf/perf_rlof.cpp @@ -1,89 +1,88 @@ #include "perf_precomp.hpp" namespace opencv_test { - namespace { - - typedef tuple ST_SR_IM_Sparse_t; - typedef TestBaseWithParam ST_SR_IM_Sparse; - PERF_TEST_P(ST_SR_IM_Sparse, OpticalFlow_SparseRLOF, - testing::Combine( - testing::Values("ST_BILINEAR", "ST_STANDART"), - testing::Values("SR_CROSS", "SR_FIXED"), - testing::Values(true, false)) - ) - { - Size sz(640,480); - string frame1_path = "B:/workspace/opencv_extra/testdata/cv/optflow/RubberWhale1.png"; - string frame2_path = "B:/workspace/opencv_extra/testdata/cv/optflow/RubberWhale2.png"; - frame1_path.erase(std::remove_if(frame1_path.begin(), frame1_path.end(), isspace), frame1_path.end()); - frame2_path.erase(std::remove_if(frame2_path.begin(), frame2_path.end(), isspace), frame2_path.end()); - Mat frame1 = imread(frame1_path); - Mat frame2 = imread(frame2_path); - ASSERT_FALSE(frame1.empty()); - ASSERT_FALSE(frame2.empty()); - vector prevPts, currPts; - for (int r = 0; r < frame1.rows; r += 10) - { - for (int c = 0; c < frame1.cols; c += 10) - { - prevPts.push_back(Point2f(static_cast(c), static_cast(r))); - } - } - vector status(prevPts.size()); - vector err(prevPts.size()); + namespace { + + typedef tuple ST_SR_IM_Sparse_t; + typedef TestBaseWithParam ST_SR_IM_Sparse; + PERF_TEST_P(ST_SR_IM_Sparse, OpticalFlow_SparseRLOF, + testing::Combine( + testing::Values("ST_BILINEAR", "ST_STANDART"), + testing::Values("SR_CROSS", "SR_FIXED"), + testing::Values(true, false)) + ) + { + string frame1_path = TS::ptr()->get_data_path() + "/cv/optflow/RubberWhale1.png"; + string frame2_path = TS::ptr()->get_data_path() + "/cv/optflow/RubberWhale2.png"; + frame1_path.erase(std::remove_if(frame1_path.begin(), frame1_path.end(), isspace), frame1_path.end()); + frame2_path.erase(std::remove_if(frame2_path.begin(), frame2_path.end(), isspace), frame2_path.end()); + Mat frame1 = imread(frame1_path); + Mat frame2 = imread(frame2_path); + ASSERT_FALSE(frame1.empty()); + ASSERT_FALSE(frame2.empty()); + vector prevPts, currPts; + for (int r = 0; r < frame1.rows; r += 10) + { + for (int c = 0; c < frame1.cols; c += 10) + { + prevPts.push_back(Point2f(static_cast(c), static_cast(r))); + } + } + vector status(prevPts.size()); + vector err(prevPts.size()); - RLOFOpticalFlowParameter param; - if (get<0>(GetParam()) == "ST_BILINEAR") - param.solverType = RLOFOpticalFlowParameter::ST_BILINEAR; - if (get<0>(GetParam()) == "ST_STANDART") - param.solverType = RLOFOpticalFlowParameter::ST_STANDART; - if (get<1>(GetParam()) == "SR_CROSS") - param.supportRegionType = RLOFOpticalFlowParameter::SR_CROSS; - if (get<1>(GetParam()) == "SR_FIXED") - param.supportRegionType = RLOFOpticalFlowParameter::SR_FIXED; - param.useIlluminationModel = get<2>(GetParam()); + Ptr param; + if (get<0>(GetParam()) == "ST_BILINEAR") + param->solverType = ST_BILINEAR; + if (get<0>(GetParam()) == "ST_STANDART") + param->solverType = ST_STANDART; + if (get<1>(GetParam()) == "SR_CROSS") + param->supportRegionType = SR_CROSS; + if (get<1>(GetParam()) == "SR_FIXED") + param->supportRegionType = SR_FIXED; + param->useIlluminationModel = get<2>(GetParam()); - TEST_CYCLE_N(1) - { - calcOpticalFlowSparseRLOF(frame1, frame2, prevPts, currPts, status, err, param, 1.f); - } + TEST_CYCLE_N(1) + { + calcOpticalFlowSparseRLOF(frame1, frame2, prevPts, currPts, status, err, param, 1.f); + } - SANITY_CHECK_NOTHING(); - } + SANITY_CHECK_NOTHING(); + } - typedef tuple INTERP_GRID_Dense_t; - typedef TestBaseWithParam INTERP_GRID_Dense; - PERF_TEST_P(INTERP_GRID_Dense, OpticalFlow_DenseRLOF, - testing::Combine( - testing::Values("INTERP_EPIC", "INTERP_GEO"), - testing::Values(4,10,50)) - ) - { - Mat flow; - string frame1_path = TS::ptr()->get_data_path() + "/cv/optflow/RubberWhale1.png"; - string frame2_path = TS::ptr()->get_data_path() + "/cv/optflow/RubberWhale2.png"; - // removing space may be an issue on windows machines - frame1_path.erase(std::remove_if(frame1_path.begin(), frame1_path.end(), isspace), frame1_path.end()); - frame2_path.erase(std::remove_if(frame2_path.begin(), frame2_path.end(), isspace), frame2_path.end()); - Mat frame1 = imread(frame1_path); - Mat frame2 = imread(frame2_path); - ASSERT_FALSE(frame1.empty()); - ASSERT_FALSE(frame2.empty()); - - RLOFOpticalFlowParameter param; - Ptr< DenseRLOFOpticalFlow> algo = DenseRLOFOpticalFlow::create(); - DenseRLOFOpticalFlow::InterpolationType interp_type; - if (get<0>(GetParam()) == "INTERP_EPIC") - interp_type = DenseRLOFOpticalFlow::InterpolationType::INTERP_EPIC; - if (get<0>(GetParam()) == "INTERP_GEO") - interp_type = DenseRLOFOpticalFlow::InterpolationType::INTERP_GEO; - + typedef tuple INTERP_GRID_Dense_t; + typedef TestBaseWithParam INTERP_GRID_Dense; + PERF_TEST_P(INTERP_GRID_Dense, OpticalFlow_DenseRLOF, + testing::Combine( + testing::Values("INTERP_EPIC", "INTERP_GEO"), + testing::Values(4,10,50)) + ) + { + Mat flow; + string frame1_path = TS::ptr()->get_data_path() + "/cv/optflow/RubberWhale1.png"; + string frame2_path = TS::ptr()->get_data_path() + "/cv/optflow/RubberWhale2.png"; + // removing space may be an issue on windows machines + frame1_path.erase(std::remove_if(frame1_path.begin(), frame1_path.end(), isspace), frame1_path.end()); + frame2_path.erase(std::remove_if(frame2_path.begin(), frame2_path.end(), isspace), frame2_path.end()); + Mat frame1 = imread(frame1_path); + Mat frame2 = imread(frame2_path); + ASSERT_FALSE(frame1.empty()); + ASSERT_FALSE(frame2.empty()); + + Ptr param; + Ptr< DenseRLOFOpticalFlow> algo = DenseRLOFOpticalFlow::create(); + InterpolationType interp_type; + if (get<0>(GetParam()) == "INTERP_EPIC") + interp_type = INTERP_EPIC; + if (get<0>(GetParam()) == "INTERP_GEO") + interp_type = INTERP_GEO; + - TEST_CYCLE_N(5) - { - calcOpticalFlowDenseRLOF(frame1, frame2,flow, param, 1.0f, Size(get<1>(GetParam()), get<1>(GetParam())), interp_type); - } + TEST_CYCLE_N(5) + { + calcOpticalFlowDenseRLOF(frame1, frame2,flow, param, 1.0f, Size(get<1>(GetParam()), get<1>(GetParam())), interp_type); + } - SANITY_CHECK_NOTHING(); - } - } + SANITY_CHECK_NOTHING(); + } + } } // namespace diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp index d903d1733db..437e807d810 100644 --- a/modules/optflow/src/rlof/berlof_invoker.hpp +++ b/modules/optflow/src/rlof/berlof_invoker.hpp @@ -1,44 +1,6 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// - // - // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. - // - // By downloading, copying, installing or using the software you agree to this license. - // If you do not agree to this license, do not download, install, - // copy or use the software. - // - // - // License Agreement - // For Open Source Computer Vision Library - // - // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. - // Copyright (C) 2009, Willow Garage Inc., all rights reserved. - // Third party copyrights are property of their respective owners. - // - // Redistribution and use in source and binary forms, with or without modification, - // are permitted provided that the following conditions are met: - // - // * Redistribution's of source code must retain the above copyright notice, - // this list of conditions and the following disclaimer. - // - // * Redistribution's in binary form must reproduce the above copyright notice, - // this list of conditions and the following disclaimer in the documentation - // and/or other materials provided with the distribution. - // - // * The name of the copyright holders may not be used to endorse or promote products - // derived from this software without specific prior written permission. - // - // This software is provided by the copyright holders and contributors "as is" and - // any express or implied warranties, including, but not limited to, the implied - // warranties of merchantability and fitness for a particular purpose are disclaimed. - // In no event shall the Intel Corporation or contributors be liable for any direct, - // indirect, incidental, special, exemplary, or consequential damages - // (including, but not limited to, procurement of substitute goods or services; - // loss of use, data, or profits; or business interruption) however caused - // and on any theory of liability, whether in contract, strict liability, - // or tort (including negligence or otherwise) arising in any way out of - // the use of this software, even if advised of the possibility of such damage. - // - //M*/ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. #ifndef _BERLOF_INVOKER_HPP_ #define _BERLOF_INVOKER_HPP_ #include "rlof_invokerbase.hpp" @@ -46,2184 +8,2122 @@ namespace cv{ namespace optflow{ - - bool checkSolution(float x, float y, float * c ) - { - float _a = x - 0.002f; - float _b = y - 0.002f; - cv::Point2f tl( c[0] * _a * _b + c[1] * _a + c[2] * _b + c[3], c[4] * _a * _b + c[5] * _a + c[6] * _b + c[7]); - _a = x + 0.002f; - cv::Point2f tr( c[0] * _a * _b + c[1] * _a + c[2] * _b + c[3], c[4] * _a * _b + c[5] * _a + c[6] * _b + c[7]); - _b = y + 0.002f; - cv::Point2f br( c[0] * _a * _b + c[1] * _a + c[2] * _b + c[3], c[4] * _a * _b + c[5] * _a + c[6] * _b + c[7]); - _a = x - 0.002f; - cv::Point2f bl( c[0] * _a * _b + c[1] * _a + c[2] * _b + c[3], c[4] * _a * _b + c[5] * _a + c[6] * _b + c[7]); - return (tl.x >= 0 && tl.y >= 0) && (tr.x <= 0 && tr.y >= 0) - && (bl.x >= 0 && bl.y <= 0) && (br.x <= 0 && br.y <= 0); - } - - inline cv::Point2f est_DeltaGain( const cv::Mat & src, const cv::Vec4f & val) - { - return cv::Point2f( - src.at(2,0) * val[0] + src.at(2,1) * val[1] + src.at(2,2) * val[2] + src.at(2,3) * val[3], - src.at(3,0) * val[0] + src.at(3,1) * val[1] + src.at(3,2) * val[2] + src.at(3,3) * val[3]); - } - inline void est_Result( const cv::Mat & src, const cv::Vec4f & val, cv::Point2f & delta, cv::Point2f & gain) - { - - delta = cv::Point2f( - -(src.at(0,0) * val[0] + src.at(0,1) * val[1] + src.at(0,2) * val[2] + src.at(0,3) * val[3]), - -(src.at(1,0) * val[0] + src.at(1,1) * val[1] + src.at(1,2) * val[2] + src.at(1,3) * val[3])); - - gain = cv::Point2f( - src.at(2,0) * val[0] + src.at(2,1) * val[1] + src.at(2,2) * val[2] + src.at(2,3) * val[3], - src.at(3,0) * val[0] + src.at(3,1) * val[1] + src.at(3,2) * val[2] + src.at(3,3) * val[3]); - } - - namespace berlof - { - - namespace ica - { - - class TrackerInvoker : public cv::ParallelLoopBody - { - public: - TrackerInvoker( - const Mat& _prevImg, - const Mat& _prevDeriv, - const Mat& _nextImg, - const Mat& _rgbPrevImg, - const Mat& _rgbNextImg, - const Point2f* _prevPts, - Point2f* _nextPts, - uchar* _status, - float* _err, - int _level, - int _maxLevel, - int _winSize[2], - int _maxIteration, - bool _useInitialFlow, - int _supportRegionType, - int _crossSegmentationThreshold, - std::vector _normSigmas, - float _minEigenValue) - { - prevImg = &_prevImg; - prevDeriv = &_prevDeriv; - nextImg = &_nextImg; - rgbPrevImg = &_rgbPrevImg; - rgbNextImg = &_rgbNextImg; - prevPts = _prevPts; - nextPts = _nextPts; - status = _status; - err = _err; - minWinSize = _winSize[0]; - maxWinSize = _winSize[1]; - criteria.maxCount = _maxIteration; - criteria.epsilon = 0.01; - level = _level; - maxLevel = _maxLevel; - windowType = _supportRegionType; - minEigThreshold = _minEigenValue; - paramReset = _normSigmas; - useInitialFlow = _useInitialFlow; - crossSegmentationThreshold = _crossSegmentationThreshold; - } - - TrackerInvoker & operator=(const TrackerInvoker &) {return *this;}; - void operator()(const cv::Range& range) const - { - - Point2f halfWin; - cv::Size winSize; - const Mat& I = *prevImg; - const Mat& J = *nextImg; - const Mat& derivI = *prevDeriv; - const Mat& BI = *rgbPrevImg; - - - const float FLT_SCALE = (1.f/(1 << 16)); - - winSize = cv::Size(maxWinSize,maxWinSize); - int winMaskwidth = static_cast(ceil(winSize.width / 16.f)) * 16; - cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); - winMaskMatBuf.setTo(1); - - std::vector param = paramReset; - int j, cn = I.channels(), cn2 = cn*2; - int winbufwidth = static_cast(ceil(winSize.width / 16.f)) * 16; - cv::Size winBufSize(winbufwidth,winbufwidth); - - cv::AutoBuffer _buf(winBufSize.area()*(cn + cn2)); - int derivDepth = DataType::depth; - - Mat IWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf.data()); - Mat derivIWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf.data() + winBufSize.area()*cn); - - - for( int ptidx = range.start; ptidx < range.end; ptidx++ ) - { - Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level)); - Point2f nextPt; - if( level == maxLevel ) - { - if( useInitialFlow ) - nextPt = nextPts[ptidx]*(float)(1./(1 << level)); - else - nextPt = prevPt; - } - else - nextPt = nextPts[ptidx]*2.f; - nextPts[ptidx] = nextPt; - - Point2i iprevPt, inextPt; - iprevPt.x = cvFloor(prevPt.x); - iprevPt.y = cvFloor(prevPt.y); - int winArea = maxWinSize * maxWinSize; - cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize)); - winMaskMatBuf.setTo(0); - if( calcWinMaskMat(BI, windowType, iprevPt, - winMaskMat,winSize,halfWin,winArea, - minWinSize,maxWinSize, crossSegmentationThreshold) == false) - continue; - - prevPt -= halfWin; - iprevPt.x = cvFloor(prevPt.x); - iprevPt.y = cvFloor(prevPt.y); - - if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols || - iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows ) - { - if( level == 0 ) - { - if( status ) - status[ptidx] = 3; - if( err ) - err[ptidx] = 0; - } - continue; - } - - float a = prevPt.x - iprevPt.x; - float b = prevPt.y - iprevPt.y; - const int W_BITS = 14, W_BITS1 = 14; - - int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); - int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); - int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); - int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - - int dstep = (int)(derivI.step/derivI.elemSize1()); - int step = (int)(I.step/I.elemSize1()); - int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1()); - CV_Assert( step == (int)(J.step/J.elemSize1()) ); - - float A11 = 0, A12 = 0, A22 = 0; - float b1 = 0, b2 = 0; - float D = 0; - - // extract the patch from the first image, compute covariation matrix of derivatives - int x, y; - for( y = 0; y < winSize.height; y++ ) - { - const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; - const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; - - short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step); - short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step); - x = 0; - for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 ) - { - if( winMaskMat.at(y,x) == 0) - { - dIptr[0] = 0; - dIptr[1] = 0; - continue; - } - int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + - src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5); - int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 + - dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1); - int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 + - dsrc[dstep+cn2+1]*iw11, W_BITS1); - Iptr[x] = (short)ival; - dIptr[0] = (short)ixval; - dIptr[1] = (short)iyval; - } - } - - cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1); - cv::Point2f backUpNextPt = nextPt; - nextPt -= halfWin; - Point2f prevDelta(0,0); //denotes h(t-1) - cv::Size _winSize = winSize; - __m128i mmMask0, mmMask1, mmMask; - getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask); - __m128 mmOnes = _mm_set1_ps(1.f ); - __m128i mmOnes_epi16 = _mm_set1_epi16(1 ); - __m128i mmZeros_epi16 = _mm_set1_epi16(0); - float MEstimatorScale = 1; - int buffIdx = 0; - float c[8]; - cv::Mat GMc0, GMc1, GMc2, GMc3; - cv::Vec2f Mc0, Mc1, Mc2, Mc3; - int noIteration = 0; - int noReusedIteration = 0; - int noSolvedIteration = 0; - for( j = 0; j < criteria.maxCount; j++ ) - { - cv::Point2f delta(0,0); - cv::Point2f deltaGain(0,0); - bool hasSolved = false; - a = nextPt.x - inextPt.x; - b = nextPt.y - inextPt.y; - float ab = a * b; - if( j == 0 - || ( inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j % 2 != 0 )) - { - inextPt.x = cvFloor(nextPt.x); - inextPt.y = cvFloor(nextPt.y); - - if( inextPt.x < -winSize.width || inextPt.x >= J.cols || - inextPt.y < -winSize.height || inextPt.y >= J.rows ) - { - if( level == 0 && status ) - status[ptidx] = 3; - break; - } - - - a = nextPt.x - inextPt.x; - b = nextPt.y - inextPt.y; - ab = a * b; - iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); - iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); - iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); - iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - // mismatch - b1 = 0; - b2 = 0; - if( j == 0 ) - { - A11 = 0; - A12 = 0; - A22 = 0; - } - - if ( j == 0 ) - { - buffIdx = 0; - for( y = 0; y < winSize.height; y++ ) - { - const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y )*step + (inextPt.x )*cn; - const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step); - const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; - x = 0; - for( ; x < winSize.width*cn; x++, dIptr += 2) - { - if( maskPtr[x] == 0) - continue; - int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, W_BITS1-5) - - Iptr[x]; - residualMat.at(buffIdx++) = static_cast(diff); - } - } - /*! Estimation for the residual */ - cv::Mat residualMatRoi(residualMat, cv::Rect(0,0,1, buffIdx)); - MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi); - } - - float eta = 1.f / winArea; - float fParam0 = param[0] * 32.f; - float fParam1 = param[1] * 32.f; - fParam0 = param[0] * MEstimatorScale; - fParam1 = param[1] * MEstimatorScale; - - - buffIdx = 0; - float _b0[4] = {0,0,0,0}; - float _b1[4] = {0,0,0,0}; - - /* - */ - for( y = 0; y < _winSize.height; y++ ) - { - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; - const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn; - const short* Iptr = (const short*)(IWinBuf.data + y*IWinBuf.step); - const short* dIptr = (const short*)(derivIWinBuf.data + y*derivIWinBuf.step); - - x = 0; - for( ; x < _winSize.width*cn; x++, dIptr += 2 ) - { - if( maskPtr[x] == 0) - continue; - int illValue = - Iptr[x]; - - float It[4] = {static_cast((Jptr[x+step+cn]<< 5) + illValue), - static_cast((Jptr[x+cn]<< 5) + illValue), - static_cast((Jptr[x+step]<< 5) + illValue), - static_cast((Jptr[x] << 5) + illValue)}; - - - - int J_val = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + - Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, - W_BITS1-5); - - - int diff = J_val + illValue; - - - MEstimatorScale += (diff < MEstimatorScale) ? -eta : eta; - - int abss = (diff < 0) ? -diff : diff; - - // compute the missmatch vector - if( j >= 0) - { - if( abss > fParam1) - { - It[0] = 0; - It[1] = 0; - It[2] = 0; - It[3] = 0; - } - else if( abss > fParam0 && diff >= 0 ) - { - It[0] = param[2] * (It[0] - fParam1); - It[1] = param[2] * (It[1] - fParam1); - It[2] = param[2] * (It[2] - fParam1); - It[3] = param[2] * (It[3] - fParam1); - } - else if( abss > fParam0 && diff < 0 ) - { - It[0] = param[2] * (It[0] + fParam1); - It[1] = param[2] * (It[1] + fParam1); - It[2] = param[2] * (It[2] + fParam1); - It[3] = param[2] * (It[3] + fParam1); - } - } - - float It0 = It[0]; - float It1 = It[1]; - float It2 = It[2]; - float It3 = It[3]; - - float ixval = static_cast(dIptr[0]); - float iyval = static_cast(dIptr[1]); - _b0[0] += It0 * ixval; - _b0[1] += It1 * ixval; - _b0[2] += It2 * ixval; - _b0[3] += It3 * ixval; - - - _b1[0] += It0*iyval; - _b1[1] += It1*iyval; - _b1[2] += It2*iyval; - _b1[3] += It3*iyval; - - - // compute the Gradient Matrice - if( j == 0) - { - float tale = param[2] * FLT_RESCALE; - if( abss < fParam0 || j < 0) - { - tale = FLT_RESCALE; - } - else if( abss > fParam1) - { - tale *= 0.01f; - } - else - { - tale *= param[2]; - } - - A11 += (float)(ixval*ixval)*tale; - A12 += (float)(ixval*iyval)*tale; - A22 += (float)(iyval*iyval)*tale; - - } - } - } - - if( j == 0 ) - { - - A11 *= FLT_SCALE; // 54866744. - A12 *= FLT_SCALE; // -628764.00 - A22 *= FLT_SCALE; // 19730.000 - - D = A11 * A22 - A12 * A12; - float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) + - 4.f*A12*A12))/(2*winArea); - - if( minEig < minEigThreshold || std::abs(D) < FLT_EPSILON) - { - if( level == 0 && status ) - status[ptidx] = 0; - if( level > 0) - { - nextPts[ptidx] = backUpNextPt; - } - noIteration++; - break; - } - - D = (1.f / D); - - } - - _b0[0] *= FLT_SCALE;_b0[1] *= FLT_SCALE;_b0[2] *= FLT_SCALE;_b0[3] *= FLT_SCALE; - _b1[0] *= FLT_SCALE;_b1[1] *= FLT_SCALE;_b1[2] *= FLT_SCALE;_b1[3] *= FLT_SCALE; - - - Mc0[0] = _b0[0] - _b0[1] - _b0[2] + _b0[3]; - Mc0[1] = _b1[0] - _b1[1] - _b1[2] + _b1[3]; - - Mc1[0] = _b0[1] - _b0[3]; - Mc1[1] = _b1[1] - _b1[3]; - - - Mc2[0] = _b0[2] - _b0[3]; - Mc2[1] = _b1[2] - _b1[3]; - - - Mc3[0] = _b0[3]; - Mc3[1] = _b1[3]; - - c[0] = -Mc0[0]; - c[1] = -Mc1[0]; - c[2] = -Mc2[0]; - c[3] = -Mc3[0]; - c[4] = -Mc0[1]; - c[5] = -Mc1[1]; - c[6] = -Mc2[1]; - c[7] = -Mc3[1]; - - float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]); - float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]); - float e2 = e0 * (c[1] * c[7] -c[3] * c[5]); - e0 = e1 * e1 - e2; - hasSolved = false; - if ( e0 > 0) - { - e0 = sqrt(e0); - float _y[2] = {-e1 - e0, e0 - e1}; - float c0yc1[2] = {c[0] * _y[0] + c[1], - c[0] * _y[1] + c[1]}; - float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0], - -(c[2] * _y[1] + c[3]) / c0yc1[1]}; - float c0yc2[2] = {c[0] * _y[0] + c[2], - c[0] * _y[1] + c[2]}; - - float c4yc6[2] = {c[4] * _y[0] + c[6], - c[4] * _y[1] + c[6]}; - - float c0xc1[2] = {c[0] * _x[0] + c[1], - c[0] * _x[1] + c[1]}; - - float c4xc5[2] = {c[4] * _x[0] + c[5], - c[4] * _x[1] + c[5]}; - bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1); - bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1); - - bool isSolution1 = checkSolution(_x[0], _y[0], c ); - bool isSolution2 = checkSolution(_x[1], _y[1], c ); - bool isSink1 = isIn1 && isSolution1; - bool isSink2 = isIn2 && isSolution2; - - if ( isSink1 != isSink2) - { - a = isSink1 ? _x[0] : _x[1]; - b = isSink1 ? _y[0] : _y[1]; - ab = a * b; - hasSolved = true; - delta.x = inextPt.x + a - nextPt.x; - delta.y = inextPt.y + b - nextPt.y; - } // isIn1 != isIn2 - } - if( hasSolved == false) - noIteration++; - } - else - { - hasSolved = false; - noReusedIteration++; - } - if( hasSolved == false ) - { - - cv::Vec2f mismatchVec = ab * Mc0 + Mc1 *a + Mc2 * b + Mc3; - delta.x = (A12 * mismatchVec.val[1] - A22 * mismatchVec.val[0]) * D; - delta.y = (A12 * mismatchVec.val[0] - A11 * mismatchVec.val[1]) * D; - delta.x = MAX(-1.f, MIN(1.f, delta.x)); - delta.y = MAX(-1.f, MIN(1.f, delta.y)); - nextPt += delta; - nextPts[ptidx] = nextPt + halfWin; - } - else - { - nextPt += delta; - nextPts[ptidx] = nextPt + halfWin; - noSolvedIteration++; - break; - } - - delta.x = ( delta.x != delta.x) ? 0 : delta.x; - delta.y = ( delta.y != delta.y) ? 0 : delta.y; - - - cv::Point2f totalDistance = backUpNextPt - nextPts[ptidx]; - - if(j > 0 && ( - (std::abs(delta.x - prevDelta.x) < 0.01 && std::abs(delta.y - prevDelta.y) < 0.01))) - { - nextPts[ptidx] -= delta*0.5f; - break; - } - - - prevDelta = delta; - } - - } - - } - - const Mat* prevImg; - const Mat* nextImg; - const Mat* prevDeriv; - const Mat* rgbPrevImg; - const Mat* rgbNextImg; - const Point2f* prevPts; - Point2f* nextPts; - uchar* status; - float* err; - int maxWinSize; - int minWinSize; - TermCriteria criteria; - int level; - int maxLevel; - int windowType; - float minEigThreshold; - bool useInitialFlow; - std::vector paramReset; - int crossSegmentationThreshold; - - }; - } - namespace radial - { - class TrackerInvoker : public cv::ParallelLoopBody - { - public: - TrackerInvoker( - const Mat& _prevImg, - const Mat& _prevDeriv, - const Mat& _nextImg, - const Mat& _rgbPrevImg, - const Mat& _rgbNextImg, - const Point2f* _prevPts, - Point2f* _nextPts, - uchar* _status, - float* _err, - Point2f* _gainVecs, - int _level, - int _maxLevel, - int _winSize[2], - int _maxIteration, - bool _useInitialFlow, - int _supportRegionType, - int _crossSegmentationThreshold, - std::vector _normSigmas, - float _minEigenValue) - { - prevImg = &_prevImg; - prevDeriv = &_prevDeriv; - nextImg = &_nextImg; - rgbPrevImg = &_rgbPrevImg; - rgbNextImg = &_rgbNextImg; - prevPts = _prevPts; - nextPts = _nextPts; - status = _status; - err = _err; - gainVecs = _gainVecs; - minWinSize = _winSize[0]; - maxWinSize = _winSize[1]; - criteria.maxCount = _maxIteration; - criteria.epsilon = 0.01; - level = _level; - maxLevel = _maxLevel; - windowType = _supportRegionType; - minEigThreshold = _minEigenValue; - paramReset = _normSigmas; - useInitialFlow = _useInitialFlow; - crossSegmentationThreshold = _crossSegmentationThreshold; - } - - TrackerInvoker & operator=(const TrackerInvoker &) {return *this;}; - void operator()(const cv::Range& range) const - { - Point2f halfWin; - cv::Size winSize; - const Mat& I = *prevImg; - const Mat& J = *nextImg; - const Mat& derivI = *prevDeriv; - const Mat& BI = *rgbPrevImg; - const float FLT_SCALE = (1.f/(1 << 16));//(1.f/(1 << 20)); // 20 - winSize = cv::Size(maxWinSize,maxWinSize); - int winMaskwidth = static_cast(ceil(winSize.width / 16.f)) * 16; - cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); - winMaskMatBuf.setTo(1); - - std::vector param = paramReset; - int cn = I.channels(), cn2 = cn*2; - int winbufwidth = static_cast(ceil(winSize.width / 16.f)) * 16; - cv::Size winBufSize(winbufwidth,winbufwidth); - - - cv::Mat invTensorMat(4,4, CV_32FC1); - cv::Mat mismatchMat(4, 1, CV_32FC1); - cv::Mat resultMat(4, 1, CV_32FC1); - - cv::AutoBuffer _buf(winBufSize.area()*(cn + cn2)); - int derivDepth = DataType::depth; - - Mat IWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf.data()); - Mat derivIWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf.data() + winBufSize.area()*cn); - - for( int ptidx = range.start; ptidx < range.end; ptidx++ ) - { - Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level)); - Point2f nextPt; - if( level == maxLevel ) - { - if( useInitialFlow ) - nextPt = nextPts[ptidx]*(float)(1./(1 << level)); - else - nextPt = prevPt; - } - else - nextPt = nextPts[ptidx]*2.f; - nextPts[ptidx] = nextPt; - - Point2i iprevPt, inextPt; - iprevPt.x = cvFloor(prevPt.x); - iprevPt.y = cvFloor(prevPt.y); - int winArea = maxWinSize * maxWinSize; - cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize)); - winMaskMatBuf.setTo(0); - if( calcWinMaskMat(BI, windowType, iprevPt, - winMaskMat,winSize,halfWin,winArea, - minWinSize,maxWinSize, crossSegmentationThreshold) == false) - continue; - - prevPt -= halfWin; - iprevPt.x = cvFloor(prevPt.x); - iprevPt.y = cvFloor(prevPt.y); - - if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols || - iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows ) - { - if( level == 0 ) - { - if( status ) - status[ptidx] = 3; - if( err ) - err[ptidx] = 0; - } - continue; - } - - float a = prevPt.x - iprevPt.x; - float b = prevPt.y - iprevPt.y; - const int W_BITS = 14, W_BITS1 = 14; - - int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); - int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); - int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); - int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - - int dstep = (int)(derivI.step/derivI.elemSize1()); - int step = (int)(I.step/I.elemSize1()); - int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1()); - CV_Assert( step == (int)(J.step/J.elemSize1()) ); - - float A11 = 0, A12 = 0, A22 = 0; - float b1 = 0, b2 = 0, b3 = 0, b4 = 0; - - // tensor - float sumIx = 0; - float sumIy = 0; - float sumI = 0; - float sumW = 0; - float w1 = 0, w2 = 0; // -IyI - float dI = 0; // I^2 - float D = 0; - - #ifdef CV_SSE2 - __m128i rightMask = _mm_set_epi16(0, std::numeric_limits::max(), 0, std::numeric_limits::max(), 0, std::numeric_limits::max(), 0, std::numeric_limits::max()); - __m128i leftMask = _mm_set_epi16(std::numeric_limits::max(), 0, std::numeric_limits::max(), 0, std::numeric_limits::max(), 0, std::numeric_limits::max(), 0); - - __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); - __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); - __m128i z = _mm_setzero_si128(); - __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1)); - __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1)); - __m128i mmMask4_epi32; - __m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits::max()); - get4BitMask(winSize.width, mmMask4_epi32); - __m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps(); - #endif - - // extract the patch from the first image, compute covariation matrix of derivatives - int x, y; - for( y = 0; y < winSize.height; y++ ) - { - x = 0; - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; - const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; - const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; - - short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step); - short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step); - - - - #ifdef CV_SSE2 - for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 ) - { - __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16); - __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16); - - - __m128i v00, v01, v10, v11, t0, t1; - v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); - v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); - v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z); - v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z); - - t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); - t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); - if( x + 4 > winSize.width) - { - t0 = _mm_and_si128(t0, mmMask4_epi32); - } - t0 = _mm_and_si128(t0, mask_0_3_epi16); - _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0)); - - - v00 = _mm_loadu_si128((const __m128i*)(dsrc)); - v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); - v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep)); - v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2)); - - t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); - t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); - t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1); - t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1); - v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... - if( x + 4 > winSize.width) - { - v00 = _mm_and_si128(v00, mmMask4_epi32); - } - v00 = _mm_and_si128(v00, mask_0_3_epi16); - _mm_storeu_si128((__m128i*)dIptr, v00); - } - #else - - for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 ) - { - if( winMaskMat.at(y,x) == 0) - { - dIptr[0] = 0; - dIptr[1] = 0; - continue; - } - int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + - src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5); - int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 + - dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1); - int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 + - dsrc[dstep+cn2+1]*iw11, W_BITS1); - - Iptr[x] = (short)ival; - dIptr[0] = (short)ixval; - dIptr[1] = (short)iyval; - - } - #endif - } - - cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1); - - cv::Point2f backUpNextPt = nextPt; - nextPt -= halfWin; - Point2f prevDelta(0,0); //relates to h(t-1) - Point2f prevGain(1,0); - cv::Point2f gainVec = gainVecs[ptidx]; - cv::Point2f backUpGain = gainVec; - cv::Size _winSize = winSize; - int j; - __m128i mmMask0, mmMask1, mmMask; - getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask); - __m128 mmOnes = _mm_set1_ps(1.f ); - __m128i mmOnes_epi16 = _mm_set1_epi16(1 ); - __m128i mmZeros_epi16 = _mm_set1_epi16(0); - float MEstimatorScale = 1; - int buffIdx = 0; - float c[8]; - cv::Mat GMc0, GMc1, GMc2, GMc3; - cv::Vec4f Mc0, Mc1, Mc2, Mc3; - int noIteration = 0; - int noReusedIteration = 0; - int noSolvedIteration = 0; - for( j = 0; j < criteria.maxCount; j++ ) - { - cv::Point2f delta(0,0); - cv::Point2f deltaGain(0,0); - bool hasSolved = false; - a = nextPt.x - inextPt.x; - b = nextPt.y - inextPt.y; - float ab = a * b; - if( j == 0 - || (inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j % 2 != 0) ) - { - inextPt.x = cvFloor(nextPt.x); - inextPt.y = cvFloor(nextPt.y); - - if( inextPt.x < -winSize.width || inextPt.x >= J.cols || - inextPt.y < -winSize.height || inextPt.y >= J.rows ) - { - if( level == 0 && status ) - status[ptidx] = 3; - if (level > 0) - { - nextPts[ptidx] = backUpNextPt; - gainVecs[ptidx] = backUpGain; - } - noIteration++; - break; - } - - - a = nextPt.x - inextPt.x; - b = nextPt.y - inextPt.y; - ab = a * b; - iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); - iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); - iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); - iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - // mismatch - b1 = 0; - b2 = 0; - b3 = 0; - b4 = 0; - if( j == 0) - { - // tensor - w1 = 0; // -IxI - w2 = 0; // -IyI - dI = 0; // I^2 - sumIx = 0; - sumIy = 0; - sumI = 0; - sumW = 0; - A11 = 0; - A12 = 0; - A22 = 0; - } - - if ( j == 0 ) - { - buffIdx = 0; - for( y = 0; y < winSize.height; y++ ) - { - const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y )*step + (inextPt.x )*cn; - const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step); - const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; - x = 0; - for( ; x < winSize.width*cn; x++, dIptr += 2) - { - if( maskPtr[x] == 0) - continue; - int diff = static_cast(CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, W_BITS1-5) - - Iptr[x] + Iptr[x] * gainVec.x +gainVec.y); - residualMat.at(buffIdx++) = static_cast(diff); - } - } - /*! Estimation for the residual */ - cv::Mat residualMatRoi(residualMat, cv::Rect(0,0,1, buffIdx)); - MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi); - } - - float eta = 1.f / winArea; - float fParam0 = param[0] * 32.f; - float fParam1 = param[1] * 32.f; - fParam0 = param[0] * MEstimatorScale; - fParam1 = param[1] * MEstimatorScale; - -#ifdef CV_SSE2 - - qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); - qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); - __m128 qb0[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; - __m128 qb1[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; - __m128 qb2[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; - __m128 qb3[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; - __m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps(); - __m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps(); - __m128 mmSumIy = _mm_setzero_ps(), mmSumIx = _mm_setzero_ps(); - __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps(); - __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits::max() -1, static_cast(fParam0))); - __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits::max()- 1, static_cast(fParam1))); - - - float s2Val = param[2] > 0 ? param[2] : -param[2]; - int s2bitShift = param[2] == 0 ? 1 : static_cast(ceil(log(200.f / s2Val) / log(2.f))); - __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast(param[2] * (float)(1 << s2bitShift))); - __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift); - __m128 mmParam2 = _mm_set1_ps(param[2]); - __m128 mmParam2s = _mm_set1_ps(0.01f * param[2]); - __m128 mmParam2s2 = _mm_set1_ps(param[2] * param[2]); - float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x; - int bitShift = gainVec.x == 0 ? 1 : static_cast(ceil(log(200.f / gainVal) / log(2.f))); - __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast(gainVec.x * (float)(1 << bitShift))); - __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast(gainVec.y)); - __m128i mmEta = _mm_setzero_si128(); - __m128i mmEtaNeg = _mm_set1_epi16(-1); - __m128i mmScale = _mm_set1_epi16(static_cast(MEstimatorScale)); - + + +inline bool checkSolution(float x, float y, float * c ) +{ + float _a = x - 0.002f; + float _b = y - 0.002f; + cv::Point2f tl( c[0] * _a * _b + c[1] * _a + c[2] * _b + c[3], c[4] * _a * _b + c[5] * _a + c[6] * _b + c[7]); + _a = x + 0.002f; + cv::Point2f tr( c[0] * _a * _b + c[1] * _a + c[2] * _b + c[3], c[4] * _a * _b + c[5] * _a + c[6] * _b + c[7]); + _b = y + 0.002f; + cv::Point2f br( c[0] * _a * _b + c[1] * _a + c[2] * _b + c[3], c[4] * _a * _b + c[5] * _a + c[6] * _b + c[7]); + _a = x - 0.002f; + cv::Point2f bl( c[0] * _a * _b + c[1] * _a + c[2] * _b + c[3], c[4] * _a * _b + c[5] * _a + c[6] * _b + c[7]); + return (tl.x >= 0 && tl.y >= 0) && (tr.x <= 0 && tr.y >= 0) + && (bl.x >= 0 && bl.y <= 0) && (br.x <= 0 && br.y <= 0); +} + +inline cv::Point2f est_DeltaGain( const cv::Mat & src, const cv::Vec4f & val) +{ + return cv::Point2f( + src.at(2,0) * val[0] + src.at(2,1) * val[1] + src.at(2,2) * val[2] + src.at(2,3) * val[3], + src.at(3,0) * val[0] + src.at(3,1) * val[1] + src.at(3,2) * val[2] + src.at(3,3) * val[3]); +} +inline void est_Result( const cv::Mat & src, const cv::Vec4f & val, cv::Point2f & delta, cv::Point2f & gain) +{ + + delta = cv::Point2f( + -(src.at(0,0) * val[0] + src.at(0,1) * val[1] + src.at(0,2) * val[2] + src.at(0,3) * val[3]), + -(src.at(1,0) * val[0] + src.at(1,1) * val[1] + src.at(1,2) * val[2] + src.at(1,3) * val[3])); + + gain = cv::Point2f( + src.at(2,0) * val[0] + src.at(2,1) * val[1] + src.at(2,2) * val[2] + src.at(2,3) * val[3], + src.at(3,0) * val[0] + src.at(3,1) * val[1] + src.at(3,2) * val[2] + src.at(3,3) * val[3]); +} + +namespace berlof +{ + + namespace ica + { + + class TrackerInvoker : public cv::ParallelLoopBody + { + public: + TrackerInvoker( + const Mat& _prevImg, + const Mat& _prevDeriv, + const Mat& _nextImg, + const Mat& _rgbPrevImg, + const Mat& _rgbNextImg, + const Point2f* _prevPts, + Point2f* _nextPts, + uchar* _status, + float* _err, + int _level, + int _maxLevel, + int _winSize[2], + int _maxIteration, + bool _useInitialFlow, + int _supportRegionType, + int _crossSegmentationThreshold, + std::vector _normSigmas, + float _minEigenValue) + { + prevImg = &_prevImg; + prevDeriv = &_prevDeriv; + nextImg = &_nextImg; + rgbPrevImg = &_rgbPrevImg; + rgbNextImg = &_rgbNextImg; + prevPts = _prevPts; + nextPts = _nextPts; + status = _status; + err = _err; + minWinSize = _winSize[0]; + maxWinSize = _winSize[1]; + criteria.maxCount = _maxIteration; + criteria.epsilon = 0.01; + level = _level; + maxLevel = _maxLevel; + windowType = _supportRegionType; + minEigThreshold = _minEigenValue; + paramReset = _normSigmas; + useInitialFlow = _useInitialFlow; + crossSegmentationThreshold = _crossSegmentationThreshold; + } + + TrackerInvoker & operator=(const TrackerInvoker &) {return *this;}; + void operator()(const cv::Range& range) const + { + + Point2f halfWin; + cv::Size winSize; + const Mat& I = *prevImg; + const Mat& J = *nextImg; + const Mat& derivI = *prevDeriv; + const Mat& BI = *rgbPrevImg; + + + const float FLT_SCALE = (1.f/(1 << 16)); + + winSize = cv::Size(maxWinSize,maxWinSize); + int winMaskwidth = static_cast(ceil(winSize.width / 16.f)) * 16; + cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); + winMaskMatBuf.setTo(1); + + std::vector param = paramReset; + int j, cn = I.channels(), cn2 = cn*2; + int winbufwidth = static_cast(ceil(winSize.width / 16.f)) * 16; + cv::Size winBufSize(winbufwidth,winbufwidth); + + cv::AutoBuffer _buf(winBufSize.area()*(cn + cn2)); + int derivDepth = DataType::depth; + + Mat IWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf.data()); + Mat derivIWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf.data() + winBufSize.area()*cn); + + + for( int ptidx = range.start; ptidx < range.end; ptidx++ ) + { + Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level)); + Point2f nextPt; + if( level == maxLevel ) + { + if( useInitialFlow ) + nextPt = nextPts[ptidx]*(float)(1./(1 << level)); + else + nextPt = prevPt; + } + else + nextPt = nextPts[ptidx]*2.f; + nextPts[ptidx] = nextPt; + + Point2i iprevPt, inextPt; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + int winArea = maxWinSize * maxWinSize; + cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize)); + winMaskMatBuf.setTo(0); + if( calcWinMaskMat(BI, windowType, iprevPt, + winMaskMat,winSize,halfWin,winArea, + minWinSize,maxWinSize, crossSegmentationThreshold) == false) + continue; + + prevPt -= halfWin; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + + if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols || + iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows ) + { + if( level == 0 ) + { + if( status ) + status[ptidx] = 3; + if( err ) + err[ptidx] = 0; + } + continue; + } + + float a = prevPt.x - iprevPt.x; + float b = prevPt.y - iprevPt.y; + const int W_BITS = 14, W_BITS1 = 14; + + int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + + int dstep = (int)(derivI.step/derivI.elemSize1()); + int step = (int)(I.step/I.elemSize1()); + int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1()); + CV_Assert( step == (int)(J.step/J.elemSize1()) ); + + float A11 = 0, A12 = 0, A22 = 0; + float D = 0; + + // extract the patch from the first image, compute covariation matrix of derivatives + int x, y; + for( y = 0; y < winSize.height; y++ ) + { + const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; + const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; + + short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step); + short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step); + x = 0; + for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 ) + { + if( winMaskMat.at(y,x) == 0) + { + dIptr[0] = 0; + dIptr[1] = 0; + continue; + } + int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + + src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5); + int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 + + dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1); + int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 + + dsrc[dstep+cn2+1]*iw11, W_BITS1); + Iptr[x] = (short)ival; + dIptr[0] = (short)ixval; + dIptr[1] = (short)iyval; + } + } + + cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1); + cv::Point2f backUpNextPt = nextPt; + nextPt -= halfWin; + Point2f prevDelta(0,0); //denotes h(t-1) + cv::Size _winSize = winSize; + __m128i mmMask0, mmMask1, mmMask; + getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask); + float MEstimatorScale = 1; + int buffIdx = 0; + float c[8]; + cv::Mat GMc0, GMc1, GMc2, GMc3; + cv::Vec2f Mc0, Mc1, Mc2, Mc3; + int noIteration = 0; + int noReusedIteration = 0; + int noSolvedIteration = 0; + for( j = 0; j < criteria.maxCount; j++ ) + { + cv::Point2f delta(0,0); + cv::Point2f deltaGain(0,0); + bool hasSolved = false; + a = nextPt.x - inextPt.x; + b = nextPt.y - inextPt.y; + float ab = a * b; + if( j == 0 + || ( inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j % 2 != 0 )) + { + inextPt.x = cvFloor(nextPt.x); + inextPt.y = cvFloor(nextPt.y); + + if( inextPt.x < -winSize.width || inextPt.x >= J.cols || + inextPt.y < -winSize.height || inextPt.y >= J.rows ) + { + if( level == 0 && status ) + status[ptidx] = 3; + break; + } + + + a = nextPt.x - inextPt.x; + b = nextPt.y - inextPt.y; + ab = a * b; + iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + // mismatch + if( j == 0 ) + { + A11 = 0; + A12 = 0; + A22 = 0; + } + + if ( j == 0 ) + { + buffIdx = 0; + for( y = 0; y < winSize.height; y++ ) + { + const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y )*step + (inextPt.x )*cn; + const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step); + const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + x = 0; + for( ; x < winSize.width*cn; x++, dIptr += 2) + { + if( maskPtr[x] == 0) + continue; + int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, W_BITS1-5) + - Iptr[x]; + residualMat.at(buffIdx++) = static_cast(diff); + } + } + /*! Estimation for the residual */ + cv::Mat residualMatRoi(residualMat, cv::Rect(0,0,1, buffIdx)); + MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi); + } + + float eta = 1.f / winArea; + float fParam0 = param[0] * 32.f; + float fParam1 = param[1] * 32.f; + fParam0 = param[0] * MEstimatorScale; + fParam1 = param[1] * MEstimatorScale; + + + buffIdx = 0; + float _b0[4] = {0,0,0,0}; + float _b1[4] = {0,0,0,0}; + + /* + */ + for( y = 0; y < _winSize.height; y++ ) + { + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn; + const short* Iptr = (const short*)(IWinBuf.data + y*IWinBuf.step); + const short* dIptr = (const short*)(derivIWinBuf.data + y*derivIWinBuf.step); + + x = 0; + for( ; x < _winSize.width*cn; x++, dIptr += 2 ) + { + if( maskPtr[x] == 0) + continue; + int illValue = - Iptr[x]; + + float It[4] = {static_cast((Jptr[x+step+cn]<< 5) + illValue), + static_cast((Jptr[x+cn]<< 5) + illValue), + static_cast((Jptr[x+step]<< 5) + illValue), + static_cast((Jptr[x] << 5) + illValue)}; + + + + int J_val = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, + W_BITS1-5); + + + int diff = J_val + illValue; + + + MEstimatorScale += (diff < MEstimatorScale) ? -eta : eta; + + int abss = (diff < 0) ? -diff : diff; + + // compute the missmatch vector + if( j >= 0) + { + if( abss > fParam1) + { + It[0] = 0; + It[1] = 0; + It[2] = 0; + It[3] = 0; + } + else if( abss > fParam0 && diff >= 0 ) + { + It[0] = param[2] * (It[0] - fParam1); + It[1] = param[2] * (It[1] - fParam1); + It[2] = param[2] * (It[2] - fParam1); + It[3] = param[2] * (It[3] - fParam1); + } + else if( abss > fParam0 && diff < 0 ) + { + It[0] = param[2] * (It[0] + fParam1); + It[1] = param[2] * (It[1] + fParam1); + It[2] = param[2] * (It[2] + fParam1); + It[3] = param[2] * (It[3] + fParam1); + } + } + + float It0 = It[0]; + float It1 = It[1]; + float It2 = It[2]; + float It3 = It[3]; + + float ixval = static_cast(dIptr[0]); + float iyval = static_cast(dIptr[1]); + _b0[0] += It0 * ixval; + _b0[1] += It1 * ixval; + _b0[2] += It2 * ixval; + _b0[3] += It3 * ixval; + + + _b1[0] += It0*iyval; + _b1[1] += It1*iyval; + _b1[2] += It2*iyval; + _b1[3] += It3*iyval; + + + // compute the Gradient Matrice + if( j == 0) + { + float tale = param[2] * FLT_RESCALE; + if( abss < fParam0 || j < 0) + { + tale = FLT_RESCALE; + } + else if( abss > fParam1) + { + tale *= 0.01f; + } + else + { + tale *= param[2]; + } + + A11 += (float)(ixval*ixval)*tale; + A12 += (float)(ixval*iyval)*tale; + A22 += (float)(iyval*iyval)*tale; + + } + } + } + + if( j == 0 ) + { + + A11 *= FLT_SCALE; // 54866744. + A12 *= FLT_SCALE; // -628764.00 + A22 *= FLT_SCALE; // 19730.000 + + D = A11 * A22 - A12 * A12; + float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) + + 4.f*A12*A12))/(2*winArea); + + if( minEig < minEigThreshold || std::abs(D) < FLT_EPSILON) + { + if( level == 0 && status ) + status[ptidx] = 0; + if( level > 0) + { + nextPts[ptidx] = backUpNextPt; + } + noIteration++; + break; + } + + D = (1.f / D); + + } + + _b0[0] *= FLT_SCALE;_b0[1] *= FLT_SCALE;_b0[2] *= FLT_SCALE;_b0[3] *= FLT_SCALE; + _b1[0] *= FLT_SCALE;_b1[1] *= FLT_SCALE;_b1[2] *= FLT_SCALE;_b1[3] *= FLT_SCALE; + + + Mc0[0] = _b0[0] - _b0[1] - _b0[2] + _b0[3]; + Mc0[1] = _b1[0] - _b1[1] - _b1[2] + _b1[3]; + + Mc1[0] = _b0[1] - _b0[3]; + Mc1[1] = _b1[1] - _b1[3]; + + + Mc2[0] = _b0[2] - _b0[3]; + Mc2[1] = _b1[2] - _b1[3]; + + + Mc3[0] = _b0[3]; + Mc3[1] = _b1[3]; + + c[0] = -Mc0[0]; + c[1] = -Mc1[0]; + c[2] = -Mc2[0]; + c[3] = -Mc3[0]; + c[4] = -Mc0[1]; + c[5] = -Mc1[1]; + c[6] = -Mc2[1]; + c[7] = -Mc3[1]; + + float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]); + float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]); + float e2 = e0 * (c[1] * c[7] -c[3] * c[5]); + e0 = e1 * e1 - e2; + hasSolved = false; + if ( e0 > 0) + { + e0 = sqrt(e0); + float _y[2] = {-e1 - e0, e0 - e1}; + float c0yc1[2] = {c[0] * _y[0] + c[1], + c[0] * _y[1] + c[1]}; + float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0], + -(c[2] * _y[1] + c[3]) / c0yc1[1]}; + bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1); + bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1); + + bool isSolution1 = checkSolution(_x[0], _y[0], c ); + bool isSolution2 = checkSolution(_x[1], _y[1], c ); + bool isSink1 = isIn1 && isSolution1; + bool isSink2 = isIn2 && isSolution2; + + if ( isSink1 != isSink2) + { + a = isSink1 ? _x[0] : _x[1]; + b = isSink1 ? _y[0] : _y[1]; + ab = a * b; + hasSolved = true; + delta.x = inextPt.x + a - nextPt.x; + delta.y = inextPt.y + b - nextPt.y; + } // isIn1 != isIn2 + } + if( hasSolved == false) + noIteration++; + } + else + { + hasSolved = false; + noReusedIteration++; + } + if( hasSolved == false ) + { + + cv::Vec2f mismatchVec = ab * Mc0 + Mc1 *a + Mc2 * b + Mc3; + delta.x = (A12 * mismatchVec.val[1] - A22 * mismatchVec.val[0]) * D; + delta.y = (A12 * mismatchVec.val[0] - A11 * mismatchVec.val[1]) * D; + delta.x = MAX(-1.f, MIN(1.f, delta.x)); + delta.y = MAX(-1.f, MIN(1.f, delta.y)); + nextPt += delta; + nextPts[ptidx] = nextPt + halfWin; + } + else + { + nextPt += delta; + nextPts[ptidx] = nextPt + halfWin; + noSolvedIteration++; + break; + } + + delta.x = ( delta.x != delta.x) ? 0 : delta.x; + delta.y = ( delta.y != delta.y) ? 0 : delta.y; + + if(j > 0 && ( + (std::abs(delta.x - prevDelta.x) < 0.01 && std::abs(delta.y - prevDelta.y) < 0.01))) + { + nextPts[ptidx] -= delta*0.5f; + break; + } + + + prevDelta = delta; + } + + } + + } + + const Mat* prevImg; + const Mat* nextImg; + const Mat* prevDeriv; + const Mat* rgbPrevImg; + const Mat* rgbNextImg; + const Point2f* prevPts; + Point2f* nextPts; + uchar* status; + float* err; + int maxWinSize; + int minWinSize; + TermCriteria criteria; + int level; + int maxLevel; + int windowType; + float minEigThreshold; + bool useInitialFlow; + std::vector paramReset; + int crossSegmentationThreshold; + + }; + } + namespace radial + { + class TrackerInvoker : public cv::ParallelLoopBody + { + public: + TrackerInvoker( + const Mat& _prevImg, + const Mat& _prevDeriv, + const Mat& _nextImg, + const Mat& _rgbPrevImg, + const Mat& _rgbNextImg, + const Point2f* _prevPts, + Point2f* _nextPts, + uchar* _status, + float* _err, + Point2f* _gainVecs, + int _level, + int _maxLevel, + int _winSize[2], + int _maxIteration, + bool _useInitialFlow, + int _supportRegionType, + int _crossSegmentationThreshold, + std::vector _normSigmas, + float _minEigenValue) + { + prevImg = &_prevImg; + prevDeriv = &_prevDeriv; + nextImg = &_nextImg; + rgbPrevImg = &_rgbPrevImg; + rgbNextImg = &_rgbNextImg; + prevPts = _prevPts; + nextPts = _nextPts; + status = _status; + err = _err; + gainVecs = _gainVecs; + minWinSize = _winSize[0]; + maxWinSize = _winSize[1]; + criteria.maxCount = _maxIteration; + criteria.epsilon = 0.01; + level = _level; + maxLevel = _maxLevel; + windowType = _supportRegionType; + minEigThreshold = _minEigenValue; + paramReset = _normSigmas; + useInitialFlow = _useInitialFlow; + crossSegmentationThreshold = _crossSegmentationThreshold; + } + + TrackerInvoker & operator=(const TrackerInvoker &) {return *this;}; + void operator()(const cv::Range& range) const + { + Point2f halfWin; + cv::Size winSize; + const Mat& I = *prevImg; + const Mat& J = *nextImg; + const Mat& derivI = *prevDeriv; + const Mat& BI = *rgbPrevImg; + const float FLT_SCALE = (1.f/(1 << 16));//(1.f/(1 << 20)); // 20 + winSize = cv::Size(maxWinSize,maxWinSize); + int winMaskwidth = static_cast(ceil(winSize.width / 16.f)) * 16; + cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); + winMaskMatBuf.setTo(1); + + std::vector param = paramReset; + int cn = I.channels(), cn2 = cn*2; + int winbufwidth = static_cast(ceil(winSize.width / 16.f)) * 16; + cv::Size winBufSize(winbufwidth,winbufwidth); + + + cv::Mat invTensorMat(4,4, CV_32FC1); + cv::Mat mismatchMat(4, 1, CV_32FC1); + cv::Mat resultMat(4, 1, CV_32FC1); + + cv::AutoBuffer _buf(winBufSize.area()*(cn + cn2)); + int derivDepth = DataType::depth; + + Mat IWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf.data()); + Mat derivIWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf.data() + winBufSize.area()*cn); + + for( int ptidx = range.start; ptidx < range.end; ptidx++ ) + { + Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level)); + Point2f nextPt; + if( level == maxLevel ) + { + if( useInitialFlow ) + nextPt = nextPts[ptidx]*(float)(1./(1 << level)); + else + nextPt = prevPt; + } + else + nextPt = nextPts[ptidx]*2.f; + nextPts[ptidx] = nextPt; + + Point2i iprevPt, inextPt; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + int winArea = maxWinSize * maxWinSize; + cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize)); + winMaskMatBuf.setTo(0); + if( calcWinMaskMat(BI, windowType, iprevPt, + winMaskMat,winSize,halfWin,winArea, + minWinSize,maxWinSize, crossSegmentationThreshold) == false) + continue; + + prevPt -= halfWin; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + + if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols || + iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows ) + { + if( level == 0 ) + { + if( status ) + status[ptidx] = 3; + if( err ) + err[ptidx] = 0; + } + continue; + } + + float a = prevPt.x - iprevPt.x; + float b = prevPt.y - iprevPt.y; + const int W_BITS = 14, W_BITS1 = 14; + + int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + + int dstep = (int)(derivI.step/derivI.elemSize1()); + int step = (int)(I.step/I.elemSize1()); + int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1()); + CV_Assert( step == (int)(J.step/J.elemSize1()) ); + + float A11 = 0, A12 = 0, A22 = 0; + + // tensor + float sumIx = 0; + float sumIy = 0; + float sumI = 0; + float sumW = 0; + float w1 = 0, w2 = 0; // -IyI + float dI = 0; // I^2 + float D = 0; + + #ifdef CV_SSE4_1 + + __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128i z = _mm_setzero_si128(); + __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1)); + __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1)); + __m128i mmMask4_epi32; + __m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits::max()); + get4BitMask(winSize.width, mmMask4_epi32); + #endif + + // extract the patch from the first image, compute covariation matrix of derivatives + int x, y; + for( y = 0; y < winSize.height; y++ ) + { + x = 0; + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; + const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; + + short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step); + short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step); + + + + #ifdef CV_SSE4_1 + for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 ) + { + __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16); + __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16); + + + __m128i v00, v01, v10, v11, t0, t1; + v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); + v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); + v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z); + v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); + if( x + 4 > winSize.width) + { + t0 = _mm_and_si128(t0, mmMask4_epi32); + } + t0 = _mm_and_si128(t0, mask_0_3_epi16); + _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0)); + + + v00 = _mm_loadu_si128((const __m128i*)(dsrc)); + v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); + v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep)); + v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2)); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1); + v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... + if( x + 4 > winSize.width) + { + v00 = _mm_and_si128(v00, mmMask4_epi32); + } + v00 = _mm_and_si128(v00, mask_0_3_epi16); + _mm_storeu_si128((__m128i*)dIptr, v00); + } + #else + + for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 ) + { + if( winMaskMat.at(y,x) == 0) + { + dIptr[0] = 0; + dIptr[1] = 0; + continue; + } + int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + + src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5); + int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 + + dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1); + int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 + + dsrc[dstep+cn2+1]*iw11, W_BITS1); + + Iptr[x] = (short)ival; + dIptr[0] = (short)ixval; + dIptr[1] = (short)iyval; + + } + #endif + } + + cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1); + + cv::Point2f backUpNextPt = nextPt; + nextPt -= halfWin; + Point2f prevDelta(0,0); //relates to h(t-1) + Point2f prevGain(1,0); + cv::Point2f gainVec = gainVecs[ptidx]; + cv::Point2f backUpGain = gainVec; + cv::Size _winSize = winSize; + int j; + __m128i mmMask0, mmMask1, mmMask; + getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask); + __m128 mmOnes = _mm_set1_ps(1.f ); + float MEstimatorScale = 1; + int buffIdx = 0; + float c[8]; + cv::Mat GMc0, GMc1, GMc2, GMc3; + cv::Vec4f Mc0, Mc1, Mc2, Mc3; + int noIteration = 0; + int noReusedIteration = 0; + int noSolvedIteration = 0; + for( j = 0; j < criteria.maxCount; j++ ) + { + cv::Point2f delta(0,0); + cv::Point2f deltaGain(0,0); + bool hasSolved = false; + a = nextPt.x - inextPt.x; + b = nextPt.y - inextPt.y; + float ab = a * b; + if( j == 0 + || (inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j % 2 != 0) ) + { + inextPt.x = cvFloor(nextPt.x); + inextPt.y = cvFloor(nextPt.y); + + if( inextPt.x < -winSize.width || inextPt.x >= J.cols || + inextPt.y < -winSize.height || inextPt.y >= J.rows ) + { + if( level == 0 && status ) + status[ptidx] = 3; + if (level > 0) + { + nextPts[ptidx] = backUpNextPt; + gainVecs[ptidx] = backUpGain; + } + noIteration++; + break; + } + + + a = nextPt.x - inextPt.x; + b = nextPt.y - inextPt.y; + ab = a * b; + iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + // mismatch + + if( j == 0) + { + // tensor + w1 = 0; // -IxI + w2 = 0; // -IyI + dI = 0; // I^2 + sumIx = 0; + sumIy = 0; + sumI = 0; + sumW = 0; + A11 = 0; + A12 = 0; + A22 = 0; + } + + if ( j == 0 ) + { + buffIdx = 0; + for( y = 0; y < winSize.height; y++ ) + { + const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y )*step + (inextPt.x )*cn; + const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step); + const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + x = 0; + for( ; x < winSize.width*cn; x++, dIptr += 2) + { + if( maskPtr[x] == 0) + continue; + int diff = static_cast(CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, W_BITS1-5) + - Iptr[x] + Iptr[x] * gainVec.x +gainVec.y); + residualMat.at(buffIdx++) = static_cast(diff); + } + } + /*! Estimation for the residual */ + cv::Mat residualMatRoi(residualMat, cv::Rect(0,0,1, buffIdx)); + MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi); + } + + float eta = 1.f / winArea; + float fParam0 = param[0] * 32.f; + float fParam1 = param[1] * 32.f; + fParam0 = param[0] * MEstimatorScale; + fParam1 = param[1] * MEstimatorScale; + +#ifdef CV_SSE4_1 + + qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128 qb0[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; + __m128 qb1[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; + __m128 qb2[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; + __m128 qb3[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; + __m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps(); + __m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps(); + __m128 mmSumIy = _mm_setzero_ps(), mmSumIx = _mm_setzero_ps(); + __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps(); + __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits::max() -1, static_cast(fParam0))); + __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits::max()- 1, static_cast(fParam1))); + + + float s2Val = param[2] > 0 ? param[2] : -param[2]; + int s2bitShift = param[2] == 0 ? 1 : static_cast(ceil(log(200.f / s2Val) / log(2.f))); + __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast(param[2] * (float)(1 << s2bitShift))); + __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift); + __m128 mmParam2s = _mm_set1_ps(0.01f * param[2]); + __m128 mmParam2s2 = _mm_set1_ps(param[2] * param[2]); + float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x; + int bitShift = gainVec.x == 0 ? 1 : static_cast(ceil(log(200.f / gainVal) / log(2.f))); + __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast(gainVec.x * (float)(1 << bitShift))); + __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast(gainVec.y)); + __m128i mmEta = _mm_setzero_si128(); + __m128i mmScale = _mm_set1_epi16(static_cast(MEstimatorScale)); + #endif - - buffIdx = 0; - float _b0[4] = {0,0,0,0}; - float _b1[4] = {0,0,0,0}; - float _b2[4] = {0,0,0,0}; - float _b3[4] = {0,0,0,0}; - float _bm[4] = {0,0,0,0}; - float _bc[4] = {0,0,0,0}; - /* - */ - for( y = 0; y < _winSize.height; y++ ) - { - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; - const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn; - const short* Iptr = (const short*)(IWinBuf.data + y*IWinBuf.step); - const short* dIptr = (const short*)(derivIWinBuf.data + y*derivIWinBuf.step); - - x = 0; -#ifdef CV_SSE2 - for( ; x <= _winSize.width*cn; x += 8, dIptr += 8*2 ) - { - __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16); - __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); - - __m128i v00 = _mm_unpacklo_epi8( - _mm_loadl_epi64((const __m128i*)(Jptr + x)), z); - __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); - __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z); - __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z); - - __m128i t0 = _mm_add_epi32 - (_mm_madd_epi16( - _mm_unpacklo_epi16(v00, v01), - qw0), - _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); - __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); - - t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); - t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5); - - __m128i lo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16); - __m128i hi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16); - - __m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), bitShift); - __m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), bitShift); - __m128i Igain_epi16 = _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32); - - - __m128i diffValue = _mm_subs_epi16(_mm_add_epi16(Igain_epi16, mmConstValue_epi16), I_0_7_epi16); - __m128i mmDiffc_epi16[4] = - { - _mm_add_epi16(_mm_slli_epi16(v11, 5), diffValue), - _mm_add_epi16(_mm_slli_epi16(v01, 5), diffValue), - _mm_add_epi16(_mm_slli_epi16(v10, 5), diffValue), - _mm_add_epi16(_mm_slli_epi16(v00, 5), diffValue) - }; - - __m128i mmDiff_epi16 = _mm_add_epi16( _mm_packs_epi32(t0, t1), diffValue); - - - mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16); - - __m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale); - mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1))); - - - __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); - __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); - - - __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16); - __m128i bSet2_epi16, bSet1_epi16; - // |It| < sigma1 ? - bSet2_epi16 = _mm_cmplt_epi16(abs_epi16, mmParam1); - // It > 0 ? - __m128i diffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128()); - // sigma0 < |It| < sigma1 ? - bSet1_epi16 = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0)); - // val = |It| -/+ sigma1 - __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)), - _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1))); - // It == 0 ? |It| > sigma13 - mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16); - - for( unsigned int mmi = 0; mmi < 4; mmi++) - { - __m128i mmDiffc_epi16_t = _mm_and_si128(mmDiffc_epi16[mmi], mask_0_7_epi16); - mmDiffc_epi16_t = _mm_and_si128(bSet2_epi16, mmDiffc_epi16_t); - - // It == val ? sigma0 < |It| < sigma1 - mmDiffc_epi16_t = _mm_blendv_epi8(mmDiffc_epi16_t, tmpParam1_epi16, bSet1_epi16); - __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3 - // diff = diff * sigma2 - lo = _mm_mullo_epi16(tale_epi16_, mmDiffc_epi16_t); - hi = _mm_mulhi_epi16(tale_epi16_, mmDiffc_epi16_t); - __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); - __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); - - mmDiffc_epi16_t = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); - __m128i diff1 = _mm_unpackhi_epi16(mmDiffc_epi16_t, mmDiffc_epi16_t); // It4 It4 It5 It5 It6 It6 It7 It7 | It12 It12 It13 It13... - __m128i diff0 = _mm_unpacklo_epi16(mmDiffc_epi16_t, mmDiffc_epi16_t); // It0 It0 It1 It1 It2 It2 It3 It3 | It8 It8 It9 It9... - - // Ix * diff / Iy * diff - v10 = _mm_mullo_epi16(Ixy_0, diff0); - v11 = _mm_mulhi_epi16(Ixy_0, diff0); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - - qb0[mmi] = _mm_add_ps(qb0[mmi], _mm_cvtepi32_ps(v00)); - qb1[mmi] = _mm_add_ps(qb1[mmi], _mm_cvtepi32_ps(v10)); - // It * Ix It * Iy [4 ... 7] - // for set 1 hi sigma 1 - v10 = _mm_mullo_epi16(Ixy_1, diff1); - v11 = _mm_mulhi_epi16(Ixy_1, diff1); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qb0[mmi] = _mm_add_ps(qb0[mmi], _mm_cvtepi32_ps(v00)); - qb1[mmi] = _mm_add_ps(qb1[mmi], _mm_cvtepi32_ps(v10)); - // diff * J [0 ... 7] - // for set 1 sigma 1 - // b3 += diff * Iptr[x] - v10 = _mm_mullo_epi16(mmDiffc_epi16_t, I_0_7_epi16); - v11 = _mm_mulhi_epi16(mmDiffc_epi16_t, I_0_7_epi16); - v00 = _mm_unpacklo_epi16(v10, v11); - - v10 = _mm_unpackhi_epi16(v10, v11); - qb2[mmi] = _mm_add_ps(qb2[mmi], _mm_cvtepi32_ps(v00)); - qb2[mmi] = _mm_add_ps(qb2[mmi], _mm_cvtepi32_ps(v10)); - qb3[mmi] = _mm_add_ps(qb3[mmi], _mm_cvtepi32_ps(diff_0_3_epi32)); - qb3[mmi] = _mm_add_ps(qb3[mmi], _mm_cvtepi32_ps(diff_4_7_epi32)); - } - - if( j == 0 ) - { - __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16)); - __m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16,bSet1_epi16), 16)); - __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16)); - __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16),16))); - - __m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16)); - __m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16),16)); - - __m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps); - __m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps); - tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps); - tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps); - - tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps); - tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps); - - t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3 - t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3 - - __m128 fy = _mm_cvtepi32_ps(t0); - __m128 fx = _mm_cvtepi32_ps(t1); - - // 0 ... 3 - __m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16)); - - // A11 - A22 - __m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps); - __m128 fytale = _mm_mul_ps(fy, tale_0_3_ps); - - mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); - mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); - mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); - - // sumIx und sumIy - mmSumIx = _mm_add_ps(mmSumIx, fxtale); - mmSumIy = _mm_add_ps(mmSumIy, fytale); - - mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale)); - mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale)); - - // sumI - __m128 I_tale_ps = _mm_mul_ps(I_ps, tale_0_3_ps); - mmSumI = _mm_add_ps(mmSumI,I_tale_ps); - - // sumW - mmSumW = _mm_add_ps(mmSumW, tale_0_3_ps); - - // sumDI - mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_tale_ps)); - - - t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11 - t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3 - - fy = _mm_cvtepi32_ps(t0); - fx = _mm_cvtepi32_ps(t1); - - // 4 ... 7 - I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16)); - - // A11 - A22 - fxtale = _mm_mul_ps(fx, tale_4_7_ps); - fytale = _mm_mul_ps(fy, tale_4_7_ps); - - mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); - mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); - mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); - - // sumIx und sumIy - mmSumIx = _mm_add_ps(mmSumIx, fxtale); - mmSumIy = _mm_add_ps(mmSumIy, fytale); - - mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale)); - mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale)); - - // sumI - I_tale_ps = _mm_mul_ps(I_ps, tale_4_7_ps); - mmSumI = _mm_add_ps(mmSumI, I_tale_ps); - - // sumW - mmSumW = _mm_add_ps(mmSumW, tale_4_7_ps); - - // sumDI - mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_tale_ps)); - } - - } + + buffIdx = 0; + float _b0[4] = {0,0,0,0}; + float _b1[4] = {0,0,0,0}; + float _b2[4] = {0,0,0,0}; + float _b3[4] = {0,0,0,0}; + /* + */ + for( y = 0; y < _winSize.height; y++ ) + { + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn; + const short* Iptr = (const short*)(IWinBuf.data + y*IWinBuf.step); + const short* dIptr = (const short*)(derivIWinBuf.data + y*derivIWinBuf.step); + + x = 0; +#ifdef CV_SSE4_1 + for( ; x <= _winSize.width*cn; x += 8, dIptr += 8*2 ) + { + __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16); + __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); + + __m128i v00 = _mm_unpacklo_epi8( + _mm_loadl_epi64((const __m128i*)(Jptr + x)), z); + __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); + __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z); + __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z); + + __m128i t0 = _mm_add_epi32 + (_mm_madd_epi16( + _mm_unpacklo_epi16(v00, v01), + qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5); + + __m128i lo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16); + __m128i hi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16); + + __m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), bitShift); + __m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), bitShift); + __m128i Igain_epi16 = _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32); + + + __m128i diffValue = _mm_subs_epi16(_mm_add_epi16(Igain_epi16, mmConstValue_epi16), I_0_7_epi16); + __m128i mmDiffc_epi16[4] = + { + _mm_add_epi16(_mm_slli_epi16(v11, 5), diffValue), + _mm_add_epi16(_mm_slli_epi16(v01, 5), diffValue), + _mm_add_epi16(_mm_slli_epi16(v10, 5), diffValue), + _mm_add_epi16(_mm_slli_epi16(v00, 5), diffValue) + }; + + __m128i mmDiff_epi16 = _mm_add_epi16( _mm_packs_epi32(t0, t1), diffValue); + + + mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16); + + __m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale); + mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1))); + + + __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); + __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); + + + __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16); + __m128i bSet2_epi16, bSet1_epi16; + // |It| < sigma1 ? + bSet2_epi16 = _mm_cmplt_epi16(abs_epi16, mmParam1); + // It > 0 ? + __m128i diffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128()); + // sigma0 < |It| < sigma1 ? + bSet1_epi16 = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0)); + // val = |It| -/+ sigma1 + __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)), + _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1))); + // It == 0 ? |It| > sigma13 + mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16); + + for( unsigned int mmi = 0; mmi < 4; mmi++) + { + __m128i mmDiffc_epi16_t = _mm_and_si128(mmDiffc_epi16[mmi], mask_0_7_epi16); + mmDiffc_epi16_t = _mm_and_si128(bSet2_epi16, mmDiffc_epi16_t); + + // It == val ? sigma0 < |It| < sigma1 + mmDiffc_epi16_t = _mm_blendv_epi8(mmDiffc_epi16_t, tmpParam1_epi16, bSet1_epi16); + __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3 + // diff = diff * sigma2 + lo = _mm_mullo_epi16(tale_epi16_, mmDiffc_epi16_t); + hi = _mm_mulhi_epi16(tale_epi16_, mmDiffc_epi16_t); + __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); + __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); + + mmDiffc_epi16_t = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); + __m128i diff1 = _mm_unpackhi_epi16(mmDiffc_epi16_t, mmDiffc_epi16_t); // It4 It4 It5 It5 It6 It6 It7 It7 | It12 It12 It13 It13... + __m128i diff0 = _mm_unpacklo_epi16(mmDiffc_epi16_t, mmDiffc_epi16_t); // It0 It0 It1 It1 It2 It2 It3 It3 | It8 It8 It9 It9... + + // Ix * diff / Iy * diff + v10 = _mm_mullo_epi16(Ixy_0, diff0); + v11 = _mm_mulhi_epi16(Ixy_0, diff0); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + + qb0[mmi] = _mm_add_ps(qb0[mmi], _mm_cvtepi32_ps(v00)); + qb1[mmi] = _mm_add_ps(qb1[mmi], _mm_cvtepi32_ps(v10)); + // It * Ix It * Iy [4 ... 7] + // for set 1 hi sigma 1 + v10 = _mm_mullo_epi16(Ixy_1, diff1); + v11 = _mm_mulhi_epi16(Ixy_1, diff1); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qb0[mmi] = _mm_add_ps(qb0[mmi], _mm_cvtepi32_ps(v00)); + qb1[mmi] = _mm_add_ps(qb1[mmi], _mm_cvtepi32_ps(v10)); + // diff * J [0 ... 7] + // for set 1 sigma 1 + // b3 += diff * Iptr[x] + v10 = _mm_mullo_epi16(mmDiffc_epi16_t, I_0_7_epi16); + v11 = _mm_mulhi_epi16(mmDiffc_epi16_t, I_0_7_epi16); + v00 = _mm_unpacklo_epi16(v10, v11); + + v10 = _mm_unpackhi_epi16(v10, v11); + qb2[mmi] = _mm_add_ps(qb2[mmi], _mm_cvtepi32_ps(v00)); + qb2[mmi] = _mm_add_ps(qb2[mmi], _mm_cvtepi32_ps(v10)); + qb3[mmi] = _mm_add_ps(qb3[mmi], _mm_cvtepi32_ps(diff_0_3_epi32)); + qb3[mmi] = _mm_add_ps(qb3[mmi], _mm_cvtepi32_ps(diff_4_7_epi32)); + } + + if( j == 0 ) + { + __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16)); + __m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16,bSet1_epi16), 16)); + __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16)); + __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16),16))); + + __m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16)); + __m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16),16)); + + __m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps); + __m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps); + tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps); + tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps); + + tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps); + tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps); + + t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3 + t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3 + + __m128 fy = _mm_cvtepi32_ps(t0); + __m128 fx = _mm_cvtepi32_ps(t1); + + // 0 ... 3 + __m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16)); + + // A11 - A22 + __m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps); + __m128 fytale = _mm_mul_ps(fy, tale_0_3_ps); + + mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); + mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); + mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); + + // sumIx und sumIy + mmSumIx = _mm_add_ps(mmSumIx, fxtale); + mmSumIy = _mm_add_ps(mmSumIy, fytale); + + mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale)); + mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale)); + + // sumI + __m128 I_tale_ps = _mm_mul_ps(I_ps, tale_0_3_ps); + mmSumI = _mm_add_ps(mmSumI,I_tale_ps); + + // sumW + mmSumW = _mm_add_ps(mmSumW, tale_0_3_ps); + + // sumDI + mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_tale_ps)); + + + t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11 + t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3 + + fy = _mm_cvtepi32_ps(t0); + fx = _mm_cvtepi32_ps(t1); + + // 4 ... 7 + I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16)); + + // A11 - A22 + fxtale = _mm_mul_ps(fx, tale_4_7_ps); + fytale = _mm_mul_ps(fy, tale_4_7_ps); + + mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); + mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); + mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); + + // sumIx und sumIy + mmSumIx = _mm_add_ps(mmSumIx, fxtale); + mmSumIy = _mm_add_ps(mmSumIy, fytale); + + mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale)); + mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale)); + + // sumI + I_tale_ps = _mm_mul_ps(I_ps, tale_4_7_ps); + mmSumI = _mm_add_ps(mmSumI, I_tale_ps); + + // sumW + mmSumW = _mm_add_ps(mmSumW, tale_4_7_ps); + + // sumDI + mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_tale_ps)); + } + + } #else - for( ; x < _winSize.width*cn; x++, dIptr += 2 ) - { - if( maskPtr[x] == 0) - continue; - - short ixval = static_cast(dIptr[0]); - short iyval = static_cast(dIptr[1]); - int illValue = Iptr[x] * gainVec.x + gainVec.y - Iptr[x]; - - int It[4] = {(Jptr[x+step+cn]<< 5) + illValue, - (Jptr[x+cn]<< 5) + illValue, - (Jptr[x+step]<< 5) + illValue, - (Jptr[x] << 5) + illValue}; - - - float J = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + - Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, - W_BITS1-5); - - int diff = J + illValue; - - - - MEstimatorScale += (diff < MEstimatorScale) ? -eta : eta; - - int abss = (diff < 0) ? -diff : diff; - - - // compute the missmatch vector - if( j >= 0) - { - if( abss > fParam1) - { - It[0] = 0; - It[1] = 0; - It[2] = 0; - It[3] = 0; - } - else if( abss > fParam0 && diff >= 0 ) - { - It[0] = param[2] * (It[0] - fParam1); - It[1] = param[2] * (It[1] - fParam1); - It[2] = param[2] * (It[2] - fParam1); - It[3] = param[2] * (It[3] - fParam1); - } - else if( abss > fParam0 && diff < 0 ) - { - It[0] = param[2] * (It[0] + fParam1); - It[1] = param[2] * (It[1] + fParam1); - It[2] = param[2] * (It[2] + fParam1); - It[3] = param[2] * (It[3] + fParam1); - } - } - _b0[0] += (float)(It[0]*dIptr[0]) ; - _b0[1] += (float)(It[1]*dIptr[0]) ; - _b0[2] += (float)(It[2]*dIptr[0]) ; - _b0[3] += (float)(It[3]*dIptr[0]) ; - - - _b1[0] += (float)(It[0]*dIptr[1]) ; - _b1[1] += (float)(It[1]*dIptr[1]) ; - _b1[2] += (float)(It[2]*dIptr[1]) ; - _b1[3] += (float)(It[3]*dIptr[1]) ; - - _b2[0] += (float)(It[0])*Iptr[x] ; - _b2[1] += (float)(It[1])*Iptr[x] ; - _b2[2] += (float)(It[2])*Iptr[x] ; - _b2[3] += (float)(It[3])*Iptr[x] ; - - _b3[0] += (float)(It[0]); - _b3[1] += (float)(It[1]); - _b3[2] += (float)(It[2]); - _b3[3] += (float)(It[3]); - - // compute the Gradient Matrice - if( j == 0) - { - float tale = param[2] * FLT_RESCALE; - if( abss < fParam0 || j < 0) - { - tale = FLT_RESCALE; - } - else if( abss > fParam1) - { - tale *= 0.01f; - } - else - { - tale *= param[2]; - } - if( j == 0 ) - { - A11 += (float)(ixval*ixval)*tale; - A12 += (float)(ixval*iyval)*tale; - A22 += (float)(iyval*iyval)*tale; - } - - dI += Iptr[x] * Iptr[x] * tale; - float dx = static_cast(dIptr[0]) * tale; - float dy = static_cast(dIptr[1]) * tale; - sumIx += dx; - sumIy += dy; - w1 += dx * Iptr[x]; - w2 += dy * Iptr[x]; - sumI += Iptr[x] * tale; - sumW += tale; - } - - } + for( ; x < _winSize.width*cn; x++, dIptr += 2 ) + { + if( maskPtr[x] == 0) + continue; + + short ixval = static_cast(dIptr[0]); + short iyval = static_cast(dIptr[1]); + int illValue = Iptr[x] * gainVec.x + gainVec.y - Iptr[x]; + + int It[4] = {(Jptr[x+step+cn]<< 5) + illValue, + (Jptr[x+cn]<< 5) + illValue, + (Jptr[x+step]<< 5) + illValue, + (Jptr[x] << 5) + illValue}; + + + float J = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, + W_BITS1-5); + + int diff = J + illValue; + + + + MEstimatorScale += (diff < MEstimatorScale) ? -eta : eta; + + int abss = (diff < 0) ? -diff : diff; + + + // compute the missmatch vector + if( j >= 0) + { + if( abss > fParam1) + { + It[0] = 0; + It[1] = 0; + It[2] = 0; + It[3] = 0; + } + else if( abss > fParam0 && diff >= 0 ) + { + It[0] = param[2] * (It[0] - fParam1); + It[1] = param[2] * (It[1] - fParam1); + It[2] = param[2] * (It[2] - fParam1); + It[3] = param[2] * (It[3] - fParam1); + } + else if( abss > fParam0 && diff < 0 ) + { + It[0] = param[2] * (It[0] + fParam1); + It[1] = param[2] * (It[1] + fParam1); + It[2] = param[2] * (It[2] + fParam1); + It[3] = param[2] * (It[3] + fParam1); + } + } + _b0[0] += (float)(It[0]*dIptr[0]) ; + _b0[1] += (float)(It[1]*dIptr[0]) ; + _b0[2] += (float)(It[2]*dIptr[0]) ; + _b0[3] += (float)(It[3]*dIptr[0]) ; + + + _b1[0] += (float)(It[0]*dIptr[1]) ; + _b1[1] += (float)(It[1]*dIptr[1]) ; + _b1[2] += (float)(It[2]*dIptr[1]) ; + _b1[3] += (float)(It[3]*dIptr[1]) ; + + _b2[0] += (float)(It[0])*Iptr[x] ; + _b2[1] += (float)(It[1])*Iptr[x] ; + _b2[2] += (float)(It[2])*Iptr[x] ; + _b2[3] += (float)(It[3])*Iptr[x] ; + + _b3[0] += (float)(It[0]); + _b3[1] += (float)(It[1]); + _b3[2] += (float)(It[2]); + _b3[3] += (float)(It[3]); + + // compute the Gradient Matrice + if( j == 0) + { + float tale = param[2] * FLT_RESCALE; + if( abss < fParam0 || j < 0) + { + tale = FLT_RESCALE; + } + else if( abss > fParam1) + { + tale *= 0.01f; + } + else + { + tale *= param[2]; + } + if( j == 0 ) + { + A11 += (float)(ixval*ixval)*tale; + A12 += (float)(ixval*iyval)*tale; + A22 += (float)(iyval*iyval)*tale; + } + + dI += Iptr[x] * Iptr[x] * tale; + float dx = static_cast(dIptr[0]) * tale; + float dy = static_cast(dIptr[1]) * tale; + sumIx += dx; + sumIy += dy; + w1 += dx * Iptr[x]; + w2 += dy * Iptr[x]; + sumI += Iptr[x] * tale; + sumW += tale; + } + + } #endif - } - -#ifdef CV_SSE2 - short etaValues[8]; - _mm_storeu_si128((__m128i*)(etaValues), mmEta); - MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3] - + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]); - -#endif - float CV_DECL_ALIGNED(32) wbuf[4];// - if( j == 0 ) - { -#ifdef CV_SSE2 - _mm_store_ps(wbuf, mmSumW1); - w1 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; - _mm_store_ps(wbuf, mmSumW2); - w2 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; - _mm_store_ps(wbuf, mmSumDI); - dI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; - _mm_store_ps(wbuf, mmSumI); - sumI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; - _mm_store_ps(wbuf, mmSumIx); - sumIx = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; - _mm_store_ps(wbuf, mmSumIy); - sumIy = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; - _mm_store_ps(wbuf, mmSumW); - sumW = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + } + +#ifdef CV_SSE4_1 + short etaValues[8]; + _mm_storeu_si128((__m128i*)(etaValues), mmEta); + MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3] + + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]); + +#endif + float CV_DECL_ALIGNED(32) wbuf[4];// + if( j == 0 ) + { +#ifdef CV_SSE4_1 + _mm_store_ps(wbuf, mmSumW1); + w1 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumW2); + w2 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumDI); + dI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumI); + sumI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumIx); + sumIx = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumIy); + sumIy = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumW); + sumW = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; #endif - sumIx *= -FLT_SCALE; - sumIy *= -FLT_SCALE; - sumI *=FLT_SCALE; - sumW *= FLT_SCALE; - w1 *= -FLT_SCALE; - w2 *= -FLT_SCALE; - dI *= FLT_SCALE; - - -#ifdef CV_SSE2 - float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];// - - _mm_store_ps(A11buf, mmAxx); - _mm_store_ps(A12buf, mmAxy); - _mm_store_ps(A22buf, mmAyy); - - - A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; - A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; - A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; + sumIx *= -FLT_SCALE; + sumIy *= -FLT_SCALE; + sumI *=FLT_SCALE; + sumW *= FLT_SCALE; + w1 *= -FLT_SCALE; + w2 *= -FLT_SCALE; + dI *= FLT_SCALE; + + +#ifdef CV_SSE4_1 + float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];// + + _mm_store_ps(A11buf, mmAxx); + _mm_store_ps(A12buf, mmAxy); + _mm_store_ps(A22buf, mmAyy); + + + A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; + A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; + A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; #endif - A11 *= FLT_SCALE; // 54866744. - A12 *= FLT_SCALE; // -628764.00 - A22 *= FLT_SCALE; // 19730.000 - - D = - A12*A12*sumI*sumI + dI*sumW*A12*A12 + 2*A12*sumI*sumIx*w2 + 2*A12*sumI*sumIy*w1 - - 2*dI*A12*sumIx*sumIy - 2*sumW*A12*w1*w2 + A11*A22*sumI*sumI - 2*A22*sumI*sumIx*w1 - - 2*A11*sumI*sumIy*w2 - sumIx*sumIx*w2*w2 + A22*dI*sumIx*sumIx + 2*sumIx*sumIy*w1*w2 - - sumIy*sumIy*w1*w1 + A11*dI*sumIy*sumIy + A22*sumW*w1*w1 + A11*sumW*w2*w2 - A11*A22*dI*sumW; - - float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) + - 4.f*A12*A12))/(2*winArea); - if( minEig < minEigThreshold || std::abs(D) < FLT_EPSILON ) - { - if( level == 0 && status ) - status[ptidx] = 0; - if( level > 0) - { - nextPts[ptidx] = backUpNextPt; - gainVecs[ptidx] = backUpGain; - } - noIteration++; - break; - } - - - D = (1.f / D); - - invTensorMat.at(0,0) = (A22*sumI*sumI - 2*sumI*sumIy*w2 + dI*sumIy*sumIy + sumW*w2*w2 - A22*dI*sumW)* D; - invTensorMat.at(0,1) = (A12*dI*sumW - A12*sumI * sumI - dI*sumIx*sumIy + sumI*sumIx*w2 + sumI*sumIy*w1 - sumW*w1*w2)* D; - invTensorMat.at(0,2) = (A12*sumI*sumIy - sumIy*sumIy*w1 - A22*sumI*sumIx - A12*sumW*w2 + A22*sumW*w1 + sumIx*sumIy*w2)* D; - invTensorMat.at(0,3) = (A22*dI*sumIx - A12*dI*sumIy - sumIx*w2*w2 + A12*sumI*w2 - A22*sumI*w1 + sumIy*w1*w2) * D; - invTensorMat.at(1,0) = invTensorMat.at(0,1); - invTensorMat.at(1,1) = (A11*sumI * sumI - 2*sumI*sumIx*w1 + dI*sumIx * sumIx + sumW*w1*w1 - A11*dI*sumW) * D; - invTensorMat.at(1,2) = (A12*sumI*sumIx - A11*sumI*sumIy - sumIx * sumIx*w2 + A11*sumW*w2 - A12*sumW*w1 + sumIx*sumIy*w1) * D; - invTensorMat.at(1,3) = (A11*dI*sumIy - sumIy*w1*w1 - A12*dI*sumIx - A11*sumI*w2 + A12*sumI*w1 + sumIx*w1*w2)* D; - invTensorMat.at(2,0) = invTensorMat.at(0,2); - invTensorMat.at(2,1) = invTensorMat.at(1,2); - invTensorMat.at(2,2) = (sumW*A12*A12 - 2*A12*sumIx*sumIy + A22*sumIx*sumIx + A11*sumIy*sumIy - A11*A22*sumW)* D; - invTensorMat.at(2,3) = (A11*A22*sumI - A12*A12*sumI - A11*sumIy*w2 + A12*sumIx*w2 + A12*sumIy*w1 - A22*sumIx*w1)* D; - invTensorMat.at(3,0) = invTensorMat.at(0,3); - invTensorMat.at(3,1) = invTensorMat.at(1,3); - invTensorMat.at(3,2) = invTensorMat.at(2,3); - invTensorMat.at(3,3) = (dI*A12*A12 - 2*A12*w1*w2 + A22*w1*w1 + A11*w2*w2 - A11*A22*dI)* D; - } - - - #ifdef CV_SSE2 - float CV_DECL_ALIGNED(16) bbuf[4]; - for(int mmi = 0; mmi < 4; mmi++) - { - - _mm_store_ps(bbuf, _mm_add_ps(qb0[mmi], qb1[mmi])); - _b0[mmi] = bbuf[0] + bbuf[2]; - _b1[mmi] = bbuf[1] + bbuf[3]; - _mm_store_ps(bbuf, qb2[mmi]); - _b2[mmi] = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; - _mm_store_ps(bbuf, qb3[mmi]); - _b3[mmi] = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; - - } - #endif - - _b0[0] *= FLT_SCALE;_b0[1] *= FLT_SCALE;_b0[2] *= FLT_SCALE;_b0[3] *= FLT_SCALE; - _b1[0] *= FLT_SCALE;_b1[1] *= FLT_SCALE;_b1[2] *= FLT_SCALE;_b1[3] *= FLT_SCALE; - _b2[0] *= FLT_SCALE;_b2[1] *= FLT_SCALE;_b2[2] *= FLT_SCALE;_b2[3] *= FLT_SCALE; - _b3[0] *= FLT_SCALE;_b3[1] *= FLT_SCALE;_b3[2] *= FLT_SCALE;_b3[3] *= FLT_SCALE; - - - Mc0[0] = _b0[0] - _b0[1] - _b0[2] + _b0[3]; - Mc0[1] = _b1[0] - _b1[1] - _b1[2] + _b1[3]; - Mc0[2] = -(_b2[0] - _b2[1] - _b2[2] + _b2[3]); - Mc0[3] = -(_b3[0] - _b3[1] - _b3[2] + _b3[3]); - - Mc1[0] = _b0[1] - _b0[3]; - Mc1[1] = _b1[1] - _b1[3]; - Mc1[2] = -(_b2[1] - _b2[3]); - Mc1[3] = -(_b3[1] - _b3[3]); - - - Mc2[0] = _b0[2] - _b0[3]; - Mc2[1] = _b1[2] - _b1[3]; - Mc2[2] = -(_b2[2] - _b2[3]); - Mc2[3] = -(_b3[2] - _b3[3]); - - - Mc3[0] = _b0[3]; - Mc3[1] = _b1[3]; - Mc3[2] = -_b2[3]; - Mc3[3] = -_b3[3]; - - // - c[0] = -Mc0[0]; - c[1] = -Mc1[0]; - c[2] = -Mc2[0]; - c[3] = -Mc3[0]; - c[4] = -Mc0[1]; - c[5] = -Mc1[1]; - c[6] = -Mc2[1]; - c[7] = -Mc3[1]; - - float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]); - float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]); - float e2 = e0 * (c[1] * c[7] -c[3] * c[5]); - e0 = e1 * e1 - e2; - hasSolved = false; - if ( e0 > 0) - { - e0 = sqrt(e0); - float _y[2] = {-e1 - e0, e0 - e1}; - float c0yc1[2] = {c[0] * _y[0] + c[1], - c[0] * _y[1] + c[1]}; - float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0], - -(c[2] * _y[1] + c[3]) / c0yc1[1]}; - float c0yc2[2] = {c[0] * _y[0] + c[2], - c[0] * _y[1] + c[2]}; - - float c4yc6[2] = {c[4] * _y[0] + c[6], - c[4] * _y[1] + c[6]}; - - float c0xc1[2] = {c[0] * _x[0] + c[1], - c[0] * _x[1] + c[1]}; - - float c4xc5[2] = {c[4] * _x[0] + c[5], - c[4] * _x[1] + c[5]}; - bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1); - bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1); - - bool isSolution1 = checkSolution(_x[0], _y[0], c ); - bool isSolution2 = checkSolution(_x[1], _y[1], c ); - bool isSink1 = isIn1 && isSolution1; - bool isSink2 = isIn2 && isSolution2; - - if ( isSink1 != isSink2) - { - a = isSink1 ? _x[0] : _x[1]; - b = isSink1 ? _y[0] : _y[1]; - ab = a * b; - hasSolved = true; - delta.x = inextPt.x + a - nextPt.x; - delta.y = inextPt.y + b - nextPt.y; - - cv::Vec4f mismatchVec = ab * Mc0 + Mc1 *a + Mc2 * b + Mc3; - deltaGain = est_DeltaGain(invTensorMat, mismatchVec); - - } // isIn1 != isIn2 - } - if( hasSolved == false) - noIteration++; - } - else - { - hasSolved = false; - noReusedIteration++; - } - if( hasSolved == false ) - { - - cv::Vec4f mismatchVec = ab * Mc0 + Mc1 *a + Mc2 * b + Mc3; - est_Result(invTensorMat, mismatchVec, delta, deltaGain); - - delta.x = MAX(-1.f, MIN( 1.f , delta.x)); - delta.y = MAX(-1.f, MIN( 1.f , delta.y)); - - - if( j == 0) - prevGain = deltaGain; - gainVec += deltaGain; - nextPt += delta ; - nextPts[ptidx] = nextPt + halfWin; - gainVecs[ptidx]= gainVec; - - } - else - { - nextPt += delta; - nextPts[ptidx] = nextPt + halfWin; - gainVecs[ptidx]= gainVec + deltaGain; - noSolvedIteration++; - break; - } - - - cv::Point2f totalDistance = backUpNextPt - nextPts[ptidx]; - - if(j > 0 && ( - (std::abs(delta.x - prevDelta.x) < 0.01 && std::abs(delta.y - prevDelta.y) < 0.01) - || ((delta.ddot(delta) <= 0.001) && std::abs(prevGain.x - deltaGain.x) < 0.01))) - { - nextPts[ptidx] -= delta*0.5f; - gainVecs[ptidx] -= deltaGain* 0.5f; - break; - } - - - prevDelta = delta; - prevGain = deltaGain; - } - - } - - } - - const Mat* prevImg; - const Mat* nextImg; - const Mat* prevDeriv; - const Mat* rgbPrevImg; - const Mat* rgbNextImg; - const Point2f* prevPts; - Point2f* nextPts; - uchar* status; - cv::Point2f* gainVecs; // gain vector x -> multiplier y -> offset - float* err; - int maxWinSize; - int minWinSize; - TermCriteria criteria; - int level; - int maxLevel; - int windowType; - float minEigThreshold; - bool useInitialFlow; - std::vector paramReset; - int crossSegmentationThreshold; - - }; - } - } - namespace beplk - { - namespace ica{ - class TrackerInvoker : public cv::ParallelLoopBody - { - public: - TrackerInvoker(const Mat& _prevImg, - const Mat& _prevDeriv, - const Mat& _nextImg, - const Mat& _rgbPrevImg, - const Mat& _rgbNextImg, - const Point2f* _prevPts, - Point2f* _nextPts, - uchar* _status, - float* _err, - int _level, - int _maxLevel, - int _winSize[2], - int _maxIteration, - bool _useInitialFlow, - int _supportRegionType, - int _crossSegmentationThreshold, - float _minEigenValue) - { - prevImg = &_prevImg; - prevDeriv = &_prevDeriv; - nextImg = &_nextImg; - rgbPrevImg = &_rgbPrevImg; - rgbNextImg = &_rgbNextImg; - prevPts = _prevPts; - nextPts = _nextPts; - status = _status; - err = _err; - minWinSize = _winSize[0]; - maxWinSize = _winSize[1]; - criteria.maxCount = _maxIteration; - criteria.epsilon = 0.01; - level = _level; - maxLevel = _maxLevel; - windowType = _supportRegionType; - minEigThreshold = _minEigenValue; - useInitialFlow = _useInitialFlow; - crossSegmentationThreshold = _crossSegmentationThreshold; - } - - TrackerInvoker & operator=(const TrackerInvoker &) {return *this;}; - - void operator()(const cv::Range& range) const - { - cv::Size winSize; - cv::Point2f halfWin; - - const Mat& I = *prevImg; - const Mat& J = *nextImg; - const Mat& derivI = *prevDeriv; - const Mat& BI = *rgbPrevImg; - - winSize = cv::Size(maxWinSize,maxWinSize); - int winMaskwidth = static_cast(ceil(winSize.width / 8.f)) * 16; - cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); - winMaskMatBuf.setTo(1); - - const float FLT_SCALE = (1.f/(1 << 20)); // 20 - - int j, cn = I.channels(), cn2 = cn*2; - int winbufwidth = static_cast(ceil(winSize.width / 8.f)) * 8; - cv::Size winBufSize(winbufwidth,winbufwidth); - - std::vector _buf(winBufSize.area()*(cn + cn2)); - Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]); - Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]); - - for( int ptidx = range.start; ptidx < range.end; ptidx++ ) - { - Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level)); - Point2f nextPt; - if( level == maxLevel ) - { - if( useInitialFlow ) - nextPt = nextPts[ptidx]*(float)(1./(1 << level)); - else - nextPt = prevPt; - } - else - nextPt = nextPts[ptidx]*2.f; - nextPts[ptidx] = nextPt; - - Point2i iprevPt, inextPt; - iprevPt.x = cvFloor(prevPt.x); - iprevPt.y = cvFloor(prevPt.y); - int winArea = maxWinSize * maxWinSize; - cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize)); - - if( calcWinMaskMat(BI, windowType, iprevPt, - winMaskMat,winSize,halfWin,winArea, - minWinSize,maxWinSize, crossSegmentationThreshold) == false) - continue; - - prevPt -= halfWin; - iprevPt.x = cvFloor(prevPt.x); - iprevPt.y = cvFloor(prevPt.y); - - if( iprevPt.x + halfWin.x < 0 || iprevPt.x + halfWin.x>= derivI.cols || - iprevPt.y + halfWin.y < 0 || iprevPt.y + halfWin.y >= derivI.rows ) - { - if( level == 0 ) - { - if( status ) - status[ptidx] = 3; - if( err ) - err[ptidx] = 0; - } - continue; - } - - float a = prevPt.x - iprevPt.x; - float b = prevPt.y - iprevPt.y; - const int W_BITS = 14, W_BITS1 = 14; - - int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); - int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); - int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); - int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - - int dstep = (int)(derivI.step/derivI.elemSize1()); - int step = (int)(I.step/I.elemSize1()); - int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1()); - CV_Assert( step == (int)(J.step/J.elemSize1()) ); - float A11 = 0, A12 = 0, A22 = 0; - - #ifdef CV_SSE2 - __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); - __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); - __m128i z = _mm_setzero_si128(); - __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1)); - __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1)); - __m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps(); - __m128i mmMask4_epi32; - get4BitMask(winSize.width, mmMask4_epi32); - #endif - - - int x, y; - for( y = 0; y < winSize.height; y++ ) - { - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; - const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; - const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; - - short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step); - short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step); - - x = 0; - - #ifdef CV_SSE2 - for( ; x < winSize.width*cn; x += 4, dsrc += 4*2, dIptr += 4*2 ) - { - __m128i wMask = _mm_set_epi32(MaskSet * maskPtr[x+3], - MaskSet * maskPtr[x+2], - MaskSet * maskPtr[x+1], - MaskSet * maskPtr[x]); - __m128i v00, v01, v10, v11, t0, t1; - v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); - v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); - v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z); - v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z); - - t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); - - t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); - _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0)); - - v00 = _mm_loadu_si128((const __m128i*)(dsrc)); - v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); - v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep)); - v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2)); - - t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); - t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); - t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1); - t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1); - v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... - - if( x + 4 > winSize.width) - { - v00 = _mm_and_si128(v00, mmMask4_epi32); - } - v00 = _mm_and_si128(v00, wMask); - - _mm_storeu_si128((__m128i*)dIptr, v00); - t0 = _mm_srai_epi32(v00, 16); // Iy0 Iy1 Iy2 Iy3 - t1 = _mm_srai_epi32(_mm_slli_epi32(v00, 16), 16); // Ix0 Ix1 Ix2 Ix3 - - __m128 fy = _mm_cvtepi32_ps(t0); - __m128 fx = _mm_cvtepi32_ps(t1); - - qA22 = _mm_add_ps(qA22, _mm_mul_ps(fy, fy)); - qA12 = _mm_add_ps(qA12, _mm_mul_ps(fx, fy)); - qA11 = _mm_add_ps(qA11, _mm_mul_ps(fx, fx)); - } - #else - - for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2) - { - if( maskPtr[x] == 0) - { - dIptr[0] = (short)0; - dIptr[1] = (short)0; - - continue; - } - int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + - src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5); - int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 + - dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1); - int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 + - dsrc[dstep+cn2+1]*iw11, W_BITS1); - - Iptr[x] = (short)ival; - dIptr[0] = (short)ixval; - dIptr[1] = (short)iyval; - A11 += (float)(ixval*ixval); - A12 += (float)(ixval*iyval); - A22 += (float)(iyval*iyval); - - } - - - #endif - - } - - #ifdef CV_SSE2 - float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4]; - _mm_store_ps(A11buf, qA11); - _mm_store_ps(A12buf, qA12); - _mm_store_ps(A22buf, qA22); - A11 += A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; - A12 += A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; - A22 += A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; - #endif - - A11 *= FLT_SCALE; - A12 *= FLT_SCALE; - A22 *= FLT_SCALE; - - - float D = A11*A22 - A12*A12; - float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) + - 4.f*A12*A12))/(2 * winArea); - - if( err ) - err[ptidx] = (float)minEig; - - if( D < FLT_EPSILON ) - { - if( level == 0 && status ) - status[ptidx] = 0; - continue; - } - - D = 1.f/D; - - cv::Point2f backUpNextPt = nextPt; - nextPt -= halfWin; - Point2f prevDelta(0,0); - - float c[8]; - // init the maske - __m128i mmMask_epi32 = _mm_set_epi32(0, std::numeric_limits::max(), 0, std::numeric_limits::max()); - __m128i mmMask_epi16 = _mm_set_epi16(0, std::numeric_limits::max(), 0, std::numeric_limits::max(), - 0, std::numeric_limits::max(), 0, std::numeric_limits::max()); - __m128i mmMask0, mmMask1, mmMask; - getWBitMask(winSize.width, mmMask0, mmMask1, mmMask); - for( j = 0; j < criteria.maxCount; j++ ) - { - cv::Point2f delta; - bool hasSolved = false; - a = nextPt.x - cvFloor(nextPt.x); - b = nextPt.y - cvFloor(nextPt.y); - float ab = a * b; - - - if( (inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j == 0)) - { - inextPt.x = cvFloor(nextPt.x); - inextPt.y = cvFloor(nextPt.y); - - - if( inextPt.x < -winSize.width || inextPt.x >= J.cols || - inextPt.y < -winSize.height || inextPt.y >= J.rows ) - { - if( level == 0 && status ) - status[ptidx] = 3; - if (level > 0) - nextPts[ptidx] = backUpNextPt; - break; - } - - - iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); - iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); - iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); - iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - - float _b1[4] = {0,0,0,0}; - float _b2[4] = {0,0,0,0}; - #ifdef CV_SSE2 - __m128 qbc0[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; - __m128 qbc1[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; - qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); - qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); - __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps(); - #endif - for( y = 0; y < winSize.height; y++ ) - { - const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn; - const short* Iptr = (const short*)(IWinBuf.data + ( y)*IWinBuf.step); - const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); - x = 0; - #ifdef CV_SSE2 - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; - for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 ) - { - if( maskPtr[x ] == 0 && maskPtr[x+1] == 0 && maskPtr[x+2] == 0 && maskPtr[x+3] == 0 - && maskPtr[x+4] == 0 && maskPtr[x+5] == 0 && maskPtr[x+6] == 0 && maskPtr[x+7] == 0) - continue; - __m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x)); - __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z); - __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); - __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z); - __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z); - - __m128i mmDiffc_epi16[4] = - { _mm_subs_epi16(_mm_slli_epi16(v00, 5), diff0), - _mm_subs_epi16(_mm_slli_epi16(v01, 5), diff0), - _mm_subs_epi16(_mm_slli_epi16(v10, 5), diff0), - _mm_subs_epi16(_mm_slli_epi16(v11, 5), diff0) - }; - - __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... - __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); - - if( x > winSize.width*cn - 8) - { - Ixy_0 = _mm_and_si128(Ixy_0, mmMask0); - Ixy_1 = _mm_and_si128(Ixy_1, mmMask1); - } - - __m128i diffc1[4] = - {_mm_unpackhi_epi16(mmDiffc_epi16[0],mmDiffc_epi16[0]), - _mm_unpackhi_epi16(mmDiffc_epi16[1],mmDiffc_epi16[1]), - _mm_unpackhi_epi16(mmDiffc_epi16[2],mmDiffc_epi16[2]), - _mm_unpackhi_epi16(mmDiffc_epi16[3],mmDiffc_epi16[3]) - }; - - __m128i diffc0[4] = - {_mm_unpacklo_epi16(mmDiffc_epi16[0],mmDiffc_epi16[0]), - _mm_unpacklo_epi16(mmDiffc_epi16[1],mmDiffc_epi16[1]), - _mm_unpacklo_epi16(mmDiffc_epi16[2],mmDiffc_epi16[2]), - _mm_unpacklo_epi16(mmDiffc_epi16[3],mmDiffc_epi16[3]) - }; - - - // It * Ix It * Iy [0 ... 3] - //mask split for multiplication - // for set 1 lo sigma 1 - - - v10 = _mm_mullo_epi16(Ixy_0, diffc0[0]); - v11 = _mm_mulhi_epi16(Ixy_0, diffc0[0]); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qbc0[0] = _mm_add_ps(qbc0[0], _mm_cvtepi32_ps(v00)); - qbc1[0] = _mm_add_ps(qbc1[0], _mm_cvtepi32_ps(v10)); - - v10 = _mm_mullo_epi16(Ixy_0, diffc0[1]); - v11 = _mm_mulhi_epi16(Ixy_0, diffc0[1]); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qbc0[1] = _mm_add_ps(qbc0[1], _mm_cvtepi32_ps(v00)); - qbc1[1] = _mm_add_ps(qbc1[1], _mm_cvtepi32_ps(v10)); - - v10 = _mm_mullo_epi16(Ixy_0, diffc0[2]); - v11 = _mm_mulhi_epi16(Ixy_0, diffc0[2]); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qbc0[2] = _mm_add_ps(qbc0[2], _mm_cvtepi32_ps(v00)); - qbc1[2] = _mm_add_ps(qbc1[2], _mm_cvtepi32_ps(v10)); - - v10 = _mm_mullo_epi16(Ixy_0, diffc0[3]); - v11 = _mm_mulhi_epi16(Ixy_0, diffc0[3]); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qbc0[3] = _mm_add_ps(qbc0[3], _mm_cvtepi32_ps(v00)); - qbc1[3] = _mm_add_ps(qbc1[3], _mm_cvtepi32_ps(v10)); - // It * Ix It * Iy [4 ... 7] - // for set 1 hi sigma 1 - - v10 = _mm_mullo_epi16(Ixy_1, diffc1[0]); - v11 = _mm_mulhi_epi16(Ixy_1, diffc1[0]); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qbc0[0] = _mm_add_ps(qbc0[0], _mm_cvtepi32_ps(v00)); - qbc1[0] = _mm_add_ps(qbc1[0], _mm_cvtepi32_ps(v10)); - - v10 = _mm_mullo_epi16(Ixy_1, diffc1[1]); - v11 = _mm_mulhi_epi16(Ixy_1, diffc1[1]); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qbc0[1] = _mm_add_ps(qbc0[1], _mm_cvtepi32_ps(v00)); - qbc1[1] = _mm_add_ps(qbc1[1], _mm_cvtepi32_ps(v10)); - - v10 = _mm_mullo_epi16(Ixy_1, diffc1[2]); - v11 = _mm_mulhi_epi16(Ixy_1, diffc1[2]); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qbc0[2] = _mm_add_ps(qbc0[2], _mm_cvtepi32_ps(v00)); - qbc1[2] = _mm_add_ps(qbc1[2], _mm_cvtepi32_ps(v10)); - - v10 = _mm_mullo_epi16(Ixy_1, diffc1[3]); - v11 = _mm_mulhi_epi16(Ixy_1, diffc1[3]); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qbc0[3] = _mm_add_ps(qbc0[3], _mm_cvtepi32_ps(v00)); - qbc1[3] = _mm_add_ps(qbc1[3], _mm_cvtepi32_ps(v10)); - - } - #else - for( ; x < winSize.width*cn; x++, dIptr += 2 ) - { - if( dIptr[0] == 0 && dIptr[1] == 0) - continue; - /*short It[4] = {CV_DESCALE((Jptr[x]) *(1 << W_BITS),W_BITS1-5) - Iptr[x], - CV_DESCALE((Jptr[x+cn]) *(1 << W_BITS),W_BITS1-5) - Iptr[x], - CV_DESCALE((Jptr[x+step]) *(1 << W_BITS),W_BITS1-5) - Iptr[x], - CV_DESCALE((Jptr[x+step+cn])*(1 << W_BITS),W_BITS1-5) - Iptr[x]};*/ - short It[4] = {(Jptr[x] << 5) - Iptr[x], - (Jptr[x+cn]<< 5) - Iptr[x], - (Jptr[x+step]<< 5) - Iptr[x], - (Jptr[x+step+cn]<< 5) - Iptr[x]}; - - _b1[0] += (float)(It[0]*dIptr[0]); - _b1[1] += (float)(It[1]*dIptr[0]); - _b1[2] += (float)(It[2]*dIptr[0]); - _b1[3] += (float)(It[3]*dIptr[0]); - - _b2[0] += (float)(It[0]*dIptr[1]); - _b2[1] += (float)(It[1]*dIptr[1]); - _b2[2] += (float)(It[2]*dIptr[1]); - _b2[3] += (float)(It[3]*dIptr[1]); - } - #endif - - } - - #ifdef CV_SSE2 - float CV_DECL_ALIGNED(16) bbuf[4]; - _mm_store_ps(bbuf, _mm_add_ps(qbc0[0], qbc1[0])); - _b1[0] += bbuf[0] + bbuf[2]; - _b2[0] += bbuf[1] + bbuf[3]; - - _mm_store_ps(bbuf, _mm_add_ps(qbc0[1], qbc1[1])); - _b1[1] += bbuf[0] + bbuf[2]; - _b2[1] += bbuf[1] + bbuf[3]; - - _mm_store_ps(bbuf, _mm_add_ps(qbc0[2], qbc1[2])); - _b1[2] += bbuf[0] + bbuf[2]; - _b2[2] += bbuf[1] + bbuf[3]; - - _mm_store_ps(bbuf, _mm_add_ps(qbc0[3], qbc1[3])); - _b1[3] += bbuf[0] + bbuf[2]; - _b2[3] += bbuf[1] + bbuf[3]; - - #endif - _b1[0] *= FLT_SCALE; - _b1[1] *= FLT_SCALE; - _b1[2] *= FLT_SCALE; - _b1[3] *= FLT_SCALE; - _b2[0] *= FLT_SCALE; - _b2[1] *= FLT_SCALE; - _b2[2] *= FLT_SCALE; - _b2[3] *= FLT_SCALE; - - float c0[4] = { _b1[3] + _b1[0] - _b1[2] - _b1[1], - _b1[1] - _b1[0], - _b1[2] - _b1[0], - _b1[0]}; - float c1[4] = { _b2[3] + _b2[0] - _b2[2] - _b2[1], - _b2[1] - _b2[0], - _b2[2] - _b2[0], - _b2[0]}; - - float DA12 = A12 * D ; - float DA22 = A22 * D ; - float DA11 = A11 * D ; - c[0] = DA12 * c1[0] - DA22 * c0[0]; - c[1] = DA12 * c1[1] - DA22 * c0[1]; - c[2] = DA12 * c1[2] - DA22 * c0[2]; - c[3] = DA12 * c1[3] - DA22 * c0[3]; - c[4] = DA12 * c0[0] - DA11 * c1[0]; - c[5] = DA12 * c0[1] - DA11 * c1[1]; - c[6] = DA12 * c0[2] - DA11 * c1[2]; - c[7] = DA12 * c0[3] - DA11 * c1[3]; - - float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]); - float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]); - float e2 = e0 * (c[1] * c[7] -c[3] * c[5]); - e0 = e1 * e1 - e2; - if ( e0 >= 0) - { - e0 = sqrt(e0); - float _y[2] = {-e1 - e0, e0 - e1}; - float c0yc1[2] = {c[0] * _y[0] + c[1], - c[0] * _y[1] + c[1]}; - - float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0], - -(c[2] * _y[1] + c[3]) / c0yc1[1]}; - - float c0yc2[2] = {c[0] * _y[0] + c[2], - c[0] * _y[1] + c[2]}; - - float c4yc6[2] = {c[4] * _y[0] + c[6], - c[4] * _y[1] + c[6]}; - - float c0xc1[2] = {c[0] * _x[0] + c[1], - c[0] * _x[1] + c[1]}; - - float c4xc5[2] = {c[4] * _x[0] + c[5], - c[4] * _x[1] + c[5]}; - bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1); - bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1); - - - bool isSink1 = isIn1 && checkSolution(_x[0], _y[0], c ); - bool isSink2 = isIn2 && checkSolution(_x[1], _y[1], c ); - - - //if( isSink1 != isSink2 ) - { - if( isSink1 ) - { - hasSolved = true; - delta.x = inextPt.x + _x[0] - nextPt.x; - delta.y = inextPt.y + _y[0] - nextPt.y; - } - if (isSink2 ) - { - hasSolved = true; - delta.x = inextPt.x + _x[1] - nextPt.x; - delta.y = inextPt.y + _y[1] - nextPt.y; - } - } // isIn1 != isIn2 - } // e0 >= 0 - } - else - hasSolved = false; - if(hasSolved == false) - { - delta = Point2f( c[0] * ab + c[1] * a + c[2] * b + c[3], - c[4] * ab + c[5] * a + c[6] * b + c[7]); - nextPt += 0.7 * delta; - nextPts[ptidx] = nextPt + halfWin; - } - else - { - nextPt += delta; - nextPts[ptidx] = nextPt + halfWin; - break; - } - - - if( delta.ddot(delta) <= criteria.epsilon) - break; - - if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01 && - std::abs(delta.y - prevDelta.y) < 0.01) - { - nextPts[ptidx] -= delta*0.35f; - break; - } - - prevDelta = delta; - - } - - } - } - - - const Mat* prevImg; - const Mat* nextImg; - const Mat* prevDeriv; - const Mat* rgbPrevImg; - const Mat* rgbNextImg; - const Point2f* prevPts; - Point2f* nextPts; - uchar* status; - float* err; - int maxWinSize; - int minWinSize; - TermCriteria criteria; - int level; - int maxLevel; - int windowType; - float minEigThreshold; - bool useInitialFlow; - int crossSegmentationThreshold; - }; + A11 *= FLT_SCALE; // 54866744. + A12 *= FLT_SCALE; // -628764.00 + A22 *= FLT_SCALE; // 19730.000 + + D = - A12*A12*sumI*sumI + dI*sumW*A12*A12 + 2*A12*sumI*sumIx*w2 + 2*A12*sumI*sumIy*w1 + - 2*dI*A12*sumIx*sumIy - 2*sumW*A12*w1*w2 + A11*A22*sumI*sumI - 2*A22*sumI*sumIx*w1 + - 2*A11*sumI*sumIy*w2 - sumIx*sumIx*w2*w2 + A22*dI*sumIx*sumIx + 2*sumIx*sumIy*w1*w2 + - sumIy*sumIy*w1*w1 + A11*dI*sumIy*sumIy + A22*sumW*w1*w1 + A11*sumW*w2*w2 - A11*A22*dI*sumW; + + float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) + + 4.f*A12*A12))/(2*winArea); + if( minEig < minEigThreshold || std::abs(D) < FLT_EPSILON ) + { + if( level == 0 && status ) + status[ptidx] = 0; + if( level > 0) + { + nextPts[ptidx] = backUpNextPt; + gainVecs[ptidx] = backUpGain; + } + noIteration++; + break; + } + + + D = (1.f / D); + + invTensorMat.at(0,0) = (A22*sumI*sumI - 2*sumI*sumIy*w2 + dI*sumIy*sumIy + sumW*w2*w2 - A22*dI*sumW)* D; + invTensorMat.at(0,1) = (A12*dI*sumW - A12*sumI * sumI - dI*sumIx*sumIy + sumI*sumIx*w2 + sumI*sumIy*w1 - sumW*w1*w2)* D; + invTensorMat.at(0,2) = (A12*sumI*sumIy - sumIy*sumIy*w1 - A22*sumI*sumIx - A12*sumW*w2 + A22*sumW*w1 + sumIx*sumIy*w2)* D; + invTensorMat.at(0,3) = (A22*dI*sumIx - A12*dI*sumIy - sumIx*w2*w2 + A12*sumI*w2 - A22*sumI*w1 + sumIy*w1*w2) * D; + invTensorMat.at(1,0) = invTensorMat.at(0,1); + invTensorMat.at(1,1) = (A11*sumI * sumI - 2*sumI*sumIx*w1 + dI*sumIx * sumIx + sumW*w1*w1 - A11*dI*sumW) * D; + invTensorMat.at(1,2) = (A12*sumI*sumIx - A11*sumI*sumIy - sumIx * sumIx*w2 + A11*sumW*w2 - A12*sumW*w1 + sumIx*sumIy*w1) * D; + invTensorMat.at(1,3) = (A11*dI*sumIy - sumIy*w1*w1 - A12*dI*sumIx - A11*sumI*w2 + A12*sumI*w1 + sumIx*w1*w2)* D; + invTensorMat.at(2,0) = invTensorMat.at(0,2); + invTensorMat.at(2,1) = invTensorMat.at(1,2); + invTensorMat.at(2,2) = (sumW*A12*A12 - 2*A12*sumIx*sumIy + A22*sumIx*sumIx + A11*sumIy*sumIy - A11*A22*sumW)* D; + invTensorMat.at(2,3) = (A11*A22*sumI - A12*A12*sumI - A11*sumIy*w2 + A12*sumIx*w2 + A12*sumIy*w1 - A22*sumIx*w1)* D; + invTensorMat.at(3,0) = invTensorMat.at(0,3); + invTensorMat.at(3,1) = invTensorMat.at(1,3); + invTensorMat.at(3,2) = invTensorMat.at(2,3); + invTensorMat.at(3,3) = (dI*A12*A12 - 2*A12*w1*w2 + A22*w1*w1 + A11*w2*w2 - A11*A22*dI)* D; + } + + + #ifdef CV_SSE4_1 + float CV_DECL_ALIGNED(16) bbuf[4]; + for(int mmi = 0; mmi < 4; mmi++) + { + + _mm_store_ps(bbuf, _mm_add_ps(qb0[mmi], qb1[mmi])); + _b0[mmi] = bbuf[0] + bbuf[2]; + _b1[mmi] = bbuf[1] + bbuf[3]; + _mm_store_ps(bbuf, qb2[mmi]); + _b2[mmi] = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; + _mm_store_ps(bbuf, qb3[mmi]); + _b3[mmi] = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; + + } + #endif + + _b0[0] *= FLT_SCALE;_b0[1] *= FLT_SCALE;_b0[2] *= FLT_SCALE;_b0[3] *= FLT_SCALE; + _b1[0] *= FLT_SCALE;_b1[1] *= FLT_SCALE;_b1[2] *= FLT_SCALE;_b1[3] *= FLT_SCALE; + _b2[0] *= FLT_SCALE;_b2[1] *= FLT_SCALE;_b2[2] *= FLT_SCALE;_b2[3] *= FLT_SCALE; + _b3[0] *= FLT_SCALE;_b3[1] *= FLT_SCALE;_b3[2] *= FLT_SCALE;_b3[3] *= FLT_SCALE; + + + Mc0[0] = _b0[0] - _b0[1] - _b0[2] + _b0[3]; + Mc0[1] = _b1[0] - _b1[1] - _b1[2] + _b1[3]; + Mc0[2] = -(_b2[0] - _b2[1] - _b2[2] + _b2[3]); + Mc0[3] = -(_b3[0] - _b3[1] - _b3[2] + _b3[3]); + + Mc1[0] = _b0[1] - _b0[3]; + Mc1[1] = _b1[1] - _b1[3]; + Mc1[2] = -(_b2[1] - _b2[3]); + Mc1[3] = -(_b3[1] - _b3[3]); + + + Mc2[0] = _b0[2] - _b0[3]; + Mc2[1] = _b1[2] - _b1[3]; + Mc2[2] = -(_b2[2] - _b2[3]); + Mc2[3] = -(_b3[2] - _b3[3]); + + + Mc3[0] = _b0[3]; + Mc3[1] = _b1[3]; + Mc3[2] = -_b2[3]; + Mc3[3] = -_b3[3]; + + // + c[0] = -Mc0[0]; + c[1] = -Mc1[0]; + c[2] = -Mc2[0]; + c[3] = -Mc3[0]; + c[4] = -Mc0[1]; + c[5] = -Mc1[1]; + c[6] = -Mc2[1]; + c[7] = -Mc3[1]; + + float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]); + float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]); + float e2 = e0 * (c[1] * c[7] -c[3] * c[5]); + e0 = e1 * e1 - e2; + hasSolved = false; + if ( e0 > 0) + { + e0 = sqrt(e0); + float _y[2] = {-e1 - e0, e0 - e1}; + float c0yc1[2] = {c[0] * _y[0] + c[1], + c[0] * _y[1] + c[1]}; + float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0], + -(c[2] * _y[1] + c[3]) / c0yc1[1]}; + bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1); + bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1); + + bool isSolution1 = checkSolution(_x[0], _y[0], c ); + bool isSolution2 = checkSolution(_x[1], _y[1], c ); + bool isSink1 = isIn1 && isSolution1; + bool isSink2 = isIn2 && isSolution2; + + if ( isSink1 != isSink2) + { + a = isSink1 ? _x[0] : _x[1]; + b = isSink1 ? _y[0] : _y[1]; + ab = a * b; + hasSolved = true; + delta.x = inextPt.x + a - nextPt.x; + delta.y = inextPt.y + b - nextPt.y; + + cv::Vec4f mismatchVec = ab * Mc0 + Mc1 *a + Mc2 * b + Mc3; + deltaGain = est_DeltaGain(invTensorMat, mismatchVec); + + } // isIn1 != isIn2 + } + if( hasSolved == false) + noIteration++; + } + else + { + hasSolved = false; + noReusedIteration++; + } + if( hasSolved == false ) + { + + cv::Vec4f mismatchVec = ab * Mc0 + Mc1 *a + Mc2 * b + Mc3; + est_Result(invTensorMat, mismatchVec, delta, deltaGain); + + delta.x = MAX(-1.f, MIN( 1.f , delta.x)); + delta.y = MAX(-1.f, MIN( 1.f , delta.y)); + + + if( j == 0) + prevGain = deltaGain; + gainVec += deltaGain; + nextPt += delta ; + nextPts[ptidx] = nextPt + halfWin; + gainVecs[ptidx]= gainVec; + + } + else + { + nextPt += delta; + nextPts[ptidx] = nextPt + halfWin; + gainVecs[ptidx]= gainVec + deltaGain; + noSolvedIteration++; + break; + } + + if(j > 0 && ( + (std::abs(delta.x - prevDelta.x) < 0.01 && std::abs(delta.y - prevDelta.y) < 0.01) + || ((delta.ddot(delta) <= 0.001) && std::abs(prevGain.x - deltaGain.x) < 0.01))) + { + nextPts[ptidx] -= delta*0.5f; + gainVecs[ptidx] -= deltaGain* 0.5f; + break; + } + + + prevDelta = delta; + prevGain = deltaGain; + } + + } + + } + + const Mat* prevImg; + const Mat* nextImg; + const Mat* prevDeriv; + const Mat* rgbPrevImg; + const Mat* rgbNextImg; + const Point2f* prevPts; + Point2f* nextPts; + uchar* status; + cv::Point2f* gainVecs; // gain vector x -> multiplier y -> offset + float* err; + int maxWinSize; + int minWinSize; + TermCriteria criteria; + int level; + int maxLevel; + int windowType; + float minEigThreshold; + bool useInitialFlow; + std::vector paramReset; + int crossSegmentationThreshold; + + }; + } +} +namespace beplk +{ + namespace ica{ + class TrackerInvoker : public cv::ParallelLoopBody + { + public: + TrackerInvoker(const Mat& _prevImg, + const Mat& _prevDeriv, + const Mat& _nextImg, + const Mat& _rgbPrevImg, + const Mat& _rgbNextImg, + const Point2f* _prevPts, + Point2f* _nextPts, + uchar* _status, + float* _err, + int _level, + int _maxLevel, + int _winSize[2], + int _maxIteration, + bool _useInitialFlow, + int _supportRegionType, + int _crossSegmentationThreshold, + float _minEigenValue) + { + prevImg = &_prevImg; + prevDeriv = &_prevDeriv; + nextImg = &_nextImg; + rgbPrevImg = &_rgbPrevImg; + rgbNextImg = &_rgbNextImg; + prevPts = _prevPts; + nextPts = _nextPts; + status = _status; + err = _err; + minWinSize = _winSize[0]; + maxWinSize = _winSize[1]; + criteria.maxCount = _maxIteration; + criteria.epsilon = 0.01; + level = _level; + maxLevel = _maxLevel; + windowType = _supportRegionType; + minEigThreshold = _minEigenValue; + useInitialFlow = _useInitialFlow; + crossSegmentationThreshold = _crossSegmentationThreshold; + } + + TrackerInvoker & operator=(const TrackerInvoker &) {return *this;}; + + void operator()(const cv::Range& range) const + { + cv::Size winSize; + cv::Point2f halfWin; + + const Mat& I = *prevImg; + const Mat& J = *nextImg; + const Mat& derivI = *prevDeriv; + const Mat& BI = *rgbPrevImg; + + winSize = cv::Size(maxWinSize,maxWinSize); + int winMaskwidth = static_cast(ceil(winSize.width / 8.f)) * 16; + cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); + winMaskMatBuf.setTo(1); + + const float FLT_SCALE = (1.f/(1 << 20)); // 20 + + int j, cn = I.channels(), cn2 = cn*2; + int winbufwidth = static_cast(ceil(winSize.width / 8.f)) * 8; + cv::Size winBufSize(winbufwidth,winbufwidth); + + std::vector _buf(winBufSize.area()*(cn + cn2)); + Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]); + Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]); + + for( int ptidx = range.start; ptidx < range.end; ptidx++ ) + { + Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level)); + Point2f nextPt; + if( level == maxLevel ) + { + if( useInitialFlow ) + nextPt = nextPts[ptidx]*(float)(1./(1 << level)); + else + nextPt = prevPt; + } + else + nextPt = nextPts[ptidx]*2.f; + nextPts[ptidx] = nextPt; + + Point2i iprevPt, inextPt; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + int winArea = maxWinSize * maxWinSize; + cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize)); + + if( calcWinMaskMat(BI, windowType, iprevPt, + winMaskMat,winSize,halfWin,winArea, + minWinSize,maxWinSize, crossSegmentationThreshold) == false) + continue; + + prevPt -= halfWin; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + + if( iprevPt.x + halfWin.x < 0 || iprevPt.x + halfWin.x>= derivI.cols || + iprevPt.y + halfWin.y < 0 || iprevPt.y + halfWin.y >= derivI.rows ) + { + if( level == 0 ) + { + if( status ) + status[ptidx] = 3; + if( err ) + err[ptidx] = 0; + } + continue; + } + + float a = prevPt.x - iprevPt.x; + float b = prevPt.y - iprevPt.y; + const int W_BITS = 14, W_BITS1 = 14; + + int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + + int dstep = (int)(derivI.step/derivI.elemSize1()); + int step = (int)(I.step/I.elemSize1()); + int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1()); + CV_Assert( step == (int)(J.step/J.elemSize1()) ); + float A11 = 0, A12 = 0, A22 = 0; + + #ifdef CV_SSE4_1 + __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128i z = _mm_setzero_si128(); + __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1)); + __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1)); + __m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps(); + __m128i mmMask4_epi32; + get4BitMask(winSize.width, mmMask4_epi32); + #endif + + + int x, y; + for( y = 0; y < winSize.height; y++ ) + { + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; + const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; + + short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step); + short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step); + + x = 0; + + #ifdef CV_SSE4_1 + for( ; x < winSize.width*cn; x += 4, dsrc += 4*2, dIptr += 4*2 ) + { + __m128i wMask = _mm_set_epi32(MaskSet * maskPtr[x+3], + MaskSet * maskPtr[x+2], + MaskSet * maskPtr[x+1], + MaskSet * maskPtr[x]); + __m128i v00, v01, v10, v11, t0, t1; + v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); + v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); + v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z); + v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); + _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0)); + + v00 = _mm_loadu_si128((const __m128i*)(dsrc)); + v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); + v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep)); + v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2)); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1); + v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... + + if( x + 4 > winSize.width) + { + v00 = _mm_and_si128(v00, mmMask4_epi32); + } + v00 = _mm_and_si128(v00, wMask); + + _mm_storeu_si128((__m128i*)dIptr, v00); + t0 = _mm_srai_epi32(v00, 16); // Iy0 Iy1 Iy2 Iy3 + t1 = _mm_srai_epi32(_mm_slli_epi32(v00, 16), 16); // Ix0 Ix1 Ix2 Ix3 + + __m128 fy = _mm_cvtepi32_ps(t0); + __m128 fx = _mm_cvtepi32_ps(t1); + + qA22 = _mm_add_ps(qA22, _mm_mul_ps(fy, fy)); + qA12 = _mm_add_ps(qA12, _mm_mul_ps(fx, fy)); + qA11 = _mm_add_ps(qA11, _mm_mul_ps(fx, fx)); + } + #else + + for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2) + { + if( maskPtr[x] == 0) + { + dIptr[0] = (short)0; + dIptr[1] = (short)0; + + continue; + } + int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + + src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5); + int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 + + dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1); + int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 + + dsrc[dstep+cn2+1]*iw11, W_BITS1); + + Iptr[x] = (short)ival; + dIptr[0] = (short)ixval; + dIptr[1] = (short)iyval; + A11 += (float)(ixval*ixval); + A12 += (float)(ixval*iyval); + A22 += (float)(iyval*iyval); + + } + + + #endif + + } + + #ifdef CV_SSE4_1 + float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4]; + _mm_store_ps(A11buf, qA11); + _mm_store_ps(A12buf, qA12); + _mm_store_ps(A22buf, qA22); + A11 += A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; + A12 += A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; + A22 += A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; + #endif + + A11 *= FLT_SCALE; + A12 *= FLT_SCALE; + A22 *= FLT_SCALE; + + + float D = A11*A22 - A12*A12; + float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) + + 4.f*A12*A12))/(2 * winArea); + + if( err ) + err[ptidx] = (float)minEig; + + if( D < FLT_EPSILON ) + { + if( level == 0 && status ) + status[ptidx] = 0; + continue; + } + + D = 1.f/D; + + cv::Point2f backUpNextPt = nextPt; + nextPt -= halfWin; + Point2f prevDelta(0,0); + + float c[8]; + __m128i mmMask0, mmMask1, mmMask; + getWBitMask(winSize.width, mmMask0, mmMask1, mmMask); + for( j = 0; j < criteria.maxCount; j++ ) + { + cv::Point2f delta; + bool hasSolved = false; + a = nextPt.x - cvFloor(nextPt.x); + b = nextPt.y - cvFloor(nextPt.y); + float ab = a * b; + + + if( (inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j == 0)) + { + inextPt.x = cvFloor(nextPt.x); + inextPt.y = cvFloor(nextPt.y); + + + if( inextPt.x < -winSize.width || inextPt.x >= J.cols || + inextPt.y < -winSize.height || inextPt.y >= J.rows ) + { + if( level == 0 && status ) + status[ptidx] = 3; + if (level > 0) + nextPts[ptidx] = backUpNextPt; + break; + } + + + iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + + float _b1[4] = {0,0,0,0}; + float _b2[4] = {0,0,0,0}; + #ifdef CV_SSE4_1 + __m128 qbc0[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; + __m128 qbc1[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; + qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + #endif + for( y = 0; y < winSize.height; y++ ) + { + const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn; + const short* Iptr = (const short*)(IWinBuf.data + ( y)*IWinBuf.step); + const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); + x = 0; + #ifdef CV_SSE4_1 + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 ) + { + if( maskPtr[x ] == 0 && maskPtr[x+1] == 0 && maskPtr[x+2] == 0 && maskPtr[x+3] == 0 + && maskPtr[x+4] == 0 && maskPtr[x+5] == 0 && maskPtr[x+6] == 0 && maskPtr[x+7] == 0) + continue; + __m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x)); + __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z); + __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); + __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z); + __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z); + + __m128i mmDiffc_epi16[4] = + { _mm_subs_epi16(_mm_slli_epi16(v00, 5), diff0), + _mm_subs_epi16(_mm_slli_epi16(v01, 5), diff0), + _mm_subs_epi16(_mm_slli_epi16(v10, 5), diff0), + _mm_subs_epi16(_mm_slli_epi16(v11, 5), diff0) + }; + + __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... + __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); + + if( x > winSize.width*cn - 8) + { + Ixy_0 = _mm_and_si128(Ixy_0, mmMask0); + Ixy_1 = _mm_and_si128(Ixy_1, mmMask1); + } + + __m128i diffc1[4] = + {_mm_unpackhi_epi16(mmDiffc_epi16[0],mmDiffc_epi16[0]), + _mm_unpackhi_epi16(mmDiffc_epi16[1],mmDiffc_epi16[1]), + _mm_unpackhi_epi16(mmDiffc_epi16[2],mmDiffc_epi16[2]), + _mm_unpackhi_epi16(mmDiffc_epi16[3],mmDiffc_epi16[3]) + }; + + __m128i diffc0[4] = + {_mm_unpacklo_epi16(mmDiffc_epi16[0],mmDiffc_epi16[0]), + _mm_unpacklo_epi16(mmDiffc_epi16[1],mmDiffc_epi16[1]), + _mm_unpacklo_epi16(mmDiffc_epi16[2],mmDiffc_epi16[2]), + _mm_unpacklo_epi16(mmDiffc_epi16[3],mmDiffc_epi16[3]) + }; + + + // It * Ix It * Iy [0 ... 3] + //mask split for multiplication + // for set 1 lo sigma 1 + + + v10 = _mm_mullo_epi16(Ixy_0, diffc0[0]); + v11 = _mm_mulhi_epi16(Ixy_0, diffc0[0]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[0] = _mm_add_ps(qbc0[0], _mm_cvtepi32_ps(v00)); + qbc1[0] = _mm_add_ps(qbc1[0], _mm_cvtepi32_ps(v10)); + + v10 = _mm_mullo_epi16(Ixy_0, diffc0[1]); + v11 = _mm_mulhi_epi16(Ixy_0, diffc0[1]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[1] = _mm_add_ps(qbc0[1], _mm_cvtepi32_ps(v00)); + qbc1[1] = _mm_add_ps(qbc1[1], _mm_cvtepi32_ps(v10)); + + v10 = _mm_mullo_epi16(Ixy_0, diffc0[2]); + v11 = _mm_mulhi_epi16(Ixy_0, diffc0[2]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[2] = _mm_add_ps(qbc0[2], _mm_cvtepi32_ps(v00)); + qbc1[2] = _mm_add_ps(qbc1[2], _mm_cvtepi32_ps(v10)); + + v10 = _mm_mullo_epi16(Ixy_0, diffc0[3]); + v11 = _mm_mulhi_epi16(Ixy_0, diffc0[3]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[3] = _mm_add_ps(qbc0[3], _mm_cvtepi32_ps(v00)); + qbc1[3] = _mm_add_ps(qbc1[3], _mm_cvtepi32_ps(v10)); + // It * Ix It * Iy [4 ... 7] + // for set 1 hi sigma 1 + + v10 = _mm_mullo_epi16(Ixy_1, diffc1[0]); + v11 = _mm_mulhi_epi16(Ixy_1, diffc1[0]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[0] = _mm_add_ps(qbc0[0], _mm_cvtepi32_ps(v00)); + qbc1[0] = _mm_add_ps(qbc1[0], _mm_cvtepi32_ps(v10)); + + v10 = _mm_mullo_epi16(Ixy_1, diffc1[1]); + v11 = _mm_mulhi_epi16(Ixy_1, diffc1[1]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[1] = _mm_add_ps(qbc0[1], _mm_cvtepi32_ps(v00)); + qbc1[1] = _mm_add_ps(qbc1[1], _mm_cvtepi32_ps(v10)); + + v10 = _mm_mullo_epi16(Ixy_1, diffc1[2]); + v11 = _mm_mulhi_epi16(Ixy_1, diffc1[2]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[2] = _mm_add_ps(qbc0[2], _mm_cvtepi32_ps(v00)); + qbc1[2] = _mm_add_ps(qbc1[2], _mm_cvtepi32_ps(v10)); + + v10 = _mm_mullo_epi16(Ixy_1, diffc1[3]); + v11 = _mm_mulhi_epi16(Ixy_1, diffc1[3]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[3] = _mm_add_ps(qbc0[3], _mm_cvtepi32_ps(v00)); + qbc1[3] = _mm_add_ps(qbc1[3], _mm_cvtepi32_ps(v10)); + + } + #else + for( ; x < winSize.width*cn; x++, dIptr += 2 ) + { + if( dIptr[0] == 0 && dIptr[1] == 0) + continue; + /*short It[4] = {CV_DESCALE((Jptr[x]) *(1 << W_BITS),W_BITS1-5) - Iptr[x], + CV_DESCALE((Jptr[x+cn]) *(1 << W_BITS),W_BITS1-5) - Iptr[x], + CV_DESCALE((Jptr[x+step]) *(1 << W_BITS),W_BITS1-5) - Iptr[x], + CV_DESCALE((Jptr[x+step+cn])*(1 << W_BITS),W_BITS1-5) - Iptr[x]};*/ + short It[4] = {(Jptr[x] << 5) - Iptr[x], + (Jptr[x+cn]<< 5) - Iptr[x], + (Jptr[x+step]<< 5) - Iptr[x], + (Jptr[x+step+cn]<< 5) - Iptr[x]}; + + _b1[0] += (float)(It[0]*dIptr[0]); + _b1[1] += (float)(It[1]*dIptr[0]); + _b1[2] += (float)(It[2]*dIptr[0]); + _b1[3] += (float)(It[3]*dIptr[0]); + + _b2[0] += (float)(It[0]*dIptr[1]); + _b2[1] += (float)(It[1]*dIptr[1]); + _b2[2] += (float)(It[2]*dIptr[1]); + _b2[3] += (float)(It[3]*dIptr[1]); + } + #endif + + } + + #ifdef CV_SSE4_1 + float CV_DECL_ALIGNED(16) bbuf[4]; + _mm_store_ps(bbuf, _mm_add_ps(qbc0[0], qbc1[0])); + _b1[0] += bbuf[0] + bbuf[2]; + _b2[0] += bbuf[1] + bbuf[3]; + + _mm_store_ps(bbuf, _mm_add_ps(qbc0[1], qbc1[1])); + _b1[1] += bbuf[0] + bbuf[2]; + _b2[1] += bbuf[1] + bbuf[3]; + + _mm_store_ps(bbuf, _mm_add_ps(qbc0[2], qbc1[2])); + _b1[2] += bbuf[0] + bbuf[2]; + _b2[2] += bbuf[1] + bbuf[3]; + + _mm_store_ps(bbuf, _mm_add_ps(qbc0[3], qbc1[3])); + _b1[3] += bbuf[0] + bbuf[2]; + _b2[3] += bbuf[1] + bbuf[3]; + + #endif + _b1[0] *= FLT_SCALE; + _b1[1] *= FLT_SCALE; + _b1[2] *= FLT_SCALE; + _b1[3] *= FLT_SCALE; + _b2[0] *= FLT_SCALE; + _b2[1] *= FLT_SCALE; + _b2[2] *= FLT_SCALE; + _b2[3] *= FLT_SCALE; + + float c0[4] = { _b1[3] + _b1[0] - _b1[2] - _b1[1], + _b1[1] - _b1[0], + _b1[2] - _b1[0], + _b1[0]}; + float c1[4] = { _b2[3] + _b2[0] - _b2[2] - _b2[1], + _b2[1] - _b2[0], + _b2[2] - _b2[0], + _b2[0]}; + + float DA12 = A12 * D ; + float DA22 = A22 * D ; + float DA11 = A11 * D ; + c[0] = DA12 * c1[0] - DA22 * c0[0]; + c[1] = DA12 * c1[1] - DA22 * c0[1]; + c[2] = DA12 * c1[2] - DA22 * c0[2]; + c[3] = DA12 * c1[3] - DA22 * c0[3]; + c[4] = DA12 * c0[0] - DA11 * c1[0]; + c[5] = DA12 * c0[1] - DA11 * c1[1]; + c[6] = DA12 * c0[2] - DA11 * c1[2]; + c[7] = DA12 * c0[3] - DA11 * c1[3]; + + float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]); + float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]); + float e2 = e0 * (c[1] * c[7] -c[3] * c[5]); + e0 = e1 * e1 - e2; + if ( e0 >= 0) + { + e0 = sqrt(e0); + float _y[2] = {-e1 - e0, e0 - e1}; + float c0yc1[2] = {c[0] * _y[0] + c[1], + c[0] * _y[1] + c[1]}; + + float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0], + -(c[2] * _y[1] + c[3]) / c0yc1[1]}; + + bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1); + bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1); + + + bool isSink1 = isIn1 && checkSolution(_x[0], _y[0], c ); + bool isSink2 = isIn2 && checkSolution(_x[1], _y[1], c ); + + + //if( isSink1 != isSink2 ) + { + if( isSink1 ) + { + hasSolved = true; + delta.x = inextPt.x + _x[0] - nextPt.x; + delta.y = inextPt.y + _y[0] - nextPt.y; + } + if (isSink2 ) + { + hasSolved = true; + delta.x = inextPt.x + _x[1] - nextPt.x; + delta.y = inextPt.y + _y[1] - nextPt.y; + } + } // isIn1 != isIn2 + } // e0 >= 0 + } + else + hasSolved = false; + if(hasSolved == false) + { + delta = Point2f( c[0] * ab + c[1] * a + c[2] * b + c[3], + c[4] * ab + c[5] * a + c[6] * b + c[7]); + nextPt += 0.7 * delta; + nextPts[ptidx] = nextPt + halfWin; + } + else + { + nextPt += delta; + nextPts[ptidx] = nextPt + halfWin; + break; + } + + + if( delta.ddot(delta) <= criteria.epsilon) + break; + + if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01 && + std::abs(delta.y - prevDelta.y) < 0.01) + { + nextPts[ptidx] -= delta*0.35f; + break; + } + + prevDelta = delta; + + } + + } + } + + + const Mat* prevImg; + const Mat* nextImg; + const Mat* prevDeriv; + const Mat* rgbPrevImg; + const Mat* rgbNextImg; + const Point2f* prevPts; + Point2f* nextPts; + uchar* status; + float* err; + int maxWinSize; + int minWinSize; + TermCriteria criteria; + int level; + int maxLevel; + int windowType; + float minEigThreshold; + bool useInitialFlow; + int crossSegmentationThreshold; + }; } } } diff --git a/modules/optflow/src/rlof/geo_interpolation.cpp b/modules/optflow/src/rlof/geo_interpolation.cpp index 47e92c96b0d..823070bf5e9 100644 --- a/modules/optflow/src/rlof/geo_interpolation.cpp +++ b/modules/optflow/src/rlof/geo_interpolation.cpp @@ -1,44 +1,6 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// - // - // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. - // - // By downloading, copying, installing or using the software you agree to this license. - // If you do not agree to this license, do not download, install, - // copy or use the software. - // - // - // License Agreement - // For Open Source Computer Vision Library - // - // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. - // Copyright (C) 2009, Willow Garage Inc., all rights reserved. - // Third party copyrights are property of their respective owners. - // - // Redistribution and use in source and binary forms, with or without modification, - // are permitted provided that the following conditions are met: - // - // * Redistribution's of source code must retain the above copyright notice, - // this list of conditions and the following disclaimer. - // - // * Redistribution's in binary form must reproduce the above copyright notice, - // this list of conditions and the following disclaimer in the documentation - // and/or other materials provided with the distribution. - // - // * The name of the copyright holders may not be used to endorse or promote products - // derived from this software without specific prior written permission. - // - // This software is provided by the copyright holders and contributors "as is" and - // any express or implied warranties, including, but not limited to, the implied - // warranties of merchantability and fitness for a particular purpose are disclaimed. - // In no event shall the Intel Corporation or contributors be liable for any direct, - // indirect, incidental, special, exemplary, or consequential damages - // (including, but not limited to, procurement of substitute goods or services; - // loss of use, data, or profits; or business interruption) however caused - // and on any theory of liability, whether in contract, strict liability, - // or tort (including negligence or otherwise) arising in any way out of - // the use of this software, even if advised of the possibility of such damage. - // - //M*/ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. // This functions have been contributed by Jonas Geisters #include "precomp.hpp" #include "geo_interpolation.hpp" @@ -46,480 +8,480 @@ #include namespace cv { - namespace optflow - { - - double K_h1(float x, double h2) - { - return exp(-(0.5 / (h2)) * x); - } - - struct Graph_helper { - int * mem; - int e_size; - Graph_helper(int k, int num_nodes) { - e_size = (2 * k + 1); - mem = (int*)malloc(sizeof(int) * e_size * num_nodes); - memset(mem, 0, sizeof(int) * e_size * num_nodes); - } - inline int size(int id) { - int r_addr = id * (e_size); - return ((int*)mem)[r_addr]; - } - inline int * data(int id) { - int r_addr = id * (e_size)+1; - return mem + r_addr; - } - inline void add(int id, std::pair data) { - int r_addr = id * (e_size); - int size = ++((int*)mem)[r_addr]; - r_addr += 2 * size - 1;//== 1 + 2*(size-1); - ((float*)mem)[r_addr] = data.first; - ((int*)mem)[r_addr + 1] = data.second; - } - inline bool color_in_target(int id, int color) { - int r_addr = id * (e_size); - int size = (((int*)mem)[r_addr]); - r_addr += 2; - for (int i = 0; i < size; i++) { - if (((int*)mem)[r_addr] == color) { - return true; - } - r_addr += 2; - } - return false; - } - - }; - - Mat sgeo_dist(const Mat& gra, int y, int x, float max, Mat &prev) - { - std::vector points; - points.push_back(Point2f(static_cast(x), static_cast(y))); - return sgeo_dist(gra, points, max, prev); - } - Mat sgeo_dist(const Mat& gra, const std::vector & points, float max, Mat &prev) - { - int Dx[] = { -1,0,1,-1,1,-1,0,1 }; - int Dy[] = { -1,-1,-1,0,0,1,1,1 }; - Mat dm(gra.rows, gra.cols, CV_32F, Scalar(max)); - prev = Mat(gra.rows, gra.cols, CV_8U, Scalar(255)); - - std::multimap not_visited_with_value; - - for (auto i = points.begin(); i != points.end(); i++) - { - int y = static_cast(i->y); - int x = static_cast(i->x); - not_visited_with_value.insert(std::pair(0.f, Vec2i(y, x))); - dm.at(y, x) = 0; - } - - bool done = false; - while (!done) - { - if (not_visited_with_value.begin() == not_visited_with_value.end()) { - done = true; - break; - } - std::multimap::iterator current_it = not_visited_with_value.begin(); - std::pair current_p = *current_it; - not_visited_with_value.erase(current_it); - - int y = current_p.second[0]; - int x = current_p.second[1]; - float cur_d = current_p.first; - - if (dm.at(y, x) != cur_d) { - continue; - } - - Vec8f gra_e = gra.at(y, x); - - for (int i = 0; i < 8; i++) { - if (gra_e[i] < 0) { - continue; - } - int dx = Dx[i]; - int dy = Dy[i]; - - if (dm.at(y + dy, x + dx) > cur_d + gra_e[i]) { - dm.at(y + dy, x + dx) = cur_d + gra_e[i]; - prev.at(y + dy, x + dx) = static_cast(7 - i); - not_visited_with_value.insert(std::pair(cur_d + gra_e[i], Vec2i(y + dy, x + dx))); - } - - } - } - - - - - return dm; - } - - Mat interpolate_irregular_nn_raster(const std::vector & prevPoints, - const std::vector & nextPoints, - const std::vector & status, - const Mat & i1) - { - Mat gra = getGraph(i1, 0.1f); - int Dx[] = { -1,0,1,-1,1,-1,0,1 }; - int Dy[] = { -1,-1,-1,0,0,1,1,1 }; - int max_rounds = 10; - Mat dirt = Mat(gra.rows, gra.cols, CV_8U, Scalar(0)); - Mat quellknoten = Mat(gra.rows, gra.cols, CV_32S, Scalar(-1)); - Mat dist = Mat(gra.rows, gra.cols, CV_32F, Scalar(std::numeric_limits::max())); - /* - * assign quellknoten ids. - */ - for (int i = 0; i < prevPoints.size(); i++) - { - int x = (int)prevPoints[i].x; - int y = (int)prevPoints[i].y; - if (status[i] == 0) - continue; - dirt.at(y, x) = 1; - dist.at(y, x) = 0; - quellknoten.at(y, x) = i; - } - - bool clean = true; - bool done = false; - int x = 0; - int y = 0; - int rounds = 0; - while (!done) { - /* - * Update x and y - * on even rounds go rasterscanorder , on odd round inverse rasterscanorder - */ - if (rounds % 2 == 0) { - x++; - if (x >= gra.cols) { - x = 0; - y++; - if (y >= gra.rows) { - y = 0; - rounds++; - y = gra.rows - 1; - x = gra.cols - 1; - if (rounds >= max_rounds || clean) { - done = true; - break; - } - } - } - } - else { - x--; - if (x < 0) { - x = gra.cols - 1; - y--; - if (y < 0) { - y = gra.rows - 1; - rounds++; - y = 0; - x = 0; - if (rounds >= max_rounds || clean) { - done = true; - break; - } - } - } - } - if (dirt.at(y, x) == 0) { - continue; - } - dirt.at(y, x) = 0; - - float c_dist = dist.at(y, x); - Vec8f gra_e = gra.at(y, x); - - for (int i = 0; i < 8; i++) { - int tx = Dx[i]; - int ty = Dy[i]; - if (ty == 0 && tx == 0) { - continue; - } - if (x + tx < 0 || x + tx >= gra.cols) { - continue; - } - if (y + ty < 0 || y + ty >= gra.rows) { - continue; - } - if (c_dist > dist.at(y + ty, x + tx)) { - if (c_dist > dist.at(y + ty, x + tx) + gra_e[i]) { - quellknoten.at(y, x) = quellknoten.at(y + ty, x + tx); - dist.at(y, x) = dist.at(y + ty, x + tx) + gra_e[i]; - dirt.at(y, x) = 1; - clean = false; - } - } - else { - if (c_dist + gra_e[i] < dist.at(y + ty, x + tx)) { - quellknoten.at(y + ty, x + tx) = quellknoten.at(y, x); - dist.at(y + ty, x + tx) = dist.at(y, x) + gra_e[i]; - dirt.at(y + ty, x + tx) = 1; - clean = false; - } - } - } - - - - } - - Mat nnFlow(i1.rows, i1.cols, CV_32FC2, Scalar(0)); - for (y = 0; y < i1.rows; y++) { - for (x = 0; x < i1.cols; x++) { - - int id = quellknoten.at(y, x); - if (id != -1) - { - nnFlow.at(y, x) = nextPoints[id] - prevPoints[id]; - } - } - } - return nnFlow; - } - - Mat interpolate_irregular_knn( - const std::vector & _prevPoints, - const std::vector & _nextPoints, - const std::vector & status, - const Mat &color_img, - int k, - float pixeldistance) - { - Mat in(color_img.rows, color_img.cols, CV_32FC2); - Mat mask = Mat::zeros(color_img.rows, color_img.cols, CV_8UC1); - - for (unsigned n = 0; n < _prevPoints.size(); n++) - { - if (_prevPoints[n].x >= 0 && _prevPoints[n].y >= 0 && _prevPoints[n].x < color_img.cols && _prevPoints[n].y < color_img.rows) - { - in.at(_prevPoints[n]) = _nextPoints[n] - _prevPoints[n]; - mask.at(_prevPoints[n]) = status[n]; - } - - } - int Dx[] = { -1,0,1,-1,1,-1,0,1 }; - int Dy[] = { -1,-1,-1,0,0,1,1,1 }; - Mat gra = getGraph(color_img, pixeldistance); - Mat nnFlow(in.rows, in.cols, CV_32FC2, Scalar(0)); - - std::multimap my_agents; // > - Graph_helper graph_helper(k, in.rows*in.cols); //< arrivaltime, color> - - - int color = 0; - std::vector flow_point_list; - for (int y = 0; y < in.rows; y++) { - for (int x = 0; x < in.cols; x++) { - if (mask.at(y, x) > 0) { - flow_point_list.push_back(Vec2i(y, x)); - nnFlow.at(y, x) = in.at(y, x); - - int v_id = (y * in.cols + x); - graph_helper.add(v_id, std::pair(0.f, color)); - - - Vec8f gra_e = gra.at(y, x); - for (int i = 0; i < 8; i++) { - if (gra_e[i] < 0) - continue; - int dx = Dx[i]; - int dy = Dy[i]; - int target = (y + dy) * in.cols + (x + dx); - Vec2i agent(target, color); - my_agents.insert(std::pair(gra_e[i], agent)); - - } - color++; - } - } - } - - int global_time = 0; - - bool done = false; - while (!done) { - if (my_agents.size() == 0) { - done = true; - break; - } - global_time++; - - std::multimap::iterator current_it = my_agents.begin(); - std::pair current_p = *current_it; - my_agents.erase(current_it); - - int target = current_p.second[0]; - color = current_p.second[1]; - float arriv_time = current_p.first; - - Vec8f gra_e = gra.at(target);// (y*cols+x) - if (graph_helper.size(target) >= k) { - continue; - } - - bool color_found_in_target = graph_helper.color_in_target(target, color); - if (color_found_in_target) { - continue; - } - graph_helper.add(target, std::pair(arriv_time, color)); - - - for (int i = 0; i < 8; i++) { - if (gra_e[i] < 0) - continue; - - int dx = Dx[i]; - int dy = Dy[i]; - int new_target = target + dx + (dy*in.cols); - if (graph_helper.size(new_target) >= k) { - continue; - } - color_found_in_target = graph_helper.color_in_target(new_target, color); - if (color_found_in_target) { - continue; - } - Vec2i new_agent(new_target, color); - my_agents.insert(std::pair(arriv_time + gra_e[i], new_agent)); - - } - } - - Mat ret(in.rows, in.cols*k, CV_32FC2); - for (int y = 0; y < in.rows; y++) { - for (int x = 0; x < in.cols; x++) { - for (int i = 0; i < k; i++) { - float dist = *((float*)(graph_helper.data(y*in.cols + x) + 2 * i)); - float id = *((float*)(graph_helper.data(y*in.cols + x) + 2 * i + 1)); - ret.at(y, k*x + i) = Vec2f(dist, id); - } - } - } - return ret; - - for (int y = 0; y < in.rows; y++) { - for (int x = 0; x < in.cols; x++) { - int v_id = y * in.cols + x; - std::vector > flow_s(k); - - for (int i = 0; i < k; i++) { - color = *((int*)(graph_helper.data(v_id) + 1 + 2 * i)); - int cy = flow_point_list[color][0]; - int cx = flow_point_list[color][1]; - Vec2f flow = in.at(cy, cx); - flow_s[i] = std::make_pair(static_cast(norm(flow)), flow); - - - - } - - nnFlow.at(y, x) = std::min_element(flow_s.begin(), flow_s.end(), [](const std::pair &left, const std::pair &right) { - return left.first < right.first; - })->second; - - - } - } - free(graph_helper.mem); - return nnFlow; - - - } - Mat getGraph(const Mat &image, float edge_length) - { - - int Dx[] = { -1,0,1,-1,1,-1,0,1 }; - int Dy[] = { -1,-1,-1,0,0,1,1,1 }; - Mat gra(image.rows, image.cols, CV_32FC(8)); - - for (int y = 0; y < gra.rows; y++) { - for (int x = 0; x < gra.cols; x++) { - - for (int i = 0; i < 8; i++) { - int dx = Dx[i]; - int dy = Dy[i]; - gra.at(y, x)[i] = -1; - - if (x + dx < 0 || y + dy < 0 || x + dx >= gra.cols || y + dy >= gra.rows) { - continue; - } - - if (i < 4) { - int si = 7 - i; - gra.at(y, x)[i] = gra.at(y + dy, x + dx)[si]; - } - else { - float p1 = dx * dx*edge_length*edge_length + dy * dy*edge_length*edge_length; - float p2 = static_cast(image.at(y, x)[0] - image.at(y + dy, x + dx)[0]); - float p3 = static_cast(image.at(y, x)[1] - image.at(y + dy, x + dx)[1]); - float p4 = static_cast(image.at(y, x)[2] - image.at(y + dy, x + dx)[2]); - gra.at(y, x)[i] = sqrt(p1 + p2 * p2 + p3 * p3 + p4 * p4); - } - - } - - } - } - - return gra; - } - - - Mat interpolate_irregular_nn( - const std::vector & _prevPoints, - const std::vector & _nextPoints, - const std::vector & status, - const Mat &color_img, - float pixeldistance) - { - int Dx[] = { -1,0,1,-1,1,-1,0,1 }; - int Dy[] = { -1,-1,-1,0,0,1,1,1 }; - std::vector prevPoints, nextPoints; - std::map, std::pair> flowMap; - for (unsigned n = 0; n < _prevPoints.size(); n++) - { - if (status[n] != 0) - { - flowMap.insert(std::make_pair( - std::make_pair(_prevPoints[n].x, _prevPoints[n].y), - std::make_pair(_nextPoints[n].x, _nextPoints[n].y))); - prevPoints.push_back(_prevPoints[n]); - nextPoints.push_back(_nextPoints[n]); - } - - } - - Mat gra = getGraph(color_img, pixeldistance); - - Mat prev; - Mat geo_dist = sgeo_dist(gra, prevPoints, std::numeric_limits::max(), prev); - - - Mat nnFlow = Mat::zeros(color_img.size(), CV_32FC2); - for (int y = 0; y < nnFlow.rows; y++) - { - for (int x = 0; x < nnFlow.cols; x++) - { - int cx = x; - int cy = y; - while (prev.at(cy, cx) != 255) - { - int i = prev.at(cy, cx); - cx += Dx[i]; - cy += Dy[i]; - } - auto val = flowMap[std::make_pair(static_cast(cx), static_cast(cy))]; - nnFlow.at(y, x) = Vec2f(val.first - cx, val.second - cy); - } - } - return nnFlow; - } - - } +namespace optflow +{ + + double K_h1(float x, double h2) + { + return exp(-(0.5 / (h2)) * x); + } + + struct Graph_helper { + int * mem; + int e_size; + Graph_helper(int k, int num_nodes) { + e_size = (2 * k + 1); + mem = (int*)malloc(sizeof(int) * e_size * num_nodes); + memset(mem, 0, sizeof(int) * e_size * num_nodes); + } + inline int size(int id) { + int r_addr = id * (e_size); + return ((int*)mem)[r_addr]; + } + inline int * data(int id) { + int r_addr = id * (e_size)+1; + return mem + r_addr; + } + inline void add(int id, std::pair data) { + int r_addr = id * (e_size); + int size = ++((int*)mem)[r_addr]; + r_addr += 2 * size - 1;//== 1 + 2*(size-1); + ((float*)mem)[r_addr] = data.first; + ((int*)mem)[r_addr + 1] = data.second; + } + inline bool color_in_target(int id, int color) { + int r_addr = id * (e_size); + int size = (((int*)mem)[r_addr]); + r_addr += 2; + for (int i = 0; i < size; i++) { + if (((int*)mem)[r_addr] == color) { + return true; + } + r_addr += 2; + } + return false; + } + + }; + + Mat sgeo_dist(const Mat& gra, int y, int x, float max, Mat &prev) + { + std::vector points; + points.push_back(Point2f(static_cast(x), static_cast(y))); + return sgeo_dist(gra, points, max, prev); + } + Mat sgeo_dist(const Mat& gra, const std::vector & points, float max, Mat &prev) + { + int Dx[] = { -1,0,1,-1,1,-1,0,1 }; + int Dy[] = { -1,-1,-1,0,0,1,1,1 }; + Mat dm(gra.rows, gra.cols, CV_32F, Scalar(max)); + prev = Mat(gra.rows, gra.cols, CV_8U, Scalar(255)); + + std::multimap not_visited_with_value; + + for (auto i = points.begin(); i != points.end(); i++) + { + int y = static_cast(i->y); + int x = static_cast(i->x); + not_visited_with_value.insert(std::pair(0.f, Vec2i(y, x))); + dm.at(y, x) = 0; + } + + bool done = false; + while (!done) + { + if (not_visited_with_value.begin() == not_visited_with_value.end()) { + done = true; + break; + } + std::multimap::iterator current_it = not_visited_with_value.begin(); + std::pair current_p = *current_it; + not_visited_with_value.erase(current_it); + + int y = current_p.second[0]; + int x = current_p.second[1]; + float cur_d = current_p.first; + + if (dm.at(y, x) != cur_d) { + continue; + } + + Vec8f gra_e = gra.at(y, x); + + for (int i = 0; i < 8; i++) { + if (gra_e[i] < 0) { + continue; + } + int dx = Dx[i]; + int dy = Dy[i]; + + if (dm.at(y + dy, x + dx) > cur_d + gra_e[i]) { + dm.at(y + dy, x + dx) = cur_d + gra_e[i]; + prev.at(y + dy, x + dx) = static_cast(7 - i); + not_visited_with_value.insert(std::pair(cur_d + gra_e[i], Vec2i(y + dy, x + dx))); + } + + } + } + + + + + return dm; + } + + Mat interpolate_irregular_nn_raster(const std::vector & prevPoints, + const std::vector & nextPoints, + const std::vector & status, + const Mat & i1) + { + Mat gra = getGraph(i1, 0.1f); + int Dx[] = { -1,0,1,-1,1,-1,0,1 }; + int Dy[] = { -1,-1,-1,0,0,1,1,1 }; + int max_rounds = 10; + Mat dirt = Mat(gra.rows, gra.cols, CV_8U, Scalar(0)); + Mat quellknoten = Mat(gra.rows, gra.cols, CV_32S, Scalar(-1)); + Mat dist = Mat(gra.rows, gra.cols, CV_32F, Scalar(std::numeric_limits::max())); + /* + * assign quellknoten ids. + */ + for (int i = 0; i < prevPoints.size(); i++) + { + int x = (int)prevPoints[i].x; + int y = (int)prevPoints[i].y; + if (status[i] == 0) + continue; + dirt.at(y, x) = 1; + dist.at(y, x) = 0; + quellknoten.at(y, x) = i; + } + + bool clean = true; + bool done = false; + int x = 0; + int y = 0; + int rounds = 0; + while (!done) { + /* + * Update x and y + * on even rounds go rasterscanorder , on odd round inverse rasterscanorder + */ + if (rounds % 2 == 0) { + x++; + if (x >= gra.cols) { + x = 0; + y++; + if (y >= gra.rows) { + y = 0; + rounds++; + y = gra.rows - 1; + x = gra.cols - 1; + if (rounds >= max_rounds || clean) { + done = true; + break; + } + } + } + } + else { + x--; + if (x < 0) { + x = gra.cols - 1; + y--; + if (y < 0) { + y = gra.rows - 1; + rounds++; + y = 0; + x = 0; + if (rounds >= max_rounds || clean) { + done = true; + break; + } + } + } + } + if (dirt.at(y, x) == 0) { + continue; + } + dirt.at(y, x) = 0; + + float c_dist = dist.at(y, x); + Vec8f gra_e = gra.at(y, x); + + for (int i = 0; i < 8; i++) { + int tx = Dx[i]; + int ty = Dy[i]; + if (ty == 0 && tx == 0) { + continue; + } + if (x + tx < 0 || x + tx >= gra.cols) { + continue; + } + if (y + ty < 0 || y + ty >= gra.rows) { + continue; + } + if (c_dist > dist.at(y + ty, x + tx)) { + if (c_dist > dist.at(y + ty, x + tx) + gra_e[i]) { + quellknoten.at(y, x) = quellknoten.at(y + ty, x + tx); + dist.at(y, x) = dist.at(y + ty, x + tx) + gra_e[i]; + dirt.at(y, x) = 1; + clean = false; + } + } + else { + if (c_dist + gra_e[i] < dist.at(y + ty, x + tx)) { + quellknoten.at(y + ty, x + tx) = quellknoten.at(y, x); + dist.at(y + ty, x + tx) = dist.at(y, x) + gra_e[i]; + dirt.at(y + ty, x + tx) = 1; + clean = false; + } + } + } + + + + } + + Mat nnFlow(i1.rows, i1.cols, CV_32FC2, Scalar(0)); + for (y = 0; y < i1.rows; y++) { + for (x = 0; x < i1.cols; x++) { + + int id = quellknoten.at(y, x); + if (id != -1) + { + nnFlow.at(y, x) = nextPoints[id] - prevPoints[id]; + } + } + } + return nnFlow; + } + + Mat interpolate_irregular_knn( + const std::vector & _prevPoints, + const std::vector & _nextPoints, + const std::vector & status, + const Mat &color_img, + int k, + float pixeldistance) + { + Mat in(color_img.rows, color_img.cols, CV_32FC2); + Mat mask = Mat::zeros(color_img.rows, color_img.cols, CV_8UC1); + + for (unsigned n = 0; n < _prevPoints.size(); n++) + { + if (_prevPoints[n].x >= 0 && _prevPoints[n].y >= 0 && _prevPoints[n].x < color_img.cols && _prevPoints[n].y < color_img.rows) + { + in.at(_prevPoints[n]) = _nextPoints[n] - _prevPoints[n]; + mask.at(_prevPoints[n]) = status[n]; + } + + } + int Dx[] = { -1,0,1,-1,1,-1,0,1 }; + int Dy[] = { -1,-1,-1,0,0,1,1,1 }; + Mat gra = getGraph(color_img, pixeldistance); + Mat nnFlow(in.rows, in.cols, CV_32FC2, Scalar(0)); + + std::multimap my_agents; // > + Graph_helper graph_helper(k, in.rows*in.cols); //< arrivaltime, color> + + + int color = 0; + std::vector flow_point_list; + for (int y = 0; y < in.rows; y++) { + for (int x = 0; x < in.cols; x++) { + if (mask.at(y, x) > 0) { + flow_point_list.push_back(Vec2i(y, x)); + nnFlow.at(y, x) = in.at(y, x); + + int v_id = (y * in.cols + x); + graph_helper.add(v_id, std::pair(0.f, color)); + + + Vec8f gra_e = gra.at(y, x); + for (int i = 0; i < 8; i++) { + if (gra_e[i] < 0) + continue; + int dx = Dx[i]; + int dy = Dy[i]; + int target = (y + dy) * in.cols + (x + dx); + Vec2i agent(target, color); + my_agents.insert(std::pair(gra_e[i], agent)); + + } + color++; + } + } + } + + int global_time = 0; + + bool done = false; + while (!done) { + if (my_agents.size() == 0) { + done = true; + break; + } + global_time++; + + std::multimap::iterator current_it = my_agents.begin(); + std::pair current_p = *current_it; + my_agents.erase(current_it); + + int target = current_p.second[0]; + color = current_p.second[1]; + float arriv_time = current_p.first; + + Vec8f gra_e = gra.at(target);// (y*cols+x) + if (graph_helper.size(target) >= k) { + continue; + } + + bool color_found_in_target = graph_helper.color_in_target(target, color); + if (color_found_in_target) { + continue; + } + graph_helper.add(target, std::pair(arriv_time, color)); + + + for (int i = 0; i < 8; i++) { + if (gra_e[i] < 0) + continue; + + int dx = Dx[i]; + int dy = Dy[i]; + int new_target = target + dx + (dy*in.cols); + if (graph_helper.size(new_target) >= k) { + continue; + } + color_found_in_target = graph_helper.color_in_target(new_target, color); + if (color_found_in_target) { + continue; + } + Vec2i new_agent(new_target, color); + my_agents.insert(std::pair(arriv_time + gra_e[i], new_agent)); + + } + } + + Mat ret(in.rows, in.cols*k, CV_32FC2); + for (int y = 0; y < in.rows; y++) { + for (int x = 0; x < in.cols; x++) { + for (int i = 0; i < k; i++) { + float dist = *((float*)(graph_helper.data(y*in.cols + x) + 2 * i)); + float id = *((float*)(graph_helper.data(y*in.cols + x) + 2 * i + 1)); + ret.at(y, k*x + i) = Vec2f(dist, id); + } + } + } + return ret; + + for (int y = 0; y < in.rows; y++) { + for (int x = 0; x < in.cols; x++) { + int v_id = y * in.cols + x; + std::vector > flow_s(k); + + for (int i = 0; i < k; i++) { + color = *((int*)(graph_helper.data(v_id) + 1 + 2 * i)); + int cy = flow_point_list[color][0]; + int cx = flow_point_list[color][1]; + Vec2f flow = in.at(cy, cx); + flow_s[i] = std::make_pair(static_cast(norm(flow)), flow); + + + + } + + nnFlow.at(y, x) = std::min_element(flow_s.begin(), flow_s.end(), [](const std::pair &left, const std::pair &right) { + return left.first < right.first; + })->second; + + + } + } + free(graph_helper.mem); + return nnFlow; + + + } + Mat getGraph(const Mat &image, float edge_length) + { + + int Dx[] = { -1,0,1,-1,1,-1,0,1 }; + int Dy[] = { -1,-1,-1,0,0,1,1,1 }; + Mat gra(image.rows, image.cols, CV_32FC(8)); + + for (int y = 0; y < gra.rows; y++) { + for (int x = 0; x < gra.cols; x++) { + + for (int i = 0; i < 8; i++) { + int dx = Dx[i]; + int dy = Dy[i]; + gra.at(y, x)[i] = -1; + + if (x + dx < 0 || y + dy < 0 || x + dx >= gra.cols || y + dy >= gra.rows) { + continue; + } + + if (i < 4) { + int si = 7 - i; + gra.at(y, x)[i] = gra.at(y + dy, x + dx)[si]; + } + else { + float p1 = dx * dx*edge_length*edge_length + dy * dy*edge_length*edge_length; + float p2 = static_cast(image.at(y, x)[0] - image.at(y + dy, x + dx)[0]); + float p3 = static_cast(image.at(y, x)[1] - image.at(y + dy, x + dx)[1]); + float p4 = static_cast(image.at(y, x)[2] - image.at(y + dy, x + dx)[2]); + gra.at(y, x)[i] = sqrt(p1 + p2 * p2 + p3 * p3 + p4 * p4); + } + + } + + } + } + + return gra; + } + + + Mat interpolate_irregular_nn( + const std::vector & _prevPoints, + const std::vector & _nextPoints, + const std::vector & status, + const Mat &color_img, + float pixeldistance) + { + int Dx[] = { -1,0,1,-1,1,-1,0,1 }; + int Dy[] = { -1,-1,-1,0,0,1,1,1 }; + std::vector prevPoints, nextPoints; + std::map, std::pair> flowMap; + for (unsigned n = 0; n < _prevPoints.size(); n++) + { + if (status[n] != 0) + { + flowMap.insert(std::make_pair( + std::make_pair(_prevPoints[n].x, _prevPoints[n].y), + std::make_pair(_nextPoints[n].x, _nextPoints[n].y))); + prevPoints.push_back(_prevPoints[n]); + nextPoints.push_back(_nextPoints[n]); + } + + } + + Mat gra = getGraph(color_img, pixeldistance); + + Mat prev; + Mat geo_dist = sgeo_dist(gra, prevPoints, std::numeric_limits::max(), prev); + + + Mat nnFlow = Mat::zeros(color_img.size(), CV_32FC2); + for (int y = 0; y < nnFlow.rows; y++) + { + for (int x = 0; x < nnFlow.cols; x++) + { + int cx = x; + int cy = y; + while (prev.at(cy, cx) != 255) + { + int i = prev.at(cy, cx); + cx += Dx[i]; + cy += Dy[i]; + } + auto val = flowMap[std::make_pair(static_cast(cx), static_cast(cy))]; + nnFlow.at(y, x) = Vec2f(val.first - cx, val.second - cy); + } + } + return nnFlow; + } + +} } \ No newline at end of file diff --git a/modules/optflow/src/rlof/geo_interpolation.hpp b/modules/optflow/src/rlof/geo_interpolation.hpp index 2c9dfbd38c0..60aef39bedc 100644 --- a/modules/optflow/src/rlof/geo_interpolation.hpp +++ b/modules/optflow/src/rlof/geo_interpolation.hpp @@ -1,75 +1,37 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// - // - // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. - // - // By downloading, copying, installing or using the software you agree to this license. - // If you do not agree to this license, do not download, install, - // copy or use the software. - // - // - // License Agreement - // For Open Source Computer Vision Library - // - // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. - // Copyright (C) 2009, Willow Garage Inc., all rights reserved. - // Third party copyrights are property of their respective owners. - // - // Redistribution and use in source and binary forms, with or without modification, - // are permitted provided that the following conditions are met: - // - // * Redistribution's of source code must retain the above copyright notice, - // this list of conditions and the following disclaimer. - // - // * Redistribution's in binary form must reproduce the above copyright notice, - // this list of conditions and the following disclaimer in the documentation - // and/or other materials provided with the distribution. - // - // * The name of the copyright holders may not be used to endorse or promote products - // derived from this software without specific prior written permission. - // - // This software is provided by the copyright holders and contributors "as is" and - // any express or implied warranties, including, but not limited to, the implied - // warranties of merchantability and fitness for a particular purpose are disclaimed. - // In no event shall the Intel Corporation or contributors be liable for any direct, - // indirect, incidental, special, exemplary, or consequential damages - // (including, but not limited to, procurement of substitute goods or services; - // loss of use, data, or profits; or business interruption) however caused - // and on any theory of liability, whether in contract, strict liability, - // or tort (including negligence or otherwise) arising in any way out of - // the use of this software, even if advised of the possibility of such damage. - // - //M*/ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. #ifndef _GEO_INTERPOLATION_HPP_ #define _GEO_INTERPOLATION_HPP_ #include namespace cv { - namespace optflow - { - typedef Vec Vec8f; - Mat getGraph(const Mat & image, float edge_length); - Mat sgeo_dist(const Mat& gra, int y, int x, float max, Mat &prev); - Mat sgeo_dist(const Mat& gra, const std::vector & points, float max, Mat &prev); - Mat interpolate_irregular_nw(const Mat &in, const Mat &mask, const Mat &color_img, float max_d, float bandwidth, float pixeldistance); - Mat interpolate_irregular_nn( - const std::vector & prevPoints, - const std::vector & nextPoints, - const std::vector & status, - const Mat &color_img, - float pixeldistance); - Mat interpolate_irregular_knn( - const std::vector & _prevPoints, - const std::vector & _nextPoints, - const std::vector & status, - const Mat &color_img, - int k, - float pixeldistance); +namespace optflow +{ + typedef Vec Vec8f; + Mat getGraph(const Mat & image, float edge_length); + Mat sgeo_dist(const Mat& gra, int y, int x, float max, Mat &prev); + Mat sgeo_dist(const Mat& gra, const std::vector & points, float max, Mat &prev); + Mat interpolate_irregular_nw(const Mat &in, const Mat &mask, const Mat &color_img, float max_d, float bandwidth, float pixeldistance); + Mat interpolate_irregular_nn( + const std::vector & prevPoints, + const std::vector & nextPoints, + const std::vector & status, + const Mat &color_img, + float pixeldistance); + Mat interpolate_irregular_knn( + const std::vector & _prevPoints, + const std::vector & _nextPoints, + const std::vector & status, + const Mat &color_img, + int k, + float pixeldistance); - Mat interpolate_irregular_nn_raster(const std::vector & prevPoints, - const std::vector & nextPoints, - const std::vector & status, - const Mat & i1); + Mat interpolate_irregular_nn_raster(const std::vector & prevPoints, + const std::vector & nextPoints, + const std::vector & status, + const Mat & i1); - } +} } #endif diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp index 1a8f5eec380..2637fa7fbe7 100644 --- a/modules/optflow/src/rlof/plk_invoker.hpp +++ b/modules/optflow/src/rlof/plk_invoker.hpp @@ -1,51 +1,13 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// - // - // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. - // - // By downloading, copying, installing or using the software you agree to this license. - // If you do not agree to this license, do not download, install, - // copy or use the software. - // - // - // License Agreement - // For Open Source Computer Vision Library - // - // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. - // Copyright (C) 2009, Willow Garage Inc., all rights reserved. - // Third party copyrights are property of their respective owners. - // - // Redistribution and use in source and binary forms, with or without modification, - // are permitted provided that the following conditions are met: - // - // * Redistribution's of source code must retain the above copyright notice, - // this list of conditions and the following disclaimer. - // - // * Redistribution's in binary form must reproduce the above copyright notice, - // this list of conditions and the following disclaimer in the documentation - // and/or other materials provided with the distribution. - // - // * The name of the copyright holders may not be used to endorse or promote products - // derived from this software without specific prior written permission. - // - // This software is provided by the copyright holders and contributors "as is" and - // any express or implied warranties, including, but not limited to, the implied - // warranties of merchantability and fitness for a particular purpose are disclaimed. - // In no event shall the Intel Corporation or contributors be liable for any direct, - // indirect, incidental, special, exemplary, or consequential damages - // (including, but not limited to, procurement of substitute goods or services; - // loss of use, data, or profits; or business interruption) however caused - // and on any theory of liability, whether in contract, strict liability, - // or tort (including negligence or otherwise) arising in any way out of - // the use of this software, even if advised of the possibility of such damage. - // - //M*/ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. #ifndef _PLK_INVOKER_HPP_ #define _PLK_INVOKER_HPP_ #include "rlof_invokerbase.hpp" namespace cv{ - namespace optflow{ +namespace optflow{ namespace plk { @@ -57,89 +19,89 @@ class TrackerInvoker : public cv::ParallelLoopBody public: TrackerInvoker( - const Mat& _prevImg, - const Mat& _prevDeriv, - const Mat& _nextImg, - const Mat& _rgbPrevImg, - const Mat& _rgbNextImg, - const Point2f* _prevPts, - Point2f* _nextPts, - uchar* _status, - float* _err, - Point2f* _gainVecs, - int _level, - int _maxLevel, - int _winSize[2], - int _maxIteration, - bool _useInitialFlow, - int _supportRegionType, - float _minEigenValue, - int _crossSegmentationThreshold) - { - prevImg = &_prevImg; - prevDeriv = &_prevDeriv; - nextImg = &_nextImg; - rgbPrevImg = &_rgbPrevImg; - rgbNextImg = &_rgbNextImg; - prevPts = _prevPts; - nextPts = _nextPts; - status = _status; - err = _err; - gainVecs = _gainVecs; - minWinSize = _winSize[0]; - maxWinSize = _winSize[1]; - criteria.maxCount = _maxIteration; - criteria.epsilon = 0.01; - level = _level; - maxLevel = _maxLevel; - windowType = _supportRegionType; - minEigThreshold = _minEigenValue; - useInitialFlow = _useInitialFlow; - crossSegmentationThreshold = _crossSegmentationThreshold; - } + const Mat& _prevImg, + const Mat& _prevDeriv, + const Mat& _nextImg, + const Mat& _rgbPrevImg, + const Mat& _rgbNextImg, + const Point2f* _prevPts, + Point2f* _nextPts, + uchar* _status, + float* _err, + Point2f* _gainVecs, + int _level, + int _maxLevel, + int _winSize[2], + int _maxIteration, + bool _useInitialFlow, + int _supportRegionType, + float _minEigenValue, + int _crossSegmentationThreshold) + { + prevImg = &_prevImg; + prevDeriv = &_prevDeriv; + nextImg = &_nextImg; + rgbPrevImg = &_rgbPrevImg; + rgbNextImg = &_rgbNextImg; + prevPts = _prevPts; + nextPts = _nextPts; + status = _status; + err = _err; + gainVecs = _gainVecs; + minWinSize = _winSize[0]; + maxWinSize = _winSize[1]; + criteria.maxCount = _maxIteration; + criteria.epsilon = 0.01; + level = _level; + maxLevel = _maxLevel; + windowType = _supportRegionType; + minEigThreshold = _minEigenValue; + useInitialFlow = _useInitialFlow; + crossSegmentationThreshold = _crossSegmentationThreshold; + } - TrackerInvoker & operator=(const TrackerInvoker &) {return *this;}; + TrackerInvoker & operator=(const TrackerInvoker &) {return *this;}; void operator()(const cv::Range& range) const { - cv::Size winSize; - cv::Point2f halfWin; + cv::Size winSize; + cv::Point2f halfWin; - const Mat& I = *prevImg; - const Mat& J = *nextImg; + const Mat& I = *prevImg; + const Mat& J = *nextImg; const Mat& derivI = *prevDeriv; - const Mat& BI = *rgbPrevImg; + const Mat& BI = *rgbPrevImg; - winSize = cv::Size(maxWinSize,maxWinSize); - int winMaskwidth = static_cast(ceil(winSize.width / 16.f)) * 16; - cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); - winMaskMatBuf.setTo(1); - - const float FLT_SCALE = (1.f/(1 << 16));//(1.f/(1 << 20)); // 20 + winSize = cv::Size(maxWinSize,maxWinSize); + int winMaskwidth = static_cast(ceil(winSize.width / 16.f)) * 16; + cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); + winMaskMatBuf.setTo(1); + + const float FLT_SCALE = (1.f/(1 << 16));//(1.f/(1 << 20)); // 20 int cn = I.channels(), cn2 = cn*2; - int winbufwidth = static_cast(ceil(winSize.width / 16.f)) * 16; - cv::Size winBufSize(winbufwidth,winbufwidth); + int winbufwidth = static_cast(ceil(winSize.width / 16.f)) * 16; + cv::Size winBufSize(winbufwidth,winbufwidth); - cv::Mat invTensorMat(4,4, CV_32FC1); - cv::Mat mismatchMat(4, 1, CV_32FC1); - cv::Mat resultMat(4, 1, CV_32FC1); + cv::Mat invTensorMat(4,4, CV_32FC1); + cv::Mat mismatchMat(4, 1, CV_32FC1); + cv::Mat resultMat(4, 1, CV_32FC1); - cv::AutoBuffer _buf(winBufSize.area()*(cn + cn2)); - int derivDepth = DataType::depth; + cv::AutoBuffer _buf(winBufSize.area()*(cn + cn2)); + int derivDepth = DataType::depth; - Mat IWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf.data()); - Mat derivIWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf.data() + winBufSize.area()*cn); + Mat IWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf.data()); + Mat derivIWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf.data() + winBufSize.area()*cn); for( int ptidx = range.start; ptidx < range.end; ptidx++ ) { - Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level)); + Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level)); Point2f nextPt; if( level == maxLevel ) { if( useInitialFlow ) - { + { nextPt = nextPts[ptidx]*(float)(1./(1 << level)); - } + } else nextPt = prevPt; } @@ -147,25 +109,25 @@ class TrackerInvoker : public cv::ParallelLoopBody nextPt = nextPts[ptidx]*2.f; nextPts[ptidx] = nextPt; - Point2i iprevPt, inextPt; - iprevPt.x = cvFloor(prevPt.x); - iprevPt.y = cvFloor(prevPt.y); - int winArea = maxWinSize * maxWinSize; - cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize)); - winMaskMatBuf.setTo(0); - if( calcWinMaskMat(BI, windowType, iprevPt, - winMaskMat,winSize,halfWin,winArea, - minWinSize,maxWinSize, crossSegmentationThreshold) == false) - continue; - - - prevPt -= halfWin; - iprevPt.x = cvFloor(prevPt.x); + Point2i iprevPt, inextPt; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + int winArea = maxWinSize * maxWinSize; + cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize)); + winMaskMatBuf.setTo(0); + if( calcWinMaskMat(BI, windowType, iprevPt, + winMaskMat,winSize,halfWin,winArea, + minWinSize,maxWinSize, crossSegmentationThreshold) == false) + continue; + + + prevPt -= halfWin; + iprevPt.x = cvFloor(prevPt.x); iprevPt.y = cvFloor(prevPt.y); - if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols || - iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows ) + if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols || + iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows ) { if( level == 0 ) { @@ -188,32 +150,32 @@ class TrackerInvoker : public cv::ParallelLoopBody int dstep = (int)(derivI.step/derivI.elemSize1()); int step = (int)(I.step/I.elemSize1()); - int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1()); + int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1()); CV_Assert( step == (int)(J.step/J.elemSize1()) ); float A11 = 0, A12 = 0, A22 = 0; // tensor - float sumIx = 0; - float sumIy = 0; - float sumI = 0; - float sumW = 0; - float w1 = 0, w2 = 0; // -IyI - float dI = 0; // I^2 - float D = 0; - #ifdef CV_SSE2 + float sumIx = 0; + float sumIy = 0; + float sumI = 0; + float sumW = 0; + float w1 = 0, w2 = 0; // -IyI + float dI = 0; // I^2 + float D = 0; + #ifdef CV_SSE4_1 __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); __m128i z = _mm_setzero_si128(); __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1)); __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1)); - __m128i mmMask4_epi32; - __m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits::max()); - get4BitMask(winSize.width, mmMask4_epi32); + __m128i mmMask4_epi32; + __m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits::max()); + get4BitMask(winSize.width, mmMask4_epi32); #endif // extract the patch from the first image, compute covariation matrix of derivatives int x, y; for( y = 0; y < winSize.height; y++ ) { - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; @@ -221,12 +183,12 @@ class TrackerInvoker : public cv::ParallelLoopBody short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step); x = 0; -#ifdef CV_SSE2 +#ifdef CV_SSE4_1 for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 ) { - __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16); - __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16); - __m128i v00, v01, v10, v11, t0, t1; + __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16); + __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16); + __m128i v00, v01, v10, v11, t0, t1; v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z); @@ -234,12 +196,12 @@ class TrackerInvoker : public cv::ParallelLoopBody t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); - t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); - if( x + 4 > winSize.width) - { - t0 = _mm_and_si128(t0, mmMask4_epi32); - } - t0 = _mm_and_si128(t0, mask_0_3_epi16); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); + if( x + 4 > winSize.width) + { + t0 = _mm_and_si128(t0, mmMask4_epi32); + } + t0 = _mm_and_si128(t0, mask_0_3_epi16); _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0)); @@ -256,57 +218,57 @@ class TrackerInvoker : public cv::ParallelLoopBody t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1); v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... if( x + 4 > winSize.width) - { - v00 = _mm_and_si128(v00, mmMask4_epi32); - } - v00 = _mm_and_si128(v00, mask_0_3_epi16); + { + v00 = _mm_and_si128(v00, mmMask4_epi32); + } + v00 = _mm_and_si128(v00, mask_0_3_epi16); _mm_storeu_si128((__m128i*)dIptr, v00); } #else for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 ) { - if( maskPtr[x] == 0) - { - dIptr[0] = 0; - dIptr[1] = 0; - continue; - } - int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + + if( maskPtr[x] == 0) + { + dIptr[0] = 0; + dIptr[1] = 0; + continue; + } + int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5); int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 + dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1); int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 + dsrc[dstep+cn2+1]*iw11, W_BITS1); - Iptr[x] = (short)ival; - dIptr[0] = (short)ixval; - dIptr[1] = (short)iyval; + Iptr[x] = (short)ival; + dIptr[0] = (short)ixval; + dIptr[1] = (short)iyval; } #endif } - cv::Point2f backUpNextPt = nextPt; - nextPt -= halfWin; - Point2f prevDelta(0,0); //relates to h(t-1) - Point2f prevGain(1,0); - cv::Point2f gainVec = gainVecs[ptidx]; - cv::Point2f backUpGain = gainVec; - int j; + cv::Point2f backUpNextPt = nextPt; + nextPt -= halfWin; + Point2f prevDelta(0,0); //relates to h(t-1) + Point2f prevGain(1,0); + cv::Point2f gainVec = gainVecs[ptidx]; + cv::Point2f backUpGain = gainVec; + int j; for( j = 0; j < criteria.maxCount; j++ ) { - status[ptidx] = static_cast(j); + status[ptidx] = static_cast(j); inextPt.x = cvFloor(nextPt.x); inextPt.y = cvFloor(nextPt.y); - - if( inextPt.x + halfWin.x < 0 || inextPt.x + halfWin.x>= derivI.cols || - inextPt.y + halfWin.y < 0 || inextPt.y + halfWin.y >= derivI.rows ) - { - if( level == 0 && status ) - status[ptidx] = 3; - break; - } + + if( inextPt.x + halfWin.x < 0 || inextPt.x + halfWin.x>= derivI.cols || + inextPt.y + halfWin.y < 0 || inextPt.y + halfWin.y >= derivI.rows ) + { + if( level == 0 && status ) + status[ptidx] = 3; + break; + } a = nextPt.x - inextPt.x; b = nextPt.y - inextPt.y; @@ -314,467 +276,467 @@ class TrackerInvoker : public cv::ParallelLoopBody iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - float b1,b2,b3,b4; + float b1,b2,b3,b4; b1 = 0; - b2 = 0; - b3 = 0; - b4 = 0; - - if( j == 0 ) - { - // tensor - w1 = 0; // -IxI - w2 = 0; // -IyI - dI = 0; // I^2 - sumIx = 0; - sumIy = 0; - sumI = 0; - sumW = 0; - A11 = 0; - A12 = 0; - A22 = 0; - } -#ifdef CV_SSE2 + b2 = 0; + b3 = 0; + b4 = 0; + + if( j == 0 ) + { + // tensor + w1 = 0; // -IxI + w2 = 0; // -IyI + dI = 0; // I^2 + sumIx = 0; + sumIy = 0; + sumI = 0; + sumW = 0; + A11 = 0; + A12 = 0; + A22 = 0; + } +#ifdef CV_SSE4_1 qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(), qb2 = _mm_setzero_ps(); - __m128 qb3 = _mm_setzero_ps(); - __m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps(); - __m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps(); - __m128 mmSumIy = _mm_setzero_ps(), mmSumIx = _mm_setzero_ps(); - __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps(); - float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x; - int bitShift = gainVec.x == 0 ? 1 : static_cast(ceil(log(200.f / gainVal) / log(2.f))); // zwei stellen nach den komma - __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast(gainVec.x * (float)(1 << bitShift))); - __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast(gainVec.y)); + __m128 qb3 = _mm_setzero_ps(); + __m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps(); + __m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps(); + __m128 mmSumIy = _mm_setzero_ps(), mmSumIx = _mm_setzero_ps(); + __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps(); + float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x; + int bitShift = gainVec.x == 0 ? 1 : static_cast(ceil(log(200.f / gainVal) / log(2.f))); // zwei stellen nach den komma + __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast(gainVec.x * (float)(1 << bitShift))); + __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast(gainVec.y)); #endif - for( y = 0; y < winSize.height; y++ ) + for( y = 0; y < winSize.height; y++ ) { - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn; const short* Iptr = (const short*)(IWinBuf.data + ( y)*IWinBuf.step); const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); x = 0; -#ifdef CV_SSE2 - +#ifdef CV_SSE4_1 + for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 ) { - // iw = iw << 14 (16384 short / 2) - // iq0 = iw01 |iw00, iw01 |iw00, iw01 |iw00, iw01 |iw00 16 bit - __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16); - - // I [0...8] + // iw = iw << 14 (16384 short / 2) + // iq0 = iw01 |iw00, iw01 |iw00, iw01 |iw00, iw01 |iw00 16 bit + __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16); + + // I [0...8] __m128i diff0, diff1; - __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); // element 0 to 7 - + __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); // element 0 to 7 + __m128i v00 = _mm_unpacklo_epi8( - _mm_loadl_epi64((const __m128i*)(Jptr + x)) // J0, J1, J2, ..., J7, 64 bit nullen - , z); //J0 , 0, J1, 0, J2, 0 ... J7,0 + _mm_loadl_epi64((const __m128i*)(Jptr + x)) // J0, J1, J2, ..., J7, 64 bit nullen + , z); //J0 , 0, J1, 0, J2, 0 ... J7,0 __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z); __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z); - - __m128i t0 = _mm_add_epi32 - (_mm_madd_epi16( - _mm_unpacklo_epi16(v00, v01), //J0, , J1, J1, J2, J2, ... J3 , J4 - qw0), + + __m128i t0 = _mm_add_epi32 + (_mm_madd_epi16( + _mm_unpacklo_epi16(v00, v01), //J0, , J1, J1, J2, J2, ... J3 , J4 + qw0), _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); - - __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + + __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); - t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5); - - - __m128i Ilo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16); - __m128i Ihi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16); - - __m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(Ilo, Ihi), bitShift); - __m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(Ilo, Ihi), bitShift); - __m128i Igain_epi16 = _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32); - - // diff = J - I + I*gain.x + gain.y - __m128i mmDiff_epi16 = _mm_add_epi16( - _mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16), // J - I - _mm_add_epi16(Igain_epi16, mmConstValue_epi16)); - - - mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16); - - - __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... + + + __m128i Ilo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16); + __m128i Ihi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16); + + __m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(Ilo, Ihi), bitShift); + __m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(Ilo, Ihi), bitShift); + __m128i Igain_epi16 = _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32); + + // diff = J - I + I*gain.x + gain.y + __m128i mmDiff_epi16 = _mm_add_epi16( + _mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16), // J - I + _mm_add_epi16(Igain_epi16, mmConstValue_epi16)); + + + mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16); + + + __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); - - diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7 | It12 It12 It13 It13... + + diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7 | It12 It12 It13 It13... diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3 | It8 It8 It9 It9... - v10 = _mm_mullo_epi16(Ixy_0, diff0); + v10 = _mm_mullo_epi16(Ixy_0, diff0); v11 = _mm_mulhi_epi16(Ixy_0, diff0); v00 = _mm_unpacklo_epi16(v10, v11); v10 = _mm_unpackhi_epi16(v10, v11); - qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); - qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); + qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); + qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); - v10 = _mm_mullo_epi16(Ixy_1, diff1); - v11 = _mm_mulhi_epi16(Ixy_1, diff1); + v10 = _mm_mullo_epi16(Ixy_1, diff1); + v11 = _mm_mulhi_epi16(Ixy_1, diff1); v00 = _mm_unpacklo_epi16(v10, v11); v10 = _mm_unpackhi_epi16(v10, v11); - qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); - qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); - // for set 1 sigma 1 - // b3 += diff * Iptr[x] - Ilo = _mm_mullo_epi16(mmDiff_epi16, I_0_7_epi16); - Ihi = _mm_mulhi_epi16(mmDiff_epi16, I_0_7_epi16); - v00 = _mm_unpacklo_epi16(Ilo, Ihi); - v10 = _mm_unpackhi_epi16(Ilo, Ihi); - qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v00)); - qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v10)); - - // b4+= diff - __m128 mmDiff_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mmDiff_epi16)); // (_mm_srai_epi32(_mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16),16)); - __m128 mmDiff_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16),16))); - qb3 = _mm_add_ps(qb3, mmDiff_0_3_ps); - qb3 = _mm_add_ps(qb3, mmDiff_4_7_ps); - if( j == 0 ) - { - __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16)); - __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16))); - - t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3 - t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3 - - __m128 fy = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t0), mask_0_4_ps); - __m128 fx = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t1), mask_0_4_ps); - - // 0 ... 3 - __m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16)); - - // A11 - A22 - mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fy)); - mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fy)); - mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fx)); - - // sumIx und sumIy - mmSumIx = _mm_add_ps(mmSumIx, fx); - mmSumIy = _mm_add_ps(mmSumIy, fy); - - mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fx)); - mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fy)); - - // sumI - I_ps = _mm_blendv_ps(_mm_set1_ps(0), I_ps, mask_0_4_ps); - mmSumI = _mm_add_ps(mmSumI,I_ps); - - // sumDI - mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_ps)); - - t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11 - t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3 - - fy = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t0), mask_4_7_ps); - fx = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t1), mask_4_7_ps); - - // 4 ... 7 - I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16)); - - - - mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fy)); - mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fy)); - mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fx)); - - // sumIx und sumIy - mmSumIx = _mm_add_ps(mmSumIx, fx); - mmSumIy = _mm_add_ps(mmSumIy, fy); - - mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fx)); - mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fy)); - - // sumI - I_ps = _mm_blendv_ps(_mm_set1_ps(0), I_ps, mask_4_7_ps); - mmSumI = _mm_add_ps(mmSumI, I_ps); - - // sumW - // sumDI - mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_ps)); - } - - } + qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); + qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); + // for set 1 sigma 1 + // b3 += diff * Iptr[x] + Ilo = _mm_mullo_epi16(mmDiff_epi16, I_0_7_epi16); + Ihi = _mm_mulhi_epi16(mmDiff_epi16, I_0_7_epi16); + v00 = _mm_unpacklo_epi16(Ilo, Ihi); + v10 = _mm_unpackhi_epi16(Ilo, Ihi); + qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v00)); + qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v10)); + + // b4+= diff + __m128 mmDiff_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mmDiff_epi16)); // (_mm_srai_epi32(_mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16),16)); + __m128 mmDiff_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16),16))); + qb3 = _mm_add_ps(qb3, mmDiff_0_3_ps); + qb3 = _mm_add_ps(qb3, mmDiff_4_7_ps); + if( j == 0 ) + { + __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16)); + __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16))); + + t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3 + t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3 + + __m128 fy = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t0), mask_0_4_ps); + __m128 fx = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t1), mask_0_4_ps); + + // 0 ... 3 + __m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16)); + + // A11 - A22 + mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fy)); + mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fy)); + mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fx)); + + // sumIx und sumIy + mmSumIx = _mm_add_ps(mmSumIx, fx); + mmSumIy = _mm_add_ps(mmSumIy, fy); + + mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fx)); + mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fy)); + + // sumI + I_ps = _mm_blendv_ps(_mm_set1_ps(0), I_ps, mask_0_4_ps); + mmSumI = _mm_add_ps(mmSumI,I_ps); + + // sumDI + mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_ps)); + + t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11 + t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3 + + fy = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t0), mask_4_7_ps); + fx = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t1), mask_4_7_ps); + + // 4 ... 7 + I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16)); + + + + mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fy)); + mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fy)); + mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fx)); + + // sumIx und sumIy + mmSumIx = _mm_add_ps(mmSumIx, fx); + mmSumIy = _mm_add_ps(mmSumIy, fy); + + mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fx)); + mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fy)); + + // sumI + I_ps = _mm_blendv_ps(_mm_set1_ps(0), I_ps, mask_4_7_ps); + mmSumI = _mm_add_ps(mmSumI, I_ps); + + // sumW + // sumDI + mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_ps)); + } + + } #else for( ; x < winSize.width*cn; x++, dIptr += 2 ) { - if( maskPtr[x] == 0) - continue; + if( maskPtr[x] == 0) + continue; int J = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, W_BITS1-5); - int diff = J - Iptr[x] + static_cast(Iptr[x]) * gainVec.x + gainVec.y; - - b1 += (float)(diff*dIptr[0]) * FLT_RESCALE; + int diff = J - Iptr[x] + static_cast(Iptr[x]) * gainVec.x + gainVec.y; + + b1 += (float)(diff*dIptr[0]) * FLT_RESCALE; b2 += (float)(diff*dIptr[1]) * FLT_RESCALE; - b3 += (float)(diff) * Iptr[x] * FLT_RESCALE; - b4 += (float)(diff); - - // compute the Gradient Matrice - if( j == 0 ) - { - A11 += (float)(dIptr[0]*dIptr[0]); - A12 += (float)(dIptr[0]*dIptr[1]); - A22 += (float)(dIptr[1]*dIptr[1]); - - - dI += Iptr[x] * Iptr[x] * FLT_RESCALE; - float dx = static_cast(dIptr[0]) * FLT_RESCALE; - float dy = static_cast(dIptr[1]) * FLT_RESCALE; - sumIx += dx; - sumIy += dy; - w1 += dx * Iptr[x]; - w2 += dy * Iptr[x]; - sumI += Iptr[x] * FLT_RESCALE; - //sumW += FLT_RESCALE; - - } + b3 += (float)(diff) * Iptr[x] * FLT_RESCALE; + b4 += (float)(diff); + + // compute the Gradient Matrice + if( j == 0 ) + { + A11 += (float)(dIptr[0]*dIptr[0]); + A12 += (float)(dIptr[0]*dIptr[1]); + A22 += (float)(dIptr[1]*dIptr[1]); + + + dI += Iptr[x] * Iptr[x] * FLT_RESCALE; + float dx = static_cast(dIptr[0]) * FLT_RESCALE; + float dy = static_cast(dIptr[1]) * FLT_RESCALE; + sumIx += dx; + sumIy += dy; + w1 += dx * Iptr[x]; + w2 += dy * Iptr[x]; + sumI += Iptr[x] * FLT_RESCALE; + //sumW += FLT_RESCALE; + + } } #endif } - - float CV_DECL_ALIGNED(16) wbuf[4];// - if( j == 0 ) - { -#ifdef CV_SSE2 - _mm_store_ps(wbuf, mmSumW1); - w1 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; - _mm_store_ps(wbuf, mmSumW2); - w2 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; - _mm_store_ps(wbuf, mmSumDI); - dI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; - _mm_store_ps(wbuf, mmSumI); - sumI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; - _mm_store_ps(wbuf, mmSumIx); - sumIx = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; - _mm_store_ps(wbuf, mmSumIy); - sumIy = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; - _mm_store_ps(wbuf, mmSumW); - sumW = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + + float CV_DECL_ALIGNED(16) wbuf[4];// + if( j == 0 ) + { +#ifdef CV_SSE4_1 + _mm_store_ps(wbuf, mmSumW1); + w1 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumW2); + w2 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumDI); + dI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumI); + sumI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumIx); + sumIx = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumIy); + sumIy = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumW); + sumW = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; #endif - sumIx *= -FLT_SCALE; - sumIy *= -FLT_SCALE; - sumI *=FLT_SCALE; - sumW = winArea * FLT_SCALE; - w1 *= -FLT_SCALE; - w2 *= -FLT_SCALE; - dI *= FLT_SCALE; -#ifdef CV_SSE2 - float CV_DECL_ALIGNED(32) A11buf[4], A12buf[4], A22buf[4];// - - _mm_store_ps(A11buf, mmAxx); - _mm_store_ps(A12buf, mmAxy); - _mm_store_ps(A22buf, mmAyy); - - A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; - A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; - A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; + sumIx *= -FLT_SCALE; + sumIy *= -FLT_SCALE; + sumI *=FLT_SCALE; + sumW = winArea * FLT_SCALE; + w1 *= -FLT_SCALE; + w2 *= -FLT_SCALE; + dI *= FLT_SCALE; +#ifdef CV_SSE4_1 + float CV_DECL_ALIGNED(32) A11buf[4], A12buf[4], A22buf[4];// + + _mm_store_ps(A11buf, mmAxx); + _mm_store_ps(A12buf, mmAxy); + _mm_store_ps(A22buf, mmAyy); + + A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; + A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; + A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; #endif - A11 *= FLT_SCALE; - A12 *= FLT_SCALE; - A22 *= FLT_SCALE; - } + A11 *= FLT_SCALE; + A12 *= FLT_SCALE; + A22 *= FLT_SCALE; + } -#ifdef CV_SSE2 +#ifdef CV_SSE4_1 float CV_DECL_ALIGNED(16) bbuf[4]; _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1)); - b1 = bbuf[0] + bbuf[2]; - b2 = bbuf[1] + bbuf[3]; - _mm_store_ps(bbuf, qb2); - b3 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; - _mm_store_ps(bbuf, qb3); - b4 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; + b1 = bbuf[0] + bbuf[2]; + b2 = bbuf[1] + bbuf[3]; + _mm_store_ps(bbuf, qb2); + b3 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; + _mm_store_ps(bbuf, qb3); + b4 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; #endif - mismatchMat.at(0,0) = b1 * FLT_SCALE; - mismatchMat.at(1,0) = b2 * FLT_SCALE; - mismatchMat.at(2,0) = -b3 * FLT_SCALE; - mismatchMat.at(3,0) = -b4 * FLT_SCALE; - - D = - A12*A12*sumI*sumI + dI*sumW*A12*A12 + 2*A12*sumI*sumIx*w2 + 2*A12*sumI*sumIy*w1 - - 2*dI*A12*sumIx*sumIy - 2*sumW*A12*w1*w2 + A11*A22*sumI*sumI - 2*A22*sumI*sumIx*w1 - - 2*A11*sumI*sumIy*w2 - sumIx*sumIx*w2*w2 + A22*dI*sumIx*sumIx + 2*sumIx*sumIy*w1*w2 - - sumIy*sumIy*w1*w1 + A11*dI*sumIy*sumIy + A22*sumW*w1*w1 + A11*sumW*w2*w2 - A11*A22*dI*sumW; - - - float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) + + mismatchMat.at(0,0) = b1 * FLT_SCALE; + mismatchMat.at(1,0) = b2 * FLT_SCALE; + mismatchMat.at(2,0) = -b3 * FLT_SCALE; + mismatchMat.at(3,0) = -b4 * FLT_SCALE; + + D = - A12*A12*sumI*sumI + dI*sumW*A12*A12 + 2*A12*sumI*sumIx*w2 + 2*A12*sumI*sumIy*w1 + - 2*dI*A12*sumIx*sumIy - 2*sumW*A12*w1*w2 + A11*A22*sumI*sumI - 2*A22*sumI*sumIx*w1 + - 2*A11*sumI*sumIy*w2 - sumIx*sumIx*w2*w2 + A22*dI*sumIx*sumIx + 2*sumIx*sumIy*w1*w2 + - sumIy*sumIy*w1*w1 + A11*dI*sumIy*sumIy + A22*sumW*w1*w1 + A11*sumW*w2*w2 - A11*A22*dI*sumW; + + + float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) + 4.f*A12*A12))/(2*winArea); - if( minEig < minEigThreshold ) - { - if( level == 0 && status || std::abs(D) < FLT_EPSILON) - status[ptidx] = 0; - if( level > 0) - { - nextPts[ptidx] = backUpNextPt; - gainVecs[ptidx] = backUpGain; - } - break; - } - - - - D = (1.f / D); - invTensorMat.at(0,0) = (A22*sumI*sumI - 2*sumI*sumIy*w2 + dI*sumIy*sumIy + sumW*w2*w2 - A22*dI*sumW)* D; invTensorMat.at(0,1) = (A12*dI*sumW - A12*sumI * sumI - dI*sumIx*sumIy + sumI*sumIx*w2 + sumI*sumIy*w1 - sumW*w1*w2)* D; - invTensorMat.at(0,2) = (A12*sumI*sumIy - sumIy*sumIy*w1 - A22*sumI*sumIx - A12*sumW*w2 + A22*sumW*w1 + sumIx*sumIy*w2)* D; - invTensorMat.at(0,3) = (A22*dI*sumIx - A12*dI*sumIy - sumIx*w2*w2 + A12*sumI*w2 - A22*sumI*w1 + sumIy*w1*w2) * D; - invTensorMat.at(1,0) = invTensorMat.at(0,1); - invTensorMat.at(1,1) = (A11*sumI * sumI - 2*sumI*sumIx*w1 + dI*sumIx * sumIx + sumW*w1*w1 - A11*dI*sumW) * D; - invTensorMat.at(1,2) = (A12*sumI*sumIx - A11*sumI*sumIy - sumIx * sumIx*w2 + A11*sumW*w2 - A12*sumW*w1 + sumIx*sumIy*w1) * D; - invTensorMat.at(1,3) = (A11*dI*sumIy - sumIy*w1*w1 - A12*dI*sumIx - A11*sumI*w2 + A12*sumI*w1 + sumIx*w1*w2)* D; - invTensorMat.at(2,0) = invTensorMat.at(0,2); - invTensorMat.at(2,1) = invTensorMat.at(1,2); - invTensorMat.at(2,2) = (sumW*A12*A12 - 2*A12*sumIx*sumIy + A22*sumIx*sumIx + A11*sumIy*sumIy - A11*A22*sumW)* D; - invTensorMat.at(2,3) = (A11*A22*sumI - A12*A12*sumI - A11*sumIy*w2 + A12*sumIx*w2 + A12*sumIy*w1 - A22*sumIx*w1)* D; - invTensorMat.at(3,0) = invTensorMat.at(0,3); - invTensorMat.at(3,1) = invTensorMat.at(1,3); - invTensorMat.at(3,2) = invTensorMat.at(2,3); - invTensorMat.at(3,3) = (dI*A12*A12 - 2*A12*w1*w2 + A22*w1*w1 + A11*w2*w2 - A11*A22*dI)* D; - - resultMat = invTensorMat * mismatchMat; + if( minEig < minEigThreshold ) + { + if( (level == 0 && status) || std::abs(D) < FLT_EPSILON) + status[ptidx] = 0; + if( level > 0) + { + nextPts[ptidx] = backUpNextPt; + gainVecs[ptidx] = backUpGain; + } + break; + } + + + + D = (1.f / D); + invTensorMat.at(0,0) = (A22*sumI*sumI - 2*sumI*sumIy*w2 + dI*sumIy*sumIy + sumW*w2*w2 - A22*dI*sumW)* D; invTensorMat.at(0,1) = (A12*dI*sumW - A12*sumI * sumI - dI*sumIx*sumIy + sumI*sumIx*w2 + sumI*sumIy*w1 - sumW*w1*w2)* D; + invTensorMat.at(0,2) = (A12*sumI*sumIy - sumIy*sumIy*w1 - A22*sumI*sumIx - A12*sumW*w2 + A22*sumW*w1 + sumIx*sumIy*w2)* D; + invTensorMat.at(0,3) = (A22*dI*sumIx - A12*dI*sumIy - sumIx*w2*w2 + A12*sumI*w2 - A22*sumI*w1 + sumIy*w1*w2) * D; + invTensorMat.at(1,0) = invTensorMat.at(0,1); + invTensorMat.at(1,1) = (A11*sumI * sumI - 2*sumI*sumIx*w1 + dI*sumIx * sumIx + sumW*w1*w1 - A11*dI*sumW) * D; + invTensorMat.at(1,2) = (A12*sumI*sumIx - A11*sumI*sumIy - sumIx * sumIx*w2 + A11*sumW*w2 - A12*sumW*w1 + sumIx*sumIy*w1) * D; + invTensorMat.at(1,3) = (A11*dI*sumIy - sumIy*w1*w1 - A12*dI*sumIx - A11*sumI*w2 + A12*sumI*w1 + sumIx*w1*w2)* D; + invTensorMat.at(2,0) = invTensorMat.at(0,2); + invTensorMat.at(2,1) = invTensorMat.at(1,2); + invTensorMat.at(2,2) = (sumW*A12*A12 - 2*A12*sumIx*sumIy + A22*sumIx*sumIx + A11*sumIy*sumIy - A11*A22*sumW)* D; + invTensorMat.at(2,3) = (A11*A22*sumI - A12*A12*sumI - A11*sumIy*w2 + A12*sumIx*w2 + A12*sumIy*w1 - A22*sumIx*w1)* D; + invTensorMat.at(3,0) = invTensorMat.at(0,3); + invTensorMat.at(3,1) = invTensorMat.at(1,3); + invTensorMat.at(3,2) = invTensorMat.at(2,3); + invTensorMat.at(3,3) = (dI*A12*A12 - 2*A12*w1*w2 + A22*w1*w1 + A11*w2*w2 - A11*A22*dI)* D; + + resultMat = invTensorMat * mismatchMat; // 0.08 -12.10 - Point2f delta(-resultMat.at(0), -resultMat.at(1)); - Point2f deltaGain(-resultMat.at(2), -resultMat.at(3)); - - - if( j == 0) - prevGain = deltaGain; - nextPt += delta; + Point2f delta(-resultMat.at(0), -resultMat.at(1)); + Point2f deltaGain(-resultMat.at(2), -resultMat.at(3)); + + + if( j == 0) + prevGain = deltaGain; + nextPt += delta; nextPts[ptidx] = nextPt + halfWin; - gainVecs[ptidx]= gainVec; + gainVecs[ptidx]= gainVec; if( delta.ddot(delta) <= criteria.epsilon) - break; - if( ( std::abs(delta.x - prevDelta.x) < 0.01 - && std::abs(delta.y - prevDelta.y) < 0.01) - || ((delta.ddot(delta) <= 0.001) - && std::abs(prevGain.x - deltaGain.x) < 0.01) - ) - { - nextPts[ptidx] -= delta*0.5f; - gainVecs[ptidx] -= deltaGain* 0.5f; - break; - } - - if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01 && - std::abs(delta.y - prevDelta.y) < 0.01) - { - nextPts[ptidx] -= delta*0.5f; - break; - } - - prevDelta = delta; - prevGain = deltaGain; - + break; + if( ( std::abs(delta.x - prevDelta.x) < 0.01 + && std::abs(delta.y - prevDelta.y) < 0.01) + || ((delta.ddot(delta) <= 0.001) + && std::abs(prevGain.x - deltaGain.x) < 0.01) + ) + { + nextPts[ptidx] -= delta*0.5f; + gainVecs[ptidx] -= deltaGain* 0.5f; + break; + } + + if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01 && + std::abs(delta.y - prevDelta.y) < 0.01) + { + nextPts[ptidx] -= delta*0.5f; + break; + } + + prevDelta = delta; + prevGain = deltaGain; + } - - + + } } - const Mat* prevImg; - const Mat* nextImg; - const Mat* prevDeriv; - const Mat* rgbPrevImg; - const Mat* rgbNextImg; - const Point2f* prevPts; - Point2f* nextPts; - uchar* status; - cv::Point2f* gainVecs; // gain vector x -> multiplier y -> offset - float* err; - int maxWinSize; - int minWinSize; - TermCriteria criteria; - int level; - int maxLevel; - int windowType; - float minEigThreshold; - bool useInitialFlow; - int crossSegmentationThreshold; + const Mat* prevImg; + const Mat* nextImg; + const Mat* prevDeriv; + const Mat* rgbPrevImg; + const Mat* rgbNextImg; + const Point2f* prevPts; + Point2f* nextPts; + uchar* status; + cv::Point2f* gainVecs; // gain vector x -> multiplier y -> offset + float* err; + int maxWinSize; + int minWinSize; + TermCriteria criteria; + int level; + int maxLevel; + int windowType; + float minEigThreshold; + bool useInitialFlow; + int crossSegmentationThreshold; }; } namespace ica{ class TrackerInvoker : public cv::ParallelLoopBody { public: - TrackerInvoker(const Mat& _prevImg, - const Mat& _prevDeriv, - const Mat& _nextImg, - const Mat& _rgbPrevImg, - const Mat& _rgbNextImg, - const Point2f* _prevPts, - Point2f* _nextPts, - uchar* _status, - float* _err, - int _level, - int _maxLevel, - int _winSize[2], - int _maxIteration, - bool _useInitialFlow, - int _supportRegionType, - int _crossSegmentationThreshold, - float _minEigenValue) - { - prevImg = &_prevImg; - prevDeriv = &_prevDeriv; - nextImg = &_nextImg; - rgbPrevImg = &_rgbPrevImg; - rgbNextImg = &_rgbNextImg; - prevPts = _prevPts; - nextPts = _nextPts; - status = _status; - err = _err; - minWinSize = _winSize[0]; - maxWinSize = _winSize[1]; - criteria.maxCount = _maxIteration; - criteria.epsilon = 0.01; - level = _level; - maxLevel = _maxLevel; - windowType = _supportRegionType; - minEigThreshold = _minEigenValue; - useInitialFlow = _useInitialFlow; - crossSegmentationThreshold = _crossSegmentationThreshold; - } + TrackerInvoker(const Mat& _prevImg, + const Mat& _prevDeriv, + const Mat& _nextImg, + const Mat& _rgbPrevImg, + const Mat& _rgbNextImg, + const Point2f* _prevPts, + Point2f* _nextPts, + uchar* _status, + float* _err, + int _level, + int _maxLevel, + int _winSize[2], + int _maxIteration, + bool _useInitialFlow, + int _supportRegionType, + int _crossSegmentationThreshold, + float _minEigenValue) + { + prevImg = &_prevImg; + prevDeriv = &_prevDeriv; + nextImg = &_nextImg; + rgbPrevImg = &_rgbPrevImg; + rgbNextImg = &_rgbNextImg; + prevPts = _prevPts; + nextPts = _nextPts; + status = _status; + err = _err; + minWinSize = _winSize[0]; + maxWinSize = _winSize[1]; + criteria.maxCount = _maxIteration; + criteria.epsilon = 0.01; + level = _level; + maxLevel = _maxLevel; + windowType = _supportRegionType; + minEigThreshold = _minEigenValue; + useInitialFlow = _useInitialFlow; + crossSegmentationThreshold = _crossSegmentationThreshold; + } - TrackerInvoker & operator=(const TrackerInvoker &) {return *this;}; + TrackerInvoker & operator=(const TrackerInvoker &) {return *this;}; void operator()(const cv::Range& range) const { - cv::Size winSize; - cv::Point2f halfWin; + cv::Size winSize; + cv::Point2f halfWin; - const Mat& I = *prevImg; - const Mat& J = *nextImg; + const Mat& I = *prevImg; + const Mat& J = *nextImg; const Mat& derivI = *prevDeriv; - const Mat& BI = *rgbPrevImg; + const Mat& BI = *rgbPrevImg; - winSize = cv::Size(maxWinSize,maxWinSize); - int winMaskwidth = static_cast(ceil(winSize.width / 8.f)) * 16; - cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); - winMaskMatBuf.setTo(1); - - + winSize = cv::Size(maxWinSize,maxWinSize); + int winMaskwidth = static_cast(ceil(winSize.width / 8.f)) * 16; + cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); + winMaskMatBuf.setTo(1); + + - const float FLT_SCALE = (1.f/(1 << 20)); // 20 + const float FLT_SCALE = (1.f/(1 << 20)); // 20 int j, cn = I.channels(), cn2 = cn*2; - int winbufwidth = static_cast(ceil(winSize.width / 8.f)) * 8; - cv::Size winBufSize(winbufwidth,winbufwidth); + int winbufwidth = static_cast(ceil(winSize.width / 8.f)) * 8; + cv::Size winBufSize(winbufwidth,winbufwidth); - std::vector _buf(winBufSize.area()*(cn + cn2)); - Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]); - Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]); + std::vector _buf(winBufSize.area()*(cn + cn2)); + Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]); + Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]); for( int ptidx = range.start; ptidx < range.end; ptidx++ ) { - Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level)); + Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level)); Point2f nextPt; if( level == maxLevel ) { @@ -787,25 +749,25 @@ class TrackerInvoker : public cv::ParallelLoopBody nextPt = nextPts[ptidx]*2.f; nextPts[ptidx] = nextPt; - Point2i iprevPt, inextPt; - iprevPt.x = cvFloor(prevPt.x); - iprevPt.y = cvFloor(prevPt.y); - int winArea = maxWinSize * maxWinSize; - cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize)); + Point2i iprevPt, inextPt; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + int winArea = maxWinSize * maxWinSize; + cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize)); - if( calcWinMaskMat(BI, windowType, iprevPt, - winMaskMat,winSize,halfWin,winArea, - minWinSize,maxWinSize, crossSegmentationThreshold) == false) - continue; + if( calcWinMaskMat(BI, windowType, iprevPt, + winMaskMat,winSize,halfWin,winArea, + minWinSize,maxWinSize, crossSegmentationThreshold) == false) + continue; - - prevPt -= halfWin; - iprevPt.x = cvFloor(prevPt.x); + + prevPt -= halfWin; + iprevPt.x = cvFloor(prevPt.x); iprevPt.y = cvFloor(prevPt.y); - if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols || - iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows ) + if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols || + iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows ) { if( level == 0 ) { @@ -828,26 +790,26 @@ class TrackerInvoker : public cv::ParallelLoopBody int dstep = (int)(derivI.step/derivI.elemSize1()); int step = (int)(I.step/I.elemSize1()); - int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1()); + int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1()); CV_Assert( step == (int)(J.step/J.elemSize1()) ); float A11 = 0, A12 = 0, A22 = 0; -#ifdef CV_SSE2 +#ifdef CV_SSE4_1 __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); __m128i z = _mm_setzero_si128(); __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1)); __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1)); __m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps(); - __m128i mmMask4_epi32; - get4BitMask(winSize.width, mmMask4_epi32); + __m128i mmMask4_epi32; + get4BitMask(winSize.width, mmMask4_epi32); #endif // extract the patch from the first image, compute covariation matrix of derivatives int x, y; for( y = 0; y < winSize.height; y++ ) { - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; @@ -856,13 +818,13 @@ class TrackerInvoker : public cv::ParallelLoopBody x = 0; -#ifdef CV_SSE2 +#ifdef CV_SSE4_1 for( ; x <= winSize.width*cn; x += 4, dsrc += 4*2, dIptr += 4*2 ) { - __m128i wMask = _mm_set_epi32(MaskSet * maskPtr[x+3], - MaskSet * maskPtr[x+2], - MaskSet * maskPtr[x+1], - MaskSet * maskPtr[x]); + __m128i wMask = _mm_set_epi32(MaskSet * maskPtr[x+3], + MaskSet * maskPtr[x+2], + MaskSet * maskPtr[x+1], + MaskSet * maskPtr[x]); __m128i v00, v01, v10, v11, t0, t1; v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); @@ -872,13 +834,13 @@ class TrackerInvoker : public cv::ParallelLoopBody t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); - - + + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0)); - + v00 = _mm_loadu_si128((const __m128i*)(dsrc)); v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); @@ -893,11 +855,11 @@ class TrackerInvoker : public cv::ParallelLoopBody t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1); v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... - if( x + 4 > winSize.width) - { - v00 = _mm_and_si128(v00, mmMask4_epi32); - } - v00 = _mm_and_si128(v00, wMask); + if( x + 4 > winSize.width) + { + v00 = _mm_and_si128(v00, mmMask4_epi32); + } + v00 = _mm_and_si128(v00, wMask); _mm_storeu_si128((__m128i*)dIptr, v00); t0 = _mm_srai_epi32(v00, 16); // Iy0 Iy1 Iy2 Iy3 @@ -906,8 +868,8 @@ class TrackerInvoker : public cv::ParallelLoopBody __m128 fy = _mm_cvtepi32_ps(t0); __m128 fx = _mm_cvtepi32_ps(t1); - - qA22 = _mm_add_ps(qA22, _mm_mul_ps(fy, fy)); + + qA22 = _mm_add_ps(qA22, _mm_mul_ps(fy, fy)); qA12 = _mm_add_ps(qA12, _mm_mul_ps(fx, fy)); qA11 = _mm_add_ps(qA11, _mm_mul_ps(fx, fx)); } @@ -915,30 +877,30 @@ class TrackerInvoker : public cv::ParallelLoopBody for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 ) { - if( maskPtr[x] == 0) - { - dIptr[0] = 0; - dIptr[1] = 0; - continue; - } - int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + + if( maskPtr[x] == 0) + { + dIptr[0] = 0; + dIptr[1] = 0; + continue; + } + int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5); int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 + dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1); int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 + dsrc[dstep+cn2+1]*iw11, W_BITS1); - Iptr[x] = (short)ival; - dIptr[0] = (short)ixval; - dIptr[1] = (short)iyval; - A11 += (float)(ixval*ixval); - A12 += (float)(ixval*iyval); - A22 += (float)(iyval*iyval); + Iptr[x] = (short)ival; + dIptr[0] = (short)ixval; + dIptr[1] = (short)iyval; + A11 += (float)(ixval*ixval); + A12 += (float)(ixval*iyval); + A22 += (float)(iyval*iyval); } #endif } -#ifdef CV_SSE2 +#ifdef CV_SSE4_1 float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4]; _mm_store_ps(A11buf, qA11); _mm_store_ps(A12buf, qA12); @@ -948,18 +910,18 @@ class TrackerInvoker : public cv::ParallelLoopBody A22 += A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; #endif - A11 *= FLT_SCALE; + A11 *= FLT_SCALE; A12 *= FLT_SCALE; A22 *= FLT_SCALE; float D = A11*A22 - A12*A12; float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) + - 4.f*A12*A12))/(2 * winArea); + 4.f*A12*A12))/(2 * winArea); if( err ) err[ptidx] = (float)minEig; - if( minEig < minEigThreshold || D < FLT_EPSILON ) + if( minEig < minEigThreshold || D < FLT_EPSILON ) { if( level == 0 && status ) status[ptidx] = 0; @@ -969,23 +931,23 @@ class TrackerInvoker : public cv::ParallelLoopBody D = 1.f/D; nextPt -= halfWin; - Point2f prevDelta(0,0); //relates to h(t-1) - __m128i mmMask0, mmMask1, mmMask; - getWBitMask(winSize.width, mmMask0, mmMask1, mmMask); + Point2f prevDelta(0,0); //relates to h(t-1) + __m128i mmMask0, mmMask1, mmMask; + getWBitMask(winSize.width, mmMask0, mmMask1, mmMask); for( j = 0; j < criteria.maxCount; j++ ) { - status[ptidx] = static_cast(j); + status[ptidx] = static_cast(j); inextPt.x = cvFloor(nextPt.x); inextPt.y = cvFloor(nextPt.y); - - if( inextPt.x + halfWin.x < 0 || inextPt.x + halfWin.x>= derivI.cols || - inextPt.y + halfWin.y < 0 || inextPt.y + halfWin.y >= derivI.rows ) - { - if( level == 0 && status ) - status[ptidx] = 3; - break; - } + + if( inextPt.x + halfWin.x < 0 || inextPt.x + halfWin.x>= derivI.cols || + inextPt.y + halfWin.y < 0 || inextPt.y + halfWin.y >= derivI.rows ) + { + if( level == 0 && status ) + status[ptidx] = 3; + break; + } a = nextPt.x - inextPt.x; b = nextPt.y - inextPt.y; @@ -994,30 +956,29 @@ class TrackerInvoker : public cv::ParallelLoopBody iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; float b1 = 0, b2 = 0; -#ifdef CV_SSE2 +#ifdef CV_SSE4_1 qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(); - __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps(); - + #endif - for( y = 0; y < winSize.height; y++ ) + for( y = 0; y < winSize.height; y++ ) { const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn; const short* Iptr = (const short*)(IWinBuf.data + ( y)*IWinBuf.step); const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); x = 0; -#ifdef CV_SSE2 - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; +#ifdef CV_SSE4_1 + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 ) { - if( maskPtr[x ] == 0 && maskPtr[x+1] == 0 && maskPtr[x+2] == 0 && maskPtr[x+3] == 0 - && maskPtr[x+4] == 0 && maskPtr[x+5] == 0 && maskPtr[x+6] == 0 && maskPtr[x+7] == 0) - continue; + if( maskPtr[x ] == 0 && maskPtr[x+1] == 0 && maskPtr[x+2] == 0 && maskPtr[x+3] == 0 + && maskPtr[x+4] == 0 && maskPtr[x+5] == 0 && maskPtr[x+6] == 0 && maskPtr[x+7] == 0) + continue; __m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x)), diff1; - __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z); + __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z); __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z); __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z); @@ -1029,44 +990,44 @@ class TrackerInvoker : public cv::ParallelLoopBody t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5); - __m128i mmDiff_epi16 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), diff0); + __m128i mmDiff_epi16 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), diff0); __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); - - if( x > winSize.width*cn - 8) - { - Ixy_0 = _mm_and_si128(Ixy_0, mmMask0); - Ixy_1 = _mm_and_si128(Ixy_1, mmMask1); - } - - diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7 | It12 It12 It13 It13... + if( x > winSize.width*cn - 8) + { + Ixy_0 = _mm_and_si128(Ixy_0, mmMask0); + Ixy_1 = _mm_and_si128(Ixy_1, mmMask1); + } + + + diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7 | It12 It12 It13 It13... diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3 | It8 It8 It9 It9... - + - v10 = _mm_mullo_epi16(Ixy_0, diff0); + v10 = _mm_mullo_epi16(Ixy_0, diff0); v11 = _mm_mulhi_epi16(Ixy_0, diff0); v00 = _mm_unpacklo_epi16(v10, v11); v10 = _mm_unpackhi_epi16(v10, v11); - qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); - qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); - - // It * Ix It * Iy [4 ... 7] - // for set 1 hi sigma 1 - - v10 = _mm_mullo_epi16(Ixy_1, diff1); - v11 = _mm_mulhi_epi16(Ixy_1, diff1); + qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); + qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); + + // It * Ix It * Iy [4 ... 7] + // for set 1 hi sigma 1 + + v10 = _mm_mullo_epi16(Ixy_1, diff1); + v11 = _mm_mulhi_epi16(Ixy_1, diff1); v00 = _mm_unpacklo_epi16(v10, v11); v10 = _mm_unpackhi_epi16(v10, v11); - qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); - qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); - } + qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); + qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); + } #else for( ; x < winSize.width*cn; x++, dIptr += 2 ) { - if( dIptr[0] == 0 && dIptr[1] == 0) - continue; + if( dIptr[0] == 0 && dIptr[1] == 0) + continue; int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, W_BITS1-5) - Iptr[x]; @@ -1076,15 +1037,15 @@ class TrackerInvoker : public cv::ParallelLoopBody } #endif } - -#ifdef CV_SSE2 + +#ifdef CV_SSE4_1 float CV_DECL_ALIGNED(16) bbuf[4]; _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1)); b1 += bbuf[0] + bbuf[2]; b2 += bbuf[1] + bbuf[3]; #endif - b1 *= FLT_SCALE; + b1 *= FLT_SCALE; b2 *= FLT_SCALE; Point2f delta( (float)((A12*b2 - A22*b1) * D), @@ -1096,44 +1057,44 @@ class TrackerInvoker : public cv::ParallelLoopBody nextPts[ptidx] = nextPt + halfWin; if( delta.ddot(delta) <= criteria.epsilon) - break; - if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01 && - std::abs(delta.y - prevDelta.y) < 0.01) - { - nextPts[ptidx] -= delta*0.5f; - break; - } - - prevDelta = delta; - + break; + if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01 && + std::abs(delta.y - prevDelta.y) < 0.01) + { + nextPts[ptidx] -= delta*0.5f; + break; + } + + prevDelta = delta; + } - - + + - - + + } } - const Mat* prevImg; - const Mat* nextImg; - const Mat* prevDeriv; - const Mat* rgbPrevImg; - const Mat* rgbNextImg; - const Point2f* prevPts; - Point2f* nextPts; - uchar* status; - float* err; - int maxWinSize; - int minWinSize; - TermCriteria criteria; - int level; - int maxLevel; - int windowType; - float minEigThreshold; - bool useInitialFlow; - int crossSegmentationThreshold; + const Mat* prevImg; + const Mat* nextImg; + const Mat* prevDeriv; + const Mat* rgbPrevImg; + const Mat* rgbNextImg; + const Point2f* prevPts; + Point2f* nextPts; + uchar* status; + float* err; + int maxWinSize; + int minWinSize; + TermCriteria criteria; + int level; + int maxLevel; + int windowType; + float minEigThreshold; + bool useInitialFlow; + int crossSegmentationThreshold; }; } } diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp index 6dbdfef441e..fb1ad16f353 100644 --- a/modules/optflow/src/rlof/rlof_invoker.hpp +++ b/modules/optflow/src/rlof/rlof_invoker.hpp @@ -1,44 +1,6 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// - // - // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. - // - // By downloading, copying, installing or using the software you agree to this license. - // If you do not agree to this license, do not download, install, - // copy or use the software. - // - // - // License Agreement - // For Open Source Computer Vision Library - // - // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. - // Copyright (C) 2009, Willow Garage Inc., all rights reserved. - // Third party copyrights are property of their respective owners. - // - // Redistribution and use in source and binary forms, with or without modification, - // are permitted provided that the following conditions are met: - // - // * Redistribution's of source code must retain the above copyright notice, - // this list of conditions and the following disclaimer. - // - // * Redistribution's in binary form must reproduce the above copyright notice, - // this list of conditions and the following disclaimer in the documentation - // and/or other materials provided with the distribution. - // - // * The name of the copyright holders may not be used to endorse or promote products - // derived from this software without specific prior written permission. - // - // This software is provided by the copyright holders and contributors "as is" and - // any express or implied warranties, including, but not limited to, the implied - // warranties of merchantability and fitness for a particular purpose are disclaimed. - // In no event shall the Intel Corporation or contributors be liable for any direct, - // indirect, incidental, special, exemplary, or consequential damages - // (including, but not limited to, procurement of substitute goods or services; - // loss of use, data, or profits; or business interruption) however caused - // and on any theory of liability, whether in contract, strict liability, - // or tort (including negligence or otherwise) arising in any way out of - // the use of this software, even if advised of the possibility of such damage. - // - //M*/ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. #ifndef _RLOF_INVOKER_HPP_ #define _RLOF_INVOKER_HPP_ #include "rlof_invokerbase.hpp" @@ -50,1462 +12,1447 @@ namespace rlof { namespace ica { - int calc_diff(int diff,float fParam0,float fParam1,float param_2) - { - int abss = (diff < 0) ? -diff : diff; - if (abss > fParam1) - { - diff = 0; - } - else if (abss > fParam0 && diff >= 0) - { - //diff = fParam1 * (diff - fParam1); - diff = static_cast(param_2 * (diff - fParam1)); - } - else if (abss > fParam0 && diff < 0) - { - //diff = fParam1 * (diff + fParam1); - diff = static_cast(param_2 * (diff + fParam1)); - - } - return diff; - } - class TrackerInvoker : public cv::ParallelLoopBody - { - public: - TrackerInvoker( - const Mat& _prevImg, - const Mat& _prevDeriv, - const Mat& _nextImg, - const Mat& _rgbPrevImg, - const Mat& _rgbNextImg, - const Point2f* _prevPts, - Point2f* _nextPts, - uchar* _status, - float* _err, - int _level, - int _maxLevel, - int _winSize[2], - int _maxIteration, - bool _useInitialFlow, - int _supportRegionType, - std::vector _normSigmas, - float _minEigenValue, - int _crossSegmentationThreshold) - { - prevImg = &_prevImg; - prevDeriv = &_prevDeriv; - nextImg = &_nextImg; - rgbPrevImg = &_rgbPrevImg; - rgbNextImg = &_rgbNextImg; - prevPts = _prevPts; - nextPts = _nextPts; - status = _status; - err = _err; - minWinSize = _winSize[0]; - maxWinSize = _winSize[1]; - criteria.maxCount = _maxIteration; - criteria.epsilon = 0.01; - level = _level; - maxLevel = _maxLevel; - windowType = _supportRegionType; - minEigThreshold = _minEigenValue; - paramReset = _normSigmas; - useInitialFlow = _useInitialFlow; - crossSegmentationThreshold = _crossSegmentationThreshold; - } - TrackerInvoker & operator=(const TrackerInvoker &) { return *this; }; - void operator()(const cv::Range& range) const - { - cv::Size winSize; - cv::Point2f halfWin; - - const Mat& I = *prevImg; - const Mat& J = *nextImg; - const Mat& derivI = *prevDeriv; - const Mat& BI = *rgbPrevImg; - - winSize = cv::Size(maxWinSize, maxWinSize); - int winMaskwidth = static_cast(ceil(winSize.width / 8.f)) * 16; - cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); - winMaskMatBuf.setTo(1); - - const float FLT_SCALE = (1.f / (1 << 20)); // 20 - - std::vector param = paramReset; - int cn = I.channels(), cn2 = cn * 2; - int winbufwidth = static_cast(ceil(winSize.width / 8.f)) * 8; - cv::Size winBufSize(winbufwidth, winbufwidth); - - std::vector _buf(winBufSize.area()*(cn + cn2)); - Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]); - Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]); - - for (int ptidx = range.start; ptidx < range.end; ptidx++) - { - Point2f prevPt = prevPts[ptidx] * (float)(1. / (1 << level)); - Point2f nextPt; - if (level == maxLevel) - { - if ( useInitialFlow) - nextPt = nextPts[ptidx] * (float)(1. / (1 << level)); - else - nextPt = prevPt; - } - else - nextPt = nextPts[ptidx] * 2.f; - nextPts[ptidx] = nextPt; - - Point2i iprevPt, inextPt; - iprevPt.x = cvFloor(prevPt.x); - iprevPt.y = cvFloor(prevPt.y); - int winArea = maxWinSize * maxWinSize; - cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0, 0, maxWinSize, maxWinSize)); - - if (calcWinMaskMat(BI, windowType, iprevPt, - winMaskMat, winSize, halfWin, winArea, - minWinSize, maxWinSize, crossSegmentationThreshold) == false) - continue; - - - prevPt -= halfWin; - iprevPt.x = cvFloor(prevPt.x); - iprevPt.y = cvFloor(prevPt.y); - - if (iprevPt.x + halfWin.x < 0 || iprevPt.x + halfWin.x >= derivI.cols || - iprevPt.y + halfWin.y < 0 || iprevPt.y + halfWin.y >= derivI.rows) - { - if (level == 0) - { - if (status) - status[ptidx] = 3; - if (err) - err[ptidx] = 0; - } - continue; - } - - float a = prevPt.x - iprevPt.x; - float b = prevPt.y - iprevPt.y; - const int W_BITS = 14, W_BITS1 = 14; - - int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); - int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); - int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); - int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - - int dstep = (int)(derivI.step / derivI.elemSize1()); - int step = (int)(I.step / I.elemSize1()); - int mStep = (int)(winMaskMat.step / winMaskMat.elemSize1()); - CV_Assert(step == (int)(J.step / J.elemSize1())); - float A11 = 0, A12 = 0, A22 = 0; - float b1 = 0, b2 = 0; - float D = 0; - float minEig; - -#ifdef CV_SSE2 - __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); - __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); - __m128i z = _mm_setzero_si128(); - __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1 - 1)); - __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1 - 5 - 1)); - __m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps(); - __m128i mmMask4_epi32; - __m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits::max()); - get4BitMask(winSize.width, mmMask4_epi32); + inline int calc_diff(int diff,float fParam0,float fParam1,float param_2) + { + int abss = (diff < 0) ? -diff : diff; + if (abss > fParam1) + { + diff = 0; + } + else if (abss > fParam0 && diff >= 0) + { + //diff = fParam1 * (diff - fParam1); + diff = static_cast(param_2 * (diff - fParam1)); + } + else if (abss > fParam0 && diff < 0) + { + //diff = fParam1 * (diff + fParam1); + diff = static_cast(param_2 * (diff + fParam1)); + + } + return diff; + } + class TrackerInvoker : public cv::ParallelLoopBody + { + public: + TrackerInvoker( + const Mat& _prevImg, + const Mat& _prevDeriv, + const Mat& _nextImg, + const Mat& _rgbPrevImg, + const Mat& _rgbNextImg, + const Point2f* _prevPts, + Point2f* _nextPts, + uchar* _status, + float* _err, + int _level, + int _maxLevel, + int _winSize[2], + int _maxIteration, + bool _useInitialFlow, + int _supportRegionType, + std::vector _normSigmas, + float _minEigenValue, + int _crossSegmentationThreshold) + { + prevImg = &_prevImg; + prevDeriv = &_prevDeriv; + nextImg = &_nextImg; + rgbPrevImg = &_rgbPrevImg; + rgbNextImg = &_rgbNextImg; + prevPts = _prevPts; + nextPts = _nextPts; + status = _status; + err = _err; + minWinSize = _winSize[0]; + maxWinSize = _winSize[1]; + criteria.maxCount = _maxIteration; + criteria.epsilon = 0.01; + level = _level; + maxLevel = _maxLevel; + windowType = _supportRegionType; + minEigThreshold = _minEigenValue; + paramReset = _normSigmas; + useInitialFlow = _useInitialFlow; + crossSegmentationThreshold = _crossSegmentationThreshold; + } + TrackerInvoker & operator=(const TrackerInvoker &) { return *this; }; + void operator()(const cv::Range& range) const + { + cv::Size winSize; + cv::Point2f halfWin; + + const Mat& I = *prevImg; + const Mat& J = *nextImg; + const Mat& derivI = *prevDeriv; + const Mat& BI = *rgbPrevImg; + + winSize = cv::Size(maxWinSize, maxWinSize); + int winMaskwidth = static_cast(ceil(winSize.width / 8.f)) * 16; + cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); + winMaskMatBuf.setTo(1); + + const float FLT_SCALE = (1.f / (1 << 20)); // 20 + + std::vector param = paramReset; + int cn = I.channels(), cn2 = cn * 2; + int winbufwidth = static_cast(ceil(winSize.width / 8.f)) * 8; + cv::Size winBufSize(winbufwidth, winbufwidth); + + std::vector _buf(winBufSize.area()*(cn + cn2)); + Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]); + Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]); + + for (int ptidx = range.start; ptidx < range.end; ptidx++) + { + Point2f prevPt = prevPts[ptidx] * (float)(1. / (1 << level)); + Point2f nextPt; + if (level == maxLevel) + { + if ( useInitialFlow) + nextPt = nextPts[ptidx] * (float)(1. / (1 << level)); + else + nextPt = prevPt; + } + else + nextPt = nextPts[ptidx] * 2.f; + nextPts[ptidx] = nextPt; + + Point2i iprevPt, inextPt; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + int winArea = maxWinSize * maxWinSize; + cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0, 0, maxWinSize, maxWinSize)); + + if (calcWinMaskMat(BI, windowType, iprevPt, + winMaskMat, winSize, halfWin, winArea, + minWinSize, maxWinSize, crossSegmentationThreshold) == false) + continue; + + + prevPt -= halfWin; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + + if (iprevPt.x + halfWin.x < 0 || iprevPt.x + halfWin.x >= derivI.cols || + iprevPt.y + halfWin.y < 0 || iprevPt.y + halfWin.y >= derivI.rows) + { + if (level == 0) + { + if (status) + status[ptidx] = 3; + if (err) + err[ptidx] = 0; + } + continue; + } + + float a = prevPt.x - iprevPt.x; + float b = prevPt.y - iprevPt.y; + const int W_BITS = 14, W_BITS1 = 14; + + int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + + int dstep = (int)(derivI.step / derivI.elemSize1()); + int step = (int)(I.step / I.elemSize1()); + int mStep = (int)(winMaskMat.step / winMaskMat.elemSize1()); + CV_Assert(step == (int)(J.step / J.elemSize1())); + float A11 = 0, A12 = 0, A22 = 0; + float b1 = 0, b2 = 0; + float D = 0; + float minEig; + +#ifdef CV_SSE4_1 + __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128i z = _mm_setzero_si128(); + __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1 - 1)); + __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1 - 5 - 1)); + __m128i mmMask4_epi32; + __m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits::max()); + get4BitMask(winSize.width, mmMask4_epi32); #endif - // extract the patch from the first image, compute covariation matrix of derivatives - int x, y; - for (y = 0; y < winSize.height; y++) - { - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; - const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; - const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; - - short* Iptr = (short*)(IWinBuf.data + y * IWinBuf.step); - short* dIptr = (short*)(derivIWinBuf.data + y * derivIWinBuf.step); - - x = 0; - -#ifdef CV_SSE2 - - for (; x <= winSize.width*cn; x += 4, dsrc += 4 * 2, dIptr += 4 * 2) - { - __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16); - __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16); - - __m128i v00, v01, v10, v11, t0, t1; - v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); - v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); - v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z); - v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z); - - t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); - t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5); - if (x + 4 > winSize.width) - { - t0 = _mm_and_si128(t0, mmMask4_epi32); - } - t0 = _mm_and_si128(t0, mask_0_3_epi16); - _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0, t0)); - - v00 = _mm_loadu_si128((const __m128i*)(dsrc)); - v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); - v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep)); - v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2)); - - t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); - t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); - t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1); - t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1); - v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... - if (x + 4 > winSize.width) - { - v00 = _mm_and_si128(v00, mmMask4_epi32); - } - v00 = _mm_and_si128(v00, mask_0_3_epi16); - _mm_storeu_si128((__m128i*)dIptr, v00); - } + // extract the patch from the first image, compute covariation matrix of derivatives + int x, y; + for (y = 0; y < winSize.height; y++) + { + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; + const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; + + short* Iptr = (short*)(IWinBuf.data + y * IWinBuf.step); + short* dIptr = (short*)(derivIWinBuf.data + y * derivIWinBuf.step); + + x = 0; + +#ifdef CV_SSE4_1 + + for (; x <= winSize.width*cn; x += 4, dsrc += 4 * 2, dIptr += 4 * 2) + { + __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16); + __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16); + + __m128i v00, v01, v10, v11, t0, t1; + v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); + v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); + v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z); + v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5); + if (x + 4 > winSize.width) + { + t0 = _mm_and_si128(t0, mmMask4_epi32); + } + t0 = _mm_and_si128(t0, mask_0_3_epi16); + _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0, t0)); + + v00 = _mm_loadu_si128((const __m128i*)(dsrc)); + v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); + v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep)); + v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2)); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1); + v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... + if (x + 4 > winSize.width) + { + v00 = _mm_and_si128(v00, mmMask4_epi32); + } + v00 = _mm_and_si128(v00, mask_0_3_epi16); + _mm_storeu_si128((__m128i*)dIptr, v00); + } #else - for (; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2) - { - if (maskPtr[x] == 0) - { - dIptr[0] = 0; - dIptr[1] = 0; - continue; - } - int ival = CV_DESCALE(src[x] * iw00 + src[x + cn] * iw01 + - src[x + step] * iw10 + src[x + step + cn] * iw11, W_BITS1 - 5); - int ixval = CV_DESCALE(dsrc[0] * iw00 + dsrc[cn2] * iw01 + - dsrc[dstep] * iw10 + dsrc[dstep + cn2] * iw11, W_BITS1); - int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc[dstep + 1] * iw10 + - dsrc[dstep + cn2 + 1] * iw11, W_BITS1); - - Iptr[x] = (short)ival; - dIptr[0] = (short)ixval; - dIptr[1] = (short)iyval; - } + for (; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2) + { + if (maskPtr[x] == 0) + { + dIptr[0] = 0; + dIptr[1] = 0; + continue; + } + int ival = CV_DESCALE(src[x] * iw00 + src[x + cn] * iw01 + + src[x + step] * iw10 + src[x + step + cn] * iw11, W_BITS1 - 5); + int ixval = CV_DESCALE(dsrc[0] * iw00 + dsrc[cn2] * iw01 + + dsrc[dstep] * iw10 + dsrc[dstep + cn2] * iw11, W_BITS1); + int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc[dstep + 1] * iw10 + + dsrc[dstep + cn2 + 1] * iw11, W_BITS1); + + Iptr[x] = (short)ival; + dIptr[0] = (short)ixval; + dIptr[1] = (short)iyval; + } #endif - } - - cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1); - - cv::Point2f backUpNextPt = nextPt; - nextPt -= halfWin; - int j; - __m128i mmMask0, mmMask1, mmMask; - getWBitMask(winSize.width, mmMask0, mmMask1, mmMask); - float MEstimatorScale = 1; - int buffIdx = 0; - cv::Point2f prevDelta(0, 0); - - for (j = 0; j < criteria.maxCount; j++) - { - inextPt.x = cvFloor(nextPt.x); - inextPt.y = cvFloor(nextPt.y); - - if (inextPt.x < -winSize.width || inextPt.x >= J.cols || - inextPt.y < -winSize.height || inextPt.y >= J.rows) - { - if (level == 0 && status) - status[ptidx] = 3; - if (level > 0) - nextPts[ptidx] = backUpNextPt; - break; - } - - - a = nextPt.x - inextPt.x; - b = nextPt.y - inextPt.y; - iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); - iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); - iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); - iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - b1 = 0; - b2 = 0; - - if (j == 0 ) - { - A11 = 0; - A12 = 0; - A22 = 0; - } - - if (j == 0 ) - { - buffIdx = 0; - for (y = 0; y < winSize.height; y++) - { - const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn; - const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step); - const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; - x = 0; - for (; x < winSize.width*cn; x++, dIptr += 2) - { - if (maskPtr[x] == 0) - continue; - int diff = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11, W_BITS1 - 5) - Iptr[x]; - residualMat.at(buffIdx++) = static_cast(diff); - } - } - /*! Estimation for the residual */ - cv::Mat residualMatRoi(residualMat, cv::Rect(0, 0, 1, buffIdx)); - MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi); - } - - float eta = 1.f / winArea; - float fParam0 = param[0] * 32.f; - float fParam1 = param[1] * 32.f; - fParam0 = param[0] * MEstimatorScale; - fParam1 = param[1] * MEstimatorScale; -#ifdef CV_SSE2 - qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); - qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); - __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(); - __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps(); - __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits::max() - 1, static_cast(fParam0))); - __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits::max() - 1, static_cast(fParam1))); - __m128 mmParam2 = _mm_set1_ps(param[2]); - float s2Val = param[2] > 0 ? param[2] : -param[2]; - int s2bitShift = param[2] == 0 ? 1 : static_cast(ceil(log(200.f / s2Val) / log(2.f))); // zwei stellen nach den komma - __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast(param[2] * (float)(1 << s2bitShift))); - __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift); - __m128 mmParam2s = _mm_set1_ps(0.01f * param[2]); - __m128 mmParam2s2 = _mm_set1_ps(param[2] * param[2]); - __m128 mmOnes = _mm_set1_ps(1.f); - __m128 mmError0 = _mm_setzero_ps(); - __m128 mmError1 = _mm_setzero_ps(); - __m128i mmEta = _mm_setzero_si128(); - __m128i mmEtaNeg = _mm_set1_epi16(-1); - __m128i mmScale = _mm_set1_epi16(static_cast(MEstimatorScale)); + } + + cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1); + + cv::Point2f backUpNextPt = nextPt; + nextPt -= halfWin; + int j; + __m128i mmMask0, mmMask1, mmMask; + getWBitMask(winSize.width, mmMask0, mmMask1, mmMask); + float MEstimatorScale = 1; + int buffIdx = 0; + cv::Point2f prevDelta(0, 0); + + for (j = 0; j < criteria.maxCount; j++) + { + inextPt.x = cvFloor(nextPt.x); + inextPt.y = cvFloor(nextPt.y); + + if (inextPt.x < -winSize.width || inextPt.x >= J.cols || + inextPt.y < -winSize.height || inextPt.y >= J.rows) + { + if (level == 0 && status) + status[ptidx] = 3; + if (level > 0) + nextPts[ptidx] = backUpNextPt; + break; + } + + + a = nextPt.x - inextPt.x; + b = nextPt.y - inextPt.y; + iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + b1 = 0; + b2 = 0; + + if (j == 0 ) + { + A11 = 0; + A12 = 0; + A22 = 0; + } + + if (j == 0 ) + { + buffIdx = 0; + for (y = 0; y < winSize.height; y++) + { + const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn; + const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step); + const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + x = 0; + for (; x < winSize.width*cn; x++, dIptr += 2) + { + if (maskPtr[x] == 0) + continue; + int diff = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11, W_BITS1 - 5) - Iptr[x]; + residualMat.at(buffIdx++) = static_cast(diff); + } + } + /*! Estimation for the residual */ + cv::Mat residualMatRoi(residualMat, cv::Rect(0, 0, 1, buffIdx)); + MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi); + } + + float eta = 1.f / winArea; + float fParam0 = param[0] * 32.f; + float fParam1 = param[1] * 32.f; + fParam0 = param[0] * MEstimatorScale; + fParam1 = param[1] * MEstimatorScale; +#ifdef CV_SSE4_1 + qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(); + __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps(); + __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits::max() - 1, static_cast(fParam0))); + __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits::max() - 1, static_cast(fParam1))); + float s2Val = param[2] > 0 ? param[2] : -param[2]; + int s2bitShift = param[2] == 0 ? 1 : static_cast(ceil(log(200.f / s2Val) / log(2.f))); // zwei stellen nach den komma + __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast(param[2] * (float)(1 << s2bitShift))); + __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift); + __m128 mmParam2s = _mm_set1_ps(0.01f * param[2]); + __m128 mmParam2s2 = _mm_set1_ps(param[2] * param[2]); + __m128 mmOnes = _mm_set1_ps(1.f); + __m128i mmEta = _mm_setzero_si128(); + __m128i mmScale = _mm_set1_epi16(static_cast(MEstimatorScale)); #endif - - buffIdx = 0; - for (y = 0; y < winSize.height; y++) - { - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; - const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn; - const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step); - const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); - - x = 0; -#ifdef CV_SSE2 - for (; x <= winSize.width*cn; x += 8, dIptr += 8 * 2) - { - __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16); - __m128i diff0, diff1; - __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); // von element 0 bis 7 - - __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z); - __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); - __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z); - __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z); - - __m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); - __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); - t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5); - t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1 - 5); - - __m128i mmDiff_epi16 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16); - - mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16); - - - __m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale); - mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1))); - - - __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... - __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); - - __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16); - __m128i bSet2_epi16, bSet1_epi16; - // |It| < sigma1 ? - bSet2_epi16 = _mm_cmplt_epi16(abs_epi16, mmParam1); - // It > 0 ? - __m128i diffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128()); - // sigma0 < |It| < sigma1 ? - bSet1_epi16 = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0)); - // val = |It| -/+ sigma1 - __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)), - _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1))); - // It == 0 ? |It| > sigma13 - mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16); - // It == val ? sigma0 < |It| < sigma1 - mmDiff_epi16 = _mm_blendv_epi8(mmDiff_epi16, tmpParam1_epi16, bSet1_epi16); - - - __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3 - // diff = diff * sigma2 - __m128i lo = _mm_mullo_epi16(tale_epi16_, mmDiff_epi16); - __m128i hi = _mm_mulhi_epi16(tale_epi16_, mmDiff_epi16); - __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); //diff 0_3_epi32 - __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); // diff 4_7_epi32 - - mmDiff_epi16 = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); - diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7 | It12 It12 It13 It13... - diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3 | It8 It8 It9 It9... - - // Ix * diff / Iy * diff - v10 = _mm_mullo_epi16(Ixy_0, diff0); - v11 = _mm_mulhi_epi16(Ixy_0, diff0); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); - qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); - - // It * Ix It * Iy [4 ... 7] - // for set 1 hi sigma 1 - - v10 = _mm_mullo_epi16(Ixy_1, diff1); - v11 = _mm_mulhi_epi16(Ixy_1, diff1); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); - qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); - - - if (j == 0) - { - __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16)); - __m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16, bSet1_epi16), 16)); - __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16)); - __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16))); - - __m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16)); - __m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16), 16)); - - __m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps); - __m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps); - tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps); - tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps); - - tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps); - tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps); - - t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3 - t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3 - - __m128 fy = _mm_cvtepi32_ps(t0); - __m128 fx = _mm_cvtepi32_ps(t1); - - // A11 - A22 - __m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps); - __m128 fytale = _mm_mul_ps(fy, tale_0_3_ps); - - mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); - mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); - mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); - - t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11 - t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3 - - fy = _mm_cvtepi32_ps(t0); - fx = _mm_cvtepi32_ps(t1); - - // A11 - A22 - fxtale = _mm_mul_ps(fx, tale_4_7_ps); - fytale = _mm_mul_ps(fy, tale_4_7_ps); - - mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); - mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); - mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); - } - } + + buffIdx = 0; + for (y = 0; y < winSize.height; y++) + { + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn; + const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step); + const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); + + x = 0; +#ifdef CV_SSE4_1 + for (; x <= winSize.width*cn; x += 8, dIptr += 8 * 2) + { + __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16); + __m128i diff0, diff1; + __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); // von element 0 bis 7 + + __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z); + __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); + __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z); + __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z); + + __m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1 - 5); + + __m128i mmDiff_epi16 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16); + + mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16); + + + __m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale); + mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1))); + + + __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... + __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); + + __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16); + __m128i bSet2_epi16, bSet1_epi16; + // |It| < sigma1 ? + bSet2_epi16 = _mm_cmplt_epi16(abs_epi16, mmParam1); + // It > 0 ? + __m128i diffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128()); + // sigma0 < |It| < sigma1 ? + bSet1_epi16 = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0)); + // val = |It| -/+ sigma1 + __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)), + _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1))); + // It == 0 ? |It| > sigma13 + mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16); + // It == val ? sigma0 < |It| < sigma1 + mmDiff_epi16 = _mm_blendv_epi8(mmDiff_epi16, tmpParam1_epi16, bSet1_epi16); + + + __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3 + // diff = diff * sigma2 + __m128i lo = _mm_mullo_epi16(tale_epi16_, mmDiff_epi16); + __m128i hi = _mm_mulhi_epi16(tale_epi16_, mmDiff_epi16); + __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); //diff 0_3_epi32 + __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); // diff 4_7_epi32 + + mmDiff_epi16 = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); + diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7 | It12 It12 It13 It13... + diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3 | It8 It8 It9 It9... + + // Ix * diff / Iy * diff + v10 = _mm_mullo_epi16(Ixy_0, diff0); + v11 = _mm_mulhi_epi16(Ixy_0, diff0); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); + qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); + + // It * Ix It * Iy [4 ... 7] + // for set 1 hi sigma 1 + + v10 = _mm_mullo_epi16(Ixy_1, diff1); + v11 = _mm_mulhi_epi16(Ixy_1, diff1); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); + qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); + + + if (j == 0) + { + __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16)); + __m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16, bSet1_epi16), 16)); + __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16)); + __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16))); + + __m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16)); + __m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16), 16)); + + __m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps); + __m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps); + tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps); + tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps); + + tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps); + tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps); + + t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3 + t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3 + + __m128 fy = _mm_cvtepi32_ps(t0); + __m128 fx = _mm_cvtepi32_ps(t1); + + // A11 - A22 + __m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps); + __m128 fytale = _mm_mul_ps(fy, tale_0_3_ps); + + mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); + mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); + mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); + + t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11 + t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3 + + fy = _mm_cvtepi32_ps(t0); + fx = _mm_cvtepi32_ps(t1); + + // A11 - A22 + fxtale = _mm_mul_ps(fx, tale_4_7_ps); + fytale = _mm_mul_ps(fy, tale_4_7_ps); + + mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); + mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); + mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); + } + } #else - for (; x < winSize.width*cn; x++, dIptr += 2) - { - if (maskPtr[x] == 0) - continue; - int diff = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + - Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11, - W_BITS1 - 5) - Iptr[x]; - - - - - - if (diff > MEstimatorScale) - MEstimatorScale += eta; - if (diff < MEstimatorScale) - MEstimatorScale -= eta; - - calc_diff(diff, fParam0, fParam1, param[2]); - - - float ixval = (float)(dIptr[0]); - float iyval = (float)(dIptr[1]); - b1 += (float)(diff*ixval); - b2 += (float)(diff*iyval); - - if ( j == 0) - { - int abss = (diff < 0) ? -diff : diff; - float tale = param[2] * FLT_RESCALE; - //float tale = fParam1 * FLT_RESCALE; - if (abss < fParam0) - { - tale = FLT_RESCALE; - } - else if (abss > fParam1) - { - tale *= 0.01f; - } - else - { - tale *= param[2]; - } - A11 += (float)(ixval*ixval*tale); - A12 += (float)(ixval*iyval*tale); - A22 += (float)(iyval*iyval*tale); - } - - - } + for (; x < winSize.width*cn; x++, dIptr += 2) + { + if (maskPtr[x] == 0) + continue; + int diff = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + + Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11, + W_BITS1 - 5) - Iptr[x]; + + + + + + if (diff > MEstimatorScale) + MEstimatorScale += eta; + if (diff < MEstimatorScale) + MEstimatorScale -= eta; + + calc_diff(diff, fParam0, fParam1, param[2]); + + + float ixval = (float)(dIptr[0]); + float iyval = (float)(dIptr[1]); + b1 += (float)(diff*ixval); + b2 += (float)(diff*iyval); + + if ( j == 0) + { + int abss = (diff < 0) ? -diff : diff; + float tale = param[2] * FLT_RESCALE; + //float tale = fParam1 * FLT_RESCALE; + if (abss < fParam0) + { + tale = FLT_RESCALE; + } + else if (abss > fParam1) + { + tale *= 0.01f; + } + else + { + tale *= param[2]; + } + A11 += (float)(ixval*ixval*tale); + A12 += (float)(ixval*iyval*tale); + A22 += (float)(iyval*iyval*tale); + } + + + } #endif - } - -#ifdef CV_SSE2 - short etaValues[8]; - _mm_storeu_si128((__m128i*)(etaValues), mmEta); - MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3] - + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]); - - + } + +#ifdef CV_SSE4_1 + short etaValues[8]; + _mm_storeu_si128((__m128i*)(etaValues), mmEta); + MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3] + + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]); + + #endif - if (j == 0) - { -#ifdef CV_SSE2 - float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];// + if (j == 0) + { +#ifdef CV_SSE4_1 + float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];// - _mm_store_ps(A11buf, mmAxx); - _mm_store_ps(A12buf, mmAxy); - _mm_store_ps(A22buf, mmAyy); + _mm_store_ps(A11buf, mmAxx); + _mm_store_ps(A12buf, mmAxy); + _mm_store_ps(A22buf, mmAyy); - A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; - A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; - A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; + A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; + A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; + A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; #endif - A11 *= FLT_SCALE; - A12 *= FLT_SCALE; - A22 *= FLT_SCALE; - - - D = A11 * A22 - A12 * A12; - minEig = (A22 + A11 - std::sqrt((A11 - A22)*(A11 - A22) + - 4.f*A12*A12)) / (2 * winArea); - D = 1.f / D; - - if (minEig < minEigThreshold || std::abs(D) < FLT_EPSILON) - { - if (level == 0 && status) - status[ptidx] = 0; - if (level > 0) - { - nextPts[ptidx] = backUpNextPt; - } - break; - } - } - -#ifdef CV_SSE2 - float CV_DECL_ALIGNED(16) bbuf[4]; - _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1)); - b1 += bbuf[0] + bbuf[2]; - b2 += bbuf[1] + bbuf[3]; + A11 *= FLT_SCALE; + A12 *= FLT_SCALE; + A22 *= FLT_SCALE; + + + D = A11 * A22 - A12 * A12; + minEig = (A22 + A11 - std::sqrt((A11 - A22)*(A11 - A22) + + 4.f*A12*A12)) / (2 * winArea); + D = 1.f / D; + + if (minEig < minEigThreshold || std::abs(D) < FLT_EPSILON) + { + if (level == 0 && status) + status[ptidx] = 0; + if (level > 0) + { + nextPts[ptidx] = backUpNextPt; + } + break; + } + } + +#ifdef CV_SSE4_1 + float CV_DECL_ALIGNED(16) bbuf[4]; + _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1)); + b1 += bbuf[0] + bbuf[2]; + b2 += bbuf[1] + bbuf[3]; #endif - b1 *= FLT_SCALE; - b2 *= FLT_SCALE; - - - Point2f delta((float)((A12*b2 - A22 * b1) * D), (float)((A12*b1 - A11 * b2) * D)); - - delta.x = (delta.x != delta.x) ? 0 : delta.x; - delta.y = (delta.y != delta.y) ? 0 : delta.y; - - nextPt += delta * 0.7; - nextPts[ptidx] = nextPt + halfWin; - - if (j > 0 && std::abs(delta.x - prevDelta.x) < 0.01 && - std::abs(delta.y - prevDelta.y) < 0.01) - { - nextPts[ptidx] -= delta * 0.5f; - break; - } - - prevDelta = delta; - - } - - } - - } - - const Mat* prevImg; - const Mat* nextImg; - const Mat* prevDeriv; - const Mat* rgbPrevImg; - const Mat* rgbNextImg; - const Point2f* prevPts; - Point2f* nextPts; - uchar* status; - float* err; - int maxWinSize; - int minWinSize; - TermCriteria criteria; - int level; - int maxLevel; - int windowType; - float minEigThreshold; - bool useInitialFlow; - std::vector paramReset; - int crossSegmentationThreshold; - }; + b1 *= FLT_SCALE; + b2 *= FLT_SCALE; + + + Point2f delta((float)((A12*b2 - A22 * b1) * D), (float)((A12*b1 - A11 * b2) * D)); + + delta.x = (delta.x != delta.x) ? 0 : delta.x; + delta.y = (delta.y != delta.y) ? 0 : delta.y; + + nextPt += delta * 0.7; + nextPts[ptidx] = nextPt + halfWin; + + if (j > 0 && std::abs(delta.x - prevDelta.x) < 0.01 && + std::abs(delta.y - prevDelta.y) < 0.01) + { + nextPts[ptidx] -= delta * 0.5f; + break; + } + + prevDelta = delta; + + } + + } + + } + + const Mat* prevImg; + const Mat* nextImg; + const Mat* prevDeriv; + const Mat* rgbPrevImg; + const Mat* rgbNextImg; + const Point2f* prevPts; + Point2f* nextPts; + uchar* status; + float* err; + int maxWinSize; + int minWinSize; + TermCriteria criteria; + int level; + int maxLevel; + int windowType; + float minEigThreshold; + bool useInitialFlow; + std::vector paramReset; + int crossSegmentationThreshold; + }; } namespace radial { - class TrackerInvoker : public cv::ParallelLoopBody - { - public: - TrackerInvoker( - const Mat& _prevImg, - const Mat& _prevDeriv, - const Mat& _nextImg, - const Mat& _rgbPrevImg, - const Mat& _rgbNextImg, - const Point2f* _prevPts, - Point2f* _nextPts, - uchar* _status, - float* _err, - Point2f* _gainVecs, - int _level, - int _maxLevel, - int _winSize[2], - int _maxIteration, - bool _useInitialFlow, - int _supportRegionType, - std::vector _normSigmas, - float _minEigenValue, - int _crossSegmentationThreshold) - { - prevImg = &_prevImg; - prevDeriv = &_prevDeriv; - nextImg = &_nextImg; - rgbPrevImg = &_rgbPrevImg; - rgbNextImg = &_rgbNextImg; - prevPts = _prevPts; - nextPts = _nextPts; - status = _status; - err = _err; - gainVecs = _gainVecs; - minWinSize = _winSize[0]; - maxWinSize = _winSize[1]; - criteria.maxCount = _maxIteration; - criteria.epsilon = 0.01; - level = _level; - maxLevel = _maxLevel; - windowType = _supportRegionType; - minEigThreshold = _minEigenValue; - paramReset = _normSigmas; - useInitialFlow = _useInitialFlow; - crossSegmentationThreshold = _crossSegmentationThreshold; - } - - TrackerInvoker & operator=(const TrackerInvoker &) { return *this; }; - void operator()(const cv::Range& range) const - { - Point2f halfWin; - cv::Size winSize; - const Mat& I = *prevImg; - const Mat& J = *nextImg; - const Mat& derivI = *prevDeriv; - const Mat& BI = *rgbPrevImg; - - - const float FLT_SCALE = (1.f / (1 << 16)); - - winSize = cv::Size(maxWinSize, maxWinSize); - int winMaskwidth = static_cast(ceil(winSize.width / 16.f)) * 16; - cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); - winMaskMatBuf.setTo(1); - - std::vector param = paramReset; - int cn = I.channels(), cn2 = cn * 2; - int winbufwidth = static_cast(ceil(winSize.width / 16.f)) * 16; - cv::Size winBufSize(winbufwidth, winbufwidth); - - cv::Mat invTensorMat(4, 4, CV_32FC1); - cv::Mat mismatchMat(4, 1, CV_32FC1); - cv::Mat resultMat(4, 1, CV_32FC1); - - - std::vector _buf(winBufSize.area()*(cn + cn2)); - Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]); - Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]); - - for (int ptidx = range.start; ptidx < range.end; ptidx++) - { - - Point2f prevPt = prevPts[ptidx] * (float)(1. / (1 << level)); - Point2f nextPt; - if (level == maxLevel) - { - if (useInitialFlow) - { - nextPt = nextPts[ptidx] * (float)(1. / (1 << level)); - } - else - nextPt = prevPt; - } - else - { - nextPt = nextPts[ptidx] * 2.f; - - } - nextPts[ptidx] = nextPt; - - Point2i iprevPt, inextPt; - iprevPt.x = cvFloor(prevPt.x); - iprevPt.y = cvFloor(prevPt.y); - int winArea = maxWinSize * maxWinSize; - cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0, 0, maxWinSize, maxWinSize)); - winMaskMatBuf.setTo(0); - if (calcWinMaskMat(BI, windowType, iprevPt, - winMaskMat, winSize, halfWin, winArea, - minWinSize, maxWinSize, crossSegmentationThreshold) == false) - continue; - - prevPt -= halfWin; - iprevPt.x = cvFloor(prevPt.x); - iprevPt.y = cvFloor(prevPt.y); - - if (iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols || - iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows) - { - if (level == 0) - { - if (status) - status[ptidx] = 3; - if (err) - err[ptidx] = 0; - } - continue; - } - - float a = prevPt.x - iprevPt.x; - float b = prevPt.y - iprevPt.y; - const int W_BITS = 14, W_BITS1 = 14; - - int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); - int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); - int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); - int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - - int dstep = (int)(derivI.step / derivI.elemSize1()); - int step = (int)(I.step / I.elemSize1()); - int mStep = (int)(winMaskMat.step / winMaskMat.elemSize1()); - CV_Assert(step == (int)(J.step / J.elemSize1())); - - float A11 = 0, A12 = 0, A22 = 0; - float b1 = 0, b2 = 0, b3 = 0, b4 = 0; - // tensor - float sumIx = 0; - float sumIy = 0; - float sumI = 0; - float sumW = 0; - float w1 = 0, w2 = 0; // -IyI - float dI = 0; // I^2 - float D = 0; - -#ifdef CV_SSE2 - __m128i rightMask = _mm_set_epi16(0, std::numeric_limits::max(), 0, std::numeric_limits::max(), 0, std::numeric_limits::max(), 0, std::numeric_limits::max()); - __m128i leftMask = _mm_set_epi16(std::numeric_limits::max(), 0, std::numeric_limits::max(), 0, std::numeric_limits::max(), 0, std::numeric_limits::max(), 0); - __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); - __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); - __m128i z = _mm_setzero_si128(); - __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1 - 1)); - __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1 - 5 - 1)); - __m128i mmMask4_epi32; - __m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits::max()); - get4BitMask(winSize.width, mmMask4_epi32); - __m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps(); + class TrackerInvoker : public cv::ParallelLoopBody + { + public: + TrackerInvoker( + const Mat& _prevImg, + const Mat& _prevDeriv, + const Mat& _nextImg, + const Mat& _rgbPrevImg, + const Mat& _rgbNextImg, + const Point2f* _prevPts, + Point2f* _nextPts, + uchar* _status, + float* _err, + Point2f* _gainVecs, + int _level, + int _maxLevel, + int _winSize[2], + int _maxIteration, + bool _useInitialFlow, + int _supportRegionType, + std::vector _normSigmas, + float _minEigenValue, + int _crossSegmentationThreshold) + { + prevImg = &_prevImg; + prevDeriv = &_prevDeriv; + nextImg = &_nextImg; + rgbPrevImg = &_rgbPrevImg; + rgbNextImg = &_rgbNextImg; + prevPts = _prevPts; + nextPts = _nextPts; + status = _status; + err = _err; + gainVecs = _gainVecs; + minWinSize = _winSize[0]; + maxWinSize = _winSize[1]; + criteria.maxCount = _maxIteration; + criteria.epsilon = 0.01; + level = _level; + maxLevel = _maxLevel; + windowType = _supportRegionType; + minEigThreshold = _minEigenValue; + paramReset = _normSigmas; + useInitialFlow = _useInitialFlow; + crossSegmentationThreshold = _crossSegmentationThreshold; + } + + TrackerInvoker & operator=(const TrackerInvoker &) { return *this; }; + void operator()(const cv::Range& range) const + { + Point2f halfWin; + cv::Size winSize; + const Mat& I = *prevImg; + const Mat& J = *nextImg; + const Mat& derivI = *prevDeriv; + const Mat& BI = *rgbPrevImg; + + + const float FLT_SCALE = (1.f / (1 << 16)); + + winSize = cv::Size(maxWinSize, maxWinSize); + int winMaskwidth = static_cast(ceil(winSize.width / 16.f)) * 16; + cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); + winMaskMatBuf.setTo(1); + + std::vector param = paramReset; + int cn = I.channels(), cn2 = cn * 2; + int winbufwidth = static_cast(ceil(winSize.width / 16.f)) * 16; + cv::Size winBufSize(winbufwidth, winbufwidth); + + cv::Mat invTensorMat(4, 4, CV_32FC1); + cv::Mat mismatchMat(4, 1, CV_32FC1); + cv::Mat resultMat(4, 1, CV_32FC1); + + + std::vector _buf(winBufSize.area()*(cn + cn2)); + Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]); + Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]); + + for (int ptidx = range.start; ptidx < range.end; ptidx++) + { + + Point2f prevPt = prevPts[ptidx] * (float)(1. / (1 << level)); + Point2f nextPt; + if (level == maxLevel) + { + if (useInitialFlow) + { + nextPt = nextPts[ptidx] * (float)(1. / (1 << level)); + } + else + nextPt = prevPt; + } + else + { + nextPt = nextPts[ptidx] * 2.f; + + } + nextPts[ptidx] = nextPt; + + Point2i iprevPt, inextPt; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + int winArea = maxWinSize * maxWinSize; + cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0, 0, maxWinSize, maxWinSize)); + winMaskMatBuf.setTo(0); + if (calcWinMaskMat(BI, windowType, iprevPt, + winMaskMat, winSize, halfWin, winArea, + minWinSize, maxWinSize, crossSegmentationThreshold) == false) + continue; + + prevPt -= halfWin; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + + if (iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols || + iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows) + { + if (level == 0) + { + if (status) + status[ptidx] = 3; + if (err) + err[ptidx] = 0; + } + continue; + } + + float a = prevPt.x - iprevPt.x; + float b = prevPt.y - iprevPt.y; + const int W_BITS = 14, W_BITS1 = 14; + + int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + + int dstep = (int)(derivI.step / derivI.elemSize1()); + int step = (int)(I.step / I.elemSize1()); + int mStep = (int)(winMaskMat.step / winMaskMat.elemSize1()); + CV_Assert(step == (int)(J.step / J.elemSize1())); + + float A11 = 0, A12 = 0, A22 = 0; + float b1 = 0, b2 = 0, b3 = 0, b4 = 0; + // tensor + float sumIx = 0; + float sumIy = 0; + float sumI = 0; + float sumW = 0; + float w1 = 0, w2 = 0; // -IyI + float dI = 0; // I^2 + float D = 0; + +#ifdef CV_SSE4_1 + __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128i z = _mm_setzero_si128(); + __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1 - 1)); + __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1 - 5 - 1)); + __m128i mmMask4_epi32; + __m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits::max()); + get4BitMask(winSize.width, mmMask4_epi32); #endif - // extract the patch from the first image, compute covariation matrix of derivatives - int x, y; - for (y = 0; y < winSize.height; y++) - { - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; - const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; - const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; - - short* Iptr = (short*)(IWinBuf.data + y * IWinBuf.step); - short* dIptr = (short*)(derivIWinBuf.data + y * derivIWinBuf.step); - - x = 0; - -#ifdef CV_SSE2 - for (; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4 * 2, dIptr += 4 * 2) - { - __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16); - __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16); - __m128i v00, v01, v10, v11, t0, t1; - v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); - v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); - v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z); - v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z); - - t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); - t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5); - if (x + 4 > winSize.width) - { - t0 = _mm_and_si128(t0, mmMask4_epi32); - } - t0 = _mm_and_si128(t0, mask_0_3_epi16); - _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0, t0)); - - - v00 = _mm_loadu_si128((const __m128i*)(dsrc)); - v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); - v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep)); - v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2)); - - t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); - t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); - t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1); - t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1); - v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... - if (x + 4 > winSize.width) - { - v00 = _mm_and_si128(v00, mmMask4_epi32); - } - v00 = _mm_and_si128(v00, mask_0_3_epi16); - _mm_storeu_si128((__m128i*)dIptr, v00); - } + // extract the patch from the first image, compute covariation matrix of derivatives + int x, y; + for (y = 0; y < winSize.height; y++) + { + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; + const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; + + short* Iptr = (short*)(IWinBuf.data + y * IWinBuf.step); + short* dIptr = (short*)(derivIWinBuf.data + y * derivIWinBuf.step); + + x = 0; + +#ifdef CV_SSE4_1 + for (; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4 * 2, dIptr += 4 * 2) + { + __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16); + __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16); + __m128i v00, v01, v10, v11, t0, t1; + v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); + v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); + v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z); + v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5); + if (x + 4 > winSize.width) + { + t0 = _mm_and_si128(t0, mmMask4_epi32); + } + t0 = _mm_and_si128(t0, mask_0_3_epi16); + _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0, t0)); + + + v00 = _mm_loadu_si128((const __m128i*)(dsrc)); + v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); + v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep)); + v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2)); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1); + v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... + if (x + 4 > winSize.width) + { + v00 = _mm_and_si128(v00, mmMask4_epi32); + } + v00 = _mm_and_si128(v00, mask_0_3_epi16); + _mm_storeu_si128((__m128i*)dIptr, v00); + } #else - for (; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2) - { - if (winMaskMat.at(y, x) == 0) - { - dIptr[0] = 0; - dIptr[1] = 0; - continue; - } - int ival = CV_DESCALE(src[x] * iw00 + src[x + cn] * iw01 + - src[x + step] * iw10 + src[x + step + cn] * iw11, W_BITS1 - 5); - int ixval = CV_DESCALE(dsrc[0] * iw00 + dsrc[cn2] * iw01 + - dsrc[dstep] * iw10 + dsrc[dstep + cn2] * iw11, W_BITS1); - int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc[dstep + 1] * iw10 + - dsrc[dstep + cn2 + 1] * iw11, W_BITS1); - - Iptr[x] = (short)ival; - dIptr[0] = (short)ixval; - dIptr[1] = (short)iyval; - - } + for (; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2) + { + if (winMaskMat.at(y, x) == 0) + { + dIptr[0] = 0; + dIptr[1] = 0; + continue; + } + int ival = CV_DESCALE(src[x] * iw00 + src[x + cn] * iw01 + + src[x + step] * iw10 + src[x + step + cn] * iw11, W_BITS1 - 5); + int ixval = CV_DESCALE(dsrc[0] * iw00 + dsrc[cn2] * iw01 + + dsrc[dstep] * iw10 + dsrc[dstep + cn2] * iw11, W_BITS1); + int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc[dstep + 1] * iw10 + + dsrc[dstep + cn2 + 1] * iw11, W_BITS1); + + Iptr[x] = (short)ival; + dIptr[0] = (short)ixval; + dIptr[1] = (short)iyval; + + } #endif - } - - cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1); - - cv::Point2f backUpNextPt = nextPt; - nextPt -= halfWin; - Point2f prevDelta(0, 0); //related to h(t-1) - Point2f prevGain(0, 0); - cv::Point2f gainVec = gainVecs[ptidx]; - cv::Point2f backUpGain = gainVec; - cv::Size _winSize = winSize; - int j; - __m128i mmMask0, mmMask1, mmMask; - getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask); - __m128 mmOnes = _mm_set1_ps(1.f); - __m128i mmOnes_epi16 = _mm_set1_epi16(1); - __m128i mmZeros_epi16 = _mm_set1_epi16(0); - float MEstimatorScale = 1; - int buffIdx = 0; - float minEigValue, maxEigValue; - - for (j = 0; j < criteria.maxCount; j++) - { - - inextPt.x = cvFloor(nextPt.x); - inextPt.y = cvFloor(nextPt.y); - - if (inextPt.x < -winSize.width || inextPt.x >= J.cols || - inextPt.y < -winSize.height || inextPt.y >= J.rows) - { - if (level == 0 && status) - status[ptidx] = 3; - if (level > 0) - { - nextPts[ptidx] = backUpNextPt; - gainVecs[ptidx] = backUpGain; - } - break; - } - - a = nextPt.x - inextPt.x; - b = nextPt.y - inextPt.y; - iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); - iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); - iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); - iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - - // mismatch - b1 = 0; - b2 = 0; - b3 = 0; - b4 = 0; - if (j == 0) - { - // tensor - w1 = 0; // -IxI - w2 = 0; // -IyI - dI = 0; // I^2 - sumIx = 0; - sumIy = 0; - sumI = 0; - sumW = 0; - A11 = 0; - A12 = 0; - A22 = 0; - } - - if (j == 0 ) - { - buffIdx = 0; - for (y = 0; y < winSize.height; y++) - { - const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn; - const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step); - const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); - x = 0; - for (; x < winSize.width*cn; x++, dIptr += 2) - { - if (dIptr[0] == 0 && dIptr[1] == 0) - continue; - int diff = static_cast(CV_DESCALE( Jptr[x] * iw00 + - Jptr[x + cn] * iw01 + - Jptr[x + step] * iw10 + - Jptr[x + step + cn] * iw11, W_BITS1 - 5) - - Iptr[x] + Iptr[x] * gainVec.x + gainVec.y); - residualMat.at(buffIdx++) = static_cast(diff); - } - } - /*! Estimation for the residual */ - cv::Mat residualMatRoi(residualMat, cv::Rect(0, 0, 1, buffIdx)); - MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi); - } - - float eta = 1.f / winArea; - float fParam0 = param[0] * 32.f; - float fParam1 = param[1] * 32.f; - fParam0 = param[0] * MEstimatorScale; - fParam1 = param[1] * MEstimatorScale; - -#ifdef CV_SSE2 - - qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); - qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); - __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(), qb2 = _mm_setzero_ps(); - __m128 qb3 = _mm_setzero_ps(); - __m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps(); - __m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps(); - __m128 mmSumIy = _mm_setzero_ps(), mmSumIx = _mm_setzero_ps(); - __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps(); - __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits::max() - 1, static_cast(fParam0))); - __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits::max() - 1, static_cast(fParam1))); - - - float s2Val = param[2] > 0 ? param[2] : -param[2]; - int s2bitShift = param[2] == 0 ? 1 : static_cast(ceil(log(200.f / s2Val) / log(2.f))); // zwei stellen nach den komma - __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast(param[2] * (float)(1 << s2bitShift))); - __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift); - __m128 mmParam2 = _mm_set1_ps(param[2]); - __m128 mmParam2s = _mm_set1_ps(0.01f * param[2]); - __m128 mmParam2s2 = _mm_set1_ps(param[2] * param[2]); - - float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x; - int bitShift = gainVec.x == 0 ? 1 : static_cast(ceil(log(200.f / gainVal) / log(2.f))); // zwei stellen nach den komma - - __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast(gainVec.x * (float)(1 << bitShift))); - __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast(gainVec.y)); - __m128i mmEta = _mm_setzero_si128(); - __m128i mmEtaNeg = _mm_set1_epi16(-1); - __m128i mmScale = _mm_set1_epi16(static_cast(MEstimatorScale)); + } + + cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1); + + cv::Point2f backUpNextPt = nextPt; + nextPt -= halfWin; + Point2f prevDelta(0, 0); //related to h(t-1) + Point2f prevGain(0, 0); + cv::Point2f gainVec = gainVecs[ptidx]; + cv::Point2f backUpGain = gainVec; + cv::Size _winSize = winSize; + int j; + __m128i mmMask0, mmMask1, mmMask; + getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask); + __m128 mmOnes = _mm_set1_ps(1.f); + float MEstimatorScale = 1; + int buffIdx = 0; + float minEigValue; + + for (j = 0; j < criteria.maxCount; j++) + { + + inextPt.x = cvFloor(nextPt.x); + inextPt.y = cvFloor(nextPt.y); + + if (inextPt.x < -winSize.width || inextPt.x >= J.cols || + inextPt.y < -winSize.height || inextPt.y >= J.rows) + { + if (level == 0 && status) + status[ptidx] = 3; + if (level > 0) + { + nextPts[ptidx] = backUpNextPt; + gainVecs[ptidx] = backUpGain; + } + break; + } + + a = nextPt.x - inextPt.x; + b = nextPt.y - inextPt.y; + iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + + // mismatch + b1 = 0; + b2 = 0; + b3 = 0; + b4 = 0; + if (j == 0) + { + // tensor + w1 = 0; // -IxI + w2 = 0; // -IyI + dI = 0; // I^2 + sumIx = 0; + sumIy = 0; + sumI = 0; + sumW = 0; + A11 = 0; + A12 = 0; + A22 = 0; + } + + if (j == 0 ) + { + buffIdx = 0; + for (y = 0; y < winSize.height; y++) + { + const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn; + const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step); + const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); + x = 0; + for (; x < winSize.width*cn; x++, dIptr += 2) + { + if (dIptr[0] == 0 && dIptr[1] == 0) + continue; + int diff = static_cast(CV_DESCALE( Jptr[x] * iw00 + + Jptr[x + cn] * iw01 + + Jptr[x + step] * iw10 + + Jptr[x + step + cn] * iw11, W_BITS1 - 5) + - Iptr[x] + Iptr[x] * gainVec.x + gainVec.y); + residualMat.at(buffIdx++) = static_cast(diff); + } + } + /*! Estimation for the residual */ + cv::Mat residualMatRoi(residualMat, cv::Rect(0, 0, 1, buffIdx)); + MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi); + } + + float eta = 1.f / winArea; + float fParam0 = param[0] * 32.f; + float fParam1 = param[1] * 32.f; + fParam0 = param[0] * MEstimatorScale; + fParam1 = param[1] * MEstimatorScale; + +#ifdef CV_SSE4_1 + + qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(), qb2 = _mm_setzero_ps(); + __m128 qb3 = _mm_setzero_ps(); + __m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps(); + __m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps(); + __m128 mmSumIy = _mm_setzero_ps(), mmSumIx = _mm_setzero_ps(); + __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps(); + __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits::max() - 1, static_cast(fParam0))); + __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits::max() - 1, static_cast(fParam1))); + + + float s2Val = param[2] > 0 ? param[2] : -param[2]; + int s2bitShift = param[2] == 0 ? 1 : static_cast(ceil(log(200.f / s2Val) / log(2.f))); // zwei stellen nach den komma + __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast(param[2] * (float)(1 << s2bitShift))); + __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift); + __m128 mmParam2s = _mm_set1_ps(0.01f * param[2]); + __m128 mmParam2s2 = _mm_set1_ps(param[2] * param[2]); + + float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x; + int bitShift = gainVec.x == 0 ? 1 : static_cast(ceil(log(200.f / gainVal) / log(2.f))); // zwei stellen nach den komma + + __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast(gainVec.x * (float)(1 << bitShift))); + __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast(gainVec.y)); + __m128i mmEta = _mm_setzero_si128(); + __m128i mmScale = _mm_set1_epi16(static_cast(MEstimatorScale)); #endif - buffIdx = 0; - - for (y = 0; y < _winSize.height; y++) - { - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; - const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn; - const short* Iptr = (const short*)(IWinBuf.data + y * IWinBuf.step); - const short* dIptr = (const short*)(derivIWinBuf.data + y * derivIWinBuf.step); - - x = 0; -#ifdef CV_SSE2 - - - for (; x <= _winSize.width*cn; x += 8, dIptr += 8 * 2) - { - __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16); - __m128i diff0, diff1; - __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); // von element 0 bis 7 - __m128i v00 = _mm_unpacklo_epi8( - _mm_loadl_epi64((const __m128i*)(Jptr + x)) , z); //J0 , 0, J1, 0, J2, 0 ... J7,0 - __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); - __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z); - __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z); - - __m128i t0 = _mm_add_epi32 - (_mm_madd_epi16( - _mm_unpacklo_epi16(v00, v01), //J0, , J1, J1, J2, J2, ... J3 , J4 - qw0), - _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); - __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); - - t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5); - t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1 - 5); - - - __m128i lo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16); - __m128i hi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16); - - __m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), bitShift); - __m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), bitShift); - __m128i Igain_epi16 = _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32); - - // diff = J - I + I*gain.x + gain.y - __m128i mmDiff_epi16 = _mm_add_epi16( - _mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16), // J - I - _mm_add_epi16(Igain_epi16, mmConstValue_epi16)); - - - mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16); - - - __m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale); - mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1))); - - - __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... - __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); - - - __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16); - __m128i bSet2_epi16, bSet1_epi16; - // |It| < sigma1 ? - bSet2_epi16 = _mm_cmplt_epi16(abs_epi16, mmParam1); - // It > 0 ? - __m128i diffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128()); - // sigma0 < |It| < sigma1 ? - bSet1_epi16 = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0)); - // val = |It| -/+ sigma1 - __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)), - _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1))); - // It == 0 ? |It| > sigma13 - mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16); - // It == val ? sigma0 < |It| < sigma1 - mmDiff_epi16 = _mm_blendv_epi8(mmDiff_epi16, tmpParam1_epi16, bSet1_epi16); - - - __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3 - // diff = diff * sigma2 - lo = _mm_mullo_epi16(tale_epi16_, mmDiff_epi16); - hi = _mm_mulhi_epi16(tale_epi16_, mmDiff_epi16); - __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); //diff 0_3_epi32 - __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); // diff 4_7_epi32 - - mmDiff_epi16 = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); // ,da das ergebniss kleiner als 16 bit sein sollte - - diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7 | It12 It12 It13 It13... - diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3 | It8 It8 It9 It9... - - // Ix * diff / Iy * diff - v10 = _mm_mullo_epi16(Ixy_0, diff0); - v11 = _mm_mulhi_epi16(Ixy_0, diff0); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); - qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); - - // It * Ix It * Iy [4 ... 7] - // for set 1 hi sigma 1 - - v10 = _mm_mullo_epi16(Ixy_1, diff1); - v11 = _mm_mulhi_epi16(Ixy_1, diff1); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); - qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); - // diff * J [0 ... 7] - // for set 1 sigma 1 - // b3 += diff * Iptr[x] - - v10 = _mm_mullo_epi16(mmDiff_epi16, I_0_7_epi16); - v11 = _mm_mulhi_epi16(mmDiff_epi16, I_0_7_epi16); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v00)); - qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v10)); - - qb3 = _mm_add_ps(qb3, _mm_cvtepi32_ps(diff_0_3_epi32)); - qb3 = _mm_add_ps(qb3, _mm_cvtepi32_ps(diff_4_7_epi32)); - - if (j == 0) - { - __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16)); - __m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16, bSet1_epi16), 16)); - __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16)); - __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16))); - - __m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16)); - __m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16), 16)); - - __m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps); - __m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps); - tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps); - tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps); - - tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps); - tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps); - - t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3 - t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3 - - __m128 fy = _mm_cvtepi32_ps(t0); - __m128 fx = _mm_cvtepi32_ps(t1); - - // 0 ... 3 - __m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16)); - - // A11 - A22 - __m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps); - __m128 fytale = _mm_mul_ps(fy, tale_0_3_ps); - - mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); - mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); - mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); - - // sumIx und sumIy - mmSumIx = _mm_add_ps(mmSumIx, fxtale); - mmSumIy = _mm_add_ps(mmSumIy, fytale); - - mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale)); - mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale)); - - // sumI - __m128 I_tale_ps = _mm_mul_ps(I_ps, tale_0_3_ps); - mmSumI = _mm_add_ps(mmSumI, I_tale_ps); - - // sumW - mmSumW = _mm_add_ps(mmSumW, tale_0_3_ps); - - // sumDI - mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps(I_ps, I_tale_ps)); - - t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11 - t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3 - - fy = _mm_cvtepi32_ps(t0); - fx = _mm_cvtepi32_ps(t1); - - // 4 ... 7 - I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16)); - - // A11 - A22 - fxtale = _mm_mul_ps(fx, tale_4_7_ps); - fytale = _mm_mul_ps(fy, tale_4_7_ps); - - mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); - mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); - mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); - - // sumIx und sumIy - mmSumIx = _mm_add_ps(mmSumIx, fxtale); - mmSumIy = _mm_add_ps(mmSumIy, fytale); - - mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale)); - mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale)); - - // sumI - I_tale_ps = _mm_mul_ps(I_ps, tale_4_7_ps); - mmSumI = _mm_add_ps(mmSumI, I_tale_ps); - - // sumW - mmSumW = _mm_add_ps(mmSumW, tale_4_7_ps); - - // sumDI - mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps(I_ps, I_tale_ps)); - } + buffIdx = 0; + + for (y = 0; y < _winSize.height; y++) + { + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn; + const short* Iptr = (const short*)(IWinBuf.data + y * IWinBuf.step); + const short* dIptr = (const short*)(derivIWinBuf.data + y * derivIWinBuf.step); + + x = 0; +#ifdef CV_SSE4_1 + + + for (; x <= _winSize.width*cn; x += 8, dIptr += 8 * 2) + { + __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16); + __m128i diff0, diff1; + __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); // von element 0 bis 7 + __m128i v00 = _mm_unpacklo_epi8( + _mm_loadl_epi64((const __m128i*)(Jptr + x)) , z); //J0 , 0, J1, 0, J2, 0 ... J7,0 + __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); + __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z); + __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z); + + __m128i t0 = _mm_add_epi32 + (_mm_madd_epi16( + _mm_unpacklo_epi16(v00, v01), //J0, , J1, J1, J2, J2, ... J3 , J4 + qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1 - 5); + + + __m128i lo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16); + __m128i hi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16); + + __m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), bitShift); + __m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), bitShift); + __m128i Igain_epi16 = _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32); + + // diff = J - I + I*gain.x + gain.y + __m128i mmDiff_epi16 = _mm_add_epi16( + _mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16), // J - I + _mm_add_epi16(Igain_epi16, mmConstValue_epi16)); + + + mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16); + + + __m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale); + mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1))); + + + __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... + __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); + + + __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16); + __m128i bSet2_epi16, bSet1_epi16; + // |It| < sigma1 ? + bSet2_epi16 = _mm_cmplt_epi16(abs_epi16, mmParam1); + // It > 0 ? + __m128i diffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128()); + // sigma0 < |It| < sigma1 ? + bSet1_epi16 = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0)); + // val = |It| -/+ sigma1 + __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)), + _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1))); + // It == 0 ? |It| > sigma13 + mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16); + // It == val ? sigma0 < |It| < sigma1 + mmDiff_epi16 = _mm_blendv_epi8(mmDiff_epi16, tmpParam1_epi16, bSet1_epi16); + + + __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3 + // diff = diff * sigma2 + lo = _mm_mullo_epi16(tale_epi16_, mmDiff_epi16); + hi = _mm_mulhi_epi16(tale_epi16_, mmDiff_epi16); + __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); //diff 0_3_epi32 + __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); // diff 4_7_epi32 + + mmDiff_epi16 = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); // ,da das ergebniss kleiner als 16 bit sein sollte + + diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7 | It12 It12 It13 It13... + diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3 | It8 It8 It9 It9... + + // Ix * diff / Iy * diff + v10 = _mm_mullo_epi16(Ixy_0, diff0); + v11 = _mm_mulhi_epi16(Ixy_0, diff0); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); + qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); + + // It * Ix It * Iy [4 ... 7] + // for set 1 hi sigma 1 + + v10 = _mm_mullo_epi16(Ixy_1, diff1); + v11 = _mm_mulhi_epi16(Ixy_1, diff1); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); + qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); + // diff * J [0 ... 7] + // for set 1 sigma 1 + // b3 += diff * Iptr[x] + + v10 = _mm_mullo_epi16(mmDiff_epi16, I_0_7_epi16); + v11 = _mm_mulhi_epi16(mmDiff_epi16, I_0_7_epi16); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v00)); + qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v10)); + + qb3 = _mm_add_ps(qb3, _mm_cvtepi32_ps(diff_0_3_epi32)); + qb3 = _mm_add_ps(qb3, _mm_cvtepi32_ps(diff_4_7_epi32)); + + if (j == 0) + { + __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16)); + __m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16, bSet1_epi16), 16)); + __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16)); + __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16))); + + __m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16)); + __m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16), 16)); + + __m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps); + __m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps); + tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps); + tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps); + + tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps); + tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps); + + t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3 + t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3 + + __m128 fy = _mm_cvtepi32_ps(t0); + __m128 fx = _mm_cvtepi32_ps(t1); + + // 0 ... 3 + __m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16)); + + // A11 - A22 + __m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps); + __m128 fytale = _mm_mul_ps(fy, tale_0_3_ps); + + mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); + mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); + mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); + + // sumIx und sumIy + mmSumIx = _mm_add_ps(mmSumIx, fxtale); + mmSumIy = _mm_add_ps(mmSumIy, fytale); + + mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale)); + mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale)); + + // sumI + __m128 I_tale_ps = _mm_mul_ps(I_ps, tale_0_3_ps); + mmSumI = _mm_add_ps(mmSumI, I_tale_ps); + + // sumW + mmSumW = _mm_add_ps(mmSumW, tale_0_3_ps); + + // sumDI + mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps(I_ps, I_tale_ps)); + + t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11 + t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3 + + fy = _mm_cvtepi32_ps(t0); + fx = _mm_cvtepi32_ps(t1); + + // 4 ... 7 + I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16)); + + // A11 - A22 + fxtale = _mm_mul_ps(fx, tale_4_7_ps); + fytale = _mm_mul_ps(fy, tale_4_7_ps); + + mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); + mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); + mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); + + // sumIx und sumIy + mmSumIx = _mm_add_ps(mmSumIx, fxtale); + mmSumIy = _mm_add_ps(mmSumIy, fytale); + + mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale)); + mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale)); + + // sumI + I_tale_ps = _mm_mul_ps(I_ps, tale_4_7_ps); + mmSumI = _mm_add_ps(mmSumI, I_tale_ps); + + // sumW + mmSumW = _mm_add_ps(mmSumW, tale_4_7_ps); + + // sumDI + mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps(I_ps, I_tale_ps)); + } - } + } #else - for (; x < _winSize.width*cn; x++, dIptr += 2) - { - if (maskPtr[x] == 0) - continue; - - float J = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + - Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11, - W_BITS1 - 5); - short ixval = static_cast(dIptr[0]); - short iyval = static_cast(dIptr[1]); - int diff = J - static_cast(Iptr[x]) + Iptr[x] * gainVec.x + gainVec.y; - int abss = (diff < 0) ? -diff : diff; - if (diff > MEstimatorScale) - MEstimatorScale += eta; - if (diff < MEstimatorScale) - MEstimatorScale -= eta; - // compute the missmatch vector - - if (abss > fParam1) - { - diff = 0.f; - } - else if (abss > fParam0 && diff >= 0) - { - diff = param[2] * (diff - fParam1); - } - else if (abss > fParam0 && diff < 0) - { - diff = param[2] * (diff + fParam1); - } - - - b1 += (float)(diff*ixval); - b2 += (float)(diff*iyval); ; - b3 += (float)(diff)* Iptr[x]; - b4 += (float)(diff); - - - // compute the Gradient Matrice - if (j == 0) - { - float tale = param[2] * FLT_RESCALE; - if (abss < fParam0 || j < 0) - { - tale = FLT_RESCALE; - } - else if (abss > fParam1) - { - tale *= 0.01f; - } - else - { - tale *= param[2]; - } - if (j == 0) - { - A11 += (float)(ixval*ixval)*tale; - A12 += (float)(ixval*iyval)*tale; - A22 += (float)(iyval*iyval)*tale; - } - - dI += Iptr[x] * Iptr[x] * tale; - float dx = static_cast(dIptr[0]) * tale; - float dy = static_cast(dIptr[1]) * tale; - sumIx += dx; - sumIy += dy; - w1 += dx * Iptr[x]; - w2 += dy * Iptr[x]; - sumI += Iptr[x] * tale; - sumW += tale; - } - - } + for (; x < _winSize.width*cn; x++, dIptr += 2) + { + if (maskPtr[x] == 0) + continue; + + float J = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + + Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11, + W_BITS1 - 5); + short ixval = static_cast(dIptr[0]); + short iyval = static_cast(dIptr[1]); + int diff = J - static_cast(Iptr[x]) + Iptr[x] * gainVec.x + gainVec.y; + int abss = (diff < 0) ? -diff : diff; + if (diff > MEstimatorScale) + MEstimatorScale += eta; + if (diff < MEstimatorScale) + MEstimatorScale -= eta; + // compute the missmatch vector + + if (abss > fParam1) + { + diff = 0.f; + } + else if (abss > fParam0 && diff >= 0) + { + diff = param[2] * (diff - fParam1); + } + else if (abss > fParam0 && diff < 0) + { + diff = param[2] * (diff + fParam1); + } + + + b1 += (float)(diff*ixval); + b2 += (float)(diff*iyval); ; + b3 += (float)(diff)* Iptr[x]; + b4 += (float)(diff); + + + // compute the Gradient Matrice + if (j == 0) + { + float tale = param[2] * FLT_RESCALE; + if (abss < fParam0 || j < 0) + { + tale = FLT_RESCALE; + } + else if (abss > fParam1) + { + tale *= 0.01f; + } + else + { + tale *= param[2]; + } + if (j == 0) + { + A11 += (float)(ixval*ixval)*tale; + A12 += (float)(ixval*iyval)*tale; + A22 += (float)(iyval*iyval)*tale; + } + + dI += Iptr[x] * Iptr[x] * tale; + float dx = static_cast(dIptr[0]) * tale; + float dy = static_cast(dIptr[1]) * tale; + sumIx += dx; + sumIy += dy; + w1 += dx * Iptr[x]; + w2 += dy * Iptr[x]; + sumI += Iptr[x] * tale; + sumW += tale; + } + + } #endif - } - - -#ifdef CV_SSE2 - short etaValues[8]; - _mm_storeu_si128((__m128i*)(etaValues), mmEta); - MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3] - + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]); - -#endif - float CV_DECL_ALIGNED(32) wbuf[4];// - if (j == 0) - { -#ifdef CV_SSE2 - _mm_store_ps(wbuf, mmSumW1); - w1 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; - _mm_store_ps(wbuf, mmSumW2); - w2 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; - _mm_store_ps(wbuf, mmSumDI); - dI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; - _mm_store_ps(wbuf, mmSumI); - sumI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; - _mm_store_ps(wbuf, mmSumIx); - sumIx = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; - _mm_store_ps(wbuf, mmSumIy); - sumIy = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; - _mm_store_ps(wbuf, mmSumW); - sumW = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + } + + +#ifdef CV_SSE4_1 + short etaValues[8]; + _mm_storeu_si128((__m128i*)(etaValues), mmEta); + MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3] + + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]); + +#endif + float CV_DECL_ALIGNED(32) wbuf[4];// + if (j == 0) + { +#ifdef CV_SSE4_1 + _mm_store_ps(wbuf, mmSumW1); + w1 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumW2); + w2 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumDI); + dI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumI); + sumI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumIx); + sumIx = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumIy); + sumIy = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumW); + sumW = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; #endif - sumIx *= -FLT_SCALE; - sumIy *= -FLT_SCALE; - sumI *= FLT_SCALE; - sumW *= FLT_SCALE; - w1 *= -FLT_SCALE; - w2 *= -FLT_SCALE; - dI *= FLT_SCALE; + sumIx *= -FLT_SCALE; + sumIy *= -FLT_SCALE; + sumI *= FLT_SCALE; + sumW *= FLT_SCALE; + w1 *= -FLT_SCALE; + w2 *= -FLT_SCALE; + dI *= FLT_SCALE; -#ifdef CV_SSE2 - float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];// +#ifdef CV_SSE4_1 - _mm_store_ps(A11buf, mmAxx); - _mm_store_ps(A12buf, mmAxy); - _mm_store_ps(A22buf, mmAyy); + float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];// + _mm_store_ps(A11buf, mmAxx); + _mm_store_ps(A12buf, mmAxy); + _mm_store_ps(A22buf, mmAyy); - A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; - A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; - A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; + + A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; + A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; + A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; #endif - A11 *= FLT_SCALE; - A12 *= FLT_SCALE; - A22 *= FLT_SCALE; - - - } - - -#ifdef CV_SSE2 - float CV_DECL_ALIGNED(16) bbuf[4]; - _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1)); - b1 = bbuf[0] + bbuf[2]; - b2 = bbuf[1] + bbuf[3]; - _mm_store_ps(bbuf, qb2); - b3 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; - _mm_store_ps(bbuf, qb3); - b4 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; - + A11 *= FLT_SCALE; + A12 *= FLT_SCALE; + A22 *= FLT_SCALE; + + + } + + +#ifdef CV_SSE4_1 + float CV_DECL_ALIGNED(16) bbuf[4]; + _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1)); + b1 = bbuf[0] + bbuf[2]; + b2 = bbuf[1] + bbuf[3]; + _mm_store_ps(bbuf, qb2); + b3 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; + _mm_store_ps(bbuf, qb3); + b4 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; + #endif - mismatchMat.at(0, 0) = b1 * FLT_SCALE; - mismatchMat.at(1, 0) = b2 * FLT_SCALE; - mismatchMat.at(2, 0) = -b3 * FLT_SCALE; - mismatchMat.at(3, 0) = -b4 * FLT_SCALE; - - D = -A12 * A12*sumI*sumI + dI * sumW*A12*A12 + 2 * A12*sumI*sumIx*w2 + 2 * A12*sumI*sumIy*w1 - - 2 * dI*A12*sumIx*sumIy - 2 * sumW*A12*w1*w2 + A11 * A22*sumI*sumI - 2 * A22*sumI*sumIx*w1 - - 2 * A11*sumI*sumIy*w2 - sumIx * sumIx*w2*w2 + A22 * dI*sumIx*sumIx + 2 * sumIx*sumIy*w1*w2 - - sumIy * sumIy*w1*w1 + A11 * dI*sumIy*sumIy + A22 * sumW*w1*w1 + A11 * sumW*w2*w2 - A11 * A22*dI*sumW; - - float sqrtVal = std::sqrt((A11 - A22)*(A11 - A22) + 4.f*A12*A12); - minEigValue = (A22 + A11 - sqrtVal) / (2.0f*winArea);; - maxEigValue = (A22 + A11 + sqrtVal) / (2.0f*winArea); - if (minEigValue < minEigThreshold || std::abs(D) < FLT_EPSILON) - { - if (level == 0 && status) - status[ptidx] = 0; - if (level > 0) - { - nextPts[ptidx] = backUpNextPt; - gainVecs[ptidx] = backUpGain; - } - break; - } - - D = (1.f / D); - - invTensorMat.at(0, 0) = (A22*sumI*sumI - 2 * sumI*sumIy*w2 + dI * sumIy*sumIy + sumW * w2*w2 - A22 * dI*sumW)* D; invTensorMat.at(0, 1) = (A12*dI*sumW - A12 * sumI * sumI - dI * sumIx*sumIy + sumI * sumIx*w2 + sumI * sumIy*w1 - sumW * w1*w2)* D; - invTensorMat.at(0, 2) = (A12*sumI*sumIy - sumIy * sumIy*w1 - A22 * sumI*sumIx - A12 * sumW*w2 + A22 * sumW*w1 + sumIx * sumIy*w2)* D; - invTensorMat.at(0, 3) = (A22*dI*sumIx - A12 * dI*sumIy - sumIx * w2*w2 + A12 * sumI*w2 - A22 * sumI*w1 + sumIy * w1*w2) * D; - invTensorMat.at(1, 0) = invTensorMat.at(0, 1); - invTensorMat.at(1, 1) = (A11*sumI * sumI - 2 * sumI*sumIx*w1 + dI * sumIx * sumIx + sumW * w1*w1 - A11 * dI*sumW) * D; - invTensorMat.at(1, 2) = (A12*sumI*sumIx - A11 * sumI*sumIy - sumIx * sumIx*w2 + A11 * sumW*w2 - A12 * sumW*w1 + sumIx * sumIy*w1) * D; - invTensorMat.at(1, 3) = (A11*dI*sumIy - sumIy * w1*w1 - A12 * dI*sumIx - A11 * sumI*w2 + A12 * sumI*w1 + sumIx * w1*w2)* D; - invTensorMat.at(2, 0) = invTensorMat.at(0, 2); - invTensorMat.at(2, 1) = invTensorMat.at(1, 2); - invTensorMat.at(2, 2) = (sumW*A12*A12 - 2 * A12*sumIx*sumIy + A22 * sumIx*sumIx + A11 * sumIy*sumIy - A11 * A22*sumW)* D; - invTensorMat.at(2, 3) = (A11*A22*sumI - A12 * A12*sumI - A11 * sumIy*w2 + A12 * sumIx*w2 + A12 * sumIy*w1 - A22 * sumIx*w1)* D; - invTensorMat.at(3, 0) = invTensorMat.at(0, 3); - invTensorMat.at(3, 1) = invTensorMat.at(1, 3); - invTensorMat.at(3, 2) = invTensorMat.at(2, 3); - invTensorMat.at(3, 3) = (dI*A12*A12 - 2 * A12*w1*w2 + A22 * w1*w1 + A11 * w2*w2 - A11 * A22*dI)* D; - - resultMat = invTensorMat * mismatchMat; - - - Point2f delta(-resultMat.at(0), -resultMat.at(1)); - Point2f deltaGain(resultMat.at(2), resultMat.at(3)); - - - - - if (j == 0) - prevGain = deltaGain; - gainVec += deltaGain * 0.8; - nextPt += delta * 0.8; - nextPts[ptidx] = nextPt + halfWin; - gainVecs[ptidx] = gainVec; - - - - cv::Point2f totalDistance = backUpNextPt - nextPts[ptidx]; - - if (// j > 1 && - (std::abs(delta.x - prevDelta.x) < 0.01 && std::abs(delta.y - prevDelta.y) < 0.01) - || ((delta.ddot(delta) <= 0.001) && std::abs(prevGain.x - deltaGain.x) < 0.01) - ) - { - nextPts[ptidx] -= delta * 0.5f; - gainVecs[ptidx] -= deltaGain * 0.5f; - break; - } - - prevDelta = delta; - prevGain = deltaGain; - } - - } - - } - - const Mat* prevImg; - const Mat* nextImg; - const Mat* prevDeriv; - const Mat* rgbPrevImg; - const Mat* rgbNextImg; - const Point2f* prevPts; - Point2f* nextPts; - uchar* status; - cv::Point2f* gainVecs; - float* err; - int maxWinSize; - int minWinSize; - TermCriteria criteria; - int level; - int maxLevel; - int windowType; - float minEigThreshold; - bool useInitialFlow; - std::vector paramReset; - int crossSegmentationThreshold; - }; + mismatchMat.at(0, 0) = b1 * FLT_SCALE; + mismatchMat.at(1, 0) = b2 * FLT_SCALE; + mismatchMat.at(2, 0) = -b3 * FLT_SCALE; + mismatchMat.at(3, 0) = -b4 * FLT_SCALE; + + D = -A12 * A12*sumI*sumI + dI * sumW*A12*A12 + 2 * A12*sumI*sumIx*w2 + 2 * A12*sumI*sumIy*w1 + - 2 * dI*A12*sumIx*sumIy - 2 * sumW*A12*w1*w2 + A11 * A22*sumI*sumI - 2 * A22*sumI*sumIx*w1 + - 2 * A11*sumI*sumIy*w2 - sumIx * sumIx*w2*w2 + A22 * dI*sumIx*sumIx + 2 * sumIx*sumIy*w1*w2 + - sumIy * sumIy*w1*w1 + A11 * dI*sumIy*sumIy + A22 * sumW*w1*w1 + A11 * sumW*w2*w2 - A11 * A22*dI*sumW; + + float sqrtVal = std::sqrt((A11 - A22)*(A11 - A22) + 4.f*A12*A12); + minEigValue = (A22 + A11 - sqrtVal) / (2.0f*winArea); + if (minEigValue < minEigThreshold || std::abs(D) < FLT_EPSILON) + { + if (level == 0 && status) + status[ptidx] = 0; + if (level > 0) + { + nextPts[ptidx] = backUpNextPt; + gainVecs[ptidx] = backUpGain; + } + break; + } + + D = (1.f / D); + + invTensorMat.at(0, 0) = (A22*sumI*sumI - 2 * sumI*sumIy*w2 + dI * sumIy*sumIy + sumW * w2*w2 - A22 * dI*sumW)* D; invTensorMat.at(0, 1) = (A12*dI*sumW - A12 * sumI * sumI - dI * sumIx*sumIy + sumI * sumIx*w2 + sumI * sumIy*w1 - sumW * w1*w2)* D; + invTensorMat.at(0, 2) = (A12*sumI*sumIy - sumIy * sumIy*w1 - A22 * sumI*sumIx - A12 * sumW*w2 + A22 * sumW*w1 + sumIx * sumIy*w2)* D; + invTensorMat.at(0, 3) = (A22*dI*sumIx - A12 * dI*sumIy - sumIx * w2*w2 + A12 * sumI*w2 - A22 * sumI*w1 + sumIy * w1*w2) * D; + invTensorMat.at(1, 0) = invTensorMat.at(0, 1); + invTensorMat.at(1, 1) = (A11*sumI * sumI - 2 * sumI*sumIx*w1 + dI * sumIx * sumIx + sumW * w1*w1 - A11 * dI*sumW) * D; + invTensorMat.at(1, 2) = (A12*sumI*sumIx - A11 * sumI*sumIy - sumIx * sumIx*w2 + A11 * sumW*w2 - A12 * sumW*w1 + sumIx * sumIy*w1) * D; + invTensorMat.at(1, 3) = (A11*dI*sumIy - sumIy * w1*w1 - A12 * dI*sumIx - A11 * sumI*w2 + A12 * sumI*w1 + sumIx * w1*w2)* D; + invTensorMat.at(2, 0) = invTensorMat.at(0, 2); + invTensorMat.at(2, 1) = invTensorMat.at(1, 2); + invTensorMat.at(2, 2) = (sumW*A12*A12 - 2 * A12*sumIx*sumIy + A22 * sumIx*sumIx + A11 * sumIy*sumIy - A11 * A22*sumW)* D; + invTensorMat.at(2, 3) = (A11*A22*sumI - A12 * A12*sumI - A11 * sumIy*w2 + A12 * sumIx*w2 + A12 * sumIy*w1 - A22 * sumIx*w1)* D; + invTensorMat.at(3, 0) = invTensorMat.at(0, 3); + invTensorMat.at(3, 1) = invTensorMat.at(1, 3); + invTensorMat.at(3, 2) = invTensorMat.at(2, 3); + invTensorMat.at(3, 3) = (dI*A12*A12 - 2 * A12*w1*w2 + A22 * w1*w1 + A11 * w2*w2 - A11 * A22*dI)* D; + + resultMat = invTensorMat * mismatchMat; + + + Point2f delta(-resultMat.at(0), -resultMat.at(1)); + Point2f deltaGain(resultMat.at(2), resultMat.at(3)); + + + + + if (j == 0) + prevGain = deltaGain; + gainVec += deltaGain * 0.8; + nextPt += delta * 0.8; + nextPts[ptidx] = nextPt + halfWin; + gainVecs[ptidx] = gainVec; + + + if (// j > 1 && + (std::abs(delta.x - prevDelta.x) < 0.01 && std::abs(delta.y - prevDelta.y) < 0.01) + || ((delta.ddot(delta) <= 0.001) && std::abs(prevGain.x - deltaGain.x) < 0.01) + ) + { + nextPts[ptidx] -= delta * 0.5f; + gainVecs[ptidx] -= deltaGain * 0.5f; + break; + } + + prevDelta = delta; + prevGain = deltaGain; + } + + } + + } + + const Mat* prevImg; + const Mat* nextImg; + const Mat* prevDeriv; + const Mat* rgbPrevImg; + const Mat* rgbNextImg; + const Point2f* prevPts; + Point2f* nextPts; + uchar* status; + cv::Point2f* gainVecs; + float* err; + int maxWinSize; + int minWinSize; + TermCriteria criteria; + int level; + int maxLevel; + int windowType; + float minEigThreshold; + bool useInitialFlow; + std::vector paramReset; + int crossSegmentationThreshold; + }; } } } diff --git a/modules/optflow/src/rlof/rlof_invokerbase.hpp b/modules/optflow/src/rlof/rlof_invokerbase.hpp index cb6243bf2aa..cb03f61384b 100644 --- a/modules/optflow/src/rlof/rlof_invokerbase.hpp +++ b/modules/optflow/src/rlof/rlof_invokerbase.hpp @@ -1,44 +1,6 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// - // - // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. - // - // By downloading, copying, installing or using the software you agree to this license. - // If you do not agree to this license, do not download, install, - // copy or use the software. - // - // - // License Agreement - // For Open Source Computer Vision Library - // - // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. - // Copyright (C) 2009, Willow Garage Inc., all rights reserved. - // Third party copyrights are property of their respective owners. - // - // Redistribution and use in source and binary forms, with or without modification, - // are permitted provided that the following conditions are met: - // - // * Redistribution's of source code must retain the above copyright notice, - // this list of conditions and the following disclaimer. - // - // * Redistribution's in binary form must reproduce the above copyright notice, - // this list of conditions and the following disclaimer in the documentation - // and/or other materials provided with the distribution. - // - // * The name of the copyright holders may not be used to endorse or promote products - // derived from this software without specific prior written permission. - // - // This software is provided by the copyright holders and contributors "as is" and - // any express or implied warranties, including, but not limited to, the implied - // warranties of merchantability and fitness for a particular purpose are disclaimed. - // In no event shall the Intel Corporation or contributors be liable for any direct, - // indirect, incidental, special, exemplary, or consequential damages - // (including, but not limited to, procurement of substitute goods or services; - // loss of use, data, or profits; or business interruption) however caused - // and on any theory of liability, whether in contract, strict liability, - // or tort (including negligence or otherwise) arising in any way out of - // the use of this software, even if advised of the possibility of such damage. - // - //M*/ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. #ifndef _RLOF_INVOKERBASE_HPP_ #define _RLOF_INVOKERBASE_HPP_ @@ -60,250 +22,250 @@ using namespace cv; namespace cv { - namespace optflow - { - typedef short deriv_type; - inline void get4BitMask(const int & width, __m128i & mask) - { - int noBits = width - static_cast(floor(width / 4.f) * 4.f); - unsigned int val[4]; - for (int n = 0; n < 4; n++) - { - val[n] = (noBits > n) ? (std::numeric_limits::max()) : 0; - } - mask = _mm_set_epi32(val[3], val[2], val[1], val[0]); +namespace optflow +{ + typedef short deriv_type; + inline void get4BitMask(const int & width, __m128i & mask) + { + int noBits = width - static_cast(floor(width / 4.f) * 4.f); + unsigned int val[4]; + for (int n = 0; n < 4; n++) + { + val[n] = (noBits > n) ? (std::numeric_limits::max()) : 0; + } + mask = _mm_set_epi32(val[3], val[2], val[1], val[0]); - } - inline void getWBitMask(const int & width, __m128i & t0, __m128i & t1, __m128i & t2) - { - int noBits = width - static_cast(floor(width / 8.f) * 8.f); - unsigned short val[8]; - for (int n = 0; n < 8; n++) - { - val[n] = (noBits > n) ? (0xffff) : 0; - } - t1 = _mm_set_epi16(val[7], val[7], val[6], val[6], val[5], val[5], val[4], val[4]); - t0 = _mm_set_epi16(val[3], val[3], val[2], val[2], val[1], val[1], val[0], val[0]); - t2 = _mm_set_epi16(val[7], val[6], val[5], val[4], val[3], val[2], val[1], val[0]); - } + } + inline void getWBitMask(const int & width, __m128i & t0, __m128i & t1, __m128i & t2) + { + int noBits = width - static_cast(floor(width / 8.f) * 8.f); + unsigned short val[8]; + for (int n = 0; n < 8; n++) + { + val[n] = (noBits > n) ? (0xffff) : 0; + } + t1 = _mm_set_epi16(val[7], val[7], val[6], val[6], val[5], val[5], val[4], val[4]); + t0 = _mm_set_epi16(val[3], val[3], val[2], val[2], val[1], val[1], val[0], val[0]); + t2 = _mm_set_epi16(val[7], val[6], val[5], val[4], val[3], val[2], val[1], val[0]); + } - typedef uchar tMaskType; + typedef uchar tMaskType; #define tCVMaskType CV_8UC1 #define MaskSet 0xffffffff - inline bool notSameColor(const cv::Point3_ & ref, int r, int c, const cv::Mat & img, int winSize, int threshold) - { - if (r >= img.rows + winSize || c >= img.cols + winSize || r < -winSize || c < -winSize) return true; - int step = static_cast(img.step1()); - const cv::Point3_ * tval = img.ptr>(); - tval += c; - tval += r * step / 3; - int R = std::abs((int)ref.x - (int)tval->x); - int G = std::abs((int)ref.y - (int)tval->y); - int B = std::abs((int)ref.z - (int)tval->z); - int diff = MAX(R, MAX(G, B)); - return diff > threshold; - } + inline bool notSameColor(const cv::Point3_ & ref, int r, int c, const cv::Mat & img, int winSize, int threshold) + { + if (r >= img.rows + winSize || c >= img.cols + winSize || r < -winSize || c < -winSize) return true; + int step = static_cast(img.step1()); + const cv::Point3_ * tval = img.ptr>(); + tval += c; + tval += r * step / 3; + int R = std::abs((int)ref.x - (int)tval->x); + int G = std::abs((int)ref.y - (int)tval->y); + int B = std::abs((int)ref.z - (int)tval->z); + int diff = MAX(R, MAX(G, B)); + return diff > threshold; + } - /*! Estimate the mask with points of the same color as the middle point - *\param prevGray input CV_8UC1 - *\param prevImg input CV_8UC3 - *\param nextImg input CV_8UC3 - *\param prevPoint global position of the middle point of the mask window - *\param nextPoint global position of the middle point of the mask window - *\param winPointMask mask matrice with 1 labeling points contained and 0 point is not contained by the segment - *\param noPoints number of points contained by the segment - *\param winRoi rectangle of the region of interesset in global coordinates - *\param minWinSize, - *\param threshold, - *\param useBothImages - */ - static void getLocalPatch( - const cv::Mat & prevImg, - const cv::Point2i & prevPoint, // feature points - cv::Mat & winPointMask, - int & noPoints, - cv::Rect & winRoi, - int minWinSize, - int threshold) - { - int maxWinSizeH = (winPointMask.cols - 1) / 2; - winRoi.x = prevPoint.x; - winRoi.y = prevPoint.y; - winRoi.width = winPointMask.cols; - winRoi.height = winPointMask.rows; + /*! Estimate the mask with points of the same color as the middle point + *\param prevGray input CV_8UC1 + *\param prevImg input CV_8UC3 + *\param nextImg input CV_8UC3 + *\param prevPoint global position of the middle point of the mask window + *\param nextPoint global position of the middle point of the mask window + *\param winPointMask mask matrice with 1 labeling points contained and 0 point is not contained by the segment + *\param noPoints number of points contained by the segment + *\param winRoi rectangle of the region of interesset in global coordinates + *\param minWinSize, + *\param threshold, + *\param useBothImages + */ + static void getLocalPatch( + const cv::Mat & prevImg, + const cv::Point2i & prevPoint, // feature points + cv::Mat & winPointMask, + int & noPoints, + cv::Rect & winRoi, + int minWinSize, + int threshold) + { + int maxWinSizeH = (winPointMask.cols - 1) / 2; + winRoi.x = prevPoint.x; + winRoi.y = prevPoint.y; + winRoi.width = winPointMask.cols; + winRoi.height = winPointMask.rows; - if (minWinSize == winPointMask.cols) - { - winRoi.x = prevPoint.x - maxWinSizeH; - winRoi.y = prevPoint.y - maxWinSizeH; - winPointMask.setTo(1); - noPoints = winPointMask.size().area(); - return; - } - noPoints = 0; - int c = prevPoint.x; - int r = prevPoint.y; - int c_left = c - 1; - int c_right = c + 1; - int r_top = r - 1; - int r_bottom = r; - int border_left = c - maxWinSizeH; - int border_right = c + maxWinSizeH; - int border_top = r - maxWinSizeH; - int border_bottom = r + maxWinSizeH; - int c_local_diff = prevPoint.x - maxWinSizeH; - int r_local_diff = prevPoint.y - maxWinSizeH; - int _c = c - c_local_diff; - int _r = r - r_local_diff; - int min_r = _r; - int max_r = _r; - int min_c = _c; - int max_c = _c; - // horizontal line - if (r < 0 || r >= prevImg.rows || c < 0 || c >= prevImg.cols) - { - noPoints = 0; - return; - } - cv::Point3_ val1 = prevImg.at>(r, c); // middle grayvalue - cv::Point3_ tval; + if (minWinSize == winPointMask.cols) + { + winRoi.x = prevPoint.x - maxWinSizeH; + winRoi.y = prevPoint.y - maxWinSizeH; + winPointMask.setTo(1); + noPoints = winPointMask.size().area(); + return; + } + noPoints = 0; + int c = prevPoint.x; + int r = prevPoint.y; + int c_left = c - 1; + int c_right = c + 1; + int r_top = r - 1; + int r_bottom = r; + int border_left = c - maxWinSizeH; + int border_right = c + maxWinSizeH; + int border_top = r - maxWinSizeH; + int border_bottom = r + maxWinSizeH; + int c_local_diff = prevPoint.x - maxWinSizeH; + int r_local_diff = prevPoint.y - maxWinSizeH; + int _c = c - c_local_diff; + int _r = r - r_local_diff; + int min_r = _r; + int max_r = _r; + int min_c = _c; + int max_c = _c; + // horizontal line + if (r < 0 || r >= prevImg.rows || c < 0 || c >= prevImg.cols) + { + noPoints = 0; + return; + } + cv::Point3_ val1 = prevImg.at>(r, c); // middle grayvalue + cv::Point3_ tval; - //vertical line - for (int dr = r_top; dr >= border_top; dr--) - { - if (notSameColor(val1, dr, c, prevImg, maxWinSizeH, threshold)) - break; + //vertical line + for (int dr = r_top; dr >= border_top; dr--) + { + if (notSameColor(val1, dr, c, prevImg, maxWinSizeH, threshold)) + break; - int _dr = dr - r_local_diff; - min_r = MIN(min_r, _dr); - max_r = MAX(max_r, _dr); - winPointMask.at(_dr, _c) = 1; - noPoints++; - } - for (int dr = r_bottom; dr < border_bottom; dr++) - { - if (notSameColor(val1, dr, c, prevImg, maxWinSizeH, threshold) - ) - break; + int _dr = dr - r_local_diff; + min_r = MIN(min_r, _dr); + max_r = MAX(max_r, _dr); + winPointMask.at(_dr, _c) = 1; + noPoints++; + } + for (int dr = r_bottom; dr < border_bottom; dr++) + { + if (notSameColor(val1, dr, c, prevImg, maxWinSizeH, threshold) + ) + break; - int _dr = dr - r_local_diff; - min_r = MIN(min_r, _dr); - max_r = MAX(max_r, _dr); - winPointMask.at(_dr, _c) = 1; - noPoints++; - } - // accumulate along the vertical line and the search line that was still labled - for (int dr = min_r + r_local_diff; dr <= max_r + r_local_diff; dr++) - { - int _dr = dr - r_local_diff; - if (winPointMask.at(_dr, _c) == 0) - { - winPointMask.row(_dr).setTo(0); - continue; - } - bool skip = false; - int _dc = c_right - c_local_diff; - for (int dc = c_right; dc < border_right; dc++, _dc++) - { - if (skip == false) - { - if (notSameColor(val1, dr, dc, prevImg, maxWinSizeH, threshold)) - skip = true; - } - if (skip == false) - { - min_c = MIN(min_c, _dc); - max_c = MAX(max_c, _dc); - winPointMask.at(_dr, _dc) = 1; - noPoints++; - } - else - winPointMask.at(_dr, _dc) = 0; - } + int _dr = dr - r_local_diff; + min_r = MIN(min_r, _dr); + max_r = MAX(max_r, _dr); + winPointMask.at(_dr, _c) = 1; + noPoints++; + } + // accumulate along the vertical line and the search line that was still labled + for (int dr = min_r + r_local_diff; dr <= max_r + r_local_diff; dr++) + { + int _dr = dr - r_local_diff; + if (winPointMask.at(_dr, _c) == 0) + { + winPointMask.row(_dr).setTo(0); + continue; + } + bool skip = false; + int _dc = c_right - c_local_diff; + for (int dc = c_right; dc < border_right; dc++, _dc++) + { + if (skip == false) + { + if (notSameColor(val1, dr, dc, prevImg, maxWinSizeH, threshold)) + skip = true; + } + if (skip == false) + { + min_c = MIN(min_c, _dc); + max_c = MAX(max_c, _dc); + winPointMask.at(_dr, _dc) = 1; + noPoints++; + } + else + winPointMask.at(_dr, _dc) = 0; + } - skip = false; - _dc = c_left - c_local_diff; - for (int dc = c_left; dc >= border_left; dc--, _dc--) - { - if (skip == false) - { - if (notSameColor(val1, dr, dc, prevImg, maxWinSizeH, threshold)) - skip = true; - } - if (skip == false) - { - min_c = MIN(min_c, _dc); - max_c = MAX(max_c, _dc); - winPointMask.at(_dr, _dc) = 1; - noPoints++; - } - else - winPointMask.at(_dr, _dc) = 0; - } - } + skip = false; + _dc = c_left - c_local_diff; + for (int dc = c_left; dc >= border_left; dc--, _dc--) + { + if (skip == false) + { + if (notSameColor(val1, dr, dc, prevImg, maxWinSizeH, threshold)) + skip = true; + } + if (skip == false) + { + min_c = MIN(min_c, _dc); + max_c = MAX(max_c, _dc); + winPointMask.at(_dr, _dc) = 1; + noPoints++; + } + else + winPointMask.at(_dr, _dc) = 0; + } + } - // get the initial small window - if (noPoints < minWinSize * minWinSize) - { - cv::Rect roi(winPointMask.cols / 2 - (minWinSize - 1) / 2, - winPointMask.rows / 2 - (minWinSize - 1) / 2, - minWinSize, minWinSize); - cv::Mat(winPointMask, roi).setTo(1); - min_c = MIN(MIN(min_c, roi.tl().x), roi.br().x - 1); - max_c = MAX(MAX(max_c, roi.tl().x), roi.br().x - 1); - min_r = MIN(MIN(min_r, roi.tl().y), roi.br().y - 1); - max_r = MAX(MAX(max_r, roi.tl().y), roi.br().y - 1); - noPoints += minWinSize * minWinSize; - } - winRoi.x = c_local_diff + min_c; - winRoi.y = r_local_diff + min_r; - winRoi.width = max_c - min_c + 1; - winRoi.height = max_r - min_r + 1; - winPointMask = winPointMask(cv::Rect(min_c, min_r, winRoi.width, winRoi.height)); - } + // get the initial small window + if (noPoints < minWinSize * minWinSize) + { + cv::Rect roi(winPointMask.cols / 2 - (minWinSize - 1) / 2, + winPointMask.rows / 2 - (minWinSize - 1) / 2, + minWinSize, minWinSize); + cv::Mat(winPointMask, roi).setTo(1); + min_c = MIN(MIN(min_c, roi.tl().x), roi.br().x - 1); + max_c = MAX(MAX(max_c, roi.tl().x), roi.br().x - 1); + min_r = MIN(MIN(min_r, roi.tl().y), roi.br().y - 1); + max_r = MAX(MAX(max_r, roi.tl().y), roi.br().y - 1); + noPoints += minWinSize * minWinSize; + } + winRoi.x = c_local_diff + min_c; + winRoi.y = r_local_diff + min_r; + winRoi.width = max_c - min_c + 1; + winRoi.height = max_r - min_r + 1; + winPointMask = winPointMask(cv::Rect(min_c, min_r, winRoi.width, winRoi.height)); + } - inline bool calcWinMaskMat( const cv::Mat & BI, - const int windowType, - const cv::Point2i & iprevPt, - cv::Mat & winMaskMat, - cv::Size & winSize, - cv::Point2f & halfWin, - int & winArea, - const int minWinSize, - const int maxWinSize, - const int crossSegmentationThreshold) - { + inline bool calcWinMaskMat( const cv::Mat & BI, + const int windowType, + const cv::Point2i & iprevPt, + cv::Mat & winMaskMat, + cv::Size & winSize, + cv::Point2f & halfWin, + int & winArea, + const int minWinSize, + const int maxWinSize, + const int crossSegmentationThreshold) + { - if (windowType == RLOFOpticalFlowParameter::SR_CROSS && maxWinSize != minWinSize) - { - // patch generation - cv::Rect winRoi; - getLocalPatch(BI, iprevPt, winMaskMat, winArea, winRoi, minWinSize, crossSegmentationThreshold); - if (winArea == 0) - return false; - winSize = winRoi.size(); - halfWin = Point2f(static_cast(iprevPt.x - winRoi.tl().x), - static_cast(iprevPt.y - winRoi.tl().y)); - } - else - { - winSize = cv::Size(maxWinSize, maxWinSize); - halfWin = Point2f((winSize.width - 1) / 2.f, (winSize.height - 1) / 2.f); - winMaskMat.setTo(1); - } - return true; - } + if (windowType == SR_CROSS && maxWinSize != minWinSize) + { + // patch generation + cv::Rect winRoi; + getLocalPatch(BI, iprevPt, winMaskMat, winArea, winRoi, minWinSize, crossSegmentationThreshold); + if (winArea == 0) + return false; + winSize = winRoi.size(); + halfWin = Point2f(static_cast(iprevPt.x - winRoi.tl().x), + static_cast(iprevPt.y - winRoi.tl().y)); + } + else + { + winSize = cv::Size(maxWinSize, maxWinSize); + halfWin = Point2f((winSize.width - 1) / 2.f, (winSize.height - 1) / 2.f); + winMaskMat.setTo(1); + } + return true; + } - inline short estimateScale(cv::Mat & residuals) - { - cv::Mat absMat = cv::abs(residuals); - return quickselect(absMat, absMat.rows / 2); - } - } + inline short estimateScale(cv::Mat & residuals) + { + cv::Mat absMat = cv::abs(residuals); + return quickselect(absMat, absMat.rows / 2); + } +} - + } #endif \ No newline at end of file diff --git a/modules/optflow/src/rlof/rlof_localflow.cpp b/modules/optflow/src/rlof/rlof_localflow.cpp index 3f995fd5477..b34a9c26e30 100644 --- a/modules/optflow/src/rlof/rlof_localflow.cpp +++ b/modules/optflow/src/rlof/rlof_localflow.cpp @@ -1,44 +1,6 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// - // - // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. - // - // By downloading, copying, installing or using the software you agree to this license. - // If you do not agree to this license, do not download, install, - // copy or use the software. - // - // - // License Agreement - // For Open Source Computer Vision Library - // - // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. - // Copyright (C) 2009, Willow Garage Inc., all rights reserved. - // Third party copyrights are property of their respective owners. - // - // Redistribution and use in source and binary forms, with or without modification, - // are permitted provided that the following conditions are met: - // - // * Redistribution's of source code must retain the above copyright notice, - // this list of conditions and the following disclaimer. - // - // * Redistribution's in binary form must reproduce the above copyright notice, - // this list of conditions and the following disclaimer in the documentation - // and/or other materials provided with the distribution. - // - // * The name of the copyright holders may not be used to endorse or promote products - // derived from this software without specific prior written permission. - // - // This software is provided by the copyright holders and contributors "as is" and - // any express or implied warranties, including, but not limited to, the implied - // warranties of merchantability and fitness for a particular purpose are disclaimed. - // In no event shall the Intel Corporation or contributors be liable for any direct, - // indirect, incidental, special, exemplary, or consequential damages - // (including, but not limited to, procurement of substitute goods or services; - // loss of use, data, or profits; or business interruption) however caused - // and on any theory of liability, whether in contract, strict liability, - // or tort (including negligence or otherwise) arising in any way out of - // the use of this software, even if advised of the possibility of such damage. - // - //M*/ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. #include "precomp.hpp" #include "rlof_localflow.h" #ifdef HAVE_OPENCV_CALIB3D @@ -52,535 +14,541 @@ using namespace cv; namespace cv { - namespace detail - { + namespace detail + { - typedef short deriv_type; - } + typedef short deriv_type; + } namespace { - static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst) - { - using namespace cv; - using cv::detail::deriv_type; - int rows = src.rows, cols = src.cols, cn = src.channels(), colsn = cols * cn, depth = src.depth(); - CV_Assert(depth == CV_8U); - dst.create(rows, cols, CV_MAKETYPE(DataType::depth, cn * 2)); - - int x, y, delta = (int)alignSize((cols + 2)*cn, 16); - AutoBuffer _tempBuf(delta * 2 + 64); - deriv_type *trow0 = alignPtr(_tempBuf.data() + cn, 16), *trow1 = alignPtr(trow0 + delta, 16); + static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst) + { + using namespace cv; + using cv::detail::deriv_type; + int rows = src.rows, cols = src.cols, cn = src.channels(), colsn = cols * cn, depth = src.depth(); + CV_Assert(depth == CV_8U); + dst.create(rows, cols, CV_MAKETYPE(DataType::depth, cn * 2)); + + int x, y, delta = (int)alignSize((cols + 2)*cn, 16); + AutoBuffer _tempBuf(delta * 2 + 64); + deriv_type *trow0 = alignPtr(_tempBuf.data() + cn, 16), *trow1 = alignPtr(trow0 + delta, 16); #if CV_SIMD128 - v_int16x8 c3 = v_setall_s16(3), c10 = v_setall_s16(10); - bool haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); + v_int16x8 c3 = v_setall_s16(3), c10 = v_setall_s16(10); + bool haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); #endif - for (y = 0; y < rows; y++) - { - const uchar* srow0 = src.ptr(y > 0 ? y - 1 : rows > 1 ? 1 : 0); - const uchar* srow1 = src.ptr(y); - const uchar* srow2 = src.ptr(y < rows - 1 ? y + 1 : rows > 1 ? rows - 2 : 0); - deriv_type* drow = dst.ptr(y); + for (y = 0; y < rows; y++) + { + const uchar* srow0 = src.ptr(y > 0 ? y - 1 : rows > 1 ? 1 : 0); + const uchar* srow1 = src.ptr(y); + const uchar* srow2 = src.ptr(y < rows - 1 ? y + 1 : rows > 1 ? rows - 2 : 0); + deriv_type* drow = dst.ptr(y); - // do vertical convolution - x = 0; + // do vertical convolution + x = 0; #if CV_SIMD128 - if (haveSIMD) - { - for (; x <= colsn - 8; x += 8) - { - v_int16x8 s0 = v_reinterpret_as_s16(v_load_expand(srow0 + x)); - v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(srow1 + x)); - v_int16x8 s2 = v_reinterpret_as_s16(v_load_expand(srow2 + x)); - - v_int16x8 t1 = s2 - s0; - v_int16x8 t0 = v_mul_wrap(s0 + s2, c3) + v_mul_wrap(s1, c10); - - v_store(trow0 + x, t0); - v_store(trow1 + x, t1); - } - } + if (haveSIMD) + { + for (; x <= colsn - 8; x += 8) + { + v_int16x8 s0 = v_reinterpret_as_s16(v_load_expand(srow0 + x)); + v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(srow1 + x)); + v_int16x8 s2 = v_reinterpret_as_s16(v_load_expand(srow2 + x)); + + v_int16x8 t1 = s2 - s0; + v_int16x8 t0 = v_mul_wrap(s0 + s2, c3) + v_mul_wrap(s1, c10); + + v_store(trow0 + x, t0); + v_store(trow1 + x, t1); + } + } #endif - for (; x < colsn; x++) - { - int t0 = (srow0[x] + srow2[x]) * 3 + srow1[x] * 10; - int t1 = srow2[x] - srow0[x]; - trow0[x] = (deriv_type)t0; - trow1[x] = (deriv_type)t1; - } - - // make border - int x0 = (cols > 1 ? 1 : 0)*cn, x1 = (cols > 1 ? cols - 2 : 0)*cn; - for (int k = 0; k < cn; k++) - { - trow0[-cn + k] = trow0[x0 + k]; trow0[colsn + k] = trow0[x1 + k]; - trow1[-cn + k] = trow1[x0 + k]; trow1[colsn + k] = trow1[x1 + k]; - } - - // do horizontal convolution, interleave the results and store them to dst - x = 0; + for (; x < colsn; x++) + { + int t0 = (srow0[x] + srow2[x]) * 3 + srow1[x] * 10; + int t1 = srow2[x] - srow0[x]; + trow0[x] = (deriv_type)t0; + trow1[x] = (deriv_type)t1; + } + + // make border + int x0 = (cols > 1 ? 1 : 0)*cn, x1 = (cols > 1 ? cols - 2 : 0)*cn; + for (int k = 0; k < cn; k++) + { + trow0[-cn + k] = trow0[x0 + k]; trow0[colsn + k] = trow0[x1 + k]; + trow1[-cn + k] = trow1[x0 + k]; trow1[colsn + k] = trow1[x1 + k]; + } + + // do horizontal convolution, interleave the results and store them to dst + x = 0; #if CV_SIMD128 - if (haveSIMD) - { - for (; x <= colsn - 8; x += 8) - { - v_int16x8 s0 = v_load(trow0 + x - cn); - v_int16x8 s1 = v_load(trow0 + x + cn); - v_int16x8 s2 = v_load(trow1 + x - cn); - v_int16x8 s3 = v_load(trow1 + x); - v_int16x8 s4 = v_load(trow1 + x + cn); - - v_int16x8 t0 = s1 - s0; - v_int16x8 t1 = v_mul_wrap(s2 + s4, c3) + v_mul_wrap(s3, c10); - - v_store_interleave((drow + x * 2), t0, t1); - } - } + if (haveSIMD) + { + for (; x <= colsn - 8; x += 8) + { + v_int16x8 s0 = v_load(trow0 + x - cn); + v_int16x8 s1 = v_load(trow0 + x + cn); + v_int16x8 s2 = v_load(trow1 + x - cn); + v_int16x8 s3 = v_load(trow1 + x); + v_int16x8 s4 = v_load(trow1 + x + cn); + + v_int16x8 t0 = s1 - s0; + v_int16x8 t1 = v_mul_wrap(s2 + s4, c3) + v_mul_wrap(s3, c10); + + v_store_interleave((drow + x * 2), t0, t1); + } + } #endif - for (; x < colsn; x++) - { - deriv_type t0 = (deriv_type)(trow0[x + cn] - trow0[x - cn]); - deriv_type t1 = (deriv_type)((trow1[x + cn] + trow1[x - cn]) * 3 + trow1[x] * 10); - drow[x * 2] = t0; drow[x * 2 + 1] = t1; - } - } - } + for (; x < colsn; x++) + { + deriv_type t0 = (deriv_type)(trow0[x + cn] - trow0[x - cn]); + deriv_type t1 = (deriv_type)((trow1[x + cn] + trow1[x - cn]) * 3 + trow1[x] * 10); + drow[x * 2] = t0; drow[x * 2 + 1] = t1; + } + } + } }//namespace - namespace optflow - { - bool isrobust(const RLOFOpticalFlowParameter & param) - { - return (param.normSigma0 != std::numeric_limits::max()) - && (param.normSigma1 != std::numeric_limits::max()); - } - std::vector get_norm(float sigma0, float sigma1) - { - std::vector result = { sigma0, sigma1, sigma0 / (sigma0 - sigma1), sigma0 * sigma1 }; - return result; - } - - - int buildOpticalFlowPyramidScale(InputArray _img, OutputArrayOfArrays pyramid, Size winSize, int maxLevel, bool withDerivatives, - int pyrBorder, int derivBorder, bool tryReuseInputImage, float levelScale[2]) - { - Mat img = _img.getMat(); - CV_Assert(img.depth() == CV_8U && winSize.width > 2 && winSize.height > 2); - int pyrstep = withDerivatives ? 2 : 1; - - pyramid.create(1, (maxLevel + 1) * pyrstep, 0 /*type*/, -1, true); - - int derivType = CV_MAKETYPE(DataType::depth, img.channels() * 2); - - //level 0 - bool lvl0IsSet = false; - if (tryReuseInputImage && img.isSubmatrix() && (pyrBorder & BORDER_ISOLATED) == 0) - { - Size wholeSize; - Point ofs; - img.locateROI(wholeSize, ofs); - if (ofs.x >= winSize.width && ofs.y >= winSize.height - && ofs.x + img.cols + winSize.width <= wholeSize.width - && ofs.y + img.rows + winSize.height <= wholeSize.height) - { - pyramid.getMatRef(0) = img; - lvl0IsSet = true; - } - } - - if (!lvl0IsSet) - { - Mat& temp = pyramid.getMatRef(0); - - if (!temp.empty()) - temp.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width); - if (temp.type() != img.type() || temp.cols != winSize.width * 2 + img.cols || temp.rows != winSize.height * 2 + img.rows) - temp.create(img.rows + winSize.height * 2, img.cols + winSize.width * 2, img.type()); - - if (pyrBorder == BORDER_TRANSPARENT) - img.copyTo(temp(Rect(winSize.width, winSize.height, img.cols, img.rows))); - else - copyMakeBorder(img, temp, winSize.height, winSize.height, winSize.width, winSize.width, pyrBorder); - temp.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width); - } - - Size sz = img.size(); - Mat prevLevel = pyramid.getMatRef(0); - Mat thisLevel = prevLevel; - - for (int level = 0; level <= maxLevel; ++level) - { - if (level != 0) - { - Mat& temp = pyramid.getMatRef(level * pyrstep); - - if (!temp.empty()) - temp.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width); - if (temp.type() != img.type() || temp.cols != winSize.width * 2 + sz.width || temp.rows != winSize.height * 2 + sz.height) - temp.create(sz.height + winSize.height * 2, sz.width + winSize.width * 2, img.type()); - - thisLevel = temp(Rect(winSize.width, winSize.height, sz.width, sz.height)); - pyrDown(prevLevel, thisLevel, sz); - - if (pyrBorder != BORDER_TRANSPARENT) - copyMakeBorder(thisLevel, temp, winSize.height, winSize.height, winSize.width, winSize.width, pyrBorder | BORDER_ISOLATED); - temp.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width); - } - - if (withDerivatives) - { - Mat& deriv = pyramid.getMatRef(level * pyrstep + 1); - - if (!deriv.empty()) - deriv.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width); - if (deriv.type() != derivType || deriv.cols != winSize.width * 2 + sz.width || deriv.rows != winSize.height * 2 + sz.height) - deriv.create(sz.height + winSize.height * 2, sz.width + winSize.width * 2, derivType); - - Mat derivI = deriv(Rect(winSize.width, winSize.height, sz.width, sz.height)); - calcSharrDeriv(thisLevel, derivI); - - if (derivBorder != BORDER_TRANSPARENT) - copyMakeBorder(derivI, deriv, winSize.height, winSize.height, winSize.width, winSize.width, derivBorder | BORDER_ISOLATED); - deriv.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width); - } - - sz = Size(static_cast((sz.width + 1) / levelScale[0]), - static_cast((sz.height + 1) / levelScale[1])); - if (sz.width <= winSize.width || sz.height <= winSize.height) - { - pyramid.create(1, (level + 1) * pyrstep, 0 /*type*/, -1, true);//check this - return level; - } - - prevLevel = thisLevel; - } - - return maxLevel; - } - int CImageBuffer::buildPyramid(cv::Size winSize, int maxLevel, float levelScale[2]) - { - if (m_Overwrite == false) - return m_maxLevel; - m_maxLevel = buildOpticalFlowPyramidScale(m_Image, m_ImagePyramid, winSize, maxLevel, false, 4, 0, true, levelScale); - return m_maxLevel; - } - - void calcLocalOpticalFlowCore( - Ptr prevPyramids[2], - Ptr currPyramids[2], - InputArray _prevPts, - InputOutputArray _nextPts, - const RLOFOpticalFlowParameter & param) - { - - bool useAdditionalRGB = param.supportRegionType == RLOFOpticalFlowParameter::SR_CROSS; - int iWinSize = param.largeWinSize; - int winSizes[2] = { iWinSize, iWinSize }; - if (param.supportRegionType != RLOFOpticalFlowParameter::SR_FIXED) - { - winSizes[0] = param.smallWinSize; - } - //cv::Size winSize(iWinSize, iWinSize); - cv::TermCriteria criteria(cv::TermCriteria::COUNT | cv::TermCriteria::EPS, param.maxIteration, 0.01); - std::vector rlofNorm = get_norm(param.normSigma0, param.normSigma1); - CV_Assert(winSizes[0] <= winSizes[1]); - - Mat prevPtsMat = _prevPts.getMat(); - const int derivDepth = DataType::depth; - - CV_Assert(param.maxLevel >= 0 && iWinSize > 2); - - int level = 0, npoints; - CV_Assert((npoints = prevPtsMat.checkVector(2, CV_32F, true)) >= 0); - - if (!(param.useInitialFlow)) - _nextPts.create(prevPtsMat.size(), prevPtsMat.type(), -1, true); - - Mat nextPtsMat = _nextPts.getMat(); - CV_Assert(nextPtsMat.checkVector(2, CV_32F, true) == npoints); - - const Point2f* prevPts = (const Point2f*)prevPtsMat.data; - Point2f* nextPts = (Point2f*)nextPtsMat.data; - std::vector status(npoints); - std::vector err(npoints); - std::vector gainPts(npoints); - - float levelScale[2] = { 2.f,2.f }; - - int maxLevel = prevPyramids[0]->buildPyramid(cv::Size(iWinSize, iWinSize), param.maxLevel, levelScale); - - maxLevel = currPyramids[0]->buildPyramid(cv::Size(iWinSize, iWinSize), maxLevel, levelScale); - if (useAdditionalRGB) - { - prevPyramids[1]->buildPyramid(cv::Size(iWinSize, iWinSize), maxLevel, levelScale); - currPyramids[1]->buildPyramid(cv::Size(iWinSize, iWinSize), maxLevel, levelScale); - } - - if ((criteria.type & TermCriteria::COUNT) == 0) - criteria.maxCount = 30; - else - criteria.maxCount = std::min(std::max(criteria.maxCount, 0), 100); - if ((criteria.type & TermCriteria::EPS) == 0) - criteria.epsilon = 0.001; - else - criteria.epsilon = std::min(std::max(criteria.epsilon, 0.), 10.); - criteria.epsilon *= criteria.epsilon; - - // dI/dx ~ Ix, dI/dy ~ Iy - Mat derivIBuf; - derivIBuf.create(prevPyramids[0]->m_ImagePyramid[0].rows + iWinSize * 2, prevPyramids[0]->m_ImagePyramid[0].cols + iWinSize * 2, CV_MAKETYPE(derivDepth, prevPyramids[0]->m_ImagePyramid[0].channels() * 2)); - - for (level = maxLevel; level >= 0; level--) - { - Mat derivI; - Size imgSize = prevPyramids[0]->getImage(level).size(); - Mat _derivI(imgSize.height + iWinSize * 2, imgSize.width + iWinSize * 2, derivIBuf.type(), derivIBuf.data); - derivI = _derivI(Rect(iWinSize, iWinSize, imgSize.width, imgSize.height)); - calcSharrDeriv(prevPyramids[0]->getImage(level), derivI); - copyMakeBorder(derivI, _derivI, iWinSize, iWinSize, iWinSize, iWinSize, BORDER_CONSTANT | BORDER_ISOLATED); - - cv::Mat tRGBPrevPyr; - cv::Mat tRGBNextPyr; - if (useAdditionalRGB) - { - tRGBPrevPyr = prevPyramids[1]->getImage(level); - tRGBNextPyr = prevPyramids[1]->getImage(level); - - prevPyramids[1]->m_Overwrite = false; - currPyramids[1]->m_Overwrite = false; - } - - cv::Mat prevImage = prevPyramids[0]->getImage(level); - cv::Mat currImage = currPyramids[0]->getImage(level); - - // apply plk like tracker - if (isrobust(param) == false) - { - if (param.useIlluminationModel) - { - cv::parallel_for_(cv::Range(0, npoints), - plk::radial::TrackerInvoker( - prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, - prevPts, nextPts, &status[0], &err[0], &gainPts[0], - level, maxLevel, winSizes, - param.maxIteration, - param.useInitialFlow, - param.supportRegionType, - param.minEigenValue, - param.crossSegmentationThreshold)); - } - else - { - if (param.solverType == RLOFOpticalFlowParameter::SolverType::ST_STANDART) - { - cv::parallel_for_(cv::Range(0, npoints), - plk::ica::TrackerInvoker( - prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, - prevPts, nextPts, &status[0], &err[0], - level, maxLevel, winSizes, - param.maxIteration, - param.useInitialFlow, - param.supportRegionType, - param.crossSegmentationThreshold, param.minEigenValue)); - } - else - { - cv::parallel_for_(cv::Range(0, npoints), - beplk::ica::TrackerInvoker(prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, - prevPts, nextPts, &status[0], &err[0], - level, maxLevel, winSizes, - param.maxIteration, - param.useInitialFlow, - param.supportRegionType, - param.crossSegmentationThreshold, - param.minEigenValue)); - } - } - } - // for robust models - else - { - if (param.useIlluminationModel) - { - if (param.solverType == RLOFOpticalFlowParameter::SolverType::ST_STANDART) - { - cv::parallel_for_(cv::Range(0, npoints), - rlof::radial::TrackerInvoker( - prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, - prevPts, nextPts, &status[0], &err[0], &gainPts[0], - level, maxLevel, winSizes, - param.maxIteration, - param.useInitialFlow, - param.supportRegionType, - rlofNorm, - param.minEigenValue, - param.crossSegmentationThreshold)); - } - else - { - cv::parallel_for_(cv::Range(0, npoints), - berlof::radial::TrackerInvoker(prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, - prevPts, nextPts, &status[0], &err[0], &gainPts[0], - level, maxLevel, winSizes, - param.maxIteration, - param.useInitialFlow, - param.supportRegionType, - param.crossSegmentationThreshold, - rlofNorm, - param.minEigenValue)); - } - } - else - { - - if (param.solverType == RLOFOpticalFlowParameter::SolverType::ST_STANDART) - { - cv::parallel_for_(cv::Range(0, npoints), - rlof::ica::TrackerInvoker( - prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, - prevPts, nextPts, &status[0], &err[0], - level, maxLevel, winSizes, - param.maxIteration, - param.useInitialFlow, - param.supportRegionType, - rlofNorm, - param.minEigenValue, - param.crossSegmentationThreshold)); - } - else - { - cv::parallel_for_(cv::Range(0, npoints), - berlof::ica::TrackerInvoker(prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, - prevPts, nextPts, &status[0], &err[0], - level, maxLevel, winSizes, - param.maxIteration, - param.useInitialFlow, - param.supportRegionType, - param.crossSegmentationThreshold, - rlofNorm, - param.minEigenValue)); - } - - } - } - - prevPyramids[0]->m_Overwrite = true; - currPyramids[0]->m_Overwrite = true; - } - } - - void preprocess(Ptr prevPyramids[2], - Ptr currPyramids[2], - const std::vector & prevPoints, - std::vector & currPoints, - const RLOFOpticalFlowParameter & param) - { - cv::Mat mask, homography; - if (param.useGlobalMotionPrior == false) - return; - - currPoints.resize(prevPoints.size()); - - RLOFOpticalFlowParameter gmeTrackerParam = param; - gmeTrackerParam.useGlobalMotionPrior = false; - gmeTrackerParam.largeWinSize = 17; - // use none robust tracker for global motion estimation since it is faster - gmeTrackerParam.normSigma0 = std::numeric_limits::max(); - gmeTrackerParam.maxIteration = MAX(15, param.maxIteration); - gmeTrackerParam.minEigenValue = 0.000001f; - - std::vector gmPrevPoints, gmCurrPoints, gmCheckPoints; - - // Initialize point grid - int stepr = prevPyramids[0]->m_Image.rows / 30; - int stepc = prevPyramids[0]->m_Image.rows / 40; - for (int r = stepr / 2; r < prevPyramids[0]->m_Image.rows; r += stepr) - { - for (int c = stepc / 2; c < prevPyramids[0]->m_Image.cols; c += stepc) - { - gmPrevPoints.push_back(cv::Point2f(static_cast(c), static_cast(r))); - } - } - - // perform motion estimation - calcLocalOpticalFlowCore(prevPyramids, currPyramids, gmPrevPoints, gmCurrPoints, gmeTrackerParam); - calcLocalOpticalFlowCore(currPyramids, prevPyramids, gmCurrPoints, gmCheckPoints, gmeTrackerParam); - - cv::Mat prevPointsMat(static_cast(gmPrevPoints.size()), 1, CV_32FC2); - cv::Mat currPointsMat(static_cast(gmPrevPoints.size()), 1, CV_32FC2); - cv::Mat distMat(static_cast(gmPrevPoints.size()), 1, CV_32FC1); - - // Forward backward confidence to estimate optimal ransac reprojection error - int noPoints = 0; - for (unsigned int n = 0; n < gmPrevPoints.size(); n++) - { - cv::Point2f dist = gmCheckPoints[n] - gmPrevPoints[n]; - - cv::Point2f flow = gmCurrPoints[n] - gmPrevPoints[n]; - prevPointsMat.at(noPoints) = gmPrevPoints[n]; - currPointsMat.at(noPoints) = gmCurrPoints[n]; - distMat.at(noPoints) = flow.x * flow.x + flow.y* flow.y; - if (isnan(distMat.at(noPoints)) == false) - noPoints++; - } - - float medianDist = (param.globalMotionRansacThreshold == 0) ? 1.f : - quickselect(distMat, static_cast(noPoints * static_cast(param.globalMotionRansacThreshold) / 100.f)); - medianDist = sqrt(medianDist); - - if (noPoints < 8) - return; - - cv::findHomography(prevPointsMat(cv::Rect(0, 0, 1, noPoints)), currPointsMat(cv::Rect(0, 0, 1, noPoints)), cv::RANSAC, medianDist, mask).convertTo(homography, CV_32FC1); - - if (homography.empty()) - return; - - cv::perspectiveTransform(prevPoints, currPoints, homography); - } - - void calcLocalOpticalFlow( - const Mat prevImage, - const Mat currImage, - Ptr prevPyramids[2], - Ptr currPyramids[2], - const std::vector & prevPoints, - std::vector & currPoints, - const RLOFOpticalFlowParameter & param) - { - if (prevImage.empty() == false && currImage.empty()== false) - { - prevPyramids[0]->m_Overwrite = true; - currPyramids[0]->m_Overwrite = true; - prevPyramids[1]->m_Overwrite = true; - currPyramids[1]->m_Overwrite = true; - if (prevImage.type() == CV_8UC3) - { - prevPyramids[0]->setGrayFromRGB(prevImage); - currPyramids[0]->setGrayFromRGB(currImage); - prevPyramids[1]->setImage(prevImage); - currPyramids[1]->setImage(currImage); - - if (param.supportRegionType == RLOFOpticalFlowParameter::SR_CROSS) - { - prevPyramids[1]->setBlurFromRGB(prevImage); - currPyramids[1]->setBlurFromRGB(currImage); - } - } - else - { - prevPyramids[0]->setImage(prevImage); - currPyramids[0]->setImage(currImage); - } - } - preprocess(prevPyramids, currPyramids, prevPoints, currPoints, param); - RLOFOpticalFlowParameter internParam = param; - if (param.useGlobalMotionPrior == true) - internParam.useInitialFlow = true; - calcLocalOpticalFlowCore(prevPyramids, currPyramids, prevPoints, currPoints, internParam); - } - } +namespace optflow +{ + int buildOpticalFlowPyramidScale(InputArray _img, OutputArrayOfArrays pyramid, Size winSize, int maxLevel, bool withDerivatives, + int pyrBorder, int derivBorder, bool tryReuseInputImage, float levelScale[2]); + void calcLocalOpticalFlowCore(Ptr prevPyramids[2], Ptr currPyramids[2], InputArray _prevPts, + InputOutputArray _nextPts, const RLOFOpticalFlowParameter & param); + + void preprocess(Ptr prevPyramids[2], Ptr currPyramids[2], const std::vector & prevPoints, + std::vector & currPoints, const RLOFOpticalFlowParameter & param); + + inline bool isrobust(const RLOFOpticalFlowParameter & param) + { + return (param.normSigma0 != std::numeric_limits::max()) + && (param.normSigma1 != std::numeric_limits::max()); + } + inline std::vector get_norm(float sigma0, float sigma1) + { + std::vector result = { sigma0, sigma1, sigma0 / (sigma0 - sigma1), sigma0 * sigma1 }; + return result; + } + + + int buildOpticalFlowPyramidScale(InputArray _img, OutputArrayOfArrays pyramid, Size winSize, int maxLevel, bool withDerivatives, + int pyrBorder, int derivBorder, bool tryReuseInputImage, float levelScale[2]) + { + Mat img = _img.getMat(); + CV_Assert(img.depth() == CV_8U && winSize.width > 2 && winSize.height > 2); + int pyrstep = withDerivatives ? 2 : 1; + + pyramid.create(1, (maxLevel + 1) * pyrstep, 0 /*type*/, -1, true); + + int derivType = CV_MAKETYPE(DataType::depth, img.channels() * 2); + + //level 0 + bool lvl0IsSet = false; + if (tryReuseInputImage && img.isSubmatrix() && (pyrBorder & BORDER_ISOLATED) == 0) + { + Size wholeSize; + Point ofs; + img.locateROI(wholeSize, ofs); + if (ofs.x >= winSize.width && ofs.y >= winSize.height + && ofs.x + img.cols + winSize.width <= wholeSize.width + && ofs.y + img.rows + winSize.height <= wholeSize.height) + { + pyramid.getMatRef(0) = img; + lvl0IsSet = true; + } + } + + if (!lvl0IsSet) + { + Mat& temp = pyramid.getMatRef(0); + + if (!temp.empty()) + temp.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width); + if (temp.type() != img.type() || temp.cols != winSize.width * 2 + img.cols || temp.rows != winSize.height * 2 + img.rows) + temp.create(img.rows + winSize.height * 2, img.cols + winSize.width * 2, img.type()); + + if (pyrBorder == BORDER_TRANSPARENT) + img.copyTo(temp(Rect(winSize.width, winSize.height, img.cols, img.rows))); + else + copyMakeBorder(img, temp, winSize.height, winSize.height, winSize.width, winSize.width, pyrBorder); + temp.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width); + } + + Size sz = img.size(); + Mat prevLevel = pyramid.getMatRef(0); + Mat thisLevel = prevLevel; + + for (int level = 0; level <= maxLevel; ++level) + { + if (level != 0) + { + Mat& temp = pyramid.getMatRef(level * pyrstep); + + if (!temp.empty()) + temp.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width); + if (temp.type() != img.type() || temp.cols != winSize.width * 2 + sz.width || temp.rows != winSize.height * 2 + sz.height) + temp.create(sz.height + winSize.height * 2, sz.width + winSize.width * 2, img.type()); + + thisLevel = temp(Rect(winSize.width, winSize.height, sz.width, sz.height)); + pyrDown(prevLevel, thisLevel, sz); + + if (pyrBorder != BORDER_TRANSPARENT) + copyMakeBorder(thisLevel, temp, winSize.height, winSize.height, winSize.width, winSize.width, pyrBorder | BORDER_ISOLATED); + temp.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width); + } + + if (withDerivatives) + { + Mat& deriv = pyramid.getMatRef(level * pyrstep + 1); + + if (!deriv.empty()) + deriv.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width); + if (deriv.type() != derivType || deriv.cols != winSize.width * 2 + sz.width || deriv.rows != winSize.height * 2 + sz.height) + deriv.create(sz.height + winSize.height * 2, sz.width + winSize.width * 2, derivType); + + Mat derivI = deriv(Rect(winSize.width, winSize.height, sz.width, sz.height)); + calcSharrDeriv(thisLevel, derivI); + + if (derivBorder != BORDER_TRANSPARENT) + copyMakeBorder(derivI, deriv, winSize.height, winSize.height, winSize.width, winSize.width, derivBorder | BORDER_ISOLATED); + deriv.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width); + } + + sz = Size(static_cast((sz.width + 1) / levelScale[0]), + static_cast((sz.height + 1) / levelScale[1])); + if (sz.width <= winSize.width || sz.height <= winSize.height) + { + pyramid.create(1, (level + 1) * pyrstep, 0 /*type*/, -1, true);//check this + return level; + } + + prevLevel = thisLevel; + } + + return maxLevel; + } + int CImageBuffer::buildPyramid(cv::Size winSize, int maxLevel, float levelScale[2]) + { + if (m_Overwrite == false) + return m_maxLevel; + m_maxLevel = buildOpticalFlowPyramidScale(m_Image, m_ImagePyramid, winSize, maxLevel, false, 4, 0, true, levelScale); + return m_maxLevel; + } + + void calcLocalOpticalFlowCore( + Ptr prevPyramids[2], + Ptr currPyramids[2], + InputArray _prevPts, + InputOutputArray _nextPts, + const RLOFOpticalFlowParameter & param) + { + + bool useAdditionalRGB = param.supportRegionType == SR_CROSS; + int iWinSize = param.largeWinSize; + int winSizes[2] = { iWinSize, iWinSize }; + if (param.supportRegionType != SR_FIXED) + { + winSizes[0] = param.smallWinSize; + } + //cv::Size winSize(iWinSize, iWinSize); + cv::TermCriteria criteria(cv::TermCriteria::COUNT | cv::TermCriteria::EPS, param.maxIteration, 0.01); + std::vector rlofNorm = get_norm(param.normSigma0, param.normSigma1); + CV_Assert(winSizes[0] <= winSizes[1]); + + Mat prevPtsMat = _prevPts.getMat(); + const int derivDepth = DataType::depth; + + CV_Assert(param.maxLevel >= 0 && iWinSize > 2); + + int level = 0, npoints; + CV_Assert((npoints = prevPtsMat.checkVector(2, CV_32F, true)) >= 0); + + if (!(param.useInitialFlow)) + _nextPts.create(prevPtsMat.size(), prevPtsMat.type(), -1, true); + + Mat nextPtsMat = _nextPts.getMat(); + CV_Assert(nextPtsMat.checkVector(2, CV_32F, true) == npoints); + + const Point2f* prevPts = (const Point2f*)prevPtsMat.data; + Point2f* nextPts = (Point2f*)nextPtsMat.data; + std::vector status(npoints); + std::vector err(npoints); + std::vector gainPts(npoints); + + float levelScale[2] = { 2.f,2.f }; + + int maxLevel = prevPyramids[0]->buildPyramid(cv::Size(iWinSize, iWinSize), param.maxLevel, levelScale); + + maxLevel = currPyramids[0]->buildPyramid(cv::Size(iWinSize, iWinSize), maxLevel, levelScale); + if (useAdditionalRGB) + { + prevPyramids[1]->buildPyramid(cv::Size(iWinSize, iWinSize), maxLevel, levelScale); + currPyramids[1]->buildPyramid(cv::Size(iWinSize, iWinSize), maxLevel, levelScale); + } + + if ((criteria.type & TermCriteria::COUNT) == 0) + criteria.maxCount = 30; + else + criteria.maxCount = std::min(std::max(criteria.maxCount, 0), 100); + if ((criteria.type & TermCriteria::EPS) == 0) + criteria.epsilon = 0.001; + else + criteria.epsilon = std::min(std::max(criteria.epsilon, 0.), 10.); + criteria.epsilon *= criteria.epsilon; + + // dI/dx ~ Ix, dI/dy ~ Iy + Mat derivIBuf; + derivIBuf.create(prevPyramids[0]->m_ImagePyramid[0].rows + iWinSize * 2, prevPyramids[0]->m_ImagePyramid[0].cols + iWinSize * 2, CV_MAKETYPE(derivDepth, prevPyramids[0]->m_ImagePyramid[0].channels() * 2)); + + for (level = maxLevel; level >= 0; level--) + { + Mat derivI; + Size imgSize = prevPyramids[0]->getImage(level).size(); + Mat _derivI(imgSize.height + iWinSize * 2, imgSize.width + iWinSize * 2, derivIBuf.type(), derivIBuf.data); + derivI = _derivI(Rect(iWinSize, iWinSize, imgSize.width, imgSize.height)); + calcSharrDeriv(prevPyramids[0]->getImage(level), derivI); + copyMakeBorder(derivI, _derivI, iWinSize, iWinSize, iWinSize, iWinSize, BORDER_CONSTANT | BORDER_ISOLATED); + + cv::Mat tRGBPrevPyr; + cv::Mat tRGBNextPyr; + if (useAdditionalRGB) + { + tRGBPrevPyr = prevPyramids[1]->getImage(level); + tRGBNextPyr = prevPyramids[1]->getImage(level); + + prevPyramids[1]->m_Overwrite = false; + currPyramids[1]->m_Overwrite = false; + } + + cv::Mat prevImage = prevPyramids[0]->getImage(level); + cv::Mat currImage = currPyramids[0]->getImage(level); + + // apply plk like tracker + if (isrobust(param) == false) + { + if (param.useIlluminationModel) + { + cv::parallel_for_(cv::Range(0, npoints), + plk::radial::TrackerInvoker( + prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, + prevPts, nextPts, &status[0], &err[0], &gainPts[0], + level, maxLevel, winSizes, + param.maxIteration, + param.useInitialFlow, + param.supportRegionType, + param.minEigenValue, + param.crossSegmentationThreshold)); + } + else + { + if (param.solverType == SolverType::ST_STANDART) + { + cv::parallel_for_(cv::Range(0, npoints), + plk::ica::TrackerInvoker( + prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, + prevPts, nextPts, &status[0], &err[0], + level, maxLevel, winSizes, + param.maxIteration, + param.useInitialFlow, + param.supportRegionType, + param.crossSegmentationThreshold, + param.minEigenValue)); + } + else + { + cv::parallel_for_(cv::Range(0, npoints), + beplk::ica::TrackerInvoker(prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, + prevPts, nextPts, &status[0], &err[0], + level, maxLevel, winSizes, + param.maxIteration, + param.useInitialFlow, + param.supportRegionType, + param.crossSegmentationThreshold, + param.minEigenValue)); + } + } + } + // for robust models + else + { + if (param.useIlluminationModel) + { + if (param.solverType == SolverType::ST_STANDART) + { + cv::parallel_for_(cv::Range(0, npoints), + rlof::radial::TrackerInvoker( + prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, + prevPts, nextPts, &status[0], &err[0], &gainPts[0], + level, maxLevel, winSizes, + param.maxIteration, + param.useInitialFlow, + param.supportRegionType, + rlofNorm, + param.minEigenValue, + param.crossSegmentationThreshold)); + } + else + { + cv::parallel_for_(cv::Range(0, npoints), + berlof::radial::TrackerInvoker(prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, + prevPts, nextPts, &status[0], &err[0], &gainPts[0], + level, maxLevel, winSizes, + param.maxIteration, + param.useInitialFlow, + param.supportRegionType, + param.crossSegmentationThreshold, + rlofNorm, + param.minEigenValue)); + } + } + else + { + + if (param.solverType == SolverType::ST_STANDART) + { + cv::parallel_for_(cv::Range(0, npoints), + rlof::ica::TrackerInvoker( + prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, + prevPts, nextPts, &status[0], &err[0], + level, maxLevel, winSizes, + param.maxIteration, + param.useInitialFlow, + param.supportRegionType, + rlofNorm, + param.minEigenValue, + param.crossSegmentationThreshold)); + } + else + { + cv::parallel_for_(cv::Range(0, npoints), + berlof::ica::TrackerInvoker(prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, + prevPts, nextPts, &status[0], &err[0], + level, maxLevel, winSizes, + param.maxIteration, + param.useInitialFlow, + param.supportRegionType, + param.crossSegmentationThreshold, + rlofNorm, + param.minEigenValue)); + } + + } + } + + prevPyramids[0]->m_Overwrite = true; + currPyramids[0]->m_Overwrite = true; + } + } + + void preprocess(Ptr prevPyramids[2], + Ptr currPyramids[2], + const std::vector & prevPoints, + std::vector & currPoints, + const RLOFOpticalFlowParameter & param) + { + cv::Mat mask, homography; + if (param.useGlobalMotionPrior == false) + return; + + currPoints.resize(prevPoints.size()); + + RLOFOpticalFlowParameter gmeTrackerParam = param; + gmeTrackerParam.useGlobalMotionPrior = false; + gmeTrackerParam.largeWinSize = 17; + // use none robust tracker for global motion estimation since it is faster + gmeTrackerParam.normSigma0 = std::numeric_limits::max(); + gmeTrackerParam.maxIteration = MAX(15, param.maxIteration); + gmeTrackerParam.minEigenValue = 0.000001f; + + std::vector gmPrevPoints, gmCurrPoints; + + // Initialize point grid + int stepr = prevPyramids[0]->m_Image.rows / 30; + int stepc = prevPyramids[0]->m_Image.rows / 40; + for (int r = stepr / 2; r < prevPyramids[0]->m_Image.rows; r += stepr) + { + for (int c = stepc / 2; c < prevPyramids[0]->m_Image.cols; c += stepc) + { + gmPrevPoints.push_back(cv::Point2f(static_cast(c), static_cast(r))); + } + } + + // perform motion estimation + calcLocalOpticalFlowCore(prevPyramids, currPyramids, gmPrevPoints, gmCurrPoints, gmeTrackerParam); + + cv::Mat prevPointsMat(static_cast(gmPrevPoints.size()), 1, CV_32FC2); + cv::Mat currPointsMat(static_cast(gmPrevPoints.size()), 1, CV_32FC2); + cv::Mat distMat(static_cast(gmPrevPoints.size()), 1, CV_32FC1); + + // Forward backward confidence to estimate optimal ransac reprojection error + int noPoints = 0; + for (unsigned int n = 0; n < gmPrevPoints.size(); n++) + { + cv::Point2f flow = gmCurrPoints[n] - gmPrevPoints[n]; + prevPointsMat.at(noPoints) = gmPrevPoints[n]; + currPointsMat.at(noPoints) = gmCurrPoints[n]; + distMat.at(noPoints) = flow.x * flow.x + flow.y* flow.y; + if (isnan(distMat.at(noPoints)) == false) + noPoints++; + } + + float medianDist = (param.globalMotionRansacThreshold == 0) ? 1.f : + quickselect(distMat, static_cast(noPoints * static_cast(param.globalMotionRansacThreshold) / 100.f)); + medianDist = sqrt(medianDist); + + if (noPoints < 8) + return; + + cv::findHomography(prevPointsMat(cv::Rect(0, 0, 1, noPoints)), currPointsMat(cv::Rect(0, 0, 1, noPoints)), cv::RANSAC, medianDist, mask).convertTo(homography, CV_32FC1); + + if (homography.empty()) + return; + + cv::perspectiveTransform(prevPoints, currPoints, homography); + } + + void calcLocalOpticalFlow( + const Mat prevImage, + const Mat currImage, + Ptr prevPyramids[2], + Ptr currPyramids[2], + const std::vector & prevPoints, + std::vector & currPoints, + const RLOFOpticalFlowParameter & param) + { + if (prevImage.empty() == false && currImage.empty()== false) + { + prevPyramids[0]->m_Overwrite = true; + currPyramids[0]->m_Overwrite = true; + prevPyramids[1]->m_Overwrite = true; + currPyramids[1]->m_Overwrite = true; + if (prevImage.type() == CV_8UC3) + { + prevPyramids[0]->setGrayFromRGB(prevImage); + currPyramids[0]->setGrayFromRGB(currImage); + prevPyramids[1]->setImage(prevImage); + currPyramids[1]->setImage(currImage); + + if (param.supportRegionType == SR_CROSS) + { + prevPyramids[1]->setBlurFromRGB(prevImage); + currPyramids[1]->setBlurFromRGB(currImage); + } + } + else + { + prevPyramids[0]->setImage(prevImage); + currPyramids[0]->setImage(currImage); + } + } + preprocess(prevPyramids, currPyramids, prevPoints, currPoints, param); + RLOFOpticalFlowParameter internParam = param; + if (param.useGlobalMotionPrior == true) + internParam.useInitialFlow = true; + calcLocalOpticalFlowCore(prevPyramids, currPyramids, prevPoints, currPoints, internParam); + } +} } - \ No newline at end of file + \ No newline at end of file diff --git a/modules/optflow/src/rlof/rlof_localflow.h b/modules/optflow/src/rlof/rlof_localflow.h index 6f6a0fc5e2a..bd59721f277 100644 --- a/modules/optflow/src/rlof/rlof_localflow.h +++ b/modules/optflow/src/rlof/rlof_localflow.h @@ -1,44 +1,6 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// - // - // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. - // - // By downloading, copying, installing or using the software you agree to this license. - // If you do not agree to this license, do not download, install, - // copy or use the software. - // - // - // License Agreement - // For Open Source Computer Vision Library - // - // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. - // Copyright (C) 2009, Willow Garage Inc., all rights reserved. - // Third party copyrights are property of their respective owners. - // - // Redistribution and use in source and binary forms, with or without modification, - // are permitted provided that the following conditions are met: - // - // * Redistribution's of source code must retain the above copyright notice, - // this list of conditions and the following disclaimer. - // - // * Redistribution's in binary form must reproduce the above copyright notice, - // this list of conditions and the following disclaimer in the documentation - // and/or other materials provided with the distribution. - // - // * The name of the copyright holders may not be used to endorse or promote products - // derived from this software without specific prior written permission. - // - // This software is provided by the copyright holders and contributors "as is" and - // any express or implied warranties, including, but not limited to, the implied - // warranties of merchantability and fitness for a particular purpose are disclaimed. - // In no event shall the Intel Corporation or contributors be liable for any direct, - // indirect, incidental, special, exemplary, or consequential damages - // (including, but not limited to, procurement of substitute goods or services; - // loss of use, data, or profits; or business interruption) however caused - // and on any theory of liability, whether in contract, strict liability, - // or tort (including negligence or otherwise) arising in any way out of - // the use of this software, even if advised of the possibility of such damage. - // - //M*/ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. #ifndef _RLOF_LOCALFLOW_H_ #define _RLOF_LOCALFLOW_H_ #include @@ -53,106 +15,106 @@ using namespace cv; template T quickselect(const Mat & inp, int k) { - unsigned long i; - unsigned long ir; - unsigned long j; - unsigned long l; - unsigned long mid; - Mat values = inp.clone(); - T a; + unsigned long i; + unsigned long ir; + unsigned long j; + unsigned long l; + unsigned long mid; + Mat values = inp.clone(); + T a; - l = 0; - ir = MAX(values.rows, values.cols) - 1; - while (true) - { - if (ir <= l + 1) - { - if (ir == l + 1 && values.at(ir) < values.at(l)) - std::swap(values.at(l), values.at(ir)); - return values.at(k); - } - else - { - mid = (l + ir) >> 1; - std::swap(values.at(mid), values.at(l+1)); - if (values.at(l) > values.at(ir)) - std::swap(values.at(l), values.at(ir)); - if (values.at(l+1) > values.at(ir)) - std::swap(values.at(l+1), values.at(ir)); - if (values.at(l) > values.at(l+1)) - std::swap(values.at(l), values.at(l+1)); - i = l + 1; - j = ir; - a = values.at(l+1); - while (true) - { - do - { - i++; - } - while (values.at(i) < a); - do - { - j--; - } - while (values.at(j) > a); - if (j < i) break; - std::swap(values.at(i), values.at(j)); - } - values.at(l+1) = values.at(j); - values.at(j) = a; - if (j >= static_cast(k)) ir = j - 1; - if (j <= static_cast(k)) l = i; - } - } + l = 0; + ir = MAX(values.rows, values.cols) - 1; + while (true) + { + if (ir <= l + 1) + { + if (ir == l + 1 && values.at(ir) < values.at(l)) + std::swap(values.at(l), values.at(ir)); + return values.at(k); + } + else + { + mid = (l + ir) >> 1; + std::swap(values.at(mid), values.at(l+1)); + if (values.at(l) > values.at(ir)) + std::swap(values.at(l), values.at(ir)); + if (values.at(l+1) > values.at(ir)) + std::swap(values.at(l+1), values.at(ir)); + if (values.at(l) > values.at(l+1)) + std::swap(values.at(l), values.at(l+1)); + i = l + 1; + j = ir; + a = values.at(l+1); + while (true) + { + do + { + i++; + } + while (values.at(i) < a); + do + { + j--; + } + while (values.at(j) > a); + if (j < i) break; + std::swap(values.at(i), values.at(j)); + } + values.at(l+1) = values.at(j); + values.at(j) = a; + if (j >= static_cast(k)) ir = j - 1; + if (j <= static_cast(k)) l = i; + } + } } namespace cv { - namespace optflow - { - class CImageBuffer - { - public: - CImageBuffer() - : m_Overwrite(true) - {}; - void setGrayFromRGB(const cv::Mat & inp) - { - if(m_Overwrite) - cv::cvtColor(inp, m_Image, cv::COLOR_BGR2GRAY); - } - void setImage(const cv::Mat & inp) - { - if(m_Overwrite) - inp.copyTo(m_Image); - } - void setBlurFromRGB(const cv::Mat & inp) - { - //cv::medianBlur(constNextImage, blurNextImg, 7); - if(m_Overwrite) - cv::GaussianBlur(inp, m_BlurredImage, cv::Size(7,7), -1); - } - - int buildPyramid(cv::Size winSize, int maxLevel, float levelScale[2]); - cv::Mat & getImage(int level) {return m_ImagePyramid[level];} +namespace optflow +{ +class CImageBuffer +{ +public: + CImageBuffer() + : m_Overwrite(true) + {}; + void setGrayFromRGB(const cv::Mat & inp) + { + if(m_Overwrite) + cv::cvtColor(inp, m_Image, cv::COLOR_BGR2GRAY); + } + void setImage(const cv::Mat & inp) + { + if(m_Overwrite) + inp.copyTo(m_Image); + } + void setBlurFromRGB(const cv::Mat & inp) + { + //cv::medianBlur(constNextImage, blurNextImg, 7); + if(m_Overwrite) + cv::GaussianBlur(inp, m_BlurredImage, cv::Size(7,7), -1); + } + + int buildPyramid(cv::Size winSize, int maxLevel, float levelScale[2]); + cv::Mat & getImage(int level) {return m_ImagePyramid[level];} + + std::vector m_ImagePyramid; + cv::Mat m_BlurredImage; + cv::Mat m_Image; + std::vector m_CrossPyramid; + int m_maxLevel; + bool m_Overwrite; +}; - std::vector m_ImagePyramid; - cv::Mat m_BlurredImage; - cv::Mat m_Image; - std::vector m_CrossPyramid; - int m_maxLevel; - bool m_Overwrite; - }; - - void calcLocalOpticalFlow( - const Mat prevImage, - const Mat currImage, - Ptr prevPyramids[2], - Ptr currPyramids[2], - const std::vector & prevPoints, - std::vector & currPoints, - const RLOFOpticalFlowParameter & param); +void calcLocalOpticalFlow( + const Mat prevImage, + const Mat currImage, + Ptr prevPyramids[2], + Ptr currPyramids[2], + const std::vector & prevPoints, + std::vector & currPoints, + const RLOFOpticalFlowParameter & param); } } #endif \ No newline at end of file diff --git a/modules/optflow/src/rlofflow.cpp b/modules/optflow/src/rlofflow.cpp index 1bdc45a0221..d7309c15210 100644 --- a/modules/optflow/src/rlofflow.cpp +++ b/modules/optflow/src/rlofflow.cpp @@ -1,385 +1,345 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// - // - // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. - // - // By downloading, copying, installing or using the software you agree to this license. - // If you do not agree to this license, do not download, install, - // copy or use the software. - // - // - // License Agreement - // For Open Source Computer Vision Library - // - // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. - // Copyright (C) 2009, Willow Garage Inc., all rights reserved. - // Third party copyrights are property of their respective owners. - // - // Redistribution and use in source and binary forms, with or without modification, - // are permitted provided that the following conditions are met: - // - // * Redistribution's of source code must retain the above copyright notice, - // this list of conditions and the following disclaimer. - // - // * Redistribution's in binary form must reproduce the above copyright notice, - // this list of conditions and the following disclaimer in the documentation - // and/or other materials provided with the distribution. - // - // * The name of the copyright holders may not be used to endorse or promote products - // derived from this software without specific prior written permission. - // - // This software is provided by the copyright holders and contributors "as is" and - // any express or implied warranties, including, but not limited to, the implied - // warranties of merchantability and fitness for a particular purpose are disclaimed. - // In no event shall the Intel Corporation or contributors be liable for any direct, - // indirect, incidental, special, exemplary, or consequential damages - // (including, but not limited to, procurement of substitute goods or services; - // loss of use, data, or profits; or business interruption) however caused - // and on any theory of liability, whether in contract, strict liability, - // or tort (including negligence or otherwise) arising in any way out of - // the use of this software, even if advised of the possibility of such damage. - // - //M*/ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. #include "precomp.hpp" #include "rlof/rlof_localflow.h" #include "rlof/geo_interpolation.hpp" #include "opencv2/ximgproc.hpp" namespace cv { - namespace optflow - { - - class DenseOpticalFlowRLOFImpl : public DenseRLOFOpticalFlow - { - public: - DenseOpticalFlowRLOFImpl() - : forwardBackwardThreshold(1.f) - , gridStep(6, 6) - , interp_type(InterpolationType::INTERP_GEO) - , k(128) - , sigma(0.05f) - , lambda(999.f) - , fgs_lambda(500.0f) - , fgs_sigma(1.5f) - , use_post_proc(true) - - { - prevPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer); - prevPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer); - currPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer); - currPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer); - } - virtual void setRLOFOpticalFlowParameter(RLOFOpticalFlowParameter val) CV_OVERRIDE { param = val; } - virtual RLOFOpticalFlowParameter getRLOFOpticalFlowParameter() const CV_OVERRIDE { return param; } - - virtual float getForwardBackward() const CV_OVERRIDE { return forwardBackwardThreshold; } - virtual void setForwardBackward(float val) CV_OVERRIDE { forwardBackwardThreshold = val; } - - virtual void setInterpolation(InterpolationType val) CV_OVERRIDE { interp_type = val; } - virtual InterpolationType getInterpolation() const CV_OVERRIDE { return interp_type; } - - virtual Size getGridStep() const CV_OVERRIDE { return gridStep; } - virtual void setGridStep(Size val) CV_OVERRIDE { gridStep = val; } - - virtual int getEPICK() const CV_OVERRIDE { return k; } - virtual void setEPICK(int val) CV_OVERRIDE { k = val; } - - virtual float getEPICSigma() const CV_OVERRIDE { return sigma; } - virtual void setEPICSigma(float val) CV_OVERRIDE { sigma = val; } - - virtual float getEPICLambda() const CV_OVERRIDE { return lambda; } - virtual void setEPICLambda(float val) CV_OVERRIDE { lambda = val; } - - virtual float getFgsLambda() const CV_OVERRIDE { return fgs_lambda; } - virtual void setFgsLambda(float val) CV_OVERRIDE { fgs_lambda = val; } - - virtual float getFgsSigma() const CV_OVERRIDE { return fgs_sigma; } - virtual void setFgsSigma(float val) CV_OVERRIDE { fgs_sigma = val; } - - virtual bool getUsePostProc() const CV_OVERRIDE { return use_post_proc; } - virtual void setUsePostProc(bool val) CV_OVERRIDE { use_post_proc = val; } - - virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow) CV_OVERRIDE - { - CV_Assert(!I0.empty() && I0.depth() == CV_8U && (I0.channels() == 3 || I0.channels() == 1)); - CV_Assert(!I1.empty() && I1.depth() == CV_8U && (I1.channels() == 3 || I1.channels() == 1)); - CV_Assert(I0.sameSize(I1)); - if (param.supportRegionType == RLOFOpticalFlowParameter::SR_CROSS) - CV_Assert( I0.channels() == 3 && I1.channels() == 3); - CV_Assert(interp_type == InterpolationType::INTERP_EPIC || interp_type == InterpolationType::INTERP_GEO); - - Mat prevImage = I0.getMat(); - Mat currImage = I1.getMat(); - int noPoints = prevImage.cols * prevImage.rows; - std::vector prevPoints(noPoints); - std::vector currPoints, refPoints; - noPoints = 0; - cv::Size grid_h = gridStep / 2; - for (int r = grid_h.height; r < prevImage.rows; r += gridStep.height) - { - for (int c = grid_h.width; c < prevImage.cols; c += gridStep.width) - { - prevPoints[noPoints++] = cv::Point2f(static_cast(c), static_cast(r)); - } - } - prevPoints.erase(prevPoints.begin() + noPoints, prevPoints.end()); - currPoints.resize(prevPoints.size()); - - calcLocalOpticalFlow(prevImage, currImage, prevPyramid, currPyramid, prevPoints, currPoints, param); - - flow.create(prevImage.size(), CV_32FC2); - Mat dense_flow = flow.getMat(); - - std::vector filtered_prevPoints; - std::vector filtered_currPoints; - if (gridStep == cv::Size(1, 1) && forwardBackwardThreshold <= 0) - { - for (unsigned int n = 0; n < prevPoints.size(); n++) - { - dense_flow.at(prevPoints[n]) = currPoints[n] - prevPoints[n]; - } - return; - } - if (forwardBackwardThreshold > 0) - { - // reuse image pyramids - calcLocalOpticalFlow(currImage, prevImage, currPyramid, prevPyramid, currPoints, refPoints, param); - - filtered_prevPoints.resize(prevPoints.size()); - filtered_currPoints.resize(prevPoints.size()); - float sqrForwardBackwardThreshold = forwardBackwardThreshold * forwardBackwardThreshold; - noPoints = 0; - for (unsigned int r = 0; r < refPoints.size(); r++) - { - Point2f diff = refPoints[r] - prevPoints[r]; - if (diff.x * diff.x + diff.y * diff.y < sqrForwardBackwardThreshold) - { - filtered_prevPoints[noPoints] = prevPoints[r]; - filtered_currPoints[noPoints++] = currPoints[r]; - } - } - - filtered_prevPoints.erase(filtered_prevPoints.begin() + noPoints, filtered_prevPoints.end()); - filtered_currPoints.erase(filtered_currPoints.begin() + noPoints, filtered_currPoints.end()); - - } - else - { - filtered_prevPoints = prevPoints; - filtered_currPoints = currPoints; - } - - if (interp_type == InterpolationType::INTERP_EPIC) - { - Ptr gd = ximgproc::createEdgeAwareInterpolator(); - gd->setK(k); - gd->setSigma(sigma); - gd->setLambda(lambda); - gd->setUsePostProcessing(false); - gd->interpolate(prevImage, filtered_prevPoints, currImage, filtered_currPoints, dense_flow); - } - else - { - Mat blurredPrevImage, blurredCurrImage; - GaussianBlur(prevImage, blurredPrevImage, cv::Size(5, 5), -1); - std::vector status(filtered_currPoints.size(), 1); - interpolate_irregular_nn_raster(filtered_prevPoints, filtered_currPoints, status, blurredPrevImage).copyTo(dense_flow); - std::vector vecMats; - std::vector vecMats2(2); - cv::split(dense_flow, vecMats); - cv::bilateralFilter(vecMats[0], vecMats2[0], 5, 2, 20); - cv::bilateralFilter(vecMats[1], vecMats2[1], 5, 2, 20); - cv::merge(vecMats2, dense_flow); - } - if (use_post_proc) - { - ximgproc::fastGlobalSmootherFilter(prevImage, flow, flow, fgs_lambda, fgs_sigma); - } - } - - virtual void collectGarbage() CV_OVERRIDE - { - prevPyramid[0].release(); - prevPyramid[1].release(); - currPyramid[0].release(); - currPyramid[1].release(); - } - - protected: - RLOFOpticalFlowParameter param; - float forwardBackwardThreshold; - Ptr prevPyramid[2]; - Ptr currPyramid[2]; - cv::Size gridStep; - InterpolationType interp_type; - int k; - float sigma; - float lambda; - float fgs_lambda; - float fgs_sigma; - bool use_post_proc; - }; - - Ptr DenseRLOFOpticalFlow::create( - const RLOFOpticalFlowParameter & rlofParam, - float forwardBackwardThreshold, - cv::Size gridStep, - InterpolationType interp_type, - int epicK, - float epicSigma, - float epicLambda, - float fgs_lambda, - float fgs_sigma) - { - Ptr algo = makePtr(); - algo->setRLOFOpticalFlowParameter(rlofParam); - algo->setForwardBackward(forwardBackwardThreshold); - algo->setGridStep(gridStep); - algo->setInterpolation(interp_type); - algo->setEPICK(epicK); - algo->setEPICSigma(epicSigma); - algo->setEPICLambda(epicLambda); - algo->setFgsLambda(fgs_lambda); - algo->setFgsSigma(fgs_sigma); - return algo; - } - - class SparseRLOFOpticalFlowImpl : public SparseRLOFOpticalFlow - { - public: - SparseRLOFOpticalFlowImpl() - : forwardBackwardThreshold(1.f) - { - prevPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer); - prevPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer); - currPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer); - currPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer); - } - virtual void setRLOFOpticalFlowParameter(RLOFOpticalFlowParameter val) CV_OVERRIDE { param = val; } - virtual RLOFOpticalFlowParameter getRLOFOpticalFlowParameter() const CV_OVERRIDE { return param; } - - virtual float getForwardBackward() const CV_OVERRIDE { return forwardBackwardThreshold; } - virtual void setForwardBackward(float val) CV_OVERRIDE { forwardBackwardThreshold = val; } - - virtual void calc(InputArray prevImg, InputArray nextImg, - InputArray prevPts, InputOutputArray nextPts, - OutputArray status, - OutputArray err) CV_OVERRIDE - { - CV_Assert(!prevImg.empty() && prevImg.depth() == CV_8U && (prevImg.channels() == 3 || prevImg.channels() == 1)); - CV_Assert(!nextImg.empty() && nextImg.depth() == CV_8U && (nextImg.channels() == 3 || nextImg.channels() == 1)); - CV_Assert(prevImg.sameSize(nextImg)); - - //CV_Assert(param.supportRegionType == RLOFOpticalFlowParameter::SR_CROSS && I0.channels() == 3 && I1.channels() == 3); - - Mat prevImage = prevImg.getMat(); - Mat nextImage = nextImg.getMat(); - Mat prevPtsMat = prevPts.getMat(); - - if (param.useInitialFlow == false) - nextPts.create(prevPtsMat.size(), prevPtsMat.type(), -1, true); - - int npoints = 0; - CV_Assert((npoints = prevPtsMat.checkVector(2, CV_32F, true)) >= 0); - - if (npoints == 0) - { - nextPts.release(); - status.release(); - err.release(); - return; - } - - Mat nextPtsMat = nextPts.getMat(); - CV_Assert(nextPtsMat.checkVector(2, CV_32F, true) == npoints); - std::vector prevPoints(npoints), nextPoints(npoints), refPoints; - prevPtsMat.copyTo(cv::Mat(1, npoints, CV_32FC2, &prevPoints[0])); - if (param.useInitialFlow ) - nextPtsMat.copyTo(cv::Mat(1, nextPtsMat.cols, CV_32FC2, &nextPoints[0])); - - cv::Mat statusMat; - cv::Mat errorMat; - if (status.needed() || forwardBackwardThreshold > 0) - { - status.create((int)npoints, 1, CV_8U, -1, true); - statusMat = status.getMat(); - statusMat.setTo(1); - } - - if (err.needed() || forwardBackwardThreshold > 0) - { - err.create((int)npoints, 1, CV_32F, -1, true); - errorMat = err.getMat(); - errorMat.setTo(0); - } - - calcLocalOpticalFlow(prevImage, nextImage, prevPyramid, currPyramid, prevPoints, nextPoints, param); - cv::Mat(1,npoints , CV_32FC2, &nextPoints[0]).copyTo(nextPtsMat); - if (forwardBackwardThreshold > 0) - { - // reuse image pyramids - calcLocalOpticalFlow(nextImage, prevImage, currPyramid, prevPyramid, nextPoints, refPoints, param); - } - for (unsigned int r = 0; r < refPoints.size(); r++) - { - Point2f diff = refPoints[r] - prevPoints[r]; - errorMat.at(r) = sqrt(diff.x * diff.x + diff.y * diff.y); - if (errorMat.at(r) <= forwardBackwardThreshold) - statusMat.at(r) = 0; - } - - } - - protected: - RLOFOpticalFlowParameter param; - float forwardBackwardThreshold; - Ptr prevPyramid[2]; - Ptr currPyramid[2]; - }; - - Ptr SparseRLOFOpticalFlow::create( - const RLOFOpticalFlowParameter & rlofParam, - float forwardBackwardThreshold) - { - Ptr algo = makePtr(); - algo->setRLOFOpticalFlowParameter(rlofParam); - algo->setForwardBackward(forwardBackwardThreshold); - return algo; - } - - void calcOpticalFlowDenseRLOF(InputArray I0, InputArray I1, InputOutputArray flow, - RLOFOpticalFlowParameter rlofParam , - float forewardBackwardThreshold, Size gridStep, - DenseRLOFOpticalFlow::InterpolationType interp_type, - int epicK, float epicSigma, - bool use_post_proc, float fgsLambda, float fgsSigma) - { - Ptr algo = DenseRLOFOpticalFlow::create( - rlofParam, forewardBackwardThreshold, gridStep, interp_type, - epicK, epicSigma, use_post_proc, fgsLambda, fgsSigma); - algo->calc(I0, I1, flow); - algo->collectGarbage(); - } - - void calcOpticalFlowSparseRLOF(InputArray prevImg, InputArray nextImg, - InputArray prevPts, InputOutputArray nextPts, - OutputArray status, OutputArray err, - RLOFOpticalFlowParameter rlofParam, - float forewardBackwardThreshold) - { - Ptr algo = SparseRLOFOpticalFlow::create( - rlofParam, forewardBackwardThreshold); - algo->calc(prevImg, nextImg, prevPts, nextPts, status, err); - } - Ptr createOptFlow_DenseRLOF() - { - return DenseRLOFOpticalFlow::create(); - } - - Ptr createOptFlow_SparseRLOF() - { - return SparseRLOFOpticalFlow::create(); - } - } +namespace optflow +{ + + class DenseOpticalFlowRLOFImpl : public DenseRLOFOpticalFlow + { + public: + DenseOpticalFlowRLOFImpl() + : forwardBackwardThreshold(1.f) + , gridStep(6, 6) + , interp_type(InterpolationType::INTERP_GEO) + , k(128) + , sigma(0.05f) + , lambda(999.f) + , fgs_lambda(500.0f) + , fgs_sigma(1.5f) + , use_post_proc(true) + + { + prevPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer); + prevPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer); + currPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer); + currPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer); + } + virtual void setRLOFOpticalFlowParameter(Ptr val) CV_OVERRIDE { }//param = *val; } + virtual Ptr getRLOFOpticalFlowParameter() const CV_OVERRIDE { return Ptr(); } + + virtual float getForwardBackward() const CV_OVERRIDE { return forwardBackwardThreshold; } + virtual void setForwardBackward(float val) CV_OVERRIDE { forwardBackwardThreshold = val; } + + virtual void setInterpolation(InterpolationType val) CV_OVERRIDE { interp_type = val; } + virtual InterpolationType getInterpolation() const CV_OVERRIDE { return interp_type; } + + virtual Size getGridStep() const CV_OVERRIDE { return gridStep; } + virtual void setGridStep(Size val) CV_OVERRIDE { gridStep = val; } + + virtual int getEPICK() const CV_OVERRIDE { return k; } + virtual void setEPICK(int val) CV_OVERRIDE { k = val; } + + virtual float getEPICSigma() const CV_OVERRIDE { return sigma; } + virtual void setEPICSigma(float val) CV_OVERRIDE { sigma = val; } + + virtual float getEPICLambda() const CV_OVERRIDE { return lambda; } + virtual void setEPICLambda(float val) CV_OVERRIDE { lambda = val; } + + virtual float getFgsLambda() const CV_OVERRIDE { return fgs_lambda; } + virtual void setFgsLambda(float val) CV_OVERRIDE { fgs_lambda = val; } + + virtual float getFgsSigma() const CV_OVERRIDE { return fgs_sigma; } + virtual void setFgsSigma(float val) CV_OVERRIDE { fgs_sigma = val; } + + virtual bool getUsePostProc() const CV_OVERRIDE { return use_post_proc; } + virtual void setUsePostProc(bool val) CV_OVERRIDE { use_post_proc = val; } + + virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow) CV_OVERRIDE + { + CV_Assert(!I0.empty() && I0.depth() == CV_8U && (I0.channels() == 3 || I0.channels() == 1)); + CV_Assert(!I1.empty() && I1.depth() == CV_8U && (I1.channels() == 3 || I1.channels() == 1)); + CV_Assert(I0.sameSize(I1)); + if (param.supportRegionType == SR_CROSS) + CV_Assert( I0.channels() == 3 && I1.channels() == 3); + CV_Assert(interp_type == InterpolationType::INTERP_EPIC || interp_type == InterpolationType::INTERP_GEO); + + Mat prevImage = I0.getMat(); + Mat currImage = I1.getMat(); + int noPoints = prevImage.cols * prevImage.rows; + std::vector prevPoints(noPoints); + std::vector currPoints, refPoints; + noPoints = 0; + cv::Size grid_h = gridStep / 2; + for (int r = grid_h.height; r < prevImage.rows; r += gridStep.height) + { + for (int c = grid_h.width; c < prevImage.cols; c += gridStep.width) + { + prevPoints[noPoints++] = cv::Point2f(static_cast(c), static_cast(r)); + } + } + prevPoints.erase(prevPoints.begin() + noPoints, prevPoints.end()); + currPoints.resize(prevPoints.size()); + + calcLocalOpticalFlow(prevImage, currImage, prevPyramid, currPyramid, prevPoints, currPoints, param); + + flow.create(prevImage.size(), CV_32FC2); + Mat dense_flow = flow.getMat(); + + std::vector filtered_prevPoints; + std::vector filtered_currPoints; + if (gridStep == cv::Size(1, 1) && forwardBackwardThreshold <= 0) + { + for (unsigned int n = 0; n < prevPoints.size(); n++) + { + dense_flow.at(prevPoints[n]) = currPoints[n] - prevPoints[n]; + } + return; + } + if (forwardBackwardThreshold > 0) + { + // reuse image pyramids + calcLocalOpticalFlow(currImage, prevImage, currPyramid, prevPyramid, currPoints, refPoints, param); + + filtered_prevPoints.resize(prevPoints.size()); + filtered_currPoints.resize(prevPoints.size()); + float sqrForwardBackwardThreshold = forwardBackwardThreshold * forwardBackwardThreshold; + noPoints = 0; + for (unsigned int r = 0; r < refPoints.size(); r++) + { + Point2f diff = refPoints[r] - prevPoints[r]; + if (diff.x * diff.x + diff.y * diff.y < sqrForwardBackwardThreshold) + { + filtered_prevPoints[noPoints] = prevPoints[r]; + filtered_currPoints[noPoints++] = currPoints[r]; + } + } + + filtered_prevPoints.erase(filtered_prevPoints.begin() + noPoints, filtered_prevPoints.end()); + filtered_currPoints.erase(filtered_currPoints.begin() + noPoints, filtered_currPoints.end()); + + } + else + { + filtered_prevPoints = prevPoints; + filtered_currPoints = currPoints; + } + + if (interp_type == InterpolationType::INTERP_EPIC) + { + Ptr gd = ximgproc::createEdgeAwareInterpolator(); + gd->setK(k); + gd->setSigma(sigma); + gd->setLambda(lambda); + gd->setUsePostProcessing(false); + gd->interpolate(prevImage, filtered_prevPoints, currImage, filtered_currPoints, dense_flow); + } + else + { + Mat blurredPrevImage, blurredCurrImage; + GaussianBlur(prevImage, blurredPrevImage, cv::Size(5, 5), -1); + std::vector status(filtered_currPoints.size(), 1); + interpolate_irregular_nn_raster(filtered_prevPoints, filtered_currPoints, status, blurredPrevImage).copyTo(dense_flow); + std::vector vecMats; + std::vector vecMats2(2); + cv::split(dense_flow, vecMats); + cv::bilateralFilter(vecMats[0], vecMats2[0], 5, 2, 20); + cv::bilateralFilter(vecMats[1], vecMats2[1], 5, 2, 20); + cv::merge(vecMats2, dense_flow); + } + if (use_post_proc) + { + ximgproc::fastGlobalSmootherFilter(prevImage, flow, flow, fgs_lambda, fgs_sigma); + } + } + + virtual void collectGarbage() CV_OVERRIDE + { + prevPyramid[0].release(); + prevPyramid[1].release(); + currPyramid[0].release(); + currPyramid[1].release(); + } + + protected: + RLOFOpticalFlowParameter param; + float forwardBackwardThreshold; + Ptr prevPyramid[2]; + Ptr currPyramid[2]; + cv::Size gridStep; + InterpolationType interp_type; + int k; + float sigma; + float lambda; + float fgs_lambda; + float fgs_sigma; + bool use_post_proc; + }; + + Ptr DenseRLOFOpticalFlow::create( + Ptr rlofParam, + float forwardBackwardThreshold, + cv::Size gridStep, + InterpolationType interp_type, + int epicK, + float epicSigma, + float epicLambda, + float fgs_lambda, + float fgs_sigma) + { + Ptr algo = makePtr(); + algo->setRLOFOpticalFlowParameter(rlofParam); + algo->setForwardBackward(forwardBackwardThreshold); + algo->setGridStep(gridStep); + algo->setInterpolation(interp_type); + algo->setEPICK(epicK); + algo->setEPICSigma(epicSigma); + algo->setEPICLambda(epicLambda); + algo->setFgsLambda(fgs_lambda); + algo->setFgsSigma(fgs_sigma); + return algo; + } + + class SparseRLOFOpticalFlowImpl : public SparseRLOFOpticalFlow + { + public: + SparseRLOFOpticalFlowImpl() + : forwardBackwardThreshold(1.f) + { + prevPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer); + prevPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer); + currPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer); + currPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer); + } + virtual void setRLOFOpticalFlowParameter(Ptr val) CV_OVERRIDE { } // param = *val; } + virtual Ptr getRLOFOpticalFlowParameter() const CV_OVERRIDE { return Ptr(); } + + virtual float getForwardBackward() const CV_OVERRIDE { return forwardBackwardThreshold; } + virtual void setForwardBackward(float val) CV_OVERRIDE { forwardBackwardThreshold = val; } + + virtual void calc(InputArray prevImg, InputArray nextImg, + InputArray prevPts, InputOutputArray nextPts, + OutputArray status, + OutputArray err) CV_OVERRIDE + { + CV_Assert(!prevImg.empty() && prevImg.depth() == CV_8U && (prevImg.channels() == 3 || prevImg.channels() == 1)); + CV_Assert(!nextImg.empty() && nextImg.depth() == CV_8U && (nextImg.channels() == 3 || nextImg.channels() == 1)); + CV_Assert(prevImg.sameSize(nextImg)); + + Mat prevImage = prevImg.getMat(); + Mat nextImage = nextImg.getMat(); + Mat prevPtsMat = prevPts.getMat(); + + if (param.useInitialFlow == false) + nextPts.create(prevPtsMat.size(), prevPtsMat.type(), -1, true); + + int npoints = 0; + CV_Assert((npoints = prevPtsMat.checkVector(2, CV_32F, true)) >= 0); + + if (npoints == 0) + { + nextPts.release(); + status.release(); + err.release(); + return; + } + + Mat nextPtsMat = nextPts.getMat(); + CV_Assert(nextPtsMat.checkVector(2, CV_32F, true) == npoints); + std::vector prevPoints(npoints), nextPoints(npoints), refPoints; + prevPtsMat.copyTo(cv::Mat(1, npoints, CV_32FC2, &prevPoints[0])); + if (param.useInitialFlow ) + nextPtsMat.copyTo(cv::Mat(1, nextPtsMat.cols, CV_32FC2, &nextPoints[0])); + + cv::Mat statusMat; + cv::Mat errorMat; + if (status.needed() || forwardBackwardThreshold > 0) + { + status.create((int)npoints, 1, CV_8U, -1, true); + statusMat = status.getMat(); + statusMat.setTo(1); + } + + if (err.needed() || forwardBackwardThreshold > 0) + { + err.create((int)npoints, 1, CV_32F, -1, true); + errorMat = err.getMat(); + errorMat.setTo(0); + } + + calcLocalOpticalFlow(prevImage, nextImage, prevPyramid, currPyramid, prevPoints, nextPoints, param); + cv::Mat(1,npoints , CV_32FC2, &nextPoints[0]).copyTo(nextPtsMat); + if (forwardBackwardThreshold > 0) + { + // reuse image pyramids + calcLocalOpticalFlow(nextImage, prevImage, currPyramid, prevPyramid, nextPoints, refPoints, param); + } + for (unsigned int r = 0; r < refPoints.size(); r++) + { + Point2f diff = refPoints[r] - prevPoints[r]; + errorMat.at(r) = sqrt(diff.x * diff.x + diff.y * diff.y); + if (errorMat.at(r) <= forwardBackwardThreshold) + statusMat.at(r) = 0; + } + + } + + protected: + RLOFOpticalFlowParameter param; + float forwardBackwardThreshold; + Ptr prevPyramid[2]; + Ptr currPyramid[2]; + }; + + Ptr SparseRLOFOpticalFlow::create( + Ptr rlofParam, + float forwardBackwardThreshold) + { + Ptr algo = makePtr(); + algo->setRLOFOpticalFlowParameter(rlofParam); + algo->setForwardBackward(forwardBackwardThreshold); + return algo; + } + + void calcOpticalFlowDenseRLOF(InputArray I0, InputArray I1, InputOutputArray flow, + Ptr rlofParam , + float forewardBackwardThreshold, Size gridStep, + InterpolationType interp_type, + int epicK, float epicSigma, + bool use_post_proc, float fgsLambda, float fgsSigma) + { + Ptr algo = DenseRLOFOpticalFlow::create( + rlofParam, forewardBackwardThreshold, gridStep, interp_type, + epicK, epicSigma, use_post_proc, fgsLambda, fgsSigma); + algo->calc(I0, I1, flow); + algo->collectGarbage(); + } + + void calcOpticalFlowSparseRLOF(InputArray prevImg, InputArray nextImg, + InputArray prevPts, InputOutputArray nextPts, + OutputArray status, OutputArray err, + Ptr rlofParam, + float forewardBackwardThreshold) + { + Ptr algo = SparseRLOFOpticalFlow::create( + rlofParam, forewardBackwardThreshold); + algo->calc(prevImg, nextImg, prevPts, nextPts, status, err); + } + Ptr createOptFlow_DenseRLOF() + { + return DenseRLOFOpticalFlow::create(); + } + + Ptr createOptFlow_SparseRLOF() + { + return SparseRLOFOpticalFlow::create(); + } +} } \ No newline at end of file diff --git a/modules/optflow/test/test_OF_accuracy.cpp b/modules/optflow/test/test_OF_accuracy.cpp index d2aedf2573a..b517896e045 100644 --- a/modules/optflow/test/test_OF_accuracy.cpp +++ b/modules/optflow/test/test_OF_accuracy.cpp @@ -85,17 +85,17 @@ static float calcRMSE(Mat flow1, Mat flow2) } static float calcRMSE(vector prevPts, vector currPts, Mat flow) { - vector ee; - for (unsigned int n = 0; n < prevPts.size(); n++) - { - Point2f gtFlow = flow.at(prevPts[n]); - if (isFlowCorrect(gtFlow.x) && isFlowCorrect(gtFlow.y)) - { - Point2f diffFlow = (currPts[n] - prevPts[n]) - gtFlow; - ee.push_back(sqrt(diffFlow.x * diffFlow.x + diffFlow.y * diffFlow.y)); - } - } - return mean(ee).val[0]; + vector ee; + for (unsigned int n = 0; n < prevPts.size(); n++) + { + Point2f gtFlow = flow.at(prevPts[n]); + if (isFlowCorrect(gtFlow.x) && isFlowCorrect(gtFlow.y)) + { + Point2f diffFlow = (currPts[n] - prevPts[n]) - gtFlow; + ee.push_back(sqrt(diffFlow.x * diffFlow.x + diffFlow.y * diffFlow.y)); + } + } + return mean(ee).val[0]; } static float calcAvgEPE(vector< pair > corr, Mat flow) { @@ -127,10 +127,10 @@ bool readRubberWhale(Mat &dst_frame_1, Mat &dst_frame_2, Mat &dst_GT) string frame1_path = getRubberWhaleFrame1(); string frame2_path = getRubberWhaleFrame2(); string gt_flow_path = getRubberWhaleGroundTruth(); - // removing space may be an issue on windows machines - frame1_path.erase(std::remove_if(frame1_path.begin(), frame1_path.end(), isspace), frame1_path.end()); - frame2_path.erase(std::remove_if(frame2_path.begin(), frame2_path.end(), isspace), frame2_path.end()); - gt_flow_path.erase(std::remove_if(gt_flow_path.begin(), gt_flow_path.end(), isspace), gt_flow_path.end()); + // removing space may be an issue on windows machines + frame1_path.erase(std::remove_if(frame1_path.begin(), frame1_path.end(), isspace), frame1_path.end()); + frame2_path.erase(std::remove_if(frame2_path.begin(), frame2_path.end(), isspace), frame2_path.end()); + gt_flow_path.erase(std::remove_if(gt_flow_path.begin(), gt_flow_path.end(), isspace), gt_flow_path.end()); dst_frame_1 = imread(frame1_path); dst_frame_2 = imread(frame2_path); @@ -177,103 +177,103 @@ TEST(DenseOpticalFlow_DeepFlow, ReferenceAccuracy) TEST(SparseOpticalFlow, ReferenceAccuracy) { - // with the following test each invoker class should be tested once - Mat frame1, frame2, GT; - ASSERT_TRUE(readRubberWhale(frame1, frame2, GT)); - vector prevPts, currPts; - - for (int r = 0; r < frame1.rows; r+=10) - { - for (int c = 0; c < frame1.cols; c+=10) - { - prevPts.push_back(Point2f(static_cast(c), static_cast(r))); - } - } - vector status(prevPts.size()); - vector err(prevPts.size()); - Ptr algo = SparseRLOFOpticalFlow::create(); - algo->setForwardBackward(0.0f); - - RLOFOpticalFlowParameter param; - param.supportRegionType = RLOFOpticalFlowParameter::SR_CROSS; - param.useIlluminationModel = true; - - param.solverType = RLOFOpticalFlowParameter::ST_BILINEAR; - algo->setRLOFOpticalFlowParameter(param); - algo->calc(frame1, frame2, prevPts, currPts, status, err); - EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.22f); - - param.solverType = RLOFOpticalFlowParameter::ST_STANDART; - algo->setRLOFOpticalFlowParameter(param); - algo->calc(frame1, frame2, prevPts, currPts, status, err); - EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.34f); - - param.useIlluminationModel = false; - - param.solverType = RLOFOpticalFlowParameter::ST_BILINEAR; - algo->setRLOFOpticalFlowParameter(param); - algo->calc(frame1, frame2, prevPts, currPts, status, err); - EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.24f); - - param.solverType = RLOFOpticalFlowParameter::ST_STANDART; - algo->setRLOFOpticalFlowParameter(param); - algo->calc(frame1, frame2, prevPts, currPts, status, err); - EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.23f); - - param.normSigma0 = numeric_limits::max(); - param.normSigma1 = numeric_limits::max(); - param.useIlluminationModel = true; - - param.solverType = RLOFOpticalFlowParameter::ST_BILINEAR; - algo->setRLOFOpticalFlowParameter(param); - algo->calc(frame1, frame2, prevPts, currPts, status, err); - EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.26f); - - param.solverType = RLOFOpticalFlowParameter::ST_STANDART; - algo->setRLOFOpticalFlowParameter(param); - algo->calc(frame1, frame2, prevPts, currPts, status, err); - EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.26f); - - param.useIlluminationModel = false; - - param.solverType = RLOFOpticalFlowParameter::ST_BILINEAR; - algo->setRLOFOpticalFlowParameter(param); - algo->calc(frame1, frame2, prevPts, currPts, status, err); - EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.25f); - - param.solverType = RLOFOpticalFlowParameter::ST_STANDART; - algo->setRLOFOpticalFlowParameter(param); - algo->calc(frame1, frame2, prevPts, currPts, status, err); - EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.26f); + // with the following test each invoker class should be tested once + Mat frame1, frame2, GT; + ASSERT_TRUE(readRubberWhale(frame1, frame2, GT)); + vector prevPts, currPts; + + for (int r = 0; r < frame1.rows; r+=10) + { + for (int c = 0; c < frame1.cols; c+=10) + { + prevPts.push_back(Point2f(static_cast(c), static_cast(r))); + } + } + vector status(prevPts.size()); + vector err(prevPts.size()); + Ptr algo = SparseRLOFOpticalFlow::create(); + algo->setForwardBackward(0.0f); + + Ptr param; + param->supportRegionType = SR_CROSS; + param->useIlluminationModel = true; + + param->solverType = ST_BILINEAR; + algo->setRLOFOpticalFlowParameter(param); + algo->calc(frame1, frame2, prevPts, currPts, status, err); + EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.22f); + + param->solverType = ST_STANDART; + algo->setRLOFOpticalFlowParameter(param); + algo->calc(frame1, frame2, prevPts, currPts, status, err); + EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.34f); + + param->useIlluminationModel = false; + + param->solverType = ST_BILINEAR; + algo->setRLOFOpticalFlowParameter(param); + algo->calc(frame1, frame2, prevPts, currPts, status, err); + EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.24f); + + param->solverType = ST_STANDART; + algo->setRLOFOpticalFlowParameter(param); + algo->calc(frame1, frame2, prevPts, currPts, status, err); + EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.23f); + + param->normSigma0 = numeric_limits::max(); + param->normSigma1 = numeric_limits::max(); + param->useIlluminationModel = true; + + param->solverType = ST_BILINEAR; + algo->setRLOFOpticalFlowParameter(param); + algo->calc(frame1, frame2, prevPts, currPts, status, err); + EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.26f); + + param->solverType = ST_STANDART; + algo->setRLOFOpticalFlowParameter(param); + algo->calc(frame1, frame2, prevPts, currPts, status, err); + EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.26f); + + param->useIlluminationModel = false; + + param->solverType = ST_BILINEAR; + algo->setRLOFOpticalFlowParameter(param); + algo->calc(frame1, frame2, prevPts, currPts, status, err); + EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.25f); + + param->solverType = ST_STANDART; + algo->setRLOFOpticalFlowParameter(param); + algo->calc(frame1, frame2, prevPts, currPts, status, err); + EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.26f); } TEST(DenseOpticalFlow_RLOF, ReferenceAccuracy) { - Mat frame1, frame2, GT; - ASSERT_TRUE(readRubberWhale(frame1, frame2, GT)); - - Mat flow; - Ptr algo = DenseRLOFOpticalFlow::create(); - RLOFOpticalFlowParameter param; - param.supportRegionType = RLOFOpticalFlowParameter::SR_CROSS; - param.solverType = RLOFOpticalFlowParameter::ST_BILINEAR; - algo->setRLOFOpticalFlowParameter(param); - algo->setForwardBackward(1.0f); - algo->setGridStep(cv::Size(4, 4)); - - algo->setInterpolation(DenseRLOFOpticalFlow::INTERP_EPIC); - algo->calc(frame1, frame2, flow); - - ASSERT_EQ(GT.rows, flow.rows); - ASSERT_EQ(GT.cols, flow.cols); - EXPECT_LE(calcRMSE(GT, flow), 0.36f); - - algo->setInterpolation(DenseRLOFOpticalFlow::INTERP_GEO); - algo->calc(frame1, frame2, flow); - - ASSERT_EQ(GT.rows, flow.rows); - ASSERT_EQ(GT.cols, flow.cols); - EXPECT_LE(calcRMSE(GT, flow), 0.49f); + Mat frame1, frame2, GT; + ASSERT_TRUE(readRubberWhale(frame1, frame2, GT)); + + Mat flow; + Ptr algo = DenseRLOFOpticalFlow::create(); + Ptr param; + param->supportRegionType = SR_CROSS; + param->solverType = ST_BILINEAR; + algo->setRLOFOpticalFlowParameter(param); + algo->setForwardBackward(1.0f); + algo->setGridStep(cv::Size(4, 4)); + + algo->setInterpolation(INTERP_EPIC); + algo->calc(frame1, frame2, flow); + + ASSERT_EQ(GT.rows, flow.rows); + ASSERT_EQ(GT.cols, flow.cols); + EXPECT_LE(calcRMSE(GT, flow), 0.36f); + + algo->setInterpolation(INTERP_GEO); + algo->calc(frame1, frame2, flow); + + ASSERT_EQ(GT.rows, flow.rows); + ASSERT_EQ(GT.cols, flow.cols); + EXPECT_LE(calcRMSE(GT, flow), 0.49f); } From 9b11b49ac5de723dd7812dbd7a7550f07e21a516 Mon Sep 17 00:00:00 2001 From: senst Date: Tue, 11 Dec 2018 01:55:37 +0100 Subject: [PATCH 03/25] introducing precompiler flag RLOD_SSE --- modules/optflow/doc/optflow.bib | 16 +- .../include/opencv2/optflow/rlofflow.hpp | 147 +++++++++--------- modules/optflow/src/rlof/berlof_invoker.hpp | 28 ++-- modules/optflow/src/rlof/plk_invoker.hpp | 29 ++-- modules/optflow/src/rlof/rlof_invoker.hpp | 57 ++++--- modules/optflow/src/rlof/rlof_invokerbase.hpp | 7 + modules/optflow/src/rlof/rlof_localflow.cpp | 4 +- modules/optflow/src/rlof/rlof_localflow.h | 5 +- modules/optflow/src/rlofflow.cpp | 32 ++-- 9 files changed, 168 insertions(+), 157 deletions(-) diff --git a/modules/optflow/doc/optflow.bib b/modules/optflow/doc/optflow.bib index 746cecb6732..5d4a47c30a9 100644 --- a/modules/optflow/doc/optflow.bib +++ b/modules/optflow/doc/optflow.bib @@ -94,15 +94,15 @@ @inproceedings{Senst2013 pages = {2499--2503}, } -@article{Senst2012, - author = {Tobias Senst and Volker Eiselein and Thomas Sikora}, - title = {Robust Local Optical Flow for Feature Tracking}, +@article{Senst2012, + author = {Tobias Senst and Volker Eiselein and Thomas Sikora}, + title = {Robust Local Optical Flow for Feature Tracking}, journal = {IEEE Transactions on Circuits and Systems for Video Technology}, - year = {2012}, - pages = {1377--1387}, - volume = {22}, - number = {9}, -} + year = {2012}, + pages = {1377--1387}, + volume = {22}, + number = {9}, +} @article{Tibshirani2008, title={Fast computation of the median by successive binning}, diff --git a/modules/optflow/include/opencv2/optflow/rlofflow.hpp b/modules/optflow/include/opencv2/optflow/rlofflow.hpp index d1a3d6f53ad..f088562215e 100644 --- a/modules/optflow/include/opencv2/optflow/rlofflow.hpp +++ b/modules/optflow/include/opencv2/optflow/rlofflow.hpp @@ -14,29 +14,29 @@ namespace optflow //! @addtogroup optflow //! @{ - /** @brief This is used store and set up the parameters of the robust local optical flow (RLOF) algoritm. + /** @brief This is used store and set up the parameters of the robust local optical flow (RLOF) algoritm. * - * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as - * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). - * This RLOF implementation can be seen as an improved pyramidal iterative Lucas-Kanade and includes + * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as + * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). + * This RLOF implementation can be seen as an improved pyramidal iterative Lucas-Kanade and includes * a set of improving modules. The main improvements in respect to the pyramidal iterative Lucas-Kanade * are: - * - A more robust redecending M-estimator framework (see @cite Senst2012) to improve the accuracy at + * - A more robust redecending M-estimator framework (see @cite Senst2012) to improve the accuracy at * motion boundaries and appearing and disappearing pixels. - * - an adaptive support region strategies to improve the accuracy at motion boundaries to reduce the - * corona effect, i.e oversmoothing of the PLK at motion/object boundaries. The cross-based segementation - * strategy (SR_CROSS) proposed in @cite Senst2014 uses a simple segmenation approach to obtain the optimal + * - an adaptive support region strategies to improve the accuracy at motion boundaries to reduce the + * corona effect, i.e oversmoothing of the PLK at motion/object boundaries. The cross-based segementation + * strategy (SR_CROSS) proposed in @cite Senst2014 uses a simple segmenation approach to obtain the optimal * shape of the support region. - * - To deal with illumination changes (outdoor sequences and shadow) the intensity constancy assumption - * based optical flow equation has been adopt with the Gennert and Negahdaripour illumination model + * - To deal with illumination changes (outdoor sequences and shadow) the intensity constancy assumption + * based optical flow equation has been adopt with the Gennert and Negahdaripour illumination model * (see @cite Senst2016). This model can be switched on/off with the useIlluminationModel variable. * - By using a global motion prior initialization (see @cite Senst2016) of the iterative refinement * the accuracy could be significantly improved for large displacements. This initialization can be * switched on and of with useGlobalMotionPrior variable. * - * The RLOF can be computed with the SparseOpticalFlow class or function interface to track a set of features - * or with the DenseOpticalFlow class or function interface to compute dense optical flow. - * + * The RLOF can be computed with the SparseOpticalFlow class or function interface to track a set of features + * or with the DenseOpticalFlow class or function interface to compute dense optical flow. + * * @see optflow::DenseRLOFOpticalFlow, optflow::calcOpticalFlowDenseRLOF(), optflow::SparseRLOFOpticalFlow, optflow::calcOpticalFlowSparseRLOF() */ enum SupportRegionType { @@ -54,7 +54,7 @@ namespace optflow enum InterpolationType { - INTERP_GEO = 0, /**< Fast geodesic interpolation, see @cite Geister2016 */ + INTERP_GEO = 0, /**< Fast geodesic interpolation, see @cite Geistert2016 */ INTERP_EPIC = 1, /**< Edge-preserving interpolation, see @cite Revaud2015,Geistert2016. */ }; @@ -86,14 +86,14 @@ namespace optflow */ float normSigma0; - /**< \$ \sigma_0 \$ paramter of the shrinked Hampel norm introduced in @cite Senst2012. If - * \$ \sigma_0 \$ = std::numeric_limist::max() the least-square estimator will be used + /**< &sigma paramter of the shrinked Hampel norm introduced in @cite Senst2012. If + * &sigma = std::numeric_limist::max() the least-square estimator will be used * instead of the M-estimator. Althoug M-estimator is more robust against outlier in the support * region the least-square can be fast in computation. */ float normSigma1; - /**< \$ \sigma_1 \$ paramter of the shrinked Hampel norm introduced in @cite Senst2012. If - * \$ \sigma_1 \$ = std::numeric_limist::max() the least-square estimator will be used + /**< &sigma paramter of the shrinked Hampel norm introduced in @cite Senst2012. If + * &sigma = std::numeric_limist::max() the least-square estimator will be used * instead of the M-estimator. Althoug M-estimator is more robust against outlier in the support * region the least-square can be fast in computation. */ @@ -122,7 +122,7 @@ namespace optflow bool useIlluminationModel; /**< Use the Gennert and Negahdaripour illumination model instead of the intensity brigthness * constraint. (Probosed in @cite Senst2016 ) This model is defined as follow: - * \f[ I(\mathbf{x},t) + m \cdot I(\mathbf{x},t) + c = I(\mathbf{x},t+1) \f[ + * \f[ I(\mathbf{x},t) + m \cdot I(\mathbf{x},t) + c = I(\mathbf{x},t+1) \f] * and contains with m and c a multiplicative and additive term which makes the estimate * more robust against illumination changes. The computational complexity is increased by * enabling the illumination model. @@ -148,15 +148,15 @@ namespace optflow */ }; - /** @brief Fast dense optical flow computation based on robust local optical flow (RLOF) algorithms and sparse-to-dense interpolation + /** @brief Fast dense optical flow computation based on robust local optical flow (RLOF) algorithms and sparse-to-dense interpolation * scheme. * - * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as - * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). + * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as + * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). * * The sparse-to-dense interpolation scheme allows for fast computation of dense optical flow using RLOF (see @cite Geistert2016). * For this scheme the following steps are applied: - * -# motion vector seeded at a regular sampled grid are computed. The sparsity of this grid can be configured with setGridStep + * -# motion vector seeded at a regular sampled grid are computed. The sparsity of this grid can be configured with setGridStep * -# (optinally) errornous motion vectors are filter based on the forward backward confidence. The threshold can be configured * with setForwardBackward. The filter is only applied if the threshold >0 but than the runtime is doubled due to the estimation * of the backward flow. @@ -165,7 +165,7 @@ namespace optflow * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details. * Parameters have been described in @cite Senst2012 @cite Senst2013 @cite Senst2014 and @cite Senst2016. * - * @note If the grid size is set to (1,1) and the forward backward threshold <= 0 that the dense optical flow field is purely. + * @note SIMD parallelization is available compiled with SSE4.1. If the grid size is set to (1,1) and the forward backward threshold <= 0 that the dense optical flow field is purely. * computed with the RLOF. * * @see optflow::calcOpticalFlowDenseRLOF(), optflow::RLOFOpticalFlowParameter @@ -174,43 +174,43 @@ namespace optflow { public: //! @brief Configuration of the RLOF alogrithm. - /** - @see optflow::RLOFOpticalFlowParameter, getRLOFOpticalFlowParameter + /** + @see optflow::RLOFOpticalFlowParameter, getRLOFOpticalFlowParameter */ CV_WRAP virtual void setRLOFOpticalFlowParameter(Ptr val) = 0; - /** @copybrief setRLOFOpticalFlowParameter - @see optflow::RLOFOpticalFlowParameter, setRLOFOpticalFlowParameter + /** @copybrief setRLOFOpticalFlowParameter + @see optflow::RLOFOpticalFlowParameter, setRLOFOpticalFlowParameter */ CV_WRAP virtual Ptr getRLOFOpticalFlowParameter() const = 0; //! @brief Threshold for the forward backward confidence check - /**For each grid point \f$ \mathbf{x} \f$ a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed. + /**For each grid point \f$ \mathbf{x} \f$ a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed. * If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f] - * is larger than threshold given by this function then the motion vector will not be used by the following + * is larger than threshold given by this function then the motion vector will not be used by the following * vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test * will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation. * @see getForwardBackward, setGridStep */ CV_WRAP virtual void setForwardBackward(float val) = 0; /** @copybrief setForwardBackward - @see setForwardBackward + @see setForwardBackward */ CV_WRAP virtual float getForwardBackward() const = 0; //! @brief Size of the grid to spawn the motion vectors. - /** For each grid point a motion vector is computed. Some motion vectors will be removed due to the forwatd backward + /** For each grid point a motion vector is computed. Some motion vectors will be removed due to the forwatd backward * threshold (if set >0). The rest will be the base of the vector field interpolation. * @see getForwardBackward, setGridStep */ CV_WRAP virtual Size getGridStep() const = 0; /** @copybrief getGridStep - * @see getGridStep + * @see getGridStep */ CV_WRAP virtual void setGridStep(Size val) = 0; //! @brief Interpolation used to compute the dense optical flow. /** Two interpolation algorithms have been supported - * - **INTERP_GEO** applies the fast geodesic interpolation, see @cite Geister2016. + * - **INTERP_GEO** applies the fast geodesic interpolation, see @cite Geistert2016. * - **INTERP_EPIC_RESIDUAL** applies the edge-preserving interpolation, see @cite Revaud2015,Geistert2016. - * @see ximgproc::EdgeAwareInterpolator, getInterpolation + * @see ximgproc::EdgeAwareInterpolator, getInterpolation */ CV_WRAP virtual void setInterpolation(InterpolationType val) = 0; /** @copybrief setInterpolation @@ -224,7 +224,7 @@ namespace optflow */ CV_WRAP virtual int getEPICK() const = 0; /** @copybrief getEPICK - * @see ximgproc::EdgeAwareInterpolator, getEPICK + * @see ximgproc::EdgeAwareInterpolator, getEPICK */ CV_WRAP virtual void setEPICK(int val) = 0; //! @brief see ximgproc::EdgeAwareInterpolator() sigma value. @@ -235,7 +235,7 @@ namespace optflow */ CV_WRAP virtual float getEPICSigma() const = 0; /** @copybrief getEPICSigma - * @see ximgproc::EdgeAwareInterpolator, getEPICSigma + * @see ximgproc::EdgeAwareInterpolator, getEPICSigma */ CV_WRAP virtual void setEPICSigma(float val) = 0; //! @brief see ximgproc::EdgeAwareInterpolator() lambda value. @@ -245,7 +245,7 @@ namespace optflow */ CV_WRAP virtual float getEPICLambda() const = 0; /** @copybrief getEPICLambda - * @see ximgproc::EdgeAwareInterpolator, getEPICLambda + * @see ximgproc::EdgeAwareInterpolator, getEPICLambda */ CV_WRAP virtual void setEPICLambda(float val) = 0; //! @brief see ximgproc::EdgeAwareInterpolator(). @@ -254,16 +254,16 @@ namespace optflow */ CV_WRAP virtual float getFgsLambda() const = 0; /** @copybrief getFgsLambda - * @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, getFgsLambda + * @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, getFgsLambda */ CV_WRAP virtual void setFgsLambda(float val) = 0; //! @brief see ximgproc::EdgeAwareInterpolator(). /** Sets the respective fastGlobalSmootherFilter() parameter. - * @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, setFgsSigma + * @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, setFgsSigma */ CV_WRAP virtual float getFgsSigma() const = 0; /** @copybrief getFgsSigma - * @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, getFgsSigma + * @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, getFgsSigma */ CV_WRAP virtual void setFgsSigma(float val) = 0; //! @brief enables ximgproc::fastGlobalSmootherFilter @@ -275,7 +275,7 @@ namespace optflow * @see ximgproc::fastGlobalSmootherFilter, setUsePostProc */ CV_WRAP virtual bool getUsePostProc() const = 0; - //! @brief Creates instance of optflow::DenseRLOFOpticalFlow + //! @brief Creates instance of optflow::DenseRLOFOpticalFlow /** * @param rlofParam see optflow::RLOFOpticalFlowParameter * @param forwardBackwardThreshold see setForwardBackward @@ -283,6 +283,7 @@ namespace optflow * @param epicK see setEPICK * @param epicSigma see setEPICSigma * @param epicLambda see setEPICLambda + * @param use_post_proc see setUsePostProc * @param fgsLambda see setFgsLambda * @param fgsSigma see setFgsSigma */ @@ -294,19 +295,21 @@ namespace optflow int epicK = 128, float epicSigma = 0.05f, float epicLambda = 999.0f, + bool use_post_proc = true, float fgsLambda = 500.0f, float fgsSigma = 1.5f); }; /** @brief Class used for calculation sparse optical flow and feature tracking with robust local optical flow (RLOF) algorithms. * - * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as - * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). + * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as + * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). * - * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details. + * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details. * Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014 and @cite Senst2016. - @sa optflow::calcOpticalFlowSparseRLOF(), optflow::RLOFOpticalFlowParameter - + * + * @note SIMD parallelization is available compiled with SSE4.1. + *@see optflow::calcOpticalFlowSparseRLOF(), optflow::RLOFOpticalFlowParameter */ class CV_EXPORTS_W SparseRLOFOpticalFlow : public SparseOpticalFlow { @@ -331,8 +334,8 @@ namespace optflow * @see setForwardBackward */ CV_WRAP virtual float getForwardBackward() const = 0; - - //! @brief Creates instance of SparseRLOFOpticalFlow + + //! @brief Creates instance of SparseRLOFOpticalFlow /** * @param rlofParam see setRLOFOpticalFlowParameter * @param forwardBackwardThreshold see setForwardBackward @@ -345,8 +348,8 @@ namespace optflow /** @brief Fast dense optical flow computation based on robust local optical flow (RLOF) algorithms and sparse-to-dense interpolation scheme. * - * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as - * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). + * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as + * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). * * The sparse-to-dense interpolation scheme allows for fast computation of dense optical flow using RLOF (see @cite Geistert2016). * For this scheme the following steps are applied: @@ -356,20 +359,20 @@ namespace optflow * of the backward flow. * -# Vector field interpolation is applied to the motion vector set to obtain a dense vector field. * - * @param I0 first 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType - * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image. - * @param I1 second 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType + * @param I0 first 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType + * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image. + * @param I1 second 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image. * @param flow computed flow image that has the same size as I0 and type CV_32FC2. * @param rlofParam see optflow::RLOFOpticalFlowParameter - * @param forwardBackwardThreshold Threshold for the forward backward confidence check. - * For each grid point \f$ \mathbf{x} \f$ a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed. + * @param forewardBackwardThreshold Threshold for the forward backward confidence check. + * For each grid point \f$ \mathbf{x} \f$ a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed. * If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f] - * is larger than threshold given by this function then the motion vector will not be used by the following + * is larger than threshold given by this function then the motion vector will not be used by the following * vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test * will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation. - * @param gridStep Size of the grid to spawn the motion vectors. For each grid point a motion vector is computed. - * Some motion vectors will be removed due to the forwatd backward threshold (if set >0). The rest will be the + * @param gridStep Size of the grid to spawn the motion vectors. For each grid point a motion vector is computed. + * Some motion vectors will be removed due to the forwatd backward threshold (if set >0). The rest will be the * base of the vector field interpolation. * @param epicK see ximgproc::EdgeAwareInterpolator() sets the respective parameter. * @param epicSigma see ximgproc::EdgeAwareInterpolator() sets the respective parameter. @@ -383,35 +386,39 @@ namespace optflow * @note If the grid size is set to (1,1) and the forward backward threshold <= 0 that the dense optical flow field is purely * computed with the RLOF. * + * @note SIMD parallelization is available compiled with SSE4.1. + * * @sa optflow::DenseRLOFOpticalFlow, optflow::RLOFOpticalFlowParameter */ CV_EXPORTS_W void calcOpticalFlowDenseRLOF(InputArray I0, InputArray I1, InputOutputArray flow, Ptr rlofParam = Ptr(), float forewardBackwardThreshold = 0, Size gridStep = Size(6, 6), InterpolationType interp_type = InterpolationType::INTERP_EPIC, - int epicK = 128, float epicSigma = 0.05f, + int epicK = 128, float epicSigma = 0.05f, float epicLambda = 100.f, bool use_post_proc = true, float fgsLambda = 500.0f, float fgsSigma = 1.5f); - /** @brief Calculates fast optical flow for a sparse feature set using the robust local optical flow (RLOF) similar + /** @brief Calculates fast optical flow for a sparse feature set using the robust local optical flow (RLOF) similar * to optflow::calcOpticalFlowPyrLK(). * * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). * - * @param I0 first 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType + * @param prevImg first 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image. - * @param I1 second 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType + * @param nextImg second 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image. - * @param prevPts vector of 2D points for which the flow needs to be found; point coordinates must be single-precision + * @param prevPts vector of 2D points for which the flow needs to be found; point coordinates must be single-precision * floating-point numbers. - * @param nextPts output vector of 2D points (with single-precision floating-point coordinates) containing the calculated - * new positions of input features in the second image; when optflow::RLOFOpticalFlowParameter::useInitialFlow variable is true the vector must + * @param nextPts output vector of 2D points (with single-precision floating-point coordinates) containing the calculated + * new positions of input features in the second image; when optflow::RLOFOpticalFlowParameter::useInitialFlow variable is true the vector must * have the same size as in the input and contain the initialization point correspondences. - * @param status output status vector (of unsigned chars); each element of the vector is set to 1 if the flow for the - * corresponding features has passed the forward backward check. - * @param err output vector of errors; each element of the vector is set to the forward backward error for the corresponding feature. + * @param status output status vector (of unsigned chars); each element of the vector is set to 1 if the flow for the + * corresponding features has passed the forward backward check. + * @param err output vector of errors; each element of the vector is set to the forward backward error for the corresponding feature. * @param rlofParam see optflow::RLOFOpticalFlowParameter - * @param forwardBackwardThreshold Threshold for the forward backward confidence check. If forewardBackwardThreshold <=0 the forward + * @param forewardBackwardThreshold Threshold for the forward backward confidence check. If forewardBackwardThreshold <=0 the forward + * + * @note SIMD parallelization is available compiled with SSE4.1. * * Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014 and @cite Senst2016. * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details. diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp index 437e807d810..93ccb94fd89 100644 --- a/modules/optflow/src/rlof/berlof_invoker.hpp +++ b/modules/optflow/src/rlof/berlof_invoker.hpp @@ -708,7 +708,7 @@ namespace berlof float dI = 0; // I^2 float D = 0; - #ifdef CV_SSE4_1 + #ifdef RLOF_SSE __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); @@ -734,7 +734,7 @@ namespace berlof - #ifdef CV_SSE4_1 + #ifdef RLOF_SSE for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 ) { __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16); @@ -906,7 +906,7 @@ namespace berlof fParam0 = param[0] * MEstimatorScale; fParam1 = param[1] * MEstimatorScale; -#ifdef CV_SSE4_1 +#ifdef RLOF_SSE qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); @@ -952,7 +952,7 @@ namespace berlof const short* dIptr = (const short*)(derivIWinBuf.data + y*derivIWinBuf.step); x = 0; -#ifdef CV_SSE4_1 +#ifdef RLOF_SSE for( ; x <= _winSize.width*cn; x += 8, dIptr += 8*2 ) { __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16); @@ -1269,7 +1269,7 @@ namespace berlof #endif } -#ifdef CV_SSE4_1 +#ifdef RLOF_SSE short etaValues[8]; _mm_storeu_si128((__m128i*)(etaValues), mmEta); MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3] @@ -1279,7 +1279,7 @@ namespace berlof float CV_DECL_ALIGNED(32) wbuf[4];// if( j == 0 ) { -#ifdef CV_SSE4_1 +#ifdef RLOF_SSE _mm_store_ps(wbuf, mmSumW1); w1 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; _mm_store_ps(wbuf, mmSumW2); @@ -1304,7 +1304,7 @@ namespace berlof dI *= FLT_SCALE; -#ifdef CV_SSE4_1 +#ifdef RLOF_SSE float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];// _mm_store_ps(A11buf, mmAxx); @@ -1362,7 +1362,7 @@ namespace berlof } - #ifdef CV_SSE4_1 + #ifdef RLOF_SSE float CV_DECL_ALIGNED(16) bbuf[4]; for(int mmi = 0; mmi < 4; mmi++) { @@ -1658,7 +1658,7 @@ namespace beplk CV_Assert( step == (int)(J.step/J.elemSize1()) ); float A11 = 0, A12 = 0, A22 = 0; - #ifdef CV_SSE4_1 + #ifdef RLOF_SSE __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); __m128i z = _mm_setzero_si128(); @@ -1682,7 +1682,7 @@ namespace beplk x = 0; - #ifdef CV_SSE4_1 + #ifdef RLOF_SSE for( ; x < winSize.width*cn; x += 4, dsrc += 4*2, dIptr += 4*2 ) { __m128i wMask = _mm_set_epi32(MaskSet * maskPtr[x+3], @@ -1763,7 +1763,7 @@ namespace beplk } - #ifdef CV_SSE4_1 + #ifdef RLOF_SSE float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4]; _mm_store_ps(A11buf, qA11); _mm_store_ps(A12buf, qA12); @@ -1834,7 +1834,7 @@ namespace beplk float _b1[4] = {0,0,0,0}; float _b2[4] = {0,0,0,0}; - #ifdef CV_SSE4_1 + #ifdef RLOF_SSE __m128 qbc0[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; __m128 qbc1[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); @@ -1846,7 +1846,7 @@ namespace beplk const short* Iptr = (const short*)(IWinBuf.data + ( y)*IWinBuf.step); const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); x = 0; - #ifdef CV_SSE4_1 + #ifdef RLOF_SSE const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 ) { @@ -1982,7 +1982,7 @@ namespace beplk } - #ifdef CV_SSE4_1 + #ifdef RLOF_SSE float CV_DECL_ALIGNED(16) bbuf[4]; _mm_store_ps(bbuf, _mm_add_ps(qbc0[0], qbc1[0])); _b1[0] += bbuf[0] + bbuf[2]; diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp index 2637fa7fbe7..072cd3390b0 100644 --- a/modules/optflow/src/rlof/plk_invoker.hpp +++ b/modules/optflow/src/rlof/plk_invoker.hpp @@ -161,7 +161,8 @@ class TrackerInvoker : public cv::ParallelLoopBody float w1 = 0, w2 = 0; // -IyI float dI = 0; // I^2 float D = 0; - #ifdef CV_SSE4_1 + #ifdef RLOF_SSE + __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); __m128i z = _mm_setzero_si128(); @@ -183,7 +184,7 @@ class TrackerInvoker : public cv::ParallelLoopBody short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step); x = 0; -#ifdef CV_SSE4_1 +#ifdef RLOF_SSE for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 ) { __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16); @@ -296,7 +297,7 @@ class TrackerInvoker : public cv::ParallelLoopBody A12 = 0; A22 = 0; } -#ifdef CV_SSE4_1 +#ifdef RLOF_SSE qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(), qb2 = _mm_setzero_ps(); @@ -306,7 +307,7 @@ class TrackerInvoker : public cv::ParallelLoopBody __m128 mmSumIy = _mm_setzero_ps(), mmSumIx = _mm_setzero_ps(); __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps(); float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x; - int bitShift = gainVec.x == 0 ? 1 : static_cast(ceil(log(200.f / gainVal) / log(2.f))); // zwei stellen nach den komma + int bitShift = gainVec.x == 0 ? 1 : static_cast(ceil(log(200.f / gainVal) / log(2.f))); __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast(gainVec.x * (float)(1 << bitShift))); __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast(gainVec.y)); #endif @@ -318,7 +319,7 @@ class TrackerInvoker : public cv::ParallelLoopBody const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); x = 0; -#ifdef CV_SSE4_1 +#ifdef RLOF_SSE for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 ) { @@ -506,7 +507,7 @@ class TrackerInvoker : public cv::ParallelLoopBody float CV_DECL_ALIGNED(16) wbuf[4];// if( j == 0 ) { -#ifdef CV_SSE4_1 +#ifdef RLOF_SSE _mm_store_ps(wbuf, mmSumW1); w1 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; _mm_store_ps(wbuf, mmSumW2); @@ -529,7 +530,7 @@ class TrackerInvoker : public cv::ParallelLoopBody w1 *= -FLT_SCALE; w2 *= -FLT_SCALE; dI *= FLT_SCALE; -#ifdef CV_SSE4_1 +#ifdef RLOF_SSE float CV_DECL_ALIGNED(32) A11buf[4], A12buf[4], A22buf[4];// _mm_store_ps(A11buf, mmAxx); @@ -545,7 +546,7 @@ class TrackerInvoker : public cv::ParallelLoopBody A22 *= FLT_SCALE; } -#ifdef CV_SSE4_1 +#ifdef RLOF_SSE float CV_DECL_ALIGNED(16) bbuf[4]; _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1)); b1 = bbuf[0] + bbuf[2]; @@ -794,7 +795,7 @@ class TrackerInvoker : public cv::ParallelLoopBody CV_Assert( step == (int)(J.step/J.elemSize1()) ); float A11 = 0, A12 = 0, A22 = 0; -#ifdef CV_SSE4_1 +#ifdef RLOF_SSE __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); __m128i z = _mm_setzero_si128(); @@ -818,7 +819,7 @@ class TrackerInvoker : public cv::ParallelLoopBody x = 0; -#ifdef CV_SSE4_1 +#ifdef RLOF_SSE for( ; x <= winSize.width*cn; x += 4, dsrc += 4*2, dIptr += 4*2 ) { __m128i wMask = _mm_set_epi32(MaskSet * maskPtr[x+3], @@ -900,7 +901,7 @@ class TrackerInvoker : public cv::ParallelLoopBody #endif } -#ifdef CV_SSE4_1 +#ifdef RLOF_SSE float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4]; _mm_store_ps(A11buf, qA11); _mm_store_ps(A12buf, qA12); @@ -956,7 +957,7 @@ class TrackerInvoker : public cv::ParallelLoopBody iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; float b1 = 0, b2 = 0; -#ifdef CV_SSE4_1 +#ifdef RLOF_SSE qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(); @@ -970,7 +971,7 @@ class TrackerInvoker : public cv::ParallelLoopBody const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); x = 0; -#ifdef CV_SSE4_1 +#ifdef RLOF_SSE const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 ) { @@ -1038,7 +1039,7 @@ class TrackerInvoker : public cv::ParallelLoopBody #endif } -#ifdef CV_SSE4_1 +#ifdef RLOF_SSE float CV_DECL_ALIGNED(16) bbuf[4]; _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1)); b1 += bbuf[0] + bbuf[2]; diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp index fb1ad16f353..c810debbdb9 100644 --- a/modules/optflow/src/rlof/rlof_invoker.hpp +++ b/modules/optflow/src/rlof/rlof_invoker.hpp @@ -165,7 +165,7 @@ namespace ica float D = 0; float minEig; -#ifdef CV_SSE4_1 +#ifdef RLOF_SSE __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); __m128i z = _mm_setzero_si128(); @@ -189,7 +189,7 @@ namespace ica x = 0; -#ifdef CV_SSE4_1 +#ifdef RLOF_SSE for (; x <= winSize.width*cn; x += 4, dsrc += 4 * 2, dIptr += 4 * 2) { @@ -326,7 +326,7 @@ namespace ica float fParam1 = param[1] * 32.f; fParam0 = param[0] * MEstimatorScale; fParam1 = param[1] * MEstimatorScale; -#ifdef CV_SSE4_1 +#ifdef RLOF_SSE qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(); @@ -334,7 +334,7 @@ namespace ica __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits::max() - 1, static_cast(fParam0))); __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits::max() - 1, static_cast(fParam1))); float s2Val = param[2] > 0 ? param[2] : -param[2]; - int s2bitShift = param[2] == 0 ? 1 : static_cast(ceil(log(200.f / s2Val) / log(2.f))); // zwei stellen nach den komma + int s2bitShift = param[2] == 0 ? 1 : static_cast(ceil(log(200.f / s2Val) / log(2.f))); __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast(param[2] * (float)(1 << s2bitShift))); __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift); __m128 mmParam2s = _mm_set1_ps(0.01f * param[2]); @@ -354,7 +354,7 @@ namespace ica const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); x = 0; -#ifdef CV_SSE4_1 +#ifdef RLOF_SSE for (; x <= winSize.width*cn; x += 8, dIptr += 8 * 2) { __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16); @@ -532,7 +532,7 @@ namespace ica #endif } -#ifdef CV_SSE4_1 +#ifdef RLOF_SSE short etaValues[8]; _mm_storeu_si128((__m128i*)(etaValues), mmEta); MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3] @@ -542,7 +542,7 @@ namespace ica #endif if (j == 0) { -#ifdef CV_SSE4_1 +#ifdef RLOF_SSE float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];// _mm_store_ps(A11buf, mmAxx); @@ -576,7 +576,7 @@ namespace ica } } -#ifdef CV_SSE4_1 +#ifdef RLOF_SSE float CV_DECL_ALIGNED(16) bbuf[4]; _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1)); b1 += bbuf[0] + bbuf[2]; @@ -786,7 +786,7 @@ namespace radial float dI = 0; // I^2 float D = 0; -#ifdef CV_SSE4_1 +#ifdef RLOF_SSE __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); __m128i z = _mm_setzero_si128(); @@ -810,7 +810,7 @@ namespace radial x = 0; -#ifdef CV_SSE4_1 +#ifdef RLOF_SSE for (; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4 * 2, dIptr += 4 * 2) { __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16); @@ -971,7 +971,7 @@ namespace radial fParam0 = param[0] * MEstimatorScale; fParam1 = param[1] * MEstimatorScale; -#ifdef CV_SSE4_1 +#ifdef RLOF_SSE qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); @@ -986,14 +986,14 @@ namespace radial float s2Val = param[2] > 0 ? param[2] : -param[2]; - int s2bitShift = param[2] == 0 ? 1 : static_cast(ceil(log(200.f / s2Val) / log(2.f))); // zwei stellen nach den komma + int s2bitShift = param[2] == 0 ? 1 : static_cast(ceil(log(200.f / s2Val) / log(2.f))); __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast(param[2] * (float)(1 << s2bitShift))); __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift); __m128 mmParam2s = _mm_set1_ps(0.01f * param[2]); __m128 mmParam2s2 = _mm_set1_ps(param[2] * param[2]); float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x; - int bitShift = gainVec.x == 0 ? 1 : static_cast(ceil(log(200.f / gainVal) / log(2.f))); // zwei stellen nach den komma + int bitShift = gainVec.x == 0 ? 1 : static_cast(ceil(log(200.f / gainVal) / log(2.f))); __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast(gainVec.x * (float)(1 << bitShift))); __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast(gainVec.y)); @@ -1011,7 +1011,7 @@ namespace radial const short* dIptr = (const short*)(derivIWinBuf.data + y * derivIWinBuf.step); x = 0; -#ifdef CV_SSE4_1 +#ifdef RLOF_SSE for (; x <= _winSize.width*cn; x += 8, dIptr += 8 * 2) @@ -1057,7 +1057,7 @@ namespace radial mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1))); - __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... + __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); @@ -1072,7 +1072,7 @@ namespace radial // val = |It| -/+ sigma1 __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)), _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1))); - // It == 0 ? |It| > sigma13 + // It == 0 ? |It| > sigma13 mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16); // It == val ? sigma0 < |It| < sigma1 mmDiff_epi16 = _mm_blendv_epi8(mmDiff_epi16, tmpParam1_epi16, bSet1_epi16); @@ -1167,7 +1167,7 @@ namespace radial __m128 I_tale_ps = _mm_mul_ps(I_ps, tale_0_3_ps); mmSumI = _mm_add_ps(mmSumI, I_tale_ps); - // sumW + // sumW mmSumW = _mm_add_ps(mmSumW, tale_0_3_ps); // sumDI @@ -1201,10 +1201,8 @@ namespace radial I_tale_ps = _mm_mul_ps(I_ps, tale_4_7_ps); mmSumI = _mm_add_ps(mmSumI, I_tale_ps); - // sumW mmSumW = _mm_add_ps(mmSumW, tale_4_7_ps); - // sumDI mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps(I_ps, I_tale_ps)); } @@ -1287,17 +1285,17 @@ namespace radial } -#ifdef CV_SSE4_1 +#ifdef RLOF_SSE short etaValues[8]; _mm_storeu_si128((__m128i*)(etaValues), mmEta); MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3] + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]); -#endif +#endif float CV_DECL_ALIGNED(32) wbuf[4];// if (j == 0) { -#ifdef CV_SSE4_1 +#ifdef RLOF_SSE _mm_store_ps(wbuf, mmSumW1); w1 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; _mm_store_ps(wbuf, mmSumW2); @@ -1322,7 +1320,7 @@ namespace radial dI *= FLT_SCALE; -#ifdef CV_SSE4_1 +#ifdef RLOF_SSE float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];// @@ -1335,15 +1333,15 @@ namespace radial A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; #endif - A11 *= FLT_SCALE; - A12 *= FLT_SCALE; - A22 *= FLT_SCALE; + A11 *= FLT_SCALE; + A12 *= FLT_SCALE; + A22 *= FLT_SCALE; } -#ifdef CV_SSE4_1 +#ifdef RLOF_SSE float CV_DECL_ALIGNED(16) bbuf[4]; _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1)); b1 = bbuf[0] + bbuf[2]; @@ -1413,8 +1411,7 @@ namespace radial nextPts[ptidx] = nextPt + halfWin; gainVecs[ptidx] = gainVec; - - if (// j > 1 && + if ( (std::abs(delta.x - prevDelta.x) < 0.01 && std::abs(delta.y - prevDelta.y) < 0.01) || ((delta.ddot(delta) <= 0.001) && std::abs(prevGain.x - deltaGain.x) < 0.01) ) @@ -1440,7 +1437,7 @@ namespace radial const Point2f* prevPts; Point2f* nextPts; uchar* status; - cv::Point2f* gainVecs; + cv::Point2f* gainVecs; float* err; int maxWinSize; int minWinSize; diff --git a/modules/optflow/src/rlof/rlof_invokerbase.hpp b/modules/optflow/src/rlof/rlof_invokerbase.hpp index cb03f61384b..34b112fa570 100644 --- a/modules/optflow/src/rlof/rlof_invokerbase.hpp +++ b/modules/optflow/src/rlof/rlof_invokerbase.hpp @@ -5,6 +5,13 @@ #define _RLOF_INVOKERBASE_HPP_ +#if CV_CPU_COMPILE_SSE4_1 +#define RLOF_SSE +#elif CV_CPU_COMPILE_SSE4_2 +#define RLOF_SSE +#endif + + #define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n)) #define FLT_RESCALE 1 #ifndef _MSC_VER diff --git a/modules/optflow/src/rlof/rlof_localflow.cpp b/modules/optflow/src/rlof/rlof_localflow.cpp index b34a9c26e30..4d18380ea77 100644 --- a/modules/optflow/src/rlof/rlof_localflow.cpp +++ b/modules/optflow/src/rlof/rlof_localflow.cpp @@ -1,7 +1,6 @@ // This file is part of OpenCV project. // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. -#include "precomp.hpp" #include "rlof_localflow.h" #ifdef HAVE_OPENCV_CALIB3D #include "opencv2/calib3d.hpp" @@ -550,5 +549,4 @@ namespace optflow calcLocalOpticalFlowCore(prevPyramids, currPyramids, prevPoints, currPoints, internParam); } } -} - \ No newline at end of file +} \ No newline at end of file diff --git a/modules/optflow/src/rlof/rlof_localflow.h b/modules/optflow/src/rlof/rlof_localflow.h index bd59721f277..98347e9dcb1 100644 --- a/modules/optflow/src/rlof/rlof_localflow.h +++ b/modules/optflow/src/rlof/rlof_localflow.h @@ -33,7 +33,7 @@ T quickselect(const Mat & inp, int k) std::swap(values.at(l), values.at(ir)); return values.at(k); } - else + else { mid = (l + ir) >> 1; std::swap(values.at(mid), values.at(l+1)); @@ -46,7 +46,7 @@ T quickselect(const Mat & inp, int k) i = l + 1; j = ir; a = values.at(l+1); - while (true) + while (true) { do { @@ -91,7 +91,6 @@ class CImageBuffer } void setBlurFromRGB(const cv::Mat & inp) { - //cv::medianBlur(constNextImage, blurNextImg, 7); if(m_Overwrite) cv::GaussianBlur(inp, m_BlurredImage, cv::Size(7,7), -1); } diff --git a/modules/optflow/src/rlofflow.cpp b/modules/optflow/src/rlofflow.cpp index d7309c15210..08e16d9a31d 100644 --- a/modules/optflow/src/rlofflow.cpp +++ b/modules/optflow/src/rlofflow.cpp @@ -6,7 +6,7 @@ #include "rlof/geo_interpolation.hpp" #include "opencv2/ximgproc.hpp" namespace cv -{ +{ namespace optflow { @@ -30,8 +30,8 @@ namespace optflow currPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer); currPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer); } - virtual void setRLOFOpticalFlowParameter(Ptr val) CV_OVERRIDE { }//param = *val; } - virtual Ptr getRLOFOpticalFlowParameter() const CV_OVERRIDE { return Ptr(); } + virtual void setRLOFOpticalFlowParameter(Ptr val) CV_OVERRIDE { param = val; } + virtual Ptr getRLOFOpticalFlowParameter() const CV_OVERRIDE { return param; } virtual float getForwardBackward() const CV_OVERRIDE { return forwardBackwardThreshold; } virtual void setForwardBackward(float val) CV_OVERRIDE { forwardBackwardThreshold = val; } @@ -65,7 +65,7 @@ namespace optflow CV_Assert(!I0.empty() && I0.depth() == CV_8U && (I0.channels() == 3 || I0.channels() == 1)); CV_Assert(!I1.empty() && I1.depth() == CV_8U && (I1.channels() == 3 || I1.channels() == 1)); CV_Assert(I0.sameSize(I1)); - if (param.supportRegionType == SR_CROSS) + if (param->supportRegionType == SR_CROSS) CV_Assert( I0.channels() == 3 && I1.channels() == 3); CV_Assert(interp_type == InterpolationType::INTERP_EPIC || interp_type == InterpolationType::INTERP_GEO); @@ -86,7 +86,7 @@ namespace optflow prevPoints.erase(prevPoints.begin() + noPoints, prevPoints.end()); currPoints.resize(prevPoints.size()); - calcLocalOpticalFlow(prevImage, currImage, prevPyramid, currPyramid, prevPoints, currPoints, param); + calcLocalOpticalFlow(prevImage, currImage, prevPyramid, currPyramid, prevPoints, currPoints, *(param.get())); flow.create(prevImage.size(), CV_32FC2); Mat dense_flow = flow.getMat(); @@ -104,7 +104,7 @@ namespace optflow if (forwardBackwardThreshold > 0) { // reuse image pyramids - calcLocalOpticalFlow(currImage, prevImage, currPyramid, prevPyramid, currPoints, refPoints, param); + calcLocalOpticalFlow(currImage, prevImage, currPyramid, prevPyramid, currPoints, refPoints, *(param.get())); filtered_prevPoints.resize(prevPoints.size()); filtered_currPoints.resize(prevPoints.size()); @@ -167,7 +167,7 @@ namespace optflow } protected: - RLOFOpticalFlowParameter param; + Ptr param; float forwardBackwardThreshold; Ptr prevPyramid[2]; Ptr currPyramid[2]; @@ -189,6 +189,7 @@ namespace optflow int epicK, float epicSigma, float epicLambda, + bool use_post_proc, float fgs_lambda, float fgs_sigma) { @@ -200,6 +201,7 @@ namespace optflow algo->setEPICK(epicK); algo->setEPICSigma(epicSigma); algo->setEPICLambda(epicLambda); + algo->setUsePostProc(use_post_proc); algo->setFgsLambda(fgs_lambda); algo->setFgsSigma(fgs_sigma); return algo; @@ -235,7 +237,7 @@ namespace optflow Mat nextImage = nextImg.getMat(); Mat prevPtsMat = prevPts.getMat(); - if (param.useInitialFlow == false) + if (param->useInitialFlow == false) nextPts.create(prevPtsMat.size(), prevPtsMat.type(), -1, true); int npoints = 0; @@ -253,7 +255,7 @@ namespace optflow CV_Assert(nextPtsMat.checkVector(2, CV_32F, true) == npoints); std::vector prevPoints(npoints), nextPoints(npoints), refPoints; prevPtsMat.copyTo(cv::Mat(1, npoints, CV_32FC2, &prevPoints[0])); - if (param.useInitialFlow ) + if (param->useInitialFlow ) nextPtsMat.copyTo(cv::Mat(1, nextPtsMat.cols, CV_32FC2, &nextPoints[0])); cv::Mat statusMat; @@ -272,12 +274,12 @@ namespace optflow errorMat.setTo(0); } - calcLocalOpticalFlow(prevImage, nextImage, prevPyramid, currPyramid, prevPoints, nextPoints, param); + calcLocalOpticalFlow(prevImage, nextImage, prevPyramid, currPyramid, prevPoints, nextPoints, *(param.get())); cv::Mat(1,npoints , CV_32FC2, &nextPoints[0]).copyTo(nextPtsMat); if (forwardBackwardThreshold > 0) { // reuse image pyramids - calcLocalOpticalFlow(nextImage, prevImage, currPyramid, prevPyramid, nextPoints, refPoints, param); + calcLocalOpticalFlow(nextImage, prevImage, currPyramid, prevPyramid, nextPoints, refPoints, *(param.get())); } for (unsigned int r = 0; r < refPoints.size(); r++) { @@ -290,7 +292,7 @@ namespace optflow } protected: - RLOFOpticalFlowParameter param; + Ptr param; float forwardBackwardThreshold; Ptr prevPyramid[2]; Ptr currPyramid[2]; @@ -310,12 +312,12 @@ namespace optflow Ptr rlofParam , float forewardBackwardThreshold, Size gridStep, InterpolationType interp_type, - int epicK, float epicSigma, + int epicK, float epicSigma, float epicLambda, bool use_post_proc, float fgsLambda, float fgsSigma) { Ptr algo = DenseRLOFOpticalFlow::create( rlofParam, forewardBackwardThreshold, gridStep, interp_type, - epicK, epicSigma, use_post_proc, fgsLambda, fgsSigma); + epicK, epicSigma, epicLambda, use_post_proc, fgsLambda, fgsSigma); algo->calc(I0, I1, flow); algo->collectGarbage(); } @@ -342,4 +344,4 @@ namespace optflow } -} \ No newline at end of file +} From 2b41b05bb6fb63bc72dda494a6b41f694880ac7c Mon Sep 17 00:00:00 2001 From: senst Date: Tue, 11 Dec 2018 10:28:19 +0100 Subject: [PATCH 04/25] remove header that could not be found --- modules/optflow/doc/optflow.bib | 2 +- modules/optflow/perf/perf_rlof.cpp | 5 - modules/optflow/src/rlof/berlof_invoker.hpp | 4 +- .../optflow/src/rlof/geo_interpolation.cpp | 7 +- modules/optflow/src/rlof/plk_invoker.hpp | 290 +++++++++--------- modules/optflow/src/rlof/rlof_invoker.hpp | 110 +++---- modules/optflow/src/rlof/rlof_localflow.h | 6 +- modules/optflow/test/test_OF_accuracy.cpp | 5 - 8 files changed, 197 insertions(+), 232 deletions(-) diff --git a/modules/optflow/doc/optflow.bib b/modules/optflow/doc/optflow.bib index 5d4a47c30a9..5b4869564bd 100644 --- a/modules/optflow/doc/optflow.bib +++ b/modules/optflow/doc/optflow.bib @@ -109,4 +109,4 @@ @article{Tibshirani2008 author={Tibshirani, Ryan J}, journal={arXiv preprint arXiv:0806.3301}, year={2008} -} +} diff --git a/modules/optflow/perf/perf_rlof.cpp b/modules/optflow/perf/perf_rlof.cpp index 4c87f51ef78..910fa58375c 100644 --- a/modules/optflow/perf/perf_rlof.cpp +++ b/modules/optflow/perf/perf_rlof.cpp @@ -1,7 +1,6 @@ #include "perf_precomp.hpp" namespace opencv_test { namespace { - typedef tuple ST_SR_IM_Sparse_t; typedef TestBaseWithParam ST_SR_IM_Sparse; PERF_TEST_P(ST_SR_IM_Sparse, OpticalFlow_SparseRLOF, @@ -67,7 +66,6 @@ namespace opencv_test { Mat frame2 = imread(frame2_path); ASSERT_FALSE(frame1.empty()); ASSERT_FALSE(frame2.empty()); - Ptr param; Ptr< DenseRLOFOpticalFlow> algo = DenseRLOFOpticalFlow::create(); InterpolationType interp_type; @@ -75,13 +73,10 @@ namespace opencv_test { interp_type = INTERP_EPIC; if (get<0>(GetParam()) == "INTERP_GEO") interp_type = INTERP_GEO; - - TEST_CYCLE_N(5) { calcOpticalFlowDenseRLOF(frame1, frame2,flow, param, 1.0f, Size(get<1>(GetParam()), get<1>(GetParam())), interp_type); } - SANITY_CHECK_NOTHING(); } } diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp index 93ccb94fd89..9c229bf542b 100644 --- a/modules/optflow/src/rlof/berlof_invoker.hpp +++ b/modules/optflow/src/rlof/berlof_invoker.hpp @@ -1275,7 +1275,7 @@ namespace berlof MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3] + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]); -#endif +#endif float CV_DECL_ALIGNED(32) wbuf[4];// if( j == 0 ) { @@ -2128,4 +2128,4 @@ namespace beplk } } } -#endif \ No newline at end of file +#endif diff --git a/modules/optflow/src/rlof/geo_interpolation.cpp b/modules/optflow/src/rlof/geo_interpolation.cpp index 823070bf5e9..f93c0b6093e 100644 --- a/modules/optflow/src/rlof/geo_interpolation.cpp +++ b/modules/optflow/src/rlof/geo_interpolation.cpp @@ -2,7 +2,6 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // This functions have been contributed by Jonas Geisters -#include "precomp.hpp" #include "geo_interpolation.hpp" #include #include @@ -11,7 +10,7 @@ namespace cv namespace optflow { - double K_h1(float x, double h2) + inline double K_h1(float x, double h2) { return exp(-(0.5 / (h2)) * x); } @@ -135,7 +134,7 @@ namespace optflow /* * assign quellknoten ids. */ - for (int i = 0; i < prevPoints.size(); i++) + for (int i = 0; i < static_cast(prevPoints.size()); i++) { int x = (int)prevPoints[i].x; int y = (int)prevPoints[i].y; @@ -484,4 +483,4 @@ namespace optflow } } -} \ No newline at end of file +} diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp index 072cd3390b0..0d7b5ddb481 100644 --- a/modules/optflow/src/rlof/plk_invoker.hpp +++ b/modules/optflow/src/rlof/plk_invoker.hpp @@ -59,7 +59,7 @@ class TrackerInvoker : public cv::ParallelLoopBody useInitialFlow = _useInitialFlow; crossSegmentationThreshold = _crossSegmentationThreshold; } - + TrackerInvoker & operator=(const TrackerInvoker &) {return *this;}; void operator()(const cv::Range& range) const { @@ -75,7 +75,6 @@ class TrackerInvoker : public cv::ParallelLoopBody int winMaskwidth = static_cast(ceil(winSize.width / 16.f)) * 16; cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); winMaskMatBuf.setTo(1); - const float FLT_SCALE = (1.f/(1 << 16));//(1.f/(1 << 20)); // 20 int cn = I.channels(), cn2 = cn*2; @@ -120,7 +119,6 @@ class TrackerInvoker : public cv::ParallelLoopBody minWinSize,maxWinSize, crossSegmentationThreshold) == false) continue; - prevPt -= halfWin; iprevPt.x = cvFloor(prevPt.x); iprevPt.y = cvFloor(prevPt.y); @@ -142,12 +140,10 @@ class TrackerInvoker : public cv::ParallelLoopBody float a = prevPt.x - iprevPt.x; float b = prevPt.y - iprevPt.y; const int W_BITS = 14, W_BITS1 = 14; - int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - int dstep = (int)(derivI.step/derivI.elemSize1()); int step = (int)(I.step/I.elemSize1()); int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1()); @@ -179,10 +175,10 @@ class TrackerInvoker : public cv::ParallelLoopBody const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; - + short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step); short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step); - + x = 0; #ifdef RLOF_SSE for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 ) @@ -194,7 +190,7 @@ class TrackerInvoker : public cv::ParallelLoopBody v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z); v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z); - + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); @@ -204,13 +200,13 @@ class TrackerInvoker : public cv::ParallelLoopBody } t0 = _mm_and_si128(t0, mask_0_3_epi16); _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0)); - - + + v00 = _mm_loadu_si128((const __m128i*)(dsrc)); v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep)); v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2)); - + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), @@ -240,7 +236,7 @@ class TrackerInvoker : public cv::ParallelLoopBody dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1); int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 + dsrc[dstep+cn2+1]*iw11, W_BITS1); - + Iptr[x] = (short)ival; dIptr[0] = (short)ixval; dIptr[1] = (short)iyval; @@ -248,7 +244,7 @@ class TrackerInvoker : public cv::ParallelLoopBody #endif } - + cv::Point2f backUpNextPt = nextPt; nextPt -= halfWin; Point2f prevDelta(0,0); //relates to h(t-1) @@ -261,8 +257,8 @@ class TrackerInvoker : public cv::ParallelLoopBody status[ptidx] = static_cast(j); inextPt.x = cvFloor(nextPt.x); inextPt.y = cvFloor(nextPt.y); - - + + if( inextPt.x + halfWin.x < 0 || inextPt.x + halfWin.x>= derivI.cols || inextPt.y + halfWin.y < 0 || inextPt.y + halfWin.y >= derivI.rows ) { @@ -282,7 +278,7 @@ class TrackerInvoker : public cv::ParallelLoopBody b2 = 0; b3 = 0; b4 = 0; - + if( j == 0 ) { // tensor @@ -302,7 +298,7 @@ class TrackerInvoker : public cv::ParallelLoopBody qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(), qb2 = _mm_setzero_ps(); __m128 qb3 = _mm_setzero_ps(); - __m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps(); + __m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps(); __m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps(); __m128 mmSumIy = _mm_setzero_ps(), mmSumIx = _mm_setzero_ps(); __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps(); @@ -317,60 +313,60 @@ class TrackerInvoker : public cv::ParallelLoopBody const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn; const short* Iptr = (const short*)(IWinBuf.data + ( y)*IWinBuf.step); const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); - + x = 0; -#ifdef RLOF_SSE - +#ifdef RLOF_SSE + for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 ) { // iw = iw << 14 (16384 short / 2) // iq0 = iw01 |iw00, iw01 |iw00, iw01 |iw00, iw01 |iw00 16 bit __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16); - + // I [0...8] __m128i diff0, diff1; __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); // element 0 to 7 - + __m128i v00 = _mm_unpacklo_epi8( _mm_loadl_epi64((const __m128i*)(Jptr + x)) // J0, J1, J2, ..., J7, 64 bit nullen - , z); //J0 , 0, J1, 0, J2, 0 ... J7,0 - __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); + , z); //J0 , 0, J1, 0, J2, 0 ... J7,0 + __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z); __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z); - - + + __m128i t0 = _mm_add_epi32 (_mm_madd_epi16( _mm_unpacklo_epi16(v00, v01), //J0, , J1, J1, J2, J2, ... J3 , J4 qw0), _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); - + __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5); - - + + __m128i Ilo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16); - __m128i Ihi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16); + __m128i Ihi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16); __m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(Ilo, Ihi), bitShift); __m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(Ilo, Ihi), bitShift); __m128i Igain_epi16 = _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32); - + // diff = J - I + I*gain.x + gain.y - __m128i mmDiff_epi16 = _mm_add_epi16( + __m128i mmDiff_epi16 = _mm_add_epi16( _mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16), // J - I _mm_add_epi16(Igain_epi16, mmConstValue_epi16)); - + mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16); - - __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... + + __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); - + diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7 | It12 It12 It13 It13... diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3 | It8 It8 It9 It9... @@ -395,7 +391,7 @@ class TrackerInvoker : public cv::ParallelLoopBody v10 = _mm_unpackhi_epi16(Ilo, Ihi); qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v00)); qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v10)); - + // b4+= diff __m128 mmDiff_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mmDiff_epi16)); // (_mm_srai_epi32(_mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16),16)); __m128 mmDiff_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16),16))); @@ -405,7 +401,7 @@ class TrackerInvoker : public cv::ParallelLoopBody { __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16)); __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16))); - + t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3 t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3 @@ -414,81 +410,81 @@ class TrackerInvoker : public cv::ParallelLoopBody // 0 ... 3 __m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16)); - + // A11 - A22 mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fy)); mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fy)); mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fx)); - + // sumIx und sumIy mmSumIx = _mm_add_ps(mmSumIx, fx); mmSumIy = _mm_add_ps(mmSumIy, fy); - + mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fx)); mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fy)); - + // sumI I_ps = _mm_blendv_ps(_mm_set1_ps(0), I_ps, mask_0_4_ps); mmSumI = _mm_add_ps(mmSumI,I_ps); - + // sumDI mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_ps)); - + t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11 t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3 - + fy = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t0), mask_4_7_ps); fx = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t1), mask_4_7_ps); // 4 ... 7 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16)); - - + + mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fy)); mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fy)); mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fx)); - + // sumIx und sumIy mmSumIx = _mm_add_ps(mmSumIx, fx); mmSumIy = _mm_add_ps(mmSumIy, fy); - + mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fx)); mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fy)); - + // sumI I_ps = _mm_blendv_ps(_mm_set1_ps(0), I_ps, mask_4_7_ps); mmSumI = _mm_add_ps(mmSumI, I_ps); - - // sumW + + // sumW // sumDI mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_ps)); } - + } #else for( ; x < winSize.width*cn; x++, dIptr += 2 ) { - if( maskPtr[x] == 0) + if( maskPtr[x] == 0) continue; int J = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, W_BITS1-5); - int diff = J - Iptr[x] + static_cast(Iptr[x]) * gainVec.x + gainVec.y; - + int diff = J - Iptr[x] + static_cast(Iptr[x]) * gainVec.x + gainVec.y; + b1 += (float)(diff*dIptr[0]) * FLT_RESCALE; b2 += (float)(diff*dIptr[1]) * FLT_RESCALE; b3 += (float)(diff) * Iptr[x] * FLT_RESCALE; b4 += (float)(diff); - + // compute the Gradient Matrice if( j == 0 ) { A11 += (float)(dIptr[0]*dIptr[0]); A12 += (float)(dIptr[0]*dIptr[1]); A22 += (float)(dIptr[1]*dIptr[1]); - - + + dI += Iptr[x] * Iptr[x] * FLT_RESCALE; float dx = static_cast(dIptr[0]) * FLT_RESCALE; float dy = static_cast(dIptr[1]) * FLT_RESCALE; @@ -498,12 +494,12 @@ class TrackerInvoker : public cv::ParallelLoopBody w2 += dy * Iptr[x]; sumI += Iptr[x] * FLT_RESCALE; //sumW += FLT_RESCALE; - + } } #endif } - + float CV_DECL_ALIGNED(16) wbuf[4];// if( j == 0 ) { @@ -531,19 +527,19 @@ class TrackerInvoker : public cv::ParallelLoopBody w2 *= -FLT_SCALE; dI *= FLT_SCALE; #ifdef RLOF_SSE - float CV_DECL_ALIGNED(32) A11buf[4], A12buf[4], A22buf[4];// - + float CV_DECL_ALIGNED(32) A11buf[4], A12buf[4], A22buf[4];// + _mm_store_ps(A11buf, mmAxx); _mm_store_ps(A12buf, mmAxy); _mm_store_ps(A22buf, mmAyy); - + A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; #endif - A11 *= FLT_SCALE; - A12 *= FLT_SCALE; - A22 *= FLT_SCALE; + A11 *= FLT_SCALE; + A12 *= FLT_SCALE; + A22 *= FLT_SCALE; } #ifdef RLOF_SSE @@ -561,12 +557,12 @@ class TrackerInvoker : public cv::ParallelLoopBody mismatchMat.at(1,0) = b2 * FLT_SCALE; mismatchMat.at(2,0) = -b3 * FLT_SCALE; mismatchMat.at(3,0) = -b4 * FLT_SCALE; - - D = - A12*A12*sumI*sumI + dI*sumW*A12*A12 + 2*A12*sumI*sumIx*w2 + 2*A12*sumI*sumIy*w1 - - 2*dI*A12*sumIx*sumIy - 2*sumW*A12*w1*w2 + A11*A22*sumI*sumI - 2*A22*sumI*sumIx*w1 - - 2*A11*sumI*sumIy*w2 - sumIx*sumIx*w2*w2 + A22*dI*sumIx*sumIx + 2*sumIx*sumIy*w1*w2 + + D = - A12*A12*sumI*sumI + dI*sumW*A12*A12 + 2*A12*sumI*sumIx*w2 + 2*A12*sumI*sumIy*w1 + - 2*dI*A12*sumIx*sumIy - 2*sumW*A12*w1*w2 + A11*A22*sumI*sumI - 2*A22*sumI*sumIx*w1 + - 2*A11*sumI*sumIy*w2 - sumIx*sumIx*w2*w2 + A22*dI*sumIx*sumIx + 2*sumIx*sumIy*w1*w2 - sumIy*sumIy*w1*w1 + A11*dI*sumIy*sumIy + A22*sumW*w1*w1 + A11*sumW*w2*w2 - A11*A22*dI*sumW; - + float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) + 4.f*A12*A12))/(2*winArea); @@ -581,10 +577,10 @@ class TrackerInvoker : public cv::ParallelLoopBody } break; } - - - D = (1.f / D); + + + D = (1.f / D); invTensorMat.at(0,0) = (A22*sumI*sumI - 2*sumI*sumIy*w2 + dI*sumIy*sumIy + sumW*w2*w2 - A22*dI*sumW)* D; invTensorMat.at(0,1) = (A12*dI*sumW - A12*sumI * sumI - dI*sumIx*sumIy + sumI*sumIx*w2 + sumI*sumIy*w1 - sumW*w1*w2)* D; invTensorMat.at(0,2) = (A12*sumI*sumIy - sumIy*sumIy*w1 - A22*sumI*sumIx - A12*sumW*w2 + A22*sumW*w1 + sumIx*sumIy*w2)* D; invTensorMat.at(0,3) = (A22*dI*sumIx - A12*dI*sumIy - sumIx*w2*w2 + A12*sumI*w2 - A22*sumI*w1 + sumIy*w1*w2) * D; @@ -606,18 +602,18 @@ class TrackerInvoker : public cv::ParallelLoopBody // 0.08 -12.10 Point2f delta(-resultMat.at(0), -resultMat.at(1)); Point2f deltaGain(-resultMat.at(2), -resultMat.at(3)); - - - if( j == 0) + + + if( j == 0) prevGain = deltaGain; nextPt += delta; nextPts[ptidx] = nextPt + halfWin; - gainVecs[ptidx]= gainVec; + gainVecs[ptidx]= gainVec; if( delta.ddot(delta) <= criteria.epsilon) break; - if( ( std::abs(delta.x - prevDelta.x) < 0.01 - && std::abs(delta.y - prevDelta.y) < 0.01) - || ((delta.ddot(delta) <= 0.001) + if( ( std::abs(delta.x - prevDelta.x) < 0.01 + && std::abs(delta.y - prevDelta.y) < 0.01) + || ((delta.ddot(delta) <= 0.001) && std::abs(prevGain.x - deltaGain.x) < 0.01) ) { @@ -626,23 +622,23 @@ class TrackerInvoker : public cv::ParallelLoopBody break; } - if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01 && + if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01 && std::abs(delta.y - prevDelta.y) < 0.01) { nextPts[ptidx] -= delta*0.5f; break; } - + prevDelta = delta; prevGain = deltaGain; - + } - - + + } } - + const Mat* prevImg; const Mat* nextImg; const Mat* prevDeriv; @@ -706,7 +702,7 @@ class TrackerInvoker : public cv::ParallelLoopBody useInitialFlow = _useInitialFlow; crossSegmentationThreshold = _crossSegmentationThreshold; } - + TrackerInvoker & operator=(const TrackerInvoker &) {return *this;}; void operator()(const cv::Range& range) const { @@ -722,8 +718,8 @@ class TrackerInvoker : public cv::ParallelLoopBody int winMaskwidth = static_cast(ceil(winSize.width / 8.f)) * 16; cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); winMaskMatBuf.setTo(1); - - + + const float FLT_SCALE = (1.f/(1 << 20)); // 20 @@ -761,7 +757,7 @@ class TrackerInvoker : public cv::ParallelLoopBody minWinSize,maxWinSize, crossSegmentationThreshold) == false) continue; - + prevPt -= halfWin; iprevPt.x = cvFloor(prevPt.x); iprevPt.y = cvFloor(prevPt.y); @@ -783,18 +779,18 @@ class TrackerInvoker : public cv::ParallelLoopBody float a = prevPt.x - iprevPt.x; float b = prevPt.y - iprevPt.y; const int W_BITS = 14, W_BITS1 = 14; - + int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - + int dstep = (int)(derivI.step/derivI.elemSize1()); int step = (int)(I.step/I.elemSize1()); int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1()); CV_Assert( step == (int)(J.step/J.elemSize1()) ); float A11 = 0, A12 = 0, A22 = 0; - + #ifdef RLOF_SSE __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); @@ -805,7 +801,7 @@ class TrackerInvoker : public cv::ParallelLoopBody __m128i mmMask4_epi32; get4BitMask(winSize.width, mmMask4_epi32); #endif - + // extract the patch from the first image, compute covariation matrix of derivatives int x, y; for( y = 0; y < winSize.height; y++ ) @@ -813,41 +809,41 @@ class TrackerInvoker : public cv::ParallelLoopBody const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; - + short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step); short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step); - + x = 0; - + #ifdef RLOF_SSE for( ; x <= winSize.width*cn; x += 4, dsrc += 4*2, dIptr += 4*2 ) { - __m128i wMask = _mm_set_epi32(MaskSet * maskPtr[x+3], - MaskSet * maskPtr[x+2], - MaskSet * maskPtr[x+1], + __m128i wMask = _mm_set_epi32(MaskSet * maskPtr[x+3], + MaskSet * maskPtr[x+2], + MaskSet * maskPtr[x+1], MaskSet * maskPtr[x]); __m128i v00, v01, v10, v11, t0, t1; v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z); v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z); - + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); - - + + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0)); - - + + v00 = _mm_loadu_si128((const __m128i*)(dsrc)); v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep)); v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2)); - + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), @@ -855,7 +851,7 @@ class TrackerInvoker : public cv::ParallelLoopBody t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1); t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1); v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... - + if( x + 4 > winSize.width) { v00 = _mm_and_si128(v00, mmMask4_epi32); @@ -865,17 +861,17 @@ class TrackerInvoker : public cv::ParallelLoopBody _mm_storeu_si128((__m128i*)dIptr, v00); t0 = _mm_srai_epi32(v00, 16); // Iy0 Iy1 Iy2 Iy3 t1 = _mm_srai_epi32(_mm_slli_epi32(v00, 16), 16); // Ix0 Ix1 Ix2 Ix3 - + __m128 fy = _mm_cvtepi32_ps(t0); __m128 fx = _mm_cvtepi32_ps(t1); - - + + qA22 = _mm_add_ps(qA22, _mm_mul_ps(fy, fy)); qA12 = _mm_add_ps(qA12, _mm_mul_ps(fx, fy)); qA11 = _mm_add_ps(qA11, _mm_mul_ps(fx, fx)); } #else - + for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 ) { if( maskPtr[x] == 0) @@ -890,7 +886,7 @@ class TrackerInvoker : public cv::ParallelLoopBody dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1); int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 + dsrc[dstep+cn2+1]*iw11, W_BITS1); - + Iptr[x] = (short)ival; dIptr[0] = (short)ixval; dIptr[1] = (short)iyval; @@ -900,7 +896,7 @@ class TrackerInvoker : public cv::ParallelLoopBody } #endif } - + #ifdef RLOF_SSE float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4]; _mm_store_ps(A11buf, qA11); @@ -910,25 +906,25 @@ class TrackerInvoker : public cv::ParallelLoopBody A12 += A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; A22 += A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; #endif - + A11 *= FLT_SCALE; A12 *= FLT_SCALE; A22 *= FLT_SCALE; - + float D = A11*A22 - A12*A12; float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) + 4.f*A12*A12))/(2 * winArea); - + if( err ) err[ptidx] = (float)minEig; - + if( minEig < minEigThreshold || D < FLT_EPSILON ) { if( level == 0 && status ) status[ptidx] = 0; continue; } - + D = 1.f/D; nextPt -= halfWin; @@ -940,8 +936,8 @@ class TrackerInvoker : public cv::ParallelLoopBody status[ptidx] = static_cast(j); inextPt.x = cvFloor(nextPt.x); inextPt.y = cvFloor(nextPt.y); - - + + if( inextPt.x + halfWin.x < 0 || inextPt.x + halfWin.x>= derivI.cols || inextPt.y + halfWin.y < 0 || inextPt.y + halfWin.y >= derivI.rows ) { @@ -969,9 +965,9 @@ class TrackerInvoker : public cv::ParallelLoopBody const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn; const short* Iptr = (const short*)(IWinBuf.data + ( y)*IWinBuf.step); const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); - + x = 0; -#ifdef RLOF_SSE +#ifdef RLOF_SSE const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 ) { @@ -983,29 +979,29 @@ class TrackerInvoker : public cv::ParallelLoopBody __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z); __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z); - + __m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5); - + __m128i mmDiff_epi16 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), diff0); - - __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... + + __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); - + if( x > winSize.width*cn - 8) { Ixy_0 = _mm_and_si128(Ixy_0, mmMask0); Ixy_1 = _mm_and_si128(Ixy_1, mmMask1); } - - + + diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7 | It12 It12 It13 It13... diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3 | It8 It8 It9 It9... - + v10 = _mm_mullo_epi16(Ixy_0, diff0); v11 = _mm_mulhi_epi16(Ixy_0, diff0); @@ -1013,10 +1009,10 @@ class TrackerInvoker : public cv::ParallelLoopBody v10 = _mm_unpackhi_epi16(v10, v11); qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); - + // It * Ix It * Iy [4 ... 7] // for set 1 hi sigma 1 - + v10 = _mm_mullo_epi16(Ixy_1, diff1); v11 = _mm_mulhi_epi16(Ixy_1, diff1); v00 = _mm_unpacklo_epi16(v10, v11); @@ -1027,7 +1023,7 @@ class TrackerInvoker : public cv::ParallelLoopBody #else for( ; x < winSize.width*cn; x++, dIptr += 2 ) { - if( dIptr[0] == 0 && dIptr[1] == 0) + if( dIptr[0] == 0 && dIptr[1] == 0) continue; int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, @@ -1038,46 +1034,46 @@ class TrackerInvoker : public cv::ParallelLoopBody } #endif } - + #ifdef RLOF_SSE float CV_DECL_ALIGNED(16) bbuf[4]; _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1)); b1 += bbuf[0] + bbuf[2]; b2 += bbuf[1] + bbuf[3]; #endif - + b1 *= FLT_SCALE; b2 *= FLT_SCALE; Point2f delta( (float)((A12*b2 - A22*b1) * D), (float)((A12*b1 - A11*b2) * D)); - + nextPt += delta; nextPts[ptidx] = nextPt + halfWin; if( delta.ddot(delta) <= criteria.epsilon) break; - if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01 && + if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01 && std::abs(delta.y - prevDelta.y) < 0.01) { nextPts[ptidx] -= delta*0.5f; break; } - + prevDelta = delta; - + } - - - - + + + + } } - + const Mat* prevImg; const Mat* nextImg; const Mat* prevDeriv; @@ -1100,4 +1096,4 @@ class TrackerInvoker : public cv::ParallelLoopBody } } }} -#endif \ No newline at end of file +#endif diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp index c810debbdb9..2a4d39b1046 100644 --- a/modules/optflow/src/rlof/rlof_invoker.hpp +++ b/modules/optflow/src/rlof/rlof_invoker.hpp @@ -190,7 +190,7 @@ namespace ica x = 0; #ifdef RLOF_SSE - + for (; x <= winSize.width*cn; x += 4, dsrc += 4 * 2, dIptr += 4 * 2) { __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16); @@ -281,7 +281,7 @@ namespace ica break; } - + a = nextPt.x - inextPt.x; b = nextPt.y - inextPt.y; iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); @@ -344,7 +344,7 @@ namespace ica __m128i mmScale = _mm_set1_epi16(static_cast(MEstimatorScale)); #endif - + buffIdx = 0; for (y = 0; y < winSize.height; y++) { @@ -354,7 +354,7 @@ namespace ica const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); x = 0; -#ifdef RLOF_SSE +#ifdef RLOF_SSE for (; x <= winSize.width*cn; x += 8, dIptr += 8 * 2) { __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16); @@ -377,12 +377,12 @@ namespace ica mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16); - + __m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale); mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1))); - - __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... + + __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16); @@ -396,20 +396,20 @@ namespace ica // val = |It| -/+ sigma1 __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)), _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1))); - // It == 0 ? |It| > sigma13 + // It == 0 ? |It| > sigma13 mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16); // It == val ? sigma0 < |It| < sigma1 mmDiff_epi16 = _mm_blendv_epi8(mmDiff_epi16, tmpParam1_epi16, bSet1_epi16); __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3 - // diff = diff * sigma2 + // diff = diff * sigma2 __m128i lo = _mm_mullo_epi16(tale_epi16_, mmDiff_epi16); __m128i hi = _mm_mulhi_epi16(tale_epi16_, mmDiff_epi16); - __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); //diff 0_3_epi32 + __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); //diff 0_3_epi32 __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); // diff 4_7_epi32 - mmDiff_epi16 = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); + mmDiff_epi16 = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7 | It12 It12 It13 It13... diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3 | It8 It8 It9 It9... @@ -449,7 +449,7 @@ namespace ica tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps); tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps); - + t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3 t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3 @@ -488,18 +488,18 @@ namespace ica Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11, W_BITS1 - 5) - Iptr[x]; - - + + if (diff > MEstimatorScale) MEstimatorScale += eta; if (diff < MEstimatorScale) MEstimatorScale -= eta; - + calc_diff(diff, fParam0, fParam1, param[2]); - - + + float ixval = (float)(dIptr[0]); float iyval = (float)(dIptr[1]); b1 += (float)(diff*ixval); @@ -531,19 +531,19 @@ namespace ica } #endif } - + #ifdef RLOF_SSE short etaValues[8]; _mm_storeu_si128((__m128i*)(etaValues), mmEta); MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3] + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]); - + #endif if (j == 0) { #ifdef RLOF_SSE - float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];// + float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];// _mm_store_ps(A11buf, mmAxx); _mm_store_ps(A12buf, mmAxy); @@ -554,8 +554,8 @@ namespace ica A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; #endif - A11 *= FLT_SCALE; - A12 *= FLT_SCALE; + A11 *= FLT_SCALE; + A12 *= FLT_SCALE; A22 *= FLT_SCALE; @@ -683,7 +683,7 @@ namespace radial TrackerInvoker & operator=(const TrackerInvoker &) { return *this; }; void operator()(const cv::Range& range) const { - Point2f halfWin; + Point2f halfWin; cv::Size winSize; const Mat& I = *prevImg; const Mat& J = *nextImg; @@ -729,7 +729,7 @@ namespace radial else { nextPt = nextPts[ptidx] * 2.f; - + } nextPts[ptidx] = nextPt; @@ -895,7 +895,7 @@ namespace radial for (j = 0; j < criteria.maxCount; j++) { - + inextPt.x = cvFloor(nextPt.x); inextPt.y = cvFloor(nextPt.y); @@ -952,9 +952,9 @@ namespace radial { if (dIptr[0] == 0 && dIptr[1] == 0) continue; - int diff = static_cast(CV_DESCALE( Jptr[x] * iw00 + - Jptr[x + cn] * iw01 + - Jptr[x + step] * iw10 + + int diff = static_cast(CV_DESCALE( Jptr[x] * iw00 + + Jptr[x + cn] * iw01 + + Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11, W_BITS1 - 5) - Iptr[x] + Iptr[x] * gainVec.x + gainVec.y); residualMat.at(buffIdx++) = static_cast(diff); @@ -970,7 +970,7 @@ namespace radial float fParam1 = param[1] * 32.f; fParam0 = param[0] * MEstimatorScale; fParam1 = param[1] * MEstimatorScale; - + #ifdef RLOF_SSE qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); @@ -1011,7 +1011,7 @@ namespace radial const short* dIptr = (const short*)(derivIWinBuf.data + y * derivIWinBuf.step); x = 0; -#ifdef RLOF_SSE +#ifdef RLOF_SSE for (; x <= _winSize.width*cn; x += 8, dIptr += 8 * 2) @@ -1020,7 +1020,7 @@ namespace radial __m128i diff0, diff1; __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); // von element 0 bis 7 __m128i v00 = _mm_unpacklo_epi8( - _mm_loadl_epi64((const __m128i*)(Jptr + x)) , z); //J0 , 0, J1, 0, J2, 0 ... J7,0 + _mm_loadl_epi64((const __m128i*)(Jptr + x)) , z); //J0 , 0, J1, 0, J2, 0 ... J7,0 __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z); __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z); @@ -1036,7 +1036,7 @@ namespace radial t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5); t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1 - 5); - + __m128i lo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16); __m128i hi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16); @@ -1049,14 +1049,14 @@ namespace radial _mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16), // J - I _mm_add_epi16(Igain_epi16, mmConstValue_epi16)); - + mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16); - + __m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale); mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1))); - - + + __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); @@ -1079,10 +1079,10 @@ namespace radial __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3 - // diff = diff * sigma2 + // diff = diff * sigma2 lo = _mm_mullo_epi16(tale_epi16_, mmDiff_epi16); hi = _mm_mulhi_epi16(tale_epi16_, mmDiff_epi16); - __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); //diff 0_3_epi32 + __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); //diff 0_3_epi32 __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); // diff 4_7_epi32 mmDiff_epi16 = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); // ,da das ergebniss kleiner als 16 bit sein sollte @@ -1120,7 +1120,7 @@ namespace radial qb3 = _mm_add_ps(qb3, _mm_cvtepi32_ps(diff_0_3_epi32)); qb3 = _mm_add_ps(qb3, _mm_cvtepi32_ps(diff_4_7_epi32)); - + if (j == 0) { __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16)); @@ -1138,9 +1138,8 @@ namespace radial tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps); tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps); - - t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3 - t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3 + t0 = _mm_srai_epi32(Ixy_0, 16); + t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); __m128 fy = _mm_cvtepi32_ps(t0); __m128 fx = _mm_cvtepi32_ps(t1); @@ -1212,9 +1211,7 @@ namespace radial { if (maskPtr[x] == 0) continue; - - float J = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + - Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11, + float J = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11, W_BITS1 - 5); short ixval = static_cast(dIptr[0]); short iyval = static_cast(dIptr[1]); @@ -1224,8 +1221,6 @@ namespace radial MEstimatorScale += eta; if (diff < MEstimatorScale) MEstimatorScale -= eta; - // compute the missmatch vector - if (abss > fParam1) { diff = 0.f; @@ -1238,8 +1233,6 @@ namespace radial { diff = param[2] * (diff + fParam1); } - - b1 += (float)(diff*ixval); b2 += (float)(diff*iyval); ; b3 += (float)(diff)* Iptr[x]; @@ -1283,8 +1276,6 @@ namespace radial } #endif } - - #ifdef RLOF_SSE short etaValues[8]; _mm_storeu_si128((__m128i*)(etaValues), mmEta); @@ -1318,12 +1309,8 @@ namespace radial w1 *= -FLT_SCALE; w2 *= -FLT_SCALE; dI *= FLT_SCALE; - - #ifdef RLOF_SSE - - float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];// - + float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4]; _mm_store_ps(A11buf, mmAxx); _mm_store_ps(A12buf, mmAxy); _mm_store_ps(A22buf, mmAyy); @@ -1336,11 +1323,7 @@ namespace radial A11 *= FLT_SCALE; A12 *= FLT_SCALE; A22 *= FLT_SCALE; - - } - - #ifdef RLOF_SSE float CV_DECL_ALIGNED(16) bbuf[4]; _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1)); @@ -1350,9 +1333,7 @@ namespace radial b3 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; _mm_store_ps(bbuf, qb3); b4 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; - #endif - mismatchMat.at(0, 0) = b1 * FLT_SCALE; mismatchMat.at(1, 0) = b2 * FLT_SCALE; mismatchMat.at(2, 0) = -b3 * FLT_SCALE; @@ -1397,13 +1378,12 @@ namespace radial resultMat = invTensorMat * mismatchMat; - Point2f delta(-resultMat.at(0), -resultMat.at(1)); Point2f deltaGain(resultMat.at(2), resultMat.at(3)); - + if (j == 0) prevGain = deltaGain; gainVec += deltaGain * 0.8; @@ -1454,4 +1434,4 @@ namespace radial } } } -#endif \ No newline at end of file +#endif diff --git a/modules/optflow/src/rlof/rlof_localflow.h b/modules/optflow/src/rlof/rlof_localflow.h index 98347e9dcb1..89ac2f8db21 100644 --- a/modules/optflow/src/rlof/rlof_localflow.h +++ b/modules/optflow/src/rlof/rlof_localflow.h @@ -25,9 +25,9 @@ T quickselect(const Mat & inp, int k) l = 0; ir = MAX(values.rows, values.cols) - 1; - while (true) + while(true) { - if (ir <= l + 1) + if (ir <= l + 1) { if (ir == l + 1 && values.at(ir) < values.at(l)) std::swap(values.at(l), values.at(ir)); @@ -116,4 +116,4 @@ void calcLocalOpticalFlow( const RLOFOpticalFlowParameter & param); } } -#endif \ No newline at end of file +#endif diff --git a/modules/optflow/test/test_OF_accuracy.cpp b/modules/optflow/test/test_OF_accuracy.cpp index b517896e045..e99a44c02c9 100644 --- a/modules/optflow/test/test_OF_accuracy.cpp +++ b/modules/optflow/test/test_OF_accuracy.cpp @@ -181,7 +181,6 @@ TEST(SparseOpticalFlow, ReferenceAccuracy) Mat frame1, frame2, GT; ASSERT_TRUE(readRubberWhale(frame1, frame2, GT)); vector prevPts, currPts; - for (int r = 0; r < frame1.rows; r+=10) { for (int c = 0; c < frame1.cols; c+=10) @@ -197,7 +196,6 @@ TEST(SparseOpticalFlow, ReferenceAccuracy) Ptr param; param->supportRegionType = SR_CROSS; param->useIlluminationModel = true; - param->solverType = ST_BILINEAR; algo->setRLOFOpticalFlowParameter(param); algo->calc(frame1, frame2, prevPts, currPts, status, err); @@ -209,7 +207,6 @@ TEST(SparseOpticalFlow, ReferenceAccuracy) EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.34f); param->useIlluminationModel = false; - param->solverType = ST_BILINEAR; algo->setRLOFOpticalFlowParameter(param); algo->calc(frame1, frame2, prevPts, currPts, status, err); @@ -251,7 +248,6 @@ TEST(DenseOpticalFlow_RLOF, ReferenceAccuracy) { Mat frame1, frame2, GT; ASSERT_TRUE(readRubberWhale(frame1, frame2, GT)); - Mat flow; Ptr algo = DenseRLOFOpticalFlow::create(); Ptr param; @@ -260,7 +256,6 @@ TEST(DenseOpticalFlow_RLOF, ReferenceAccuracy) algo->setRLOFOpticalFlowParameter(param); algo->setForwardBackward(1.0f); algo->setGridStep(cv::Size(4, 4)); - algo->setInterpolation(INTERP_EPIC); algo->calc(frame1, frame2, flow); From 4956de9010c0df00cea94e3007ae9b59ae41e7fa Mon Sep 17 00:00:00 2001 From: senst Date: Tue, 11 Dec 2018 12:26:52 +0100 Subject: [PATCH 05/25] remove whitespaces fix perf and accuracy tests --- .../include/opencv2/optflow/rlofflow.hpp | 32 ++++++---- modules/optflow/perf/perf_rlof.cpp | 24 +++----- modules/optflow/src/rlof/berlof_invoker.hpp | 28 +++++---- modules/optflow/src/rlof/plk_invoker.hpp | 19 ++++-- modules/optflow/src/rlof/rlof_invoker.hpp | 59 +++++++++---------- modules/optflow/src/rlof/rlof_invokerbase.hpp | 9 ++- modules/optflow/src/rlofflow.cpp | 12 ++-- modules/optflow/test/test_OF_accuracy.cpp | 19 +++--- 8 files changed, 103 insertions(+), 99 deletions(-) diff --git a/modules/optflow/include/opencv2/optflow/rlofflow.hpp b/modules/optflow/include/opencv2/optflow/rlofflow.hpp index f088562215e..4911c53b947 100644 --- a/modules/optflow/include/opencv2/optflow/rlofflow.hpp +++ b/modules/optflow/include/opencv2/optflow/rlofflow.hpp @@ -21,7 +21,7 @@ namespace optflow * This RLOF implementation can be seen as an improved pyramidal iterative Lucas-Kanade and includes * a set of improving modules. The main improvements in respect to the pyramidal iterative Lucas-Kanade * are: - * - A more robust redecending M-estimator framework (see @cite Senst2012) to improve the accuracy at + * - A more robust redecending M-estimator framework (see @cite Senst2012) to improve the accuracy at * motion boundaries and appearing and disappearing pixels. * - an adaptive support region strategies to improve the accuracy at motion boundaries to reduce the * corona effect, i.e oversmoothing of the PLK at motion/object boundaries. The cross-based segementation @@ -143,8 +143,8 @@ namespace optflow float globalMotionRansacThreshold; /**< To apply the global motion prior motion vectors will be computed on a regulary sampled which * are the basis for Homography estimation using RANSAC. The reprojection threshold is based on - * n-th percentil (given by this value [0 ... 100]) of the forward backward errors relating to - * the motion vectors. See @cite Senst2016 for more details. + * n-th percentil (given by this value [0 ... 100]) of the motion vectors magnitude. + * See @cite Senst2016 for more details. */ }; @@ -165,7 +165,8 @@ namespace optflow * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details. * Parameters have been described in @cite Senst2012 @cite Senst2013 @cite Senst2014 and @cite Senst2016. * - * @note SIMD parallelization is available compiled with SSE4.1. If the grid size is set to (1,1) and the forward backward threshold <= 0 that the dense optical flow field is purely. + * @note SIMD parallelization is only available when compiling with SSE4.1. If the grid size is set to (1,1) and the + * forward backward threshold <= 0 that the dense optical flow field is purely. * computed with the RLOF. * * @see optflow::calcOpticalFlowDenseRLOF(), optflow::RLOFOpticalFlowParameter @@ -207,7 +208,7 @@ namespace optflow CV_WRAP virtual void setGridStep(Size val) = 0; //! @brief Interpolation used to compute the dense optical flow. - /** Two interpolation algorithms have been supported + /** Two interpolation algorithms are supported * - **INTERP_GEO** applies the fast geodesic interpolation, see @cite Geistert2016. * - **INTERP_EPIC_RESIDUAL** applies the edge-preserving interpolation, see @cite Revaud2015,Geistert2016. * @see ximgproc::EdgeAwareInterpolator, getInterpolation @@ -280,6 +281,7 @@ namespace optflow * @param rlofParam see optflow::RLOFOpticalFlowParameter * @param forwardBackwardThreshold see setForwardBackward * @param gridStep see setGridStep + * @param interp_type see setInterpolation * @param epicK see setEPICK * @param epicSigma see setEPICSigma * @param epicLambda see setEPICLambda @@ -308,8 +310,8 @@ namespace optflow * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details. * Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014 and @cite Senst2016. * - * @note SIMD parallelization is available compiled with SSE4.1. - *@see optflow::calcOpticalFlowSparseRLOF(), optflow::RLOFOpticalFlowParameter + * @note SIMD parallelization is only available when compiling with SSE4.1. + * @see optflow::calcOpticalFlowSparseRLOF(), optflow::RLOFOpticalFlowParameter */ class CV_EXPORTS_W SparseRLOFOpticalFlow : public SparseOpticalFlow { @@ -365,7 +367,7 @@ namespace optflow * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image. * @param flow computed flow image that has the same size as I0 and type CV_32FC2. * @param rlofParam see optflow::RLOFOpticalFlowParameter - * @param forewardBackwardThreshold Threshold for the forward backward confidence check. + * @param forwardBackwardThreshold Threshold for the forward backward confidence check. * For each grid point \f$ \mathbf{x} \f$ a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed. * If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f] * is larger than threshold given by this function then the motion vector will not be used by the following @@ -374,6 +376,10 @@ namespace optflow * @param gridStep Size of the grid to spawn the motion vectors. For each grid point a motion vector is computed. * Some motion vectors will be removed due to the forwatd backward threshold (if set >0). The rest will be the * base of the vector field interpolation. + * @param interp_type interpolation method used to compute the dense optical flow. Two interpolation algorithms are + * supported: + * - **INTERP_GEO** applies the fast geodesic interpolation, see @cite Geistert2016. + * - **INTERP_EPIC_RESIDUAL** applies the edge-preserving interpolation, see @cite Revaud2015,Geistert2016. * @param epicK see ximgproc::EdgeAwareInterpolator() sets the respective parameter. * @param epicSigma see ximgproc::EdgeAwareInterpolator() sets the respective parameter. * @param epicLambda see ximgproc::EdgeAwareInterpolator() sets the respective parameter. @@ -386,13 +392,13 @@ namespace optflow * @note If the grid size is set to (1,1) and the forward backward threshold <= 0 that the dense optical flow field is purely * computed with the RLOF. * - * @note SIMD parallelization is available compiled with SSE4.1. + * @note SIMD parallelization is only available when compiling with SSE4.1. * * @sa optflow::DenseRLOFOpticalFlow, optflow::RLOFOpticalFlowParameter */ CV_EXPORTS_W void calcOpticalFlowDenseRLOF(InputArray I0, InputArray I1, InputOutputArray flow, Ptr rlofParam = Ptr(), - float forewardBackwardThreshold = 0, Size gridStep = Size(6, 6), + float forwardBackwardThreshold = 0, Size gridStep = Size(6, 6), InterpolationType interp_type = InterpolationType::INTERP_EPIC, int epicK = 128, float epicSigma = 0.05f, float epicLambda = 100.f, bool use_post_proc = true, float fgsLambda = 500.0f, float fgsSigma = 1.5f); @@ -416,9 +422,9 @@ namespace optflow * corresponding features has passed the forward backward check. * @param err output vector of errors; each element of the vector is set to the forward backward error for the corresponding feature. * @param rlofParam see optflow::RLOFOpticalFlowParameter - * @param forewardBackwardThreshold Threshold for the forward backward confidence check. If forewardBackwardThreshold <=0 the forward + * @param forwardBackwardThreshold Threshold for the forward backward confidence check. If forewardBackwardThreshold <=0 the forward * - * @note SIMD parallelization is available compiled with SSE4.1. + * @note SIMD parallelization is only available when compiling with SSE4.1. * * Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014 and @cite Senst2016. * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details. @@ -427,7 +433,7 @@ namespace optflow InputArray prevPts, InputOutputArray nextPts, OutputArray status, OutputArray err, Ptr rlofParam = Ptr(), - float forewardBackwardThreshold = 0); + float forwardBackwardThreshold = 0); //! Additional interface to the Dense RLOF algorithm - optflow::calcOpticalFlowDenseRLOF() CV_EXPORTS_W Ptr createOptFlow_DenseRLOF(); diff --git a/modules/optflow/perf/perf_rlof.cpp b/modules/optflow/perf/perf_rlof.cpp index 910fa58375c..dda15a4b94e 100644 --- a/modules/optflow/perf/perf_rlof.cpp +++ b/modules/optflow/perf/perf_rlof.cpp @@ -1,6 +1,5 @@ #include "perf_precomp.hpp" -namespace opencv_test { - namespace { +namespace opencv_test { namespace { typedef tuple ST_SR_IM_Sparse_t; typedef TestBaseWithParam ST_SR_IM_Sparse; PERF_TEST_P(ST_SR_IM_Sparse, OpticalFlow_SparseRLOF, @@ -10,12 +9,8 @@ namespace opencv_test { testing::Values(true, false)) ) { - string frame1_path = TS::ptr()->get_data_path() + "/cv/optflow/RubberWhale1.png"; - string frame2_path = TS::ptr()->get_data_path() + "/cv/optflow/RubberWhale2.png"; - frame1_path.erase(std::remove_if(frame1_path.begin(), frame1_path.end(), isspace), frame1_path.end()); - frame2_path.erase(std::remove_if(frame2_path.begin(), frame2_path.end(), isspace), frame2_path.end()); - Mat frame1 = imread(frame1_path); - Mat frame2 = imread(frame2_path); + Mat frame1 = imread(getDataPath("cv/optflow/RubberWhale1.png")); + Mat frame2 = imread(getDataPath("cv/optflow/RubberWhale2.png")); ASSERT_FALSE(frame1.empty()); ASSERT_FALSE(frame2.empty()); vector prevPts, currPts; @@ -29,7 +24,7 @@ namespace opencv_test { vector status(prevPts.size()); vector err(prevPts.size()); - Ptr param; + Ptr param = Ptr(new RLOFOpticalFlowParameter); if (get<0>(GetParam()) == "ST_BILINEAR") param->solverType = ST_BILINEAR; if (get<0>(GetParam()) == "ST_STANDART") @@ -57,16 +52,11 @@ namespace opencv_test { ) { Mat flow; - string frame1_path = TS::ptr()->get_data_path() + "/cv/optflow/RubberWhale1.png"; - string frame2_path = TS::ptr()->get_data_path() + "/cv/optflow/RubberWhale2.png"; - // removing space may be an issue on windows machines - frame1_path.erase(std::remove_if(frame1_path.begin(), frame1_path.end(), isspace), frame1_path.end()); - frame2_path.erase(std::remove_if(frame2_path.begin(), frame2_path.end(), isspace), frame2_path.end()); - Mat frame1 = imread(frame1_path); - Mat frame2 = imread(frame2_path); + Mat frame1 = imread(getDataPath("cv/optflow/RubberWhale1.png")); + Mat frame2 = imread(getDataPath("cv/optflow/RubberWhale1.png")); ASSERT_FALSE(frame1.empty()); ASSERT_FALSE(frame2.empty()); - Ptr param; + Ptr param = Ptr(new RLOFOpticalFlowParameter);; Ptr< DenseRLOFOpticalFlow> algo = DenseRLOFOpticalFlow::create(); InterpolationType interp_type; if (get<0>(GetParam()) == "INTERP_EPIC") diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp index 9c229bf542b..3dfed1ce6bd 100644 --- a/modules/optflow/src/rlof/berlof_invoker.hpp +++ b/modules/optflow/src/rlof/berlof_invoker.hpp @@ -95,9 +95,11 @@ namespace berlof } TrackerInvoker & operator=(const TrackerInvoker &) {return *this;}; - void operator()(const cv::Range& range) const + void operator()(const cv::Range& range) const CV_OVERRIDE { - +#ifdef DEBUG_INVOKER + printf("berlof::ica");fflush(stdout); +#endif Point2f halfWin; cv::Size winSize; const Mat& I = *prevImg; @@ -610,8 +612,11 @@ namespace berlof } TrackerInvoker & operator=(const TrackerInvoker &) {return *this;}; - void operator()(const cv::Range& range) const + void operator()(const cv::Range& range) const CV_OVERRIDE { +#ifdef DEBUG_INVOKER + printf("berlof::radial");fflush(stdout); +#endif Point2f halfWin; cv::Size winSize; const Mat& I = *prevImg; @@ -725,16 +730,14 @@ namespace berlof for( y = 0; y < winSize.height; y++ ) { x = 0; - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step); short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step); - - - #ifdef RLOF_SSE + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 ) { __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16); @@ -812,9 +815,11 @@ namespace berlof cv::Point2f backUpGain = gainVec; cv::Size _winSize = winSize; int j; + #ifdef RLOF_SSE __m128i mmMask0, mmMask1, mmMask; getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask); __m128 mmOnes = _mm_set1_ps(1.f ); + #endif float MEstimatorScale = 1; int buffIdx = 0; float c[8]; @@ -1172,11 +1177,11 @@ namespace berlof (Jptr[x] << 5) + illValue}; - float J = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + + float J_val = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, W_BITS1-5); - int diff = J + illValue; + int diff = J_val + illValue; @@ -1274,9 +1279,8 @@ namespace berlof _mm_storeu_si128((__m128i*)(etaValues), mmEta); MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3] + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]); - + float CV_DECL_ALIGNED(32) wbuf[4]; #endif - float CV_DECL_ALIGNED(32) wbuf[4];// if( j == 0 ) { #ifdef RLOF_SSE @@ -1575,7 +1579,7 @@ namespace beplk TrackerInvoker & operator=(const TrackerInvoker &) {return *this;}; - void operator()(const cv::Range& range) const + void operator()(const cv::Range& range) const CV_OVERRIDE { cv::Size winSize; cv::Point2f halfWin; diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp index 0d7b5ddb481..14b6eb78d22 100644 --- a/modules/optflow/src/rlof/plk_invoker.hpp +++ b/modules/optflow/src/rlof/plk_invoker.hpp @@ -61,8 +61,11 @@ class TrackerInvoker : public cv::ParallelLoopBody } TrackerInvoker & operator=(const TrackerInvoker &) {return *this;}; - void operator()(const cv::Range& range) const + void operator()(const cv::Range& range) const CV_OVERRIDE { +#ifdef DEBUG_INVOKER + printf("plk::radial");fflush(stdout); +#endif cv::Size winSize; cv::Point2f halfWin; @@ -467,10 +470,10 @@ class TrackerInvoker : public cv::ParallelLoopBody { if( maskPtr[x] == 0) continue; - int J = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + + int J_val = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, W_BITS1-5); - int diff = J - Iptr[x] + static_cast(Iptr[x]) * gainVec.x + gainVec.y; + int diff = J_val - Iptr[x] + static_cast(Iptr[x]) * gainVec.x + gainVec.y; b1 += (float)(diff*dIptr[0]) * FLT_RESCALE; b2 += (float)(diff*dIptr[1]) * FLT_RESCALE; @@ -499,8 +502,9 @@ class TrackerInvoker : public cv::ParallelLoopBody } #endif } - - float CV_DECL_ALIGNED(16) wbuf[4];// +#ifdef RLOF_SSE + float CV_DECL_ALIGNED(16) wbuf[4]; +#endif if( j == 0 ) { #ifdef RLOF_SSE @@ -704,8 +708,11 @@ class TrackerInvoker : public cv::ParallelLoopBody } TrackerInvoker & operator=(const TrackerInvoker &) {return *this;}; - void operator()(const cv::Range& range) const + void operator()(const cv::Range& range) const CV_OVERRIDE { +#ifdef DEBUG_INVOKER + printf("plk::ica");fflush(stdout); +#endif cv::Size winSize; cv::Point2f halfWin; diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp index 2a4d39b1046..7da20aa7d77 100644 --- a/modules/optflow/src/rlof/rlof_invoker.hpp +++ b/modules/optflow/src/rlof/rlof_invoker.hpp @@ -12,26 +12,6 @@ namespace rlof { namespace ica { - inline int calc_diff(int diff,float fParam0,float fParam1,float param_2) - { - int abss = (diff < 0) ? -diff : diff; - if (abss > fParam1) - { - diff = 0; - } - else if (abss > fParam0 && diff >= 0) - { - //diff = fParam1 * (diff - fParam1); - diff = static_cast(param_2 * (diff - fParam1)); - } - else if (abss > fParam0 && diff < 0) - { - //diff = fParam1 * (diff + fParam1); - diff = static_cast(param_2 * (diff + fParam1)); - - } - return diff; - } class TrackerInvoker : public cv::ParallelLoopBody { public: @@ -77,8 +57,11 @@ namespace ica crossSegmentationThreshold = _crossSegmentationThreshold; } TrackerInvoker & operator=(const TrackerInvoker &) { return *this; }; - void operator()(const cv::Range& range) const + void operator()(const cv::Range& range) const CV_OVERRIDE { +#ifdef DEBUG_INVOKER + printf("rlof::ica");fflush(stdout); +#endif cv::Size winSize; cv::Point2f halfWin; @@ -497,8 +480,22 @@ namespace ica if (diff < MEstimatorScale) MEstimatorScale -= eta; - calc_diff(diff, fParam0, fParam1, param[2]); + int abss = (diff < 0) ? -diff : diff; + if (abss > fParam1) + { + diff = 0; + } + else if (abss > fParam0 && diff >= 0) + { + //diff = fParam1 * (diff - fParam1); + diff = static_cast(param[2] * (diff - fParam1)); + } + else if (abss > fParam0 && diff < 0) + { + //diff = fParam1 * (diff + fParam1); + diff = static_cast(param[2] * (diff + fParam1)); + } float ixval = (float)(dIptr[0]); float iyval = (float)(dIptr[1]); @@ -507,9 +504,7 @@ namespace ica if ( j == 0) { - int abss = (diff < 0) ? -diff : diff; float tale = param[2] * FLT_RESCALE; - //float tale = fParam1 * FLT_RESCALE; if (abss < fParam0) { tale = FLT_RESCALE; @@ -681,8 +676,11 @@ namespace radial } TrackerInvoker & operator=(const TrackerInvoker &) { return *this; }; - void operator()(const cv::Range& range) const + void operator()(const cv::Range& range) const CV_OVERRIDE { +#ifdef DEBUG_INVOKER + printf("rlof::radial");fflush(stdout); +#endif Point2f halfWin; cv::Size winSize; const Mat& I = *prevImg; @@ -801,7 +799,6 @@ namespace radial int x, y; for (y = 0; y < winSize.height; y++) { - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; @@ -811,6 +808,7 @@ namespace radial x = 0; #ifdef RLOF_SSE + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; for (; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4 * 2, dIptr += 4 * 2) { __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16); @@ -886,9 +884,11 @@ namespace radial cv::Point2f backUpGain = gainVec; cv::Size _winSize = winSize; int j; +#ifdef RLOF_SSE __m128i mmMask0, mmMask1, mmMask; getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask); __m128 mmOnes = _mm_set1_ps(1.f); +#endif float MEstimatorScale = 1; int buffIdx = 0; float minEigValue; @@ -1211,11 +1211,11 @@ namespace radial { if (maskPtr[x] == 0) continue; - float J = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11, + float J_val = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11, W_BITS1 - 5); short ixval = static_cast(dIptr[0]); short iyval = static_cast(dIptr[1]); - int diff = J - static_cast(Iptr[x]) + Iptr[x] * gainVec.x + gainVec.y; + int diff = J_val - static_cast(Iptr[x]) + Iptr[x] * gainVec.x + gainVec.y; int abss = (diff < 0) ? -diff : diff; if (diff > MEstimatorScale) MEstimatorScale += eta; @@ -1281,9 +1281,8 @@ namespace radial _mm_storeu_si128((__m128i*)(etaValues), mmEta); MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3] + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]); - + float CV_DECL_ALIGNED(32) wbuf[4]; #endif - float CV_DECL_ALIGNED(32) wbuf[4];// if (j == 0) { #ifdef RLOF_SSE diff --git a/modules/optflow/src/rlof/rlof_invokerbase.hpp b/modules/optflow/src/rlof/rlof_invokerbase.hpp index 34b112fa570..42275cfb2a7 100644 --- a/modules/optflow/src/rlof/rlof_invokerbase.hpp +++ b/modules/optflow/src/rlof/rlof_invokerbase.hpp @@ -7,10 +7,12 @@ #if CV_CPU_COMPILE_SSE4_1 #define RLOF_SSE -#elif CV_CPU_COMPILE_SSE4_2 +#endif +#if CV_CPU_COMPILE_SSE4_2 #define RLOF_SSE #endif +//#define DEBUG_INVOKER #define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n)) #define FLT_RESCALE 1 @@ -271,8 +273,5 @@ namespace optflow return quickselect(absMat, absMat.rows / 2); } } - - } - -#endif \ No newline at end of file +#endif diff --git a/modules/optflow/src/rlofflow.cpp b/modules/optflow/src/rlofflow.cpp index 08e16d9a31d..3ba5a782f0b 100644 --- a/modules/optflow/src/rlofflow.cpp +++ b/modules/optflow/src/rlofflow.cpp @@ -14,7 +14,8 @@ namespace optflow { public: DenseOpticalFlowRLOFImpl() - : forwardBackwardThreshold(1.f) + : param(Ptr(new RLOFOpticalFlowParameter)) + , forwardBackwardThreshold(1.f) , gridStep(6, 6) , interp_type(InterpolationType::INTERP_GEO) , k(128) @@ -68,7 +69,6 @@ namespace optflow if (param->supportRegionType == SR_CROSS) CV_Assert( I0.channels() == 3 && I1.channels() == 3); CV_Assert(interp_type == InterpolationType::INTERP_EPIC || interp_type == InterpolationType::INTERP_GEO); - Mat prevImage = I0.getMat(); Mat currImage = I1.getMat(); int noPoints = prevImage.cols * prevImage.rows; @@ -85,7 +85,6 @@ namespace optflow } prevPoints.erase(prevPoints.begin() + noPoints, prevPoints.end()); currPoints.resize(prevPoints.size()); - calcLocalOpticalFlow(prevImage, currImage, prevPyramid, currPyramid, prevPoints, currPoints, *(param.get())); flow.create(prevImage.size(), CV_32FC2); @@ -211,15 +210,16 @@ namespace optflow { public: SparseRLOFOpticalFlowImpl() - : forwardBackwardThreshold(1.f) + : param(Ptr(new RLOFOpticalFlowParameter)) + , forwardBackwardThreshold(1.f) { prevPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer); prevPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer); currPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer); currPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer); } - virtual void setRLOFOpticalFlowParameter(Ptr val) CV_OVERRIDE { } // param = *val; } - virtual Ptr getRLOFOpticalFlowParameter() const CV_OVERRIDE { return Ptr(); } + virtual void setRLOFOpticalFlowParameter(Ptr val) CV_OVERRIDE { param = val; } + virtual Ptr getRLOFOpticalFlowParameter() const CV_OVERRIDE { return param; } virtual float getForwardBackward() const CV_OVERRIDE { return forwardBackwardThreshold; } virtual void setForwardBackward(float val) CV_OVERRIDE { forwardBackwardThreshold = val; } diff --git a/modules/optflow/test/test_OF_accuracy.cpp b/modules/optflow/test/test_OF_accuracy.cpp index e99a44c02c9..67d58f243b0 100644 --- a/modules/optflow/test/test_OF_accuracy.cpp +++ b/modules/optflow/test/test_OF_accuracy.cpp @@ -192,14 +192,13 @@ TEST(SparseOpticalFlow, ReferenceAccuracy) vector err(prevPts.size()); Ptr algo = SparseRLOFOpticalFlow::create(); algo->setForwardBackward(0.0f); - - Ptr param; + Ptr param = Ptr(new RLOFOpticalFlowParameter); param->supportRegionType = SR_CROSS; param->useIlluminationModel = true; param->solverType = ST_BILINEAR; algo->setRLOFOpticalFlowParameter(param); algo->calc(frame1, frame2, prevPts, currPts, status, err); - EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.22f); + EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.25f); param->solverType = ST_STANDART; algo->setRLOFOpticalFlowParameter(param); @@ -210,12 +209,12 @@ TEST(SparseOpticalFlow, ReferenceAccuracy) param->solverType = ST_BILINEAR; algo->setRLOFOpticalFlowParameter(param); algo->calc(frame1, frame2, prevPts, currPts, status, err); - EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.24f); + EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.25f); param->solverType = ST_STANDART; algo->setRLOFOpticalFlowParameter(param); algo->calc(frame1, frame2, prevPts, currPts, status, err); - EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.23f); + EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.25f); param->normSigma0 = numeric_limits::max(); param->normSigma1 = numeric_limits::max(); @@ -224,12 +223,12 @@ TEST(SparseOpticalFlow, ReferenceAccuracy) param->solverType = ST_BILINEAR; algo->setRLOFOpticalFlowParameter(param); algo->calc(frame1, frame2, prevPts, currPts, status, err); - EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.26f); + EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.28f); param->solverType = ST_STANDART; algo->setRLOFOpticalFlowParameter(param); algo->calc(frame1, frame2, prevPts, currPts, status, err); - EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.26f); + EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.28f); param->useIlluminationModel = false; @@ -241,7 +240,7 @@ TEST(SparseOpticalFlow, ReferenceAccuracy) param->solverType = ST_STANDART; algo->setRLOFOpticalFlowParameter(param); algo->calc(frame1, frame2, prevPts, currPts, status, err); - EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.26f); + EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.28f); } TEST(DenseOpticalFlow_RLOF, ReferenceAccuracy) @@ -250,7 +249,7 @@ TEST(DenseOpticalFlow_RLOF, ReferenceAccuracy) ASSERT_TRUE(readRubberWhale(frame1, frame2, GT)); Mat flow; Ptr algo = DenseRLOFOpticalFlow::create(); - Ptr param; + Ptr param = Ptr(new RLOFOpticalFlowParameter); param->supportRegionType = SR_CROSS; param->solverType = ST_BILINEAR; algo->setRLOFOpticalFlowParameter(param); @@ -261,7 +260,7 @@ TEST(DenseOpticalFlow_RLOF, ReferenceAccuracy) ASSERT_EQ(GT.rows, flow.rows); ASSERT_EQ(GT.cols, flow.cols); - EXPECT_LE(calcRMSE(GT, flow), 0.36f); + EXPECT_LE(calcRMSE(GT, flow), 0.37f); algo->setInterpolation(INTERP_GEO); algo->calc(frame1, frame2, flow); From a6cc9bb0bc5d2a4362ad2a35a120519789e4a8b5 Mon Sep 17 00:00:00 2001 From: senst Date: Tue, 11 Dec 2018 12:57:24 +0100 Subject: [PATCH 06/25] remove x86intrin.h header --- modules/optflow/src/rlof/rlof_invokerbase.hpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/modules/optflow/src/rlof/rlof_invokerbase.hpp b/modules/optflow/src/rlof/rlof_invokerbase.hpp index 42275cfb2a7..adb45f039b4 100644 --- a/modules/optflow/src/rlof/rlof_invokerbase.hpp +++ b/modules/optflow/src/rlof/rlof_invokerbase.hpp @@ -5,10 +5,10 @@ #define _RLOF_INVOKERBASE_HPP_ -#if CV_CPU_COMPILE_SSE4_1 +#ifdef CV_CPU_COMPILE_SSE4_1 #define RLOF_SSE #endif -#if CV_CPU_COMPILE_SSE4_2 +#ifdef CV_CPU_COMPILE_SSE4_2 #define RLOF_SSE #endif @@ -16,9 +16,6 @@ #define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n)) #define FLT_RESCALE 1 -#ifndef _MSC_VER -#include -#endif #include "opencv2/core.hpp" #include "opencv2/video.hpp" From c367b8c55b43452d9b784f23e36f7955a7378b0e Mon Sep 17 00:00:00 2001 From: senst Date: Tue, 11 Dec 2018 14:16:08 +0100 Subject: [PATCH 07/25] fix ios and arm by removing last sse commands --- modules/optflow/src/rlof/berlof_invoker.hpp | 4 ++++ modules/optflow/src/rlof/plk_invoker.hpp | 2 ++ modules/optflow/src/rlof/rlof_invoker.hpp | 2 ++ modules/optflow/src/rlof/rlof_invokerbase.hpp | 3 ++- 4 files changed, 10 insertions(+), 1 deletion(-) diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp index 3dfed1ce6bd..d550e67a7b6 100644 --- a/modules/optflow/src/rlof/berlof_invoker.hpp +++ b/modules/optflow/src/rlof/berlof_invoker.hpp @@ -222,8 +222,10 @@ namespace berlof nextPt -= halfWin; Point2f prevDelta(0,0); //denotes h(t-1) cv::Size _winSize = winSize; +#ifdef RLOF_SSE __m128i mmMask0, mmMask1, mmMask; getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask); +#endif float MEstimatorScale = 1; int buffIdx = 0; float c[8]; @@ -1803,8 +1805,10 @@ namespace beplk Point2f prevDelta(0,0); float c[8]; +#ifdef RLOF_SSE __m128i mmMask0, mmMask1, mmMask; getWBitMask(winSize.width, mmMask0, mmMask1, mmMask); +#endif for( j = 0; j < criteria.maxCount; j++ ) { cv::Point2f delta; diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp index 14b6eb78d22..39f81fe6a18 100644 --- a/modules/optflow/src/rlof/plk_invoker.hpp +++ b/modules/optflow/src/rlof/plk_invoker.hpp @@ -936,8 +936,10 @@ class TrackerInvoker : public cv::ParallelLoopBody nextPt -= halfWin; Point2f prevDelta(0,0); //relates to h(t-1) +#ifdef RLOF_SSE __m128i mmMask0, mmMask1, mmMask; getWBitMask(winSize.width, mmMask0, mmMask1, mmMask); +#endif for( j = 0; j < criteria.maxCount; j++ ) { status[ptidx] = static_cast(j); diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp index 7da20aa7d77..6faf5a6247f 100644 --- a/modules/optflow/src/rlof/rlof_invoker.hpp +++ b/modules/optflow/src/rlof/rlof_invoker.hpp @@ -243,8 +243,10 @@ namespace ica cv::Point2f backUpNextPt = nextPt; nextPt -= halfWin; int j; +#ifdef RLOF_SSE __m128i mmMask0, mmMask1, mmMask; getWBitMask(winSize.width, mmMask0, mmMask1, mmMask); +#endif float MEstimatorScale = 1; int buffIdx = 0; cv::Point2f prevDelta(0, 0); diff --git a/modules/optflow/src/rlof/rlof_invokerbase.hpp b/modules/optflow/src/rlof/rlof_invokerbase.hpp index adb45f039b4..68e2ca28bbc 100644 --- a/modules/optflow/src/rlof/rlof_invokerbase.hpp +++ b/modules/optflow/src/rlof/rlof_invokerbase.hpp @@ -31,6 +31,7 @@ namespace cv namespace optflow { typedef short deriv_type; +#ifdef RLOF_SSE inline void get4BitMask(const int & width, __m128i & mask) { int noBits = width - static_cast(floor(width / 4.f) * 4.f); @@ -54,7 +55,7 @@ namespace optflow t0 = _mm_set_epi16(val[3], val[3], val[2], val[2], val[1], val[1], val[0], val[0]); t2 = _mm_set_epi16(val[7], val[6], val[5], val[4], val[3], val[2], val[1], val[0]); } - +#endif typedef uchar tMaskType; #define tCVMaskType CV_8UC1 #define MaskSet 0xffffffff From 085e3c0b82496398403b35761c7d34bb7da60e9a Mon Sep 17 00:00:00 2001 From: senst Date: Tue, 11 Dec 2018 14:28:51 +0100 Subject: [PATCH 08/25] fix warnings for windows compilation --- modules/optflow/perf/perf_rlof.cpp | 2 +- modules/optflow/src/rlof/berlof_invoker.hpp | 28 +++++++++---------- .../optflow/src/rlof/geo_interpolation.cpp | 4 +-- modules/optflow/src/rlof/plk_invoker.hpp | 2 +- modules/optflow/src/rlof/rlof_invoker.hpp | 12 ++++---- modules/optflow/test/test_OF_accuracy.cpp | 2 +- 6 files changed, 25 insertions(+), 25 deletions(-) diff --git a/modules/optflow/perf/perf_rlof.cpp b/modules/optflow/perf/perf_rlof.cpp index dda15a4b94e..e67feec2777 100644 --- a/modules/optflow/perf/perf_rlof.cpp +++ b/modules/optflow/perf/perf_rlof.cpp @@ -58,7 +58,7 @@ namespace opencv_test { namespace { ASSERT_FALSE(frame2.empty()); Ptr param = Ptr(new RLOFOpticalFlowParameter);; Ptr< DenseRLOFOpticalFlow> algo = DenseRLOFOpticalFlow::create(); - InterpolationType interp_type; + InterpolationType interp_type = INTERP_EPIC; if (get<0>(GetParam()) == "INTERP_EPIC") interp_type = INTERP_EPIC; if (get<0>(GetParam()) == "INTERP_GEO") diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp index d550e67a7b6..5f917dc12d4 100644 --- a/modules/optflow/src/rlof/berlof_invoker.hpp +++ b/modules/optflow/src/rlof/berlof_invoker.hpp @@ -1169,9 +1169,9 @@ namespace berlof if( maskPtr[x] == 0) continue; - short ixval = static_cast(dIptr[0]); - short iyval = static_cast(dIptr[1]); - int illValue = Iptr[x] * gainVec.x + gainVec.y - Iptr[x]; + short ixval = dIptr[0]; + short iyval = dIptr[1]; + int illValue = static_cast(Iptr[x] * gainVec.x + gainVec.y - Iptr[x]); int It[4] = {(Jptr[x+step+cn]<< 5) + illValue, (Jptr[x+cn]<< 5) + illValue, @@ -1179,7 +1179,7 @@ namespace berlof (Jptr[x] << 5) + illValue}; - float J_val = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + + int J_val = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, W_BITS1-5); @@ -1202,19 +1202,19 @@ namespace berlof It[2] = 0; It[3] = 0; } - else if( abss > fParam0 && diff >= 0 ) + else if( abss > static_cast(fParam0) && diff >= 0 ) { - It[0] = param[2] * (It[0] - fParam1); - It[1] = param[2] * (It[1] - fParam1); - It[2] = param[2] * (It[2] - fParam1); - It[3] = param[2] * (It[3] - fParam1); + It[0] = static_cast(param[2] * (It[0] - fParam1)); + It[1] = static_cast(param[2] * (It[1] - fParam1)); + It[2] = static_cast(param[2] * (It[2] - fParam1)); + It[3] = static_cast(param[2] * (It[3] - fParam1)); } - else if( abss > fParam0 && diff < 0 ) + else if( abss > static_cast(fParam0) && diff < 0 ) { - It[0] = param[2] * (It[0] + fParam1); - It[1] = param[2] * (It[1] + fParam1); - It[2] = param[2] * (It[2] + fParam1); - It[3] = param[2] * (It[3] + fParam1); + It[0] = static_cast(param[2] * (It[0] + fParam1)); + It[1] = static_cast(param[2] * (It[1] + fParam1)); + It[2] = static_cast(param[2] * (It[2] + fParam1)); + It[3] = static_cast(param[2] * (It[3] + fParam1)); } } _b0[0] += (float)(It[0]*dIptr[0]) ; diff --git a/modules/optflow/src/rlof/geo_interpolation.cpp b/modules/optflow/src/rlof/geo_interpolation.cpp index f93c0b6093e..f96e3914cff 100644 --- a/modules/optflow/src/rlof/geo_interpolation.cpp +++ b/modules/optflow/src/rlof/geo_interpolation.cpp @@ -363,7 +363,7 @@ namespace optflow } } return ret; - +/* for (int y = 0; y < in.rows; y++) { for (int x = 0; x < in.cols; x++) { int v_id = y * in.cols + x; @@ -389,7 +389,7 @@ namespace optflow } free(graph_helper.mem); return nnFlow; - +*/ } Mat getGraph(const Mat &image, float edge_length) diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp index 39f81fe6a18..dd8ab5e7d1b 100644 --- a/modules/optflow/src/rlof/plk_invoker.hpp +++ b/modules/optflow/src/rlof/plk_invoker.hpp @@ -473,7 +473,7 @@ class TrackerInvoker : public cv::ParallelLoopBody int J_val = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, W_BITS1-5); - int diff = J_val - Iptr[x] + static_cast(Iptr[x]) * gainVec.x + gainVec.y; + int diff = static_cast(J_val - Iptr[x] + Iptr[x] * gainVec.x + gainVec.y); b1 += (float)(diff*dIptr[0]) * FLT_RESCALE; b2 += (float)(diff*dIptr[1]) * FLT_RESCALE; diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp index 6faf5a6247f..660585cd1a3 100644 --- a/modules/optflow/src/rlof/rlof_invoker.hpp +++ b/modules/optflow/src/rlof/rlof_invoker.hpp @@ -1213,7 +1213,7 @@ namespace radial { if (maskPtr[x] == 0) continue; - float J_val = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11, + int J_val = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11, W_BITS1 - 5); short ixval = static_cast(dIptr[0]); short iyval = static_cast(dIptr[1]); @@ -1223,17 +1223,17 @@ namespace radial MEstimatorScale += eta; if (diff < MEstimatorScale) MEstimatorScale -= eta; - if (abss > fParam1) + if (abss > static_cast(fParam1)) { diff = 0.f; } - else if (abss > fParam0 && diff >= 0) + else if (abss > static_cast(fParam0) && diff >= 0) { - diff = param[2] * (diff - fParam1); + diff = static_cast(param[2] * (diff - fParam1)); } - else if (abss > fParam0 && diff < 0) + else if (abss > static_cast(fParam0) && diff < 0) { - diff = param[2] * (diff + fParam1); + diff = static_cast(param[2] * (diff + fParam1)); } b1 += (float)(diff*ixval); b2 += (float)(diff*iyval); ; diff --git a/modules/optflow/test/test_OF_accuracy.cpp b/modules/optflow/test/test_OF_accuracy.cpp index 67d58f243b0..26f6b3668c4 100644 --- a/modules/optflow/test/test_OF_accuracy.cpp +++ b/modules/optflow/test/test_OF_accuracy.cpp @@ -95,7 +95,7 @@ static float calcRMSE(vector prevPts, vector currPts, Mat flow ee.push_back(sqrt(diffFlow.x * diffFlow.x + diffFlow.y * diffFlow.y)); } } - return mean(ee).val[0]; + return static_cast(mean(ee).val[0]); } static float calcAvgEPE(vector< pair > corr, Mat flow) { From 3f759afa08044a1adba1743a8b1ef79cb6238b9d Mon Sep 17 00:00:00 2001 From: senst Date: Tue, 11 Dec 2018 14:38:33 +0100 Subject: [PATCH 09/25] fix documentation RLOFOpticalFlowParameter --- .../include/opencv2/optflow/rlofflow.hpp | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/modules/optflow/include/opencv2/optflow/rlofflow.hpp b/modules/optflow/include/opencv2/optflow/rlofflow.hpp index 4911c53b947..cbd352ec406 100644 --- a/modules/optflow/include/opencv2/optflow/rlofflow.hpp +++ b/modules/optflow/include/opencv2/optflow/rlofflow.hpp @@ -14,6 +14,25 @@ namespace optflow //! @addtogroup optflow //! @{ + enum SupportRegionType { + SR_FIXED = 0, /**< Apply a constant support region */ + SR_CROSS = 1 /**< Apply a adaptive support region obtained by cross-based segmentation + * as described in @cite Senst2014 + */ + }; + enum SolverType { + ST_STANDART = 0, /**< Apply standard iterative refinement */ + ST_BILINEAR = 1 /**< Apply optimized iterative refinement based bilinear equation solutions + * as described in @cite Senst2013 + */ + }; + + enum InterpolationType + { + INTERP_GEO = 0, /**< Fast geodesic interpolation, see @cite Geistert2016 */ + INTERP_EPIC = 1, /**< Edge-preserving interpolation, see @cite Revaud2015,Geistert2016. */ + }; + /** @brief This is used store and set up the parameters of the robust local optical flow (RLOF) algoritm. * * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as @@ -39,25 +58,6 @@ namespace optflow * * @see optflow::DenseRLOFOpticalFlow, optflow::calcOpticalFlowDenseRLOF(), optflow::SparseRLOFOpticalFlow, optflow::calcOpticalFlowSparseRLOF() */ - enum SupportRegionType { - SR_FIXED = 0, /**< Apply a constant support region */ - SR_CROSS = 1 /**< Apply a adaptive support region obtained by cross-based segmentation - * as described in @cite Senst2014 - */ - }; - enum SolverType { - ST_STANDART = 0, /**< Apply standard iterative refinement */ - ST_BILINEAR = 1 /**< Apply optimized iterative refinement based bilinear equation solutions - * as described in @cite Senst2013 - */ - }; - - enum InterpolationType - { - INTERP_GEO = 0, /**< Fast geodesic interpolation, see @cite Geistert2016 */ - INTERP_EPIC = 1, /**< Edge-preserving interpolation, see @cite Revaud2015,Geistert2016. */ - }; - class CV_EXPORTS_W RLOFOpticalFlowParameter{ public: RLOFOpticalFlowParameter() From fb3133743f7289db86a438f6dd10e7da97c75f5c Mon Sep 17 00:00:00 2001 From: senst Date: Tue, 11 Dec 2018 15:56:17 +0100 Subject: [PATCH 10/25] integrate cast to remove last warnings --- modules/optflow/include/opencv2/optflow/rlofflow.hpp | 2 +- modules/optflow/src/rlof/rlof_invoker.hpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/optflow/include/opencv2/optflow/rlofflow.hpp b/modules/optflow/include/opencv2/optflow/rlofflow.hpp index cbd352ec406..87b1d30266d 100644 --- a/modules/optflow/include/opencv2/optflow/rlofflow.hpp +++ b/modules/optflow/include/opencv2/optflow/rlofflow.hpp @@ -121,7 +121,7 @@ namespace optflow */ bool useIlluminationModel; /**< Use the Gennert and Negahdaripour illumination model instead of the intensity brigthness - * constraint. (Probosed in @cite Senst2016 ) This model is defined as follow: + * constraint. (proposed in @cite Senst2016 ) This model is defined as follow: * \f[ I(\mathbf{x},t) + m \cdot I(\mathbf{x},t) + c = I(\mathbf{x},t+1) \f] * and contains with m and c a multiplicative and additive term which makes the estimate * more robust against illumination changes. The computational complexity is increased by diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp index 660585cd1a3..444d9114bd3 100644 --- a/modules/optflow/src/rlof/rlof_invoker.hpp +++ b/modules/optflow/src/rlof/rlof_invoker.hpp @@ -1217,7 +1217,7 @@ namespace radial W_BITS1 - 5); short ixval = static_cast(dIptr[0]); short iyval = static_cast(dIptr[1]); - int diff = J_val - static_cast(Iptr[x]) + Iptr[x] * gainVec.x + gainVec.y; + int diff = static_cast(J_val - Iptr[x] + Iptr[x] * gainVec.x + gainVec.y); int abss = (diff < 0) ? -diff : diff; if (diff > MEstimatorScale) MEstimatorScale += eta; @@ -1225,7 +1225,7 @@ namespace radial MEstimatorScale -= eta; if (abss > static_cast(fParam1)) { - diff = 0.f; + diff = 0; } else if (abss > static_cast(fParam0) && diff >= 0) { From d1440c04bfc0215b331edfebdf377288382e53da Mon Sep 17 00:00:00 2001 From: senst Date: Wed, 12 Dec 2018 14:57:15 +0100 Subject: [PATCH 11/25] * add create method and function inferfaces to RLOFOpticalFlowParamter to enable python wrapper interfaces --- .../include/opencv2/optflow/rlofflow.hpp | 46 +++++++++++++++ modules/optflow/src/rlofflow.cpp | 57 ++++++++++++++++++- 2 files changed, 102 insertions(+), 1 deletion(-) diff --git a/modules/optflow/include/opencv2/optflow/rlofflow.hpp b/modules/optflow/include/opencv2/optflow/rlofflow.hpp index 87b1d30266d..f0932abc557 100644 --- a/modules/optflow/include/opencv2/optflow/rlofflow.hpp +++ b/modules/optflow/include/opencv2/optflow/rlofflow.hpp @@ -76,6 +76,7 @@ namespace optflow ,minEigenValue(0.0001f) ,globalMotionRansacThreshold(10) {} + SolverType solverType; /**< Variable specifies the iterative refinement strategy. Please consider citing @cite Senst2013 when * using ST_BILINEAR. @@ -146,6 +147,51 @@ namespace optflow * n-th percentil (given by this value [0 ... 100]) of the motion vectors magnitude. * See @cite Senst2016 for more details. */ + + CV_WRAP void setSolverType(SolverType val); + CV_WRAP SolverType getSolverType() const; + + CV_WRAP void setSupportRegionType(SupportRegionType val); + CV_WRAP SupportRegionType getSupportRegionType() const; + + CV_WRAP void setNormSigma0(float val); + CV_WRAP float getNormSigma0() const; + + CV_WRAP void setNormSigma1(float val); + CV_WRAP float getNormSigma1() const; + + CV_WRAP void setSmallWinSize(int val); + CV_WRAP int getSmallWinSize() const; + + CV_WRAP void setLargeWinSize(int val); + CV_WRAP int getLargeWinSize() const; + + CV_WRAP void setCrossSegmentationThreshold(int val); + CV_WRAP int getCrossSegmentationThreshold() const; + + CV_WRAP void setMaxLevel(int val); + CV_WRAP int getMaxLevel() const; + + CV_WRAP void setUseInitialFlow(bool val); + CV_WRAP bool getUseInitialFlow() const; + + CV_WRAP void setUseIlluminationModel(bool val); + CV_WRAP bool getUseIlluminationModel() const; + + CV_WRAP void setUseGlobalMotionPrior(bool val); + CV_WRAP bool getUseGlobalMotionPrior() const; + + CV_WRAP void setMaxIteration(int val); + CV_WRAP int getMaxIteration() const; + + CV_WRAP void setMinEigenValue(float val); + CV_WRAP float getMinEigenValue() const; + + CV_WRAP void setGlobalMotionRansacThreshold(float val); + CV_WRAP float getGlobalMotionRansacThreshold() const; + + //! @brief Creates instance of optflow::RLOFOpticalFlowParameter + CV_WRAP static Ptr create(); }; /** @brief Fast dense optical flow computation based on robust local optical flow (RLOF) algorithms and sparse-to-dense interpolation diff --git a/modules/optflow/src/rlofflow.cpp b/modules/optflow/src/rlofflow.cpp index 3ba5a782f0b..ff43fc39820 100644 --- a/modules/optflow/src/rlofflow.cpp +++ b/modules/optflow/src/rlofflow.cpp @@ -10,6 +10,53 @@ namespace cv namespace optflow { + Ptr RLOFOpticalFlowParameter::create() + { + return Ptr(new RLOFOpticalFlowParameter); + } + + void RLOFOpticalFlowParameter::setSolverType(SolverType val){ solverType = val;} + SolverType RLOFOpticalFlowParameter::getSolverType() const { return solverType;} + + void RLOFOpticalFlowParameter::setSupportRegionType(SupportRegionType val){ supportRegionType = val;} + SupportRegionType RLOFOpticalFlowParameter::getSupportRegionType() const { return supportRegionType;} + + void RLOFOpticalFlowParameter::setNormSigma0(float val){ normSigma0 = val;} + float RLOFOpticalFlowParameter::getNormSigma0() const { return normSigma0;} + + void RLOFOpticalFlowParameter::setNormSigma1(float val){ normSigma1 = val;} + float RLOFOpticalFlowParameter::getNormSigma1() const { return normSigma1;} + + void RLOFOpticalFlowParameter::setSmallWinSize(int val){ smallWinSize = val;} + int RLOFOpticalFlowParameter::getSmallWinSize() const { return smallWinSize;} + + void RLOFOpticalFlowParameter::setLargeWinSize(int val){ largeWinSize = val;} + int RLOFOpticalFlowParameter::getLargeWinSize() const { return largeWinSize;} + + void RLOFOpticalFlowParameter::setCrossSegmentationThreshold(int val){ crossSegmentationThreshold = val;} + int RLOFOpticalFlowParameter::getCrossSegmentationThreshold() const { return crossSegmentationThreshold;} + + void RLOFOpticalFlowParameter::setMaxLevel(int val){ maxLevel = val;} + int RLOFOpticalFlowParameter::getMaxLevel() const { return maxLevel;} + + void RLOFOpticalFlowParameter::setUseInitialFlow(bool val){ useInitialFlow = val;} + bool RLOFOpticalFlowParameter::getUseInitialFlow() const { return useInitialFlow;} + + void RLOFOpticalFlowParameter::setUseIlluminationModel(bool val){ useIlluminationModel = val;} + bool RLOFOpticalFlowParameter::getUseIlluminationModel() const { return useIlluminationModel;} + + void RLOFOpticalFlowParameter::setUseGlobalMotionPrior(bool val){ useGlobalMotionPrior = val;} + bool RLOFOpticalFlowParameter::getUseGlobalMotionPrior() const { return useGlobalMotionPrior;} + + void RLOFOpticalFlowParameter::setMaxIteration(int val){ maxIteration = val;} + int RLOFOpticalFlowParameter::getMaxIteration() const { return maxIteration;} + + void RLOFOpticalFlowParameter::setMinEigenValue(float val){ minEigenValue = val;} + float RLOFOpticalFlowParameter::getMinEigenValue() const { return minEigenValue;} + + void RLOFOpticalFlowParameter::setGlobalMotionRansacThreshold(float val){ globalMotionRansacThreshold = val;} + float RLOFOpticalFlowParameter::getGlobalMotionRansacThreshold() const { return globalMotionRansacThreshold;} + class DenseOpticalFlowRLOFImpl : public DenseRLOFOpticalFlow { public: @@ -66,9 +113,13 @@ namespace optflow CV_Assert(!I0.empty() && I0.depth() == CV_8U && (I0.channels() == 3 || I0.channels() == 1)); CV_Assert(!I1.empty() && I1.depth() == CV_8U && (I1.channels() == 3 || I1.channels() == 1)); CV_Assert(I0.sameSize(I1)); + if (param.empty()) + param = Ptr(new RLOFOpticalFlowParameter()); if (param->supportRegionType == SR_CROSS) CV_Assert( I0.channels() == 3 && I1.channels() == 3); CV_Assert(interp_type == InterpolationType::INTERP_EPIC || interp_type == InterpolationType::INTERP_GEO); + // if no parameter is used use the default parameter + Mat prevImage = I0.getMat(); Mat currImage = I1.getMat(); int noPoints = prevImage.cols * prevImage.rows; @@ -86,7 +137,6 @@ namespace optflow prevPoints.erase(prevPoints.begin() + noPoints, prevPoints.end()); currPoints.resize(prevPoints.size()); calcLocalOpticalFlow(prevImage, currImage, prevPyramid, currPyramid, prevPoints, currPoints, *(param.get())); - flow.create(prevImage.size(), CV_32FC2); Mat dense_flow = flow.getMat(); @@ -237,6 +287,11 @@ namespace optflow Mat nextImage = nextImg.getMat(); Mat prevPtsMat = prevPts.getMat(); + if (param.empty()) + { + param = Ptr(new RLOFOpticalFlowParameter); + } + if (param->useInitialFlow == false) nextPts.create(prevPtsMat.size(), prevPtsMat.type(), -1, true); From 912c1dd29f0574552b3f299a3525a33d4879acd8 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Fri, 14 Dec 2018 13:55:26 +0300 Subject: [PATCH 12/25] white space fixes / coding style --- modules/optflow/include/opencv2/optflow.hpp | 1 - .../include/opencv2/optflow/rlofflow.hpp | 945 ++--- modules/optflow/perf/perf_rlof.cpp | 131 +- modules/optflow/src/rlof/berlof_invoker.hpp | 3599 ++++++++--------- .../optflow/src/rlof/geo_interpolation.cpp | 730 ++-- .../optflow/src/rlof/geo_interpolation.hpp | 55 +- modules/optflow/src/rlof/plk_invoker.hpp | 171 +- modules/optflow/src/rlof/rlof_invoker.hpp | 2362 ++++++----- modules/optflow/src/rlof/rlof_invokerbase.hpp | 421 +- modules/optflow/src/rlof/rlof_localflow.cpp | 833 ++-- modules/optflow/src/rlof/rlof_localflow.h | 25 +- modules/optflow/src/rlofflow.cpp | 651 ++- 12 files changed, 4952 insertions(+), 4972 deletions(-) diff --git a/modules/optflow/include/opencv2/optflow.hpp b/modules/optflow/include/opencv2/optflow.hpp index a4bd3a737e6..093b5fe2c7f 100644 --- a/modules/optflow/include/opencv2/optflow.hpp +++ b/modules/optflow/include/opencv2/optflow.hpp @@ -173,7 +173,6 @@ CV_EXPORTS_W Ptr createOptFlow_Farneback(); //! Additional interface to the SparseToDenseFlow algorithm - calcOpticalFlowSparseToDense() CV_EXPORTS_W Ptr createOptFlow_SparseToDense(); - /** @brief "Dual TV L1" Optical Flow Algorithm. The class implements the "Dual TV L1" optical flow algorithm described in @cite Zach2007 and diff --git a/modules/optflow/include/opencv2/optflow/rlofflow.hpp b/modules/optflow/include/opencv2/optflow/rlofflow.hpp index f0932abc557..4d602d2b28f 100644 --- a/modules/optflow/include/opencv2/optflow/rlofflow.hpp +++ b/modules/optflow/include/opencv2/optflow/rlofflow.hpp @@ -11,482 +11,483 @@ namespace cv { namespace optflow { - //! @addtogroup optflow - //! @{ - - enum SupportRegionType { - SR_FIXED = 0, /**< Apply a constant support region */ - SR_CROSS = 1 /**< Apply a adaptive support region obtained by cross-based segmentation - * as described in @cite Senst2014 - */ - }; - enum SolverType { - ST_STANDART = 0, /**< Apply standard iterative refinement */ - ST_BILINEAR = 1 /**< Apply optimized iterative refinement based bilinear equation solutions - * as described in @cite Senst2013 - */ - }; - - enum InterpolationType - { - INTERP_GEO = 0, /**< Fast geodesic interpolation, see @cite Geistert2016 */ - INTERP_EPIC = 1, /**< Edge-preserving interpolation, see @cite Revaud2015,Geistert2016. */ - }; - - /** @brief This is used store and set up the parameters of the robust local optical flow (RLOF) algoritm. - * - * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as - * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). - * This RLOF implementation can be seen as an improved pyramidal iterative Lucas-Kanade and includes - * a set of improving modules. The main improvements in respect to the pyramidal iterative Lucas-Kanade - * are: - * - A more robust redecending M-estimator framework (see @cite Senst2012) to improve the accuracy at - * motion boundaries and appearing and disappearing pixels. - * - an adaptive support region strategies to improve the accuracy at motion boundaries to reduce the - * corona effect, i.e oversmoothing of the PLK at motion/object boundaries. The cross-based segementation - * strategy (SR_CROSS) proposed in @cite Senst2014 uses a simple segmenation approach to obtain the optimal - * shape of the support region. - * - To deal with illumination changes (outdoor sequences and shadow) the intensity constancy assumption - * based optical flow equation has been adopt with the Gennert and Negahdaripour illumination model - * (see @cite Senst2016). This model can be switched on/off with the useIlluminationModel variable. - * - By using a global motion prior initialization (see @cite Senst2016) of the iterative refinement - * the accuracy could be significantly improved for large displacements. This initialization can be - * switched on and of with useGlobalMotionPrior variable. - * - * The RLOF can be computed with the SparseOpticalFlow class or function interface to track a set of features - * or with the DenseOpticalFlow class or function interface to compute dense optical flow. - * - * @see optflow::DenseRLOFOpticalFlow, optflow::calcOpticalFlowDenseRLOF(), optflow::SparseRLOFOpticalFlow, optflow::calcOpticalFlowSparseRLOF() - */ - class CV_EXPORTS_W RLOFOpticalFlowParameter{ - public: - RLOFOpticalFlowParameter() - :solverType(ST_BILINEAR) - ,supportRegionType(SR_CROSS) - ,normSigma0(3.2f) - ,normSigma1(7.f) - ,smallWinSize(9) - ,largeWinSize(21) - ,crossSegmentationThreshold(25) - ,maxLevel(3) - ,useInitialFlow(false) - ,useIlluminationModel(true) - ,useGlobalMotionPrior(true) - ,maxIteration(30) - ,minEigenValue(0.0001f) - ,globalMotionRansacThreshold(10) - {} - - SolverType solverType; - /**< Variable specifies the iterative refinement strategy. Please consider citing @cite Senst2013 when - * using ST_BILINEAR. - */ - - SupportRegionType supportRegionType; - /**< Variable specifies the support region shape extraction or shrinking strategy. - */ - - float normSigma0; - /**< &sigma paramter of the shrinked Hampel norm introduced in @cite Senst2012. If - * &sigma = std::numeric_limist::max() the least-square estimator will be used - * instead of the M-estimator. Althoug M-estimator is more robust against outlier in the support - * region the least-square can be fast in computation. - */ - float normSigma1; - /**< &sigma paramter of the shrinked Hampel norm introduced in @cite Senst2012. If - * &sigma = std::numeric_limist::max() the least-square estimator will be used - * instead of the M-estimator. Althoug M-estimator is more robust against outlier in the support - * region the least-square can be fast in computation. - */ - int smallWinSize; - /**< Minimal window size of the support region. This parameter is only used if supportRegionType is SR_CROSS. - */ - int largeWinSize; - /**< Maximal window size of the support region. If supportRegionType is SR_FIXED this gives the exact support - * region size. The speed of the RLOF is related to the applied win sizes. The smaller the window size the lower is the runtime, - * but the more sensitive to noise is the method. - */ - int crossSegmentationThreshold; - /**< Color similarity threshold used by cross-based segmentation following @cite Senst2014 . - * (Only used if supportRegionType is SR_CROSS). With the cross-bassed segmentation - * motion boundaries can be computed more accurately. - */ - int maxLevel; - /**< Maximal number of pyramid level used. The large this value is the more likely it is - * to obtain accurate solutions for long-range motions. The runtime is linear related to - * this parameter. - */ - bool useInitialFlow; - /**< Use next point list as initial values. A good intialization can imporve the algortihm - * accuracy and reduce the runtime by a faster convergence of the iteration refinement. - */ - bool useIlluminationModel; - /**< Use the Gennert and Negahdaripour illumination model instead of the intensity brigthness - * constraint. (proposed in @cite Senst2016 ) This model is defined as follow: - * \f[ I(\mathbf{x},t) + m \cdot I(\mathbf{x},t) + c = I(\mathbf{x},t+1) \f] - * and contains with m and c a multiplicative and additive term which makes the estimate - * more robust against illumination changes. The computational complexity is increased by - * enabling the illumination model. - */ - bool useGlobalMotionPrior; - /**< Use global motion prior initialisation has been introduced in @cite Senst2016 . It - * allows to be more accurate for long-range motion. The computational complexity is - * slightly increased by enabling the global motion prior initialisation. - */ - int maxIteration; - /**< Number of maximal iterations used for the iterative refinement. Lower values can - * reduce the runtime but also the accuracy. - */ - float minEigenValue; - /**< Threshold for the minimal eigenvalue of the gradient matrix defines when to abort the - * iterative refinement. - */ - float globalMotionRansacThreshold; - /**< To apply the global motion prior motion vectors will be computed on a regulary sampled which - * are the basis for Homography estimation using RANSAC. The reprojection threshold is based on - * n-th percentil (given by this value [0 ... 100]) of the motion vectors magnitude. - * See @cite Senst2016 for more details. - */ - - CV_WRAP void setSolverType(SolverType val); - CV_WRAP SolverType getSolverType() const; - - CV_WRAP void setSupportRegionType(SupportRegionType val); - CV_WRAP SupportRegionType getSupportRegionType() const; - - CV_WRAP void setNormSigma0(float val); - CV_WRAP float getNormSigma0() const; - - CV_WRAP void setNormSigma1(float val); - CV_WRAP float getNormSigma1() const; - - CV_WRAP void setSmallWinSize(int val); - CV_WRAP int getSmallWinSize() const; - - CV_WRAP void setLargeWinSize(int val); - CV_WRAP int getLargeWinSize() const; - - CV_WRAP void setCrossSegmentationThreshold(int val); - CV_WRAP int getCrossSegmentationThreshold() const; - - CV_WRAP void setMaxLevel(int val); - CV_WRAP int getMaxLevel() const; - - CV_WRAP void setUseInitialFlow(bool val); - CV_WRAP bool getUseInitialFlow() const; - - CV_WRAP void setUseIlluminationModel(bool val); - CV_WRAP bool getUseIlluminationModel() const; - - CV_WRAP void setUseGlobalMotionPrior(bool val); - CV_WRAP bool getUseGlobalMotionPrior() const; - - CV_WRAP void setMaxIteration(int val); - CV_WRAP int getMaxIteration() const; - - CV_WRAP void setMinEigenValue(float val); - CV_WRAP float getMinEigenValue() const; - - CV_WRAP void setGlobalMotionRansacThreshold(float val); - CV_WRAP float getGlobalMotionRansacThreshold() const; - - //! @brief Creates instance of optflow::RLOFOpticalFlowParameter - CV_WRAP static Ptr create(); - }; - - /** @brief Fast dense optical flow computation based on robust local optical flow (RLOF) algorithms and sparse-to-dense interpolation - * scheme. - * - * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as - * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). - * - * The sparse-to-dense interpolation scheme allows for fast computation of dense optical flow using RLOF (see @cite Geistert2016). - * For this scheme the following steps are applied: - * -# motion vector seeded at a regular sampled grid are computed. The sparsity of this grid can be configured with setGridStep - * -# (optinally) errornous motion vectors are filter based on the forward backward confidence. The threshold can be configured - * with setForwardBackward. The filter is only applied if the threshold >0 but than the runtime is doubled due to the estimation - * of the backward flow. - * -# Vector field interpolation is applied to the motion vector set to obtain a dense vector field. - * - * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details. - * Parameters have been described in @cite Senst2012 @cite Senst2013 @cite Senst2014 and @cite Senst2016. - * - * @note SIMD parallelization is only available when compiling with SSE4.1. If the grid size is set to (1,1) and the - * forward backward threshold <= 0 that the dense optical flow field is purely. - * computed with the RLOF. - * - * @see optflow::calcOpticalFlowDenseRLOF(), optflow::RLOFOpticalFlowParameter - */ - class CV_EXPORTS_W DenseRLOFOpticalFlow : public DenseOpticalFlow - { - public: - //! @brief Configuration of the RLOF alogrithm. - /** - @see optflow::RLOFOpticalFlowParameter, getRLOFOpticalFlowParameter - */ - CV_WRAP virtual void setRLOFOpticalFlowParameter(Ptr val) = 0; - /** @copybrief setRLOFOpticalFlowParameter - @see optflow::RLOFOpticalFlowParameter, setRLOFOpticalFlowParameter - */ - CV_WRAP virtual Ptr getRLOFOpticalFlowParameter() const = 0; - //! @brief Threshold for the forward backward confidence check - /**For each grid point \f$ \mathbf{x} \f$ a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed. - * If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f] - * is larger than threshold given by this function then the motion vector will not be used by the following - * vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test - * will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation. - * @see getForwardBackward, setGridStep - */ - CV_WRAP virtual void setForwardBackward(float val) = 0; - /** @copybrief setForwardBackward - @see setForwardBackward - */ - CV_WRAP virtual float getForwardBackward() const = 0; - //! @brief Size of the grid to spawn the motion vectors. - /** For each grid point a motion vector is computed. Some motion vectors will be removed due to the forwatd backward - * threshold (if set >0). The rest will be the base of the vector field interpolation. - * @see getForwardBackward, setGridStep - */ - CV_WRAP virtual Size getGridStep() const = 0; - /** @copybrief getGridStep - * @see getGridStep - */ - CV_WRAP virtual void setGridStep(Size val) = 0; - - //! @brief Interpolation used to compute the dense optical flow. - /** Two interpolation algorithms are supported - * - **INTERP_GEO** applies the fast geodesic interpolation, see @cite Geistert2016. - * - **INTERP_EPIC_RESIDUAL** applies the edge-preserving interpolation, see @cite Revaud2015,Geistert2016. - * @see ximgproc::EdgeAwareInterpolator, getInterpolation - */ - CV_WRAP virtual void setInterpolation(InterpolationType val) = 0; - /** @copybrief setInterpolation - * @see ximgproc::EdgeAwareInterpolator, setInterpolation - */ - CV_WRAP virtual InterpolationType getInterpolation() const = 0; - //! @brief see ximgproc::EdgeAwareInterpolator() K value. - /** K is a number of nearest-neighbor matches considered, when fitting a locally affine - * model. Usually it should be around 128. However, lower values would make the interpolation noticeably faster. - * @see ximgproc::EdgeAwareInterpolator, setEPICK - */ - CV_WRAP virtual int getEPICK() const = 0; - /** @copybrief getEPICK - * @see ximgproc::EdgeAwareInterpolator, getEPICK - */ - CV_WRAP virtual void setEPICK(int val) = 0; - //! @brief see ximgproc::EdgeAwareInterpolator() sigma value. - /** Sigma is a parameter defining how fast the weights decrease in the locally-weighted affine - * fitting. Higher values can help preserve fine details, lower values can help to get rid of noise in the - * output flow. - * @see ximgproc::EdgeAwareInterpolator, setEPICSigma - */ - CV_WRAP virtual float getEPICSigma() const = 0; - /** @copybrief getEPICSigma - * @see ximgproc::EdgeAwareInterpolator, getEPICSigma - */ - CV_WRAP virtual void setEPICSigma(float val) = 0; - //! @brief see ximgproc::EdgeAwareInterpolator() lambda value. - /** Lambda is a parameter defining the weight of the edge-aware term in geodesic distance, - * should be in the range of 0 to 1000. - * @see ximgproc::EdgeAwareInterpolator, setEPICSigma - */ - CV_WRAP virtual float getEPICLambda() const = 0; - /** @copybrief getEPICLambda - * @see ximgproc::EdgeAwareInterpolator, getEPICLambda - */ - CV_WRAP virtual void setEPICLambda(float val) = 0; - //! @brief see ximgproc::EdgeAwareInterpolator(). - /** Sets the respective fastGlobalSmootherFilter() parameter. - * @see ximgproc::EdgeAwareInterpolator, setFgsLambda - */ - CV_WRAP virtual float getFgsLambda() const = 0; - /** @copybrief getFgsLambda - * @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, getFgsLambda - */ - CV_WRAP virtual void setFgsLambda(float val) = 0; - //! @brief see ximgproc::EdgeAwareInterpolator(). - /** Sets the respective fastGlobalSmootherFilter() parameter. - * @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, setFgsSigma - */ - CV_WRAP virtual float getFgsSigma() const = 0; - /** @copybrief getFgsSigma - * @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, getFgsSigma - */ - CV_WRAP virtual void setFgsSigma(float val) = 0; - //! @brief enables ximgproc::fastGlobalSmootherFilter - /** - * @see getUsePostProc - */ - CV_WRAP virtual void setUsePostProc(bool val) = 0; - /** @copybrief setUsePostProc - * @see ximgproc::fastGlobalSmootherFilter, setUsePostProc - */ - CV_WRAP virtual bool getUsePostProc() const = 0; - //! @brief Creates instance of optflow::DenseRLOFOpticalFlow - /** - * @param rlofParam see optflow::RLOFOpticalFlowParameter - * @param forwardBackwardThreshold see setForwardBackward - * @param gridStep see setGridStep - * @param interp_type see setInterpolation - * @param epicK see setEPICK - * @param epicSigma see setEPICSigma - * @param epicLambda see setEPICLambda - * @param use_post_proc see setUsePostProc - * @param fgsLambda see setFgsLambda - * @param fgsSigma see setFgsSigma - */ - CV_WRAP static Ptr create( - Ptr rlofParam = Ptr(), - float forwardBackwardThreshold = 1.f, - Size gridStep = Size(6, 6), - InterpolationType interp_type = InterpolationType::INTERP_EPIC, - int epicK = 128, - float epicSigma = 0.05f, - float epicLambda = 999.0f, - bool use_post_proc = true, - float fgsLambda = 500.0f, - float fgsSigma = 1.5f); - }; - - /** @brief Class used for calculation sparse optical flow and feature tracking with robust local optical flow (RLOF) algorithms. - * - * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as - * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). - * - * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details. - * Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014 and @cite Senst2016. - * - * @note SIMD parallelization is only available when compiling with SSE4.1. - * @see optflow::calcOpticalFlowSparseRLOF(), optflow::RLOFOpticalFlowParameter - */ - class CV_EXPORTS_W SparseRLOFOpticalFlow : public SparseOpticalFlow - { - public: - /** @copydoc DenseRLOFOpticalFlow::setRLOFOpticalFlowParameter - */ - CV_WRAP virtual void setRLOFOpticalFlowParameter(Ptr val) = 0; - /** @copybrief setRLOFOpticalFlowParameter - * @see setRLOFOpticalFlowParameter - */ - CV_WRAP virtual Ptr getRLOFOpticalFlowParameter() const = 0; - //! @brief Threshold for the forward backward confidence check - /** For each feature point a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed. - * If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f] - * is larger than threshold given by this function then the status will not be used by the following - * vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test - * will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation. - * @see setForwardBackward - */ - CV_WRAP virtual void setForwardBackward(float val) = 0; - /** @copybrief setForwardBackward - * @see setForwardBackward - */ - CV_WRAP virtual float getForwardBackward() const = 0; - - //! @brief Creates instance of SparseRLOFOpticalFlow - /** - * @param rlofParam see setRLOFOpticalFlowParameter - * @param forwardBackwardThreshold see setForwardBackward - */ - CV_WRAP static Ptr create( - Ptr rlofParam = Ptr(), - float forwardBackwardThreshold = 1.f); - - }; - - /** @brief Fast dense optical flow computation based on robust local optical flow (RLOF) algorithms and sparse-to-dense interpolation scheme. - * - * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as - * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). - * - * The sparse-to-dense interpolation scheme allows for fast computation of dense optical flow using RLOF (see @cite Geistert2016). - * For this scheme the following steps are applied: - * -# motion vector seeded at a regular sampled grid are computed. The sparsity of this grid can be configured with setGridStep - * -# (optinally) errornous motion vectors are filter based on the forward backward confidence. The threshold can be configured - * with setForwardBackward. The filter is only applied if the threshold >0 but than the runtime is doubled due to the estimation - * of the backward flow. - * -# Vector field interpolation is applied to the motion vector set to obtain a dense vector field. - * - * @param I0 first 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType - * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image. - * @param I1 second 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType - * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image. - * @param flow computed flow image that has the same size as I0 and type CV_32FC2. - * @param rlofParam see optflow::RLOFOpticalFlowParameter - * @param forwardBackwardThreshold Threshold for the forward backward confidence check. - * For each grid point \f$ \mathbf{x} \f$ a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed. - * If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f] - * is larger than threshold given by this function then the motion vector will not be used by the following - * vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test +//! @addtogroup optflow +//! @{ + +enum SupportRegionType { + SR_FIXED = 0, /**< Apply a constant support region */ + SR_CROSS = 1 /**< Apply a adaptive support region obtained by cross-based segmentation + * as described in @cite Senst2014 + */ +}; +enum SolverType { + ST_STANDART = 0, /**< Apply standard iterative refinement */ + ST_BILINEAR = 1 /**< Apply optimized iterative refinement based bilinear equation solutions + * as described in @cite Senst2013 + */ +}; + +enum InterpolationType +{ + INTERP_GEO = 0, /**< Fast geodesic interpolation, see @cite Geistert2016 */ + INTERP_EPIC = 1, /**< Edge-preserving interpolation, see @cite Revaud2015,Geistert2016. */ +}; + +/** @brief This is used store and set up the parameters of the robust local optical flow (RLOF) algoritm. + * + * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as + * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). + * This RLOF implementation can be seen as an improved pyramidal iterative Lucas-Kanade and includes + * a set of improving modules. The main improvements in respect to the pyramidal iterative Lucas-Kanade + * are: + * - A more robust redecending M-estimator framework (see @cite Senst2012) to improve the accuracy at + * motion boundaries and appearing and disappearing pixels. + * - an adaptive support region strategies to improve the accuracy at motion boundaries to reduce the + * corona effect, i.e oversmoothing of the PLK at motion/object boundaries. The cross-based segementation + * strategy (SR_CROSS) proposed in @cite Senst2014 uses a simple segmenation approach to obtain the optimal + * shape of the support region. + * - To deal with illumination changes (outdoor sequences and shadow) the intensity constancy assumption + * based optical flow equation has been adopt with the Gennert and Negahdaripour illumination model + * (see @cite Senst2016). This model can be switched on/off with the useIlluminationModel variable. + * - By using a global motion prior initialization (see @cite Senst2016) of the iterative refinement + * the accuracy could be significantly improved for large displacements. This initialization can be + * switched on and of with useGlobalMotionPrior variable. + * + * The RLOF can be computed with the SparseOpticalFlow class or function interface to track a set of features + * or with the DenseOpticalFlow class or function interface to compute dense optical flow. + * + * @see optflow::DenseRLOFOpticalFlow, optflow::calcOpticalFlowDenseRLOF(), optflow::SparseRLOFOpticalFlow, optflow::calcOpticalFlowSparseRLOF() + */ +class CV_EXPORTS_W RLOFOpticalFlowParameter{ +public: + RLOFOpticalFlowParameter() + :solverType(ST_BILINEAR) + ,supportRegionType(SR_CROSS) + ,normSigma0(3.2f) + ,normSigma1(7.f) + ,smallWinSize(9) + ,largeWinSize(21) + ,crossSegmentationThreshold(25) + ,maxLevel(3) + ,useInitialFlow(false) + ,useIlluminationModel(true) + ,useGlobalMotionPrior(true) + ,maxIteration(30) + ,minEigenValue(0.0001f) + ,globalMotionRansacThreshold(10) + {} + + SolverType solverType; + /**< Variable specifies the iterative refinement strategy. Please consider citing @cite Senst2013 when + * using ST_BILINEAR. + */ + + SupportRegionType supportRegionType; + /**< Variable specifies the support region shape extraction or shrinking strategy. + */ + + float normSigma0; + /**< &sigma paramter of the shrinked Hampel norm introduced in @cite Senst2012. If + * &sigma = std::numeric_limist::max() the least-square estimator will be used + * instead of the M-estimator. Althoug M-estimator is more robust against outlier in the support + * region the least-square can be fast in computation. + */ + float normSigma1; + /**< &sigma paramter of the shrinked Hampel norm introduced in @cite Senst2012. If + * &sigma = std::numeric_limist::max() the least-square estimator will be used + * instead of the M-estimator. Althoug M-estimator is more robust against outlier in the support + * region the least-square can be fast in computation. + */ + int smallWinSize; + /**< Minimal window size of the support region. This parameter is only used if supportRegionType is SR_CROSS. + */ + int largeWinSize; + /**< Maximal window size of the support region. If supportRegionType is SR_FIXED this gives the exact support + * region size. The speed of the RLOF is related to the applied win sizes. The smaller the window size the lower is the runtime, + * but the more sensitive to noise is the method. + */ + int crossSegmentationThreshold; + /**< Color similarity threshold used by cross-based segmentation following @cite Senst2014 . + * (Only used if supportRegionType is SR_CROSS). With the cross-bassed segmentation + * motion boundaries can be computed more accurately. + */ + int maxLevel; + /**< Maximal number of pyramid level used. The large this value is the more likely it is + * to obtain accurate solutions for long-range motions. The runtime is linear related to + * this parameter. + */ + bool useInitialFlow; + /**< Use next point list as initial values. A good intialization can imporve the algortihm + * accuracy and reduce the runtime by a faster convergence of the iteration refinement. + */ + bool useIlluminationModel; + /**< Use the Gennert and Negahdaripour illumination model instead of the intensity brigthness + * constraint. (proposed in @cite Senst2016 ) This model is defined as follow: + * \f[ I(\mathbf{x},t) + m \cdot I(\mathbf{x},t) + c = I(\mathbf{x},t+1) \f] + * and contains with m and c a multiplicative and additive term which makes the estimate + * more robust against illumination changes. The computational complexity is increased by + * enabling the illumination model. + */ + bool useGlobalMotionPrior; + /**< Use global motion prior initialisation has been introduced in @cite Senst2016 . It + * allows to be more accurate for long-range motion. The computational complexity is + * slightly increased by enabling the global motion prior initialisation. + */ + int maxIteration; + /**< Number of maximal iterations used for the iterative refinement. Lower values can + * reduce the runtime but also the accuracy. + */ + float minEigenValue; + /**< Threshold for the minimal eigenvalue of the gradient matrix defines when to abort the + * iterative refinement. + */ + float globalMotionRansacThreshold; + /**< To apply the global motion prior motion vectors will be computed on a regulary sampled which + * are the basis for Homography estimation using RANSAC. The reprojection threshold is based on + * n-th percentil (given by this value [0 ... 100]) of the motion vectors magnitude. + * See @cite Senst2016 for more details. + */ + + CV_WRAP void setSolverType(SolverType val); + CV_WRAP SolverType getSolverType() const; + + CV_WRAP void setSupportRegionType(SupportRegionType val); + CV_WRAP SupportRegionType getSupportRegionType() const; + + CV_WRAP void setNormSigma0(float val); + CV_WRAP float getNormSigma0() const; + + CV_WRAP void setNormSigma1(float val); + CV_WRAP float getNormSigma1() const; + + CV_WRAP void setSmallWinSize(int val); + CV_WRAP int getSmallWinSize() const; + + CV_WRAP void setLargeWinSize(int val); + CV_WRAP int getLargeWinSize() const; + + CV_WRAP void setCrossSegmentationThreshold(int val); + CV_WRAP int getCrossSegmentationThreshold() const; + + CV_WRAP void setMaxLevel(int val); + CV_WRAP int getMaxLevel() const; + + CV_WRAP void setUseInitialFlow(bool val); + CV_WRAP bool getUseInitialFlow() const; + + CV_WRAP void setUseIlluminationModel(bool val); + CV_WRAP bool getUseIlluminationModel() const; + + CV_WRAP void setUseGlobalMotionPrior(bool val); + CV_WRAP bool getUseGlobalMotionPrior() const; + + CV_WRAP void setMaxIteration(int val); + CV_WRAP int getMaxIteration() const; + + CV_WRAP void setMinEigenValue(float val); + CV_WRAP float getMinEigenValue() const; + + CV_WRAP void setGlobalMotionRansacThreshold(float val); + CV_WRAP float getGlobalMotionRansacThreshold() const; + + //! @brief Creates instance of optflow::RLOFOpticalFlowParameter + CV_WRAP static Ptr create(); +}; + +/** @brief Fast dense optical flow computation based on robust local optical flow (RLOF) algorithms and sparse-to-dense interpolation + * scheme. + * + * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as + * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). + * + * The sparse-to-dense interpolation scheme allows for fast computation of dense optical flow using RLOF (see @cite Geistert2016). + * For this scheme the following steps are applied: + * -# motion vector seeded at a regular sampled grid are computed. The sparsity of this grid can be configured with setGridStep + * -# (optinally) errornous motion vectors are filter based on the forward backward confidence. The threshold can be configured + * with setForwardBackward. The filter is only applied if the threshold >0 but than the runtime is doubled due to the estimation + * of the backward flow. + * -# Vector field interpolation is applied to the motion vector set to obtain a dense vector field. + * + * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details. + * Parameters have been described in @cite Senst2012 @cite Senst2013 @cite Senst2014 and @cite Senst2016. + * + * @note SIMD parallelization is only available when compiling with SSE4.1. If the grid size is set to (1,1) and the + * forward backward threshold <= 0 that the dense optical flow field is purely. + * computed with the RLOF. + * + * @see optflow::calcOpticalFlowDenseRLOF(), optflow::RLOFOpticalFlowParameter +*/ +class CV_EXPORTS_W DenseRLOFOpticalFlow : public DenseOpticalFlow +{ +public: + //! @brief Configuration of the RLOF alogrithm. + /** + @see optflow::RLOFOpticalFlowParameter, getRLOFOpticalFlowParameter + */ + CV_WRAP virtual void setRLOFOpticalFlowParameter(Ptr val) = 0; + /** @copybrief setRLOFOpticalFlowParameter + @see optflow::RLOFOpticalFlowParameter, setRLOFOpticalFlowParameter + */ + CV_WRAP virtual Ptr getRLOFOpticalFlowParameter() const = 0; + //! @brief Threshold for the forward backward confidence check + /**For each grid point \f$ \mathbf{x} \f$ a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed. + * If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f] + * is larger than threshold given by this function then the motion vector will not be used by the following + * vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test * will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation. - * @param gridStep Size of the grid to spawn the motion vectors. For each grid point a motion vector is computed. - * Some motion vectors will be removed due to the forwatd backward threshold (if set >0). The rest will be the - * base of the vector field interpolation. - * @param interp_type interpolation method used to compute the dense optical flow. Two interpolation algorithms are - * supported: + * @see getForwardBackward, setGridStep + */ + CV_WRAP virtual void setForwardBackward(float val) = 0; + /** @copybrief setForwardBackward + @see setForwardBackward + */ + CV_WRAP virtual float getForwardBackward() const = 0; + //! @brief Size of the grid to spawn the motion vectors. + /** For each grid point a motion vector is computed. Some motion vectors will be removed due to the forwatd backward + * threshold (if set >0). The rest will be the base of the vector field interpolation. + * @see getForwardBackward, setGridStep + */ + CV_WRAP virtual Size getGridStep() const = 0; + /** @copybrief getGridStep + * @see getGridStep + */ + CV_WRAP virtual void setGridStep(Size val) = 0; + + //! @brief Interpolation used to compute the dense optical flow. + /** Two interpolation algorithms are supported * - **INTERP_GEO** applies the fast geodesic interpolation, see @cite Geistert2016. * - **INTERP_EPIC_RESIDUAL** applies the edge-preserving interpolation, see @cite Revaud2015,Geistert2016. - * @param epicK see ximgproc::EdgeAwareInterpolator() sets the respective parameter. - * @param epicSigma see ximgproc::EdgeAwareInterpolator() sets the respective parameter. - * @param epicLambda see ximgproc::EdgeAwareInterpolator() sets the respective parameter. - * @param use_post_proc enables ximgproc::fastGlobalSmootherFilter() parameter. - * @param fgsLambda sets the respective ximgproc::fastGlobalSmootherFilter() parameter. - * @param fgsSigma sets the respective ximgproc::fastGlobalSmootherFilter() parameter. - * - * Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014, @cite Senst2016. - * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details. - * @note If the grid size is set to (1,1) and the forward backward threshold <= 0 that the dense optical flow field is purely - * computed with the RLOF. - * - * @note SIMD parallelization is only available when compiling with SSE4.1. - * - * @sa optflow::DenseRLOFOpticalFlow, optflow::RLOFOpticalFlowParameter - */ - CV_EXPORTS_W void calcOpticalFlowDenseRLOF(InputArray I0, InputArray I1, InputOutputArray flow, + * @see ximgproc::EdgeAwareInterpolator, getInterpolation + */ + CV_WRAP virtual void setInterpolation(InterpolationType val) = 0; + /** @copybrief setInterpolation + * @see ximgproc::EdgeAwareInterpolator, setInterpolation + */ + CV_WRAP virtual InterpolationType getInterpolation() const = 0; + //! @brief see ximgproc::EdgeAwareInterpolator() K value. + /** K is a number of nearest-neighbor matches considered, when fitting a locally affine + * model. Usually it should be around 128. However, lower values would make the interpolation noticeably faster. + * @see ximgproc::EdgeAwareInterpolator, setEPICK + */ + CV_WRAP virtual int getEPICK() const = 0; + /** @copybrief getEPICK + * @see ximgproc::EdgeAwareInterpolator, getEPICK + */ + CV_WRAP virtual void setEPICK(int val) = 0; + //! @brief see ximgproc::EdgeAwareInterpolator() sigma value. + /** Sigma is a parameter defining how fast the weights decrease in the locally-weighted affine + * fitting. Higher values can help preserve fine details, lower values can help to get rid of noise in the + * output flow. + * @see ximgproc::EdgeAwareInterpolator, setEPICSigma + */ + CV_WRAP virtual float getEPICSigma() const = 0; + /** @copybrief getEPICSigma + * @see ximgproc::EdgeAwareInterpolator, getEPICSigma + */ + CV_WRAP virtual void setEPICSigma(float val) = 0; + //! @brief see ximgproc::EdgeAwareInterpolator() lambda value. + /** Lambda is a parameter defining the weight of the edge-aware term in geodesic distance, + * should be in the range of 0 to 1000. + * @see ximgproc::EdgeAwareInterpolator, setEPICSigma + */ + CV_WRAP virtual float getEPICLambda() const = 0; + /** @copybrief getEPICLambda + * @see ximgproc::EdgeAwareInterpolator, getEPICLambda + */ + CV_WRAP virtual void setEPICLambda(float val) = 0; + //! @brief see ximgproc::EdgeAwareInterpolator(). + /** Sets the respective fastGlobalSmootherFilter() parameter. + * @see ximgproc::EdgeAwareInterpolator, setFgsLambda + */ + CV_WRAP virtual float getFgsLambda() const = 0; + /** @copybrief getFgsLambda + * @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, getFgsLambda + */ + CV_WRAP virtual void setFgsLambda(float val) = 0; + //! @brief see ximgproc::EdgeAwareInterpolator(). + /** Sets the respective fastGlobalSmootherFilter() parameter. + * @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, setFgsSigma + */ + CV_WRAP virtual float getFgsSigma() const = 0; + /** @copybrief getFgsSigma + * @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, getFgsSigma + */ + CV_WRAP virtual void setFgsSigma(float val) = 0; + //! @brief enables ximgproc::fastGlobalSmootherFilter + /** + * @see getUsePostProc + */ + CV_WRAP virtual void setUsePostProc(bool val) = 0; + /** @copybrief setUsePostProc + * @see ximgproc::fastGlobalSmootherFilter, setUsePostProc + */ + CV_WRAP virtual bool getUsePostProc() const = 0; + //! @brief Creates instance of optflow::DenseRLOFOpticalFlow + /** + * @param rlofParam see optflow::RLOFOpticalFlowParameter + * @param forwardBackwardThreshold see setForwardBackward + * @param gridStep see setGridStep + * @param interp_type see setInterpolation + * @param epicK see setEPICK + * @param epicSigma see setEPICSigma + * @param epicLambda see setEPICLambda + * @param use_post_proc see setUsePostProc + * @param fgsLambda see setFgsLambda + * @param fgsSigma see setFgsSigma + */ + CV_WRAP static Ptr create( Ptr rlofParam = Ptr(), - float forwardBackwardThreshold = 0, Size gridStep = Size(6, 6), + float forwardBackwardThreshold = 1.f, + Size gridStep = Size(6, 6), InterpolationType interp_type = InterpolationType::INTERP_EPIC, - int epicK = 128, float epicSigma = 0.05f, float epicLambda = 100.f, - bool use_post_proc = true, float fgsLambda = 500.0f, float fgsSigma = 1.5f); - - /** @brief Calculates fast optical flow for a sparse feature set using the robust local optical flow (RLOF) similar - * to optflow::calcOpticalFlowPyrLK(). - * - * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as - * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). - * - * @param prevImg first 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType - * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image. - * @param nextImg second 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType - * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image. - * @param prevPts vector of 2D points for which the flow needs to be found; point coordinates must be single-precision - * floating-point numbers. - * @param nextPts output vector of 2D points (with single-precision floating-point coordinates) containing the calculated - * new positions of input features in the second image; when optflow::RLOFOpticalFlowParameter::useInitialFlow variable is true the vector must - * have the same size as in the input and contain the initialization point correspondences. - * @param status output status vector (of unsigned chars); each element of the vector is set to 1 if the flow for the - * corresponding features has passed the forward backward check. - * @param err output vector of errors; each element of the vector is set to the forward backward error for the corresponding feature. - * @param rlofParam see optflow::RLOFOpticalFlowParameter - * @param forwardBackwardThreshold Threshold for the forward backward confidence check. If forewardBackwardThreshold <=0 the forward - * - * @note SIMD parallelization is only available when compiling with SSE4.1. - * - * Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014 and @cite Senst2016. - * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details. - */ - CV_EXPORTS_W void calcOpticalFlowSparseRLOF(InputArray prevImg, InputArray nextImg, - InputArray prevPts, InputOutputArray nextPts, - OutputArray status, OutputArray err, - Ptr rlofParam = Ptr(), - float forwardBackwardThreshold = 0); - - //! Additional interface to the Dense RLOF algorithm - optflow::calcOpticalFlowDenseRLOF() - CV_EXPORTS_W Ptr createOptFlow_DenseRLOF(); + int epicK = 128, + float epicSigma = 0.05f, + float epicLambda = 999.0f, + bool use_post_proc = true, + float fgsLambda = 500.0f, + float fgsSigma = 1.5f); +}; + +/** @brief Class used for calculation sparse optical flow and feature tracking with robust local optical flow (RLOF) algorithms. +* +* The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as +* proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). +* +* For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details. +* Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014 and @cite Senst2016. +* +* @note SIMD parallelization is only available when compiling with SSE4.1. +* @see optflow::calcOpticalFlowSparseRLOF(), optflow::RLOFOpticalFlowParameter +*/ +class CV_EXPORTS_W SparseRLOFOpticalFlow : public SparseOpticalFlow +{ +public: + /** @copydoc DenseRLOFOpticalFlow::setRLOFOpticalFlowParameter + */ + CV_WRAP virtual void setRLOFOpticalFlowParameter(Ptr val) = 0; + /** @copybrief setRLOFOpticalFlowParameter + * @see setRLOFOpticalFlowParameter + */ + CV_WRAP virtual Ptr getRLOFOpticalFlowParameter() const = 0; + //! @brief Threshold for the forward backward confidence check + /** For each feature point a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed. + * If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f] + * is larger than threshold given by this function then the status will not be used by the following + * vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test + * will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation. + * @see setForwardBackward + */ + CV_WRAP virtual void setForwardBackward(float val) = 0; + /** @copybrief setForwardBackward + * @see setForwardBackward + */ + CV_WRAP virtual float getForwardBackward() const = 0; - //! Additional interface to the Sparse RLOF algorithm - optflow::calcOpticalFlowSparseRLOF() - CV_EXPORTS_W Ptr createOptFlow_SparseRLOF(); - //! @} -} -} + //! @brief Creates instance of SparseRLOFOpticalFlow + /** + * @param rlofParam see setRLOFOpticalFlowParameter + * @param forwardBackwardThreshold see setForwardBackward + */ + CV_WRAP static Ptr create( + Ptr rlofParam = Ptr(), + float forwardBackwardThreshold = 1.f); + +}; + +/** @brief Fast dense optical flow computation based on robust local optical flow (RLOF) algorithms and sparse-to-dense interpolation scheme. + * + * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as + * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). + * + * The sparse-to-dense interpolation scheme allows for fast computation of dense optical flow using RLOF (see @cite Geistert2016). + * For this scheme the following steps are applied: + * -# motion vector seeded at a regular sampled grid are computed. The sparsity of this grid can be configured with setGridStep + * -# (optinally) errornous motion vectors are filter based on the forward backward confidence. The threshold can be configured + * with setForwardBackward. The filter is only applied if the threshold >0 but than the runtime is doubled due to the estimation + * of the backward flow. + * -# Vector field interpolation is applied to the motion vector set to obtain a dense vector field. + * + * @param I0 first 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType + * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image. + * @param I1 second 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType + * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image. + * @param flow computed flow image that has the same size as I0 and type CV_32FC2. + * @param rlofParam see optflow::RLOFOpticalFlowParameter + * @param forwardBackwardThreshold Threshold for the forward backward confidence check. + * For each grid point \f$ \mathbf{x} \f$ a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed. + * If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f] + * is larger than threshold given by this function then the motion vector will not be used by the following + * vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test + * will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation. + * @param gridStep Size of the grid to spawn the motion vectors. For each grid point a motion vector is computed. + * Some motion vectors will be removed due to the forwatd backward threshold (if set >0). The rest will be the + * base of the vector field interpolation. + * @param interp_type interpolation method used to compute the dense optical flow. Two interpolation algorithms are + * supported: + * - **INTERP_GEO** applies the fast geodesic interpolation, see @cite Geistert2016. + * - **INTERP_EPIC_RESIDUAL** applies the edge-preserving interpolation, see @cite Revaud2015,Geistert2016. + * @param epicK see ximgproc::EdgeAwareInterpolator() sets the respective parameter. + * @param epicSigma see ximgproc::EdgeAwareInterpolator() sets the respective parameter. + * @param epicLambda see ximgproc::EdgeAwareInterpolator() sets the respective parameter. + * @param use_post_proc enables ximgproc::fastGlobalSmootherFilter() parameter. + * @param fgsLambda sets the respective ximgproc::fastGlobalSmootherFilter() parameter. + * @param fgsSigma sets the respective ximgproc::fastGlobalSmootherFilter() parameter. + * + * Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014, @cite Senst2016. + * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details. + * @note If the grid size is set to (1,1) and the forward backward threshold <= 0 that the dense optical flow field is purely + * computed with the RLOF. + * + * @note SIMD parallelization is only available when compiling with SSE4.1. + * + * @sa optflow::DenseRLOFOpticalFlow, optflow::RLOFOpticalFlowParameter +*/ +CV_EXPORTS_W void calcOpticalFlowDenseRLOF(InputArray I0, InputArray I1, InputOutputArray flow, + Ptr rlofParam = Ptr(), + float forwardBackwardThreshold = 0, Size gridStep = Size(6, 6), + InterpolationType interp_type = InterpolationType::INTERP_EPIC, + int epicK = 128, float epicSigma = 0.05f, float epicLambda = 100.f, + bool use_post_proc = true, float fgsLambda = 500.0f, float fgsSigma = 1.5f); + +/** @brief Calculates fast optical flow for a sparse feature set using the robust local optical flow (RLOF) similar +* to optflow::calcOpticalFlowPyrLK(). +* +* The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as +* proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). +* +* @param prevImg first 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType +* = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image. +* @param nextImg second 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType +* = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image. +* @param prevPts vector of 2D points for which the flow needs to be found; point coordinates must be single-precision +* floating-point numbers. +* @param nextPts output vector of 2D points (with single-precision floating-point coordinates) containing the calculated +* new positions of input features in the second image; when optflow::RLOFOpticalFlowParameter::useInitialFlow variable is true the vector must +* have the same size as in the input and contain the initialization point correspondences. +* @param status output status vector (of unsigned chars); each element of the vector is set to 1 if the flow for the +* corresponding features has passed the forward backward check. +* @param err output vector of errors; each element of the vector is set to the forward backward error for the corresponding feature. +* @param rlofParam see optflow::RLOFOpticalFlowParameter +* @param forwardBackwardThreshold Threshold for the forward backward confidence check. If forewardBackwardThreshold <=0 the forward +* +* @note SIMD parallelization is only available when compiling with SSE4.1. +* +* Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014 and @cite Senst2016. +* For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details. +*/ +CV_EXPORTS_W void calcOpticalFlowSparseRLOF(InputArray prevImg, InputArray nextImg, + InputArray prevPts, InputOutputArray nextPts, + OutputArray status, OutputArray err, + Ptr rlofParam = Ptr(), + float forwardBackwardThreshold = 0); + +//! Additional interface to the Dense RLOF algorithm - optflow::calcOpticalFlowDenseRLOF() +CV_EXPORTS_W Ptr createOptFlow_DenseRLOF(); + +//! Additional interface to the Sparse RLOF algorithm - optflow::calcOpticalFlowSparseRLOF() +CV_EXPORTS_W Ptr createOptFlow_SparseRLOF(); +//! @} + +} // namespace +} // namespace #endif diff --git a/modules/optflow/perf/perf_rlof.cpp b/modules/optflow/perf/perf_rlof.cpp index e67feec2777..d8fb64eb40a 100644 --- a/modules/optflow/perf/perf_rlof.cpp +++ b/modules/optflow/perf/perf_rlof.cpp @@ -1,73 +1,74 @@ #include "perf_precomp.hpp" namespace opencv_test { namespace { - typedef tuple ST_SR_IM_Sparse_t; - typedef TestBaseWithParam ST_SR_IM_Sparse; - PERF_TEST_P(ST_SR_IM_Sparse, OpticalFlow_SparseRLOF, - testing::Combine( - testing::Values("ST_BILINEAR", "ST_STANDART"), - testing::Values("SR_CROSS", "SR_FIXED"), - testing::Values(true, false)) - ) + +typedef tuple ST_SR_IM_Sparse_t; +typedef TestBaseWithParam ST_SR_IM_Sparse; +PERF_TEST_P(ST_SR_IM_Sparse, OpticalFlow_SparseRLOF, + testing::Combine( + testing::Values("ST_BILINEAR", "ST_STANDART"), + testing::Values("SR_CROSS", "SR_FIXED"), + testing::Values(true, false)) +) +{ + Mat frame1 = imread(getDataPath("cv/optflow/RubberWhale1.png")); + Mat frame2 = imread(getDataPath("cv/optflow/RubberWhale2.png")); + ASSERT_FALSE(frame1.empty()); + ASSERT_FALSE(frame2.empty()); + vector prevPts, currPts; + for (int r = 0; r < frame1.rows; r += 10) + { + for (int c = 0; c < frame1.cols; c += 10) { - Mat frame1 = imread(getDataPath("cv/optflow/RubberWhale1.png")); - Mat frame2 = imread(getDataPath("cv/optflow/RubberWhale2.png")); - ASSERT_FALSE(frame1.empty()); - ASSERT_FALSE(frame2.empty()); - vector prevPts, currPts; - for (int r = 0; r < frame1.rows; r += 10) - { - for (int c = 0; c < frame1.cols; c += 10) - { - prevPts.push_back(Point2f(static_cast(c), static_cast(r))); - } - } - vector status(prevPts.size()); - vector err(prevPts.size()); + prevPts.push_back(Point2f(static_cast(c), static_cast(r))); + } + } + vector status(prevPts.size()); + vector err(prevPts.size()); - Ptr param = Ptr(new RLOFOpticalFlowParameter); - if (get<0>(GetParam()) == "ST_BILINEAR") - param->solverType = ST_BILINEAR; - if (get<0>(GetParam()) == "ST_STANDART") - param->solverType = ST_STANDART; - if (get<1>(GetParam()) == "SR_CROSS") - param->supportRegionType = SR_CROSS; - if (get<1>(GetParam()) == "SR_FIXED") - param->supportRegionType = SR_FIXED; - param->useIlluminationModel = get<2>(GetParam()); + Ptr param = Ptr(new RLOFOpticalFlowParameter); + if (get<0>(GetParam()) == "ST_BILINEAR") + param->solverType = ST_BILINEAR; + if (get<0>(GetParam()) == "ST_STANDART") + param->solverType = ST_STANDART; + if (get<1>(GetParam()) == "SR_CROSS") + param->supportRegionType = SR_CROSS; + if (get<1>(GetParam()) == "SR_FIXED") + param->supportRegionType = SR_FIXED; + param->useIlluminationModel = get<2>(GetParam()); - TEST_CYCLE_N(1) - { - calcOpticalFlowSparseRLOF(frame1, frame2, prevPts, currPts, status, err, param, 1.f); - } + TEST_CYCLE_N(1) + { + calcOpticalFlowSparseRLOF(frame1, frame2, prevPts, currPts, status, err, param, 1.f); + } - SANITY_CHECK_NOTHING(); - } + SANITY_CHECK_NOTHING(); +} - typedef tuple INTERP_GRID_Dense_t; - typedef TestBaseWithParam INTERP_GRID_Dense; - PERF_TEST_P(INTERP_GRID_Dense, OpticalFlow_DenseRLOF, - testing::Combine( - testing::Values("INTERP_EPIC", "INTERP_GEO"), - testing::Values(4,10,50)) - ) - { - Mat flow; - Mat frame1 = imread(getDataPath("cv/optflow/RubberWhale1.png")); - Mat frame2 = imread(getDataPath("cv/optflow/RubberWhale1.png")); - ASSERT_FALSE(frame1.empty()); - ASSERT_FALSE(frame2.empty()); - Ptr param = Ptr(new RLOFOpticalFlowParameter);; - Ptr< DenseRLOFOpticalFlow> algo = DenseRLOFOpticalFlow::create(); - InterpolationType interp_type = INTERP_EPIC; - if (get<0>(GetParam()) == "INTERP_EPIC") - interp_type = INTERP_EPIC; - if (get<0>(GetParam()) == "INTERP_GEO") - interp_type = INTERP_GEO; - TEST_CYCLE_N(5) - { - calcOpticalFlowDenseRLOF(frame1, frame2,flow, param, 1.0f, Size(get<1>(GetParam()), get<1>(GetParam())), interp_type); - } - SANITY_CHECK_NOTHING(); - } +typedef tuple INTERP_GRID_Dense_t; +typedef TestBaseWithParam INTERP_GRID_Dense; +PERF_TEST_P(INTERP_GRID_Dense, OpticalFlow_DenseRLOF, + testing::Combine( + testing::Values("INTERP_EPIC", "INTERP_GEO"), + testing::Values(4,10,50)) +) +{ + Mat flow; + Mat frame1 = imread(getDataPath("cv/optflow/RubberWhale1.png")); + Mat frame2 = imread(getDataPath("cv/optflow/RubberWhale1.png")); + ASSERT_FALSE(frame1.empty()); + ASSERT_FALSE(frame2.empty()); + Ptr param = Ptr(new RLOFOpticalFlowParameter);; + Ptr< DenseRLOFOpticalFlow> algo = DenseRLOFOpticalFlow::create(); + InterpolationType interp_type = INTERP_EPIC; + if (get<0>(GetParam()) == "INTERP_EPIC") + interp_type = INTERP_EPIC; + if (get<0>(GetParam()) == "INTERP_GEO") + interp_type = INTERP_GEO; + TEST_CYCLE_N(5) + { + calcOpticalFlowDenseRLOF(frame1, frame2,flow, param, 1.0f, Size(get<1>(GetParam()), get<1>(GetParam())), interp_type); } -} // namespace + SANITY_CHECK_NOTHING(); +} + +}} // namespace diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp index 5f917dc12d4..e5cac1fa87c 100644 --- a/modules/optflow/src/rlof/berlof_invoker.hpp +++ b/modules/optflow/src/rlof/berlof_invoker.hpp @@ -43,2097 +43,2092 @@ inline void est_Result( const cv::Mat & src, const cv::Vec4f & val, cv::Point2f src.at(3,0) * val[0] + src.at(3,1) * val[1] + src.at(3,2) * val[2] + src.at(3,3) * val[3]); } -namespace berlof -{ +namespace berlof { - namespace ica - { +namespace ica { - class TrackerInvoker : public cv::ParallelLoopBody - { - public: - TrackerInvoker( - const Mat& _prevImg, - const Mat& _prevDeriv, - const Mat& _nextImg, - const Mat& _rgbPrevImg, - const Mat& _rgbNextImg, - const Point2f* _prevPts, - Point2f* _nextPts, - uchar* _status, - float* _err, - int _level, - int _maxLevel, - int _winSize[2], - int _maxIteration, - bool _useInitialFlow, - int _supportRegionType, - int _crossSegmentationThreshold, - std::vector _normSigmas, - float _minEigenValue) - { - prevImg = &_prevImg; - prevDeriv = &_prevDeriv; - nextImg = &_nextImg; - rgbPrevImg = &_rgbPrevImg; - rgbNextImg = &_rgbNextImg; - prevPts = _prevPts; - nextPts = _nextPts; - status = _status; - err = _err; - minWinSize = _winSize[0]; - maxWinSize = _winSize[1]; - criteria.maxCount = _maxIteration; - criteria.epsilon = 0.01; - level = _level; - maxLevel = _maxLevel; - windowType = _supportRegionType; - minEigThreshold = _minEigenValue; - paramReset = _normSigmas; - useInitialFlow = _useInitialFlow; - crossSegmentationThreshold = _crossSegmentationThreshold; - } +class TrackerInvoker : public cv::ParallelLoopBody +{ +public: + TrackerInvoker( + const Mat& _prevImg, + const Mat& _prevDeriv, + const Mat& _nextImg, + const Mat& _rgbPrevImg, + const Mat& _rgbNextImg, + const Point2f* _prevPts, + Point2f* _nextPts, + uchar* _status, + float* _err, + int _level, + int _maxLevel, + int _winSize[2], + int _maxIteration, + bool _useInitialFlow, + int _supportRegionType, + int _crossSegmentationThreshold, + std::vector _normSigmas, + float _minEigenValue) + { + prevImg = &_prevImg; + prevDeriv = &_prevDeriv; + nextImg = &_nextImg; + rgbPrevImg = &_rgbPrevImg; + rgbNextImg = &_rgbNextImg; + prevPts = _prevPts; + nextPts = _nextPts; + status = _status; + err = _err; + minWinSize = _winSize[0]; + maxWinSize = _winSize[1]; + criteria.maxCount = _maxIteration; + criteria.epsilon = 0.01; + level = _level; + maxLevel = _maxLevel; + windowType = _supportRegionType; + minEigThreshold = _minEigenValue; + paramReset = _normSigmas; + useInitialFlow = _useInitialFlow; + crossSegmentationThreshold = _crossSegmentationThreshold; + } - TrackerInvoker & operator=(const TrackerInvoker &) {return *this;}; - void operator()(const cv::Range& range) const CV_OVERRIDE - { + TrackerInvoker & operator=(const TrackerInvoker &) {return *this;}; + void operator()(const cv::Range& range) const CV_OVERRIDE + { #ifdef DEBUG_INVOKER - printf("berlof::ica");fflush(stdout); + printf("berlof::ica");fflush(stdout); #endif - Point2f halfWin; - cv::Size winSize; - const Mat& I = *prevImg; - const Mat& J = *nextImg; - const Mat& derivI = *prevDeriv; - const Mat& BI = *rgbPrevImg; + Point2f halfWin; + cv::Size winSize; + const Mat& I = *prevImg; + const Mat& J = *nextImg; + const Mat& derivI = *prevDeriv; + const Mat& BI = *rgbPrevImg; - const float FLT_SCALE = (1.f/(1 << 16)); + const float FLT_SCALE = (1.f/(1 << 16)); - winSize = cv::Size(maxWinSize,maxWinSize); - int winMaskwidth = static_cast(ceil(winSize.width / 16.f)) * 16; - cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); - winMaskMatBuf.setTo(1); + winSize = cv::Size(maxWinSize,maxWinSize); + int winMaskwidth = static_cast(ceil(winSize.width / 16.f)) * 16; + cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); + winMaskMatBuf.setTo(1); - std::vector param = paramReset; - int j, cn = I.channels(), cn2 = cn*2; - int winbufwidth = static_cast(ceil(winSize.width / 16.f)) * 16; - cv::Size winBufSize(winbufwidth,winbufwidth); + std::vector param = paramReset; + int j, cn = I.channels(), cn2 = cn*2; + int winbufwidth = static_cast(ceil(winSize.width / 16.f)) * 16; + cv::Size winBufSize(winbufwidth,winbufwidth); - cv::AutoBuffer _buf(winBufSize.area()*(cn + cn2)); - int derivDepth = DataType::depth; + cv::AutoBuffer _buf(winBufSize.area()*(cn + cn2)); + int derivDepth = DataType::depth; - Mat IWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf.data()); - Mat derivIWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf.data() + winBufSize.area()*cn); + Mat IWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf.data()); + Mat derivIWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf.data() + winBufSize.area()*cn); - for( int ptidx = range.start; ptidx < range.end; ptidx++ ) + for( int ptidx = range.start; ptidx < range.end; ptidx++ ) + { + Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level)); + Point2f nextPt; + if( level == maxLevel ) + { + if( useInitialFlow ) + nextPt = nextPts[ptidx]*(float)(1./(1 << level)); + else + nextPt = prevPt; + } + else + nextPt = nextPts[ptidx]*2.f; + nextPts[ptidx] = nextPt; + + Point2i iprevPt, inextPt; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + int winArea = maxWinSize * maxWinSize; + cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize)); + winMaskMatBuf.setTo(0); + if( calcWinMaskMat(BI, windowType, iprevPt, + winMaskMat,winSize,halfWin,winArea, + minWinSize,maxWinSize, crossSegmentationThreshold) == false) + continue; + + prevPt -= halfWin; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + + if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols || + iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows ) + { + if( level == 0 ) { - Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level)); - Point2f nextPt; - if( level == maxLevel ) - { - if( useInitialFlow ) - nextPt = nextPts[ptidx]*(float)(1./(1 << level)); - else - nextPt = prevPt; - } - else - nextPt = nextPts[ptidx]*2.f; - nextPts[ptidx] = nextPt; - - Point2i iprevPt, inextPt; - iprevPt.x = cvFloor(prevPt.x); - iprevPt.y = cvFloor(prevPt.y); - int winArea = maxWinSize * maxWinSize; - cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize)); - winMaskMatBuf.setTo(0); - if( calcWinMaskMat(BI, windowType, iprevPt, - winMaskMat,winSize,halfWin,winArea, - minWinSize,maxWinSize, crossSegmentationThreshold) == false) - continue; - - prevPt -= halfWin; - iprevPt.x = cvFloor(prevPt.x); - iprevPt.y = cvFloor(prevPt.y); + if( status ) + status[ptidx] = 3; + if( err ) + err[ptidx] = 0; + } + continue; + } - if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols || - iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows ) - { - if( level == 0 ) - { - if( status ) - status[ptidx] = 3; - if( err ) - err[ptidx] = 0; - } - continue; - } + float a = prevPt.x - iprevPt.x; + float b = prevPt.y - iprevPt.y; + const int W_BITS = 14, W_BITS1 = 14; - float a = prevPt.x - iprevPt.x; - float b = prevPt.y - iprevPt.y; - const int W_BITS = 14, W_BITS1 = 14; + int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); - int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); - int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); - int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + int dstep = (int)(derivI.step/derivI.elemSize1()); + int step = (int)(I.step/I.elemSize1()); + int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1()); + CV_Assert( step == (int)(J.step/J.elemSize1()) ); - int dstep = (int)(derivI.step/derivI.elemSize1()); - int step = (int)(I.step/I.elemSize1()); - int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1()); - CV_Assert( step == (int)(J.step/J.elemSize1()) ); + float A11 = 0, A12 = 0, A22 = 0; + float D = 0; - float A11 = 0, A12 = 0, A22 = 0; - float D = 0; + // extract the patch from the first image, compute covariation matrix of derivatives + int x, y; + for( y = 0; y < winSize.height; y++ ) + { + const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; + const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; - // extract the patch from the first image, compute covariation matrix of derivatives - int x, y; - for( y = 0; y < winSize.height; y++ ) + short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step); + short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step); + x = 0; + for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 ) + { + if( winMaskMat.at(y,x) == 0) { - const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; - const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; - - short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step); - short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step); - x = 0; - for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 ) - { - if( winMaskMat.at(y,x) == 0) - { - dIptr[0] = 0; - dIptr[1] = 0; - continue; - } - int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + - src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5); - int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 + - dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1); - int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 + - dsrc[dstep+cn2+1]*iw11, W_BITS1); - Iptr[x] = (short)ival; - dIptr[0] = (short)ixval; - dIptr[1] = (short)iyval; - } + dIptr[0] = 0; + dIptr[1] = 0; + continue; } + int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + + src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5); + int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 + + dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1); + int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 + + dsrc[dstep+cn2+1]*iw11, W_BITS1); + Iptr[x] = (short)ival; + dIptr[0] = (short)ixval; + dIptr[1] = (short)iyval; + } + } - cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1); - cv::Point2f backUpNextPt = nextPt; - nextPt -= halfWin; - Point2f prevDelta(0,0); //denotes h(t-1) - cv::Size _winSize = winSize; + cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1); + cv::Point2f backUpNextPt = nextPt; + nextPt -= halfWin; + Point2f prevDelta(0,0); //denotes h(t-1) + cv::Size _winSize = winSize; #ifdef RLOF_SSE - __m128i mmMask0, mmMask1, mmMask; - getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask); + __m128i mmMask0, mmMask1, mmMask; + getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask); #endif - float MEstimatorScale = 1; - int buffIdx = 0; - float c[8]; - cv::Mat GMc0, GMc1, GMc2, GMc3; - cv::Vec2f Mc0, Mc1, Mc2, Mc3; - int noIteration = 0; - int noReusedIteration = 0; - int noSolvedIteration = 0; - for( j = 0; j < criteria.maxCount; j++ ) - { - cv::Point2f delta(0,0); - cv::Point2f deltaGain(0,0); - bool hasSolved = false; - a = nextPt.x - inextPt.x; - b = nextPt.y - inextPt.y; - float ab = a * b; - if( j == 0 - || ( inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j % 2 != 0 )) - { - inextPt.x = cvFloor(nextPt.x); - inextPt.y = cvFloor(nextPt.y); + float MEstimatorScale = 1; + int buffIdx = 0; + float c[8]; + cv::Mat GMc0, GMc1, GMc2, GMc3; + cv::Vec2f Mc0, Mc1, Mc2, Mc3; + int noIteration = 0; + int noReusedIteration = 0; + int noSolvedIteration = 0; + for( j = 0; j < criteria.maxCount; j++ ) + { + cv::Point2f delta(0,0); + cv::Point2f deltaGain(0,0); + bool hasSolved = false; + a = nextPt.x - inextPt.x; + b = nextPt.y - inextPt.y; + float ab = a * b; + if( j == 0 + || ( inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j % 2 != 0 )) + { + inextPt.x = cvFloor(nextPt.x); + inextPt.y = cvFloor(nextPt.y); - if( inextPt.x < -winSize.width || inextPt.x >= J.cols || - inextPt.y < -winSize.height || inextPt.y >= J.rows ) - { - if( level == 0 && status ) - status[ptidx] = 3; - break; - } + if( inextPt.x < -winSize.width || inextPt.x >= J.cols || + inextPt.y < -winSize.height || inextPt.y >= J.rows ) + { + if( level == 0 && status ) + status[ptidx] = 3; + break; + } - a = nextPt.x - inextPt.x; - b = nextPt.y - inextPt.y; - ab = a * b; - iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); - iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); - iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); - iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - // mismatch - if( j == 0 ) - { - A11 = 0; - A12 = 0; - A22 = 0; - } + a = nextPt.x - inextPt.x; + b = nextPt.y - inextPt.y; + ab = a * b; + iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + // mismatch + if( j == 0 ) + { + A11 = 0; + A12 = 0; + A22 = 0; + } - if ( j == 0 ) + if ( j == 0 ) + { + buffIdx = 0; + for( y = 0; y < winSize.height; y++ ) { - buffIdx = 0; - for( y = 0; y < winSize.height; y++ ) + const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y )*step + (inextPt.x )*cn; + const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step); + const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + x = 0; + for( ; x < winSize.width*cn; x++, dIptr += 2) { - const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y )*step + (inextPt.x )*cn; - const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step); - const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; - x = 0; - for( ; x < winSize.width*cn; x++, dIptr += 2) - { - if( maskPtr[x] == 0) - continue; - int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, W_BITS1-5) - - Iptr[x]; - residualMat.at(buffIdx++) = static_cast(diff); - } + if( maskPtr[x] == 0) + continue; + int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, W_BITS1-5) + - Iptr[x]; + residualMat.at(buffIdx++) = static_cast(diff); } - /*! Estimation for the residual */ - cv::Mat residualMatRoi(residualMat, cv::Rect(0,0,1, buffIdx)); - MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi); } + /*! Estimation for the residual */ + cv::Mat residualMatRoi(residualMat, cv::Rect(0,0,1, buffIdx)); + MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi); + } - float eta = 1.f / winArea; - float fParam0 = param[0] * 32.f; - float fParam1 = param[1] * 32.f; - fParam0 = param[0] * MEstimatorScale; - fParam1 = param[1] * MEstimatorScale; + float eta = 1.f / winArea; + float fParam0 = param[0] * 32.f; + float fParam1 = param[1] * 32.f; + fParam0 = param[0] * MEstimatorScale; + fParam1 = param[1] * MEstimatorScale; - buffIdx = 0; - float _b0[4] = {0,0,0,0}; - float _b1[4] = {0,0,0,0}; + buffIdx = 0; + float _b0[4] = {0,0,0,0}; + float _b1[4] = {0,0,0,0}; - /* - */ - for( y = 0; y < _winSize.height; y++ ) - { - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; - const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn; - const short* Iptr = (const short*)(IWinBuf.data + y*IWinBuf.step); - const short* dIptr = (const short*)(derivIWinBuf.data + y*derivIWinBuf.step); + /* + */ + for( y = 0; y < _winSize.height; y++ ) + { + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn; + const short* Iptr = (const short*)(IWinBuf.data + y*IWinBuf.step); + const short* dIptr = (const short*)(derivIWinBuf.data + y*derivIWinBuf.step); - x = 0; - for( ; x < _winSize.width*cn; x++, dIptr += 2 ) - { - if( maskPtr[x] == 0) - continue; - int illValue = - Iptr[x]; + x = 0; + for( ; x < _winSize.width*cn; x++, dIptr += 2 ) + { + if( maskPtr[x] == 0) + continue; + int illValue = - Iptr[x]; - float It[4] = {static_cast((Jptr[x+step+cn]<< 5) + illValue), - static_cast((Jptr[x+cn]<< 5) + illValue), - static_cast((Jptr[x+step]<< 5) + illValue), - static_cast((Jptr[x] << 5) + illValue)}; + float It[4] = {static_cast((Jptr[x+step+cn]<< 5) + illValue), + static_cast((Jptr[x+cn]<< 5) + illValue), + static_cast((Jptr[x+step]<< 5) + illValue), + static_cast((Jptr[x] << 5) + illValue)}; - int J_val = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + - Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, - W_BITS1-5); + int J_val = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, + W_BITS1-5); - int diff = J_val + illValue; + int diff = J_val + illValue; - MEstimatorScale += (diff < MEstimatorScale) ? -eta : eta; + MEstimatorScale += (diff < MEstimatorScale) ? -eta : eta; - int abss = (diff < 0) ? -diff : diff; + int abss = (diff < 0) ? -diff : diff; - // compute the missmatch vector - if( j >= 0) + // compute the missmatch vector + if( j >= 0) + { + if( abss > fParam1) + { + It[0] = 0; + It[1] = 0; + It[2] = 0; + It[3] = 0; + } + else if( abss > fParam0 && diff >= 0 ) + { + It[0] = param[2] * (It[0] - fParam1); + It[1] = param[2] * (It[1] - fParam1); + It[2] = param[2] * (It[2] - fParam1); + It[3] = param[2] * (It[3] - fParam1); + } + else if( abss > fParam0 && diff < 0 ) { - if( abss > fParam1) - { - It[0] = 0; - It[1] = 0; - It[2] = 0; - It[3] = 0; - } - else if( abss > fParam0 && diff >= 0 ) - { - It[0] = param[2] * (It[0] - fParam1); - It[1] = param[2] * (It[1] - fParam1); - It[2] = param[2] * (It[2] - fParam1); - It[3] = param[2] * (It[3] - fParam1); - } - else if( abss > fParam0 && diff < 0 ) - { - It[0] = param[2] * (It[0] + fParam1); - It[1] = param[2] * (It[1] + fParam1); - It[2] = param[2] * (It[2] + fParam1); - It[3] = param[2] * (It[3] + fParam1); - } + It[0] = param[2] * (It[0] + fParam1); + It[1] = param[2] * (It[1] + fParam1); + It[2] = param[2] * (It[2] + fParam1); + It[3] = param[2] * (It[3] + fParam1); } + } - float It0 = It[0]; - float It1 = It[1]; - float It2 = It[2]; - float It3 = It[3]; + float It0 = It[0]; + float It1 = It[1]; + float It2 = It[2]; + float It3 = It[3]; - float ixval = static_cast(dIptr[0]); - float iyval = static_cast(dIptr[1]); - _b0[0] += It0 * ixval; - _b0[1] += It1 * ixval; - _b0[2] += It2 * ixval; - _b0[3] += It3 * ixval; + float ixval = static_cast(dIptr[0]); + float iyval = static_cast(dIptr[1]); + _b0[0] += It0 * ixval; + _b0[1] += It1 * ixval; + _b0[2] += It2 * ixval; + _b0[3] += It3 * ixval; - _b1[0] += It0*iyval; - _b1[1] += It1*iyval; - _b1[2] += It2*iyval; - _b1[3] += It3*iyval; + _b1[0] += It0*iyval; + _b1[1] += It1*iyval; + _b1[2] += It2*iyval; + _b1[3] += It3*iyval; - // compute the Gradient Matrice - if( j == 0) + // compute the Gradient Matrice + if( j == 0) + { + float tale = param[2] * FLT_RESCALE; + if( abss < fParam0 || j < 0) { - float tale = param[2] * FLT_RESCALE; - if( abss < fParam0 || j < 0) - { - tale = FLT_RESCALE; - } - else if( abss > fParam1) - { - tale *= 0.01f; - } - else - { - tale *= param[2]; - } - - A11 += (float)(ixval*ixval)*tale; - A12 += (float)(ixval*iyval)*tale; - A22 += (float)(iyval*iyval)*tale; - + tale = FLT_RESCALE; + } + else if( abss > fParam1) + { + tale *= 0.01f; + } + else + { + tale *= param[2]; } + + A11 += (float)(ixval*ixval)*tale; + A12 += (float)(ixval*iyval)*tale; + A22 += (float)(iyval*iyval)*tale; + } } + } - if( j == 0 ) - { + if( j == 0 ) + { - A11 *= FLT_SCALE; // 54866744. - A12 *= FLT_SCALE; // -628764.00 - A22 *= FLT_SCALE; // 19730.000 + A11 *= FLT_SCALE; // 54866744. + A12 *= FLT_SCALE; // -628764.00 + A22 *= FLT_SCALE; // 19730.000 - D = A11 * A22 - A12 * A12; - float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) + - 4.f*A12*A12))/(2*winArea); + D = A11 * A22 - A12 * A12; + float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) + + 4.f*A12*A12))/(2*winArea); - if( minEig < minEigThreshold || std::abs(D) < FLT_EPSILON) + if( minEig < minEigThreshold || std::abs(D) < FLT_EPSILON) + { + if( level == 0 && status ) + status[ptidx] = 0; + if( level > 0) { - if( level == 0 && status ) - status[ptidx] = 0; - if( level > 0) - { - nextPts[ptidx] = backUpNextPt; - } - noIteration++; - break; + nextPts[ptidx] = backUpNextPt; } - - D = (1.f / D); - + noIteration++; + break; } - _b0[0] *= FLT_SCALE;_b0[1] *= FLT_SCALE;_b0[2] *= FLT_SCALE;_b0[3] *= FLT_SCALE; - _b1[0] *= FLT_SCALE;_b1[1] *= FLT_SCALE;_b1[2] *= FLT_SCALE;_b1[3] *= FLT_SCALE; + D = (1.f / D); + } - Mc0[0] = _b0[0] - _b0[1] - _b0[2] + _b0[3]; - Mc0[1] = _b1[0] - _b1[1] - _b1[2] + _b1[3]; + _b0[0] *= FLT_SCALE;_b0[1] *= FLT_SCALE;_b0[2] *= FLT_SCALE;_b0[3] *= FLT_SCALE; + _b1[0] *= FLT_SCALE;_b1[1] *= FLT_SCALE;_b1[2] *= FLT_SCALE;_b1[3] *= FLT_SCALE; - Mc1[0] = _b0[1] - _b0[3]; - Mc1[1] = _b1[1] - _b1[3]; + Mc0[0] = _b0[0] - _b0[1] - _b0[2] + _b0[3]; + Mc0[1] = _b1[0] - _b1[1] - _b1[2] + _b1[3]; - Mc2[0] = _b0[2] - _b0[3]; - Mc2[1] = _b1[2] - _b1[3]; + Mc1[0] = _b0[1] - _b0[3]; + Mc1[1] = _b1[1] - _b1[3]; - Mc3[0] = _b0[3]; - Mc3[1] = _b1[3]; + Mc2[0] = _b0[2] - _b0[3]; + Mc2[1] = _b1[2] - _b1[3]; - c[0] = -Mc0[0]; - c[1] = -Mc1[0]; - c[2] = -Mc2[0]; - c[3] = -Mc3[0]; - c[4] = -Mc0[1]; - c[5] = -Mc1[1]; - c[6] = -Mc2[1]; - c[7] = -Mc3[1]; - float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]); - float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]); - float e2 = e0 * (c[1] * c[7] -c[3] * c[5]); - e0 = e1 * e1 - e2; - hasSolved = false; - if ( e0 > 0) - { - e0 = sqrt(e0); - float _y[2] = {-e1 - e0, e0 - e1}; - float c0yc1[2] = {c[0] * _y[0] + c[1], - c[0] * _y[1] + c[1]}; - float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0], - -(c[2] * _y[1] + c[3]) / c0yc1[1]}; - bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1); - bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1); - - bool isSolution1 = checkSolution(_x[0], _y[0], c ); - bool isSolution2 = checkSolution(_x[1], _y[1], c ); - bool isSink1 = isIn1 && isSolution1; - bool isSink2 = isIn2 && isSolution2; - - if ( isSink1 != isSink2) - { - a = isSink1 ? _x[0] : _x[1]; - b = isSink1 ? _y[0] : _y[1]; - ab = a * b; - hasSolved = true; - delta.x = inextPt.x + a - nextPt.x; - delta.y = inextPt.y + b - nextPt.y; - } // isIn1 != isIn2 - } - if( hasSolved == false) - noIteration++; - } - else - { - hasSolved = false; - noReusedIteration++; - } - if( hasSolved == false ) - { + Mc3[0] = _b0[3]; + Mc3[1] = _b1[3]; - cv::Vec2f mismatchVec = ab * Mc0 + Mc1 *a + Mc2 * b + Mc3; - delta.x = (A12 * mismatchVec.val[1] - A22 * mismatchVec.val[0]) * D; - delta.y = (A12 * mismatchVec.val[0] - A11 * mismatchVec.val[1]) * D; - delta.x = MAX(-1.f, MIN(1.f, delta.x)); - delta.y = MAX(-1.f, MIN(1.f, delta.y)); - nextPt += delta; - nextPts[ptidx] = nextPt + halfWin; - } - else - { - nextPt += delta; - nextPts[ptidx] = nextPt + halfWin; - noSolvedIteration++; - break; - } + c[0] = -Mc0[0]; + c[1] = -Mc1[0]; + c[2] = -Mc2[0]; + c[3] = -Mc3[0]; + c[4] = -Mc0[1]; + c[5] = -Mc1[1]; + c[6] = -Mc2[1]; + c[7] = -Mc3[1]; - delta.x = ( delta.x != delta.x) ? 0 : delta.x; - delta.y = ( delta.y != delta.y) ? 0 : delta.y; + float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]); + float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]); + float e2 = e0 * (c[1] * c[7] -c[3] * c[5]); + e0 = e1 * e1 - e2; + hasSolved = false; + if ( e0 > 0) + { + e0 = sqrt(e0); + float _y[2] = {-e1 - e0, e0 - e1}; + float c0yc1[2] = {c[0] * _y[0] + c[1], + c[0] * _y[1] + c[1]}; + float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0], + -(c[2] * _y[1] + c[3]) / c0yc1[1]}; + bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1); + bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1); + + bool isSolution1 = checkSolution(_x[0], _y[0], c ); + bool isSolution2 = checkSolution(_x[1], _y[1], c ); + bool isSink1 = isIn1 && isSolution1; + bool isSink2 = isIn2 && isSolution2; - if(j > 0 && ( - (std::abs(delta.x - prevDelta.x) < 0.01 && std::abs(delta.y - prevDelta.y) < 0.01))) + if ( isSink1 != isSink2) { - nextPts[ptidx] -= delta*0.5f; - break; - } + a = isSink1 ? _x[0] : _x[1]; + b = isSink1 ? _y[0] : _y[1]; + ab = a * b; + hasSolved = true; + delta.x = inextPt.x + a - nextPt.x; + delta.y = inextPt.y + b - nextPt.y; + } // isIn1 != isIn2 + } + if( hasSolved == false) + noIteration++; + } + else + { + hasSolved = false; + noReusedIteration++; + } + if( hasSolved == false ) + { + cv::Vec2f mismatchVec = ab * Mc0 + Mc1 *a + Mc2 * b + Mc3; + delta.x = (A12 * mismatchVec.val[1] - A22 * mismatchVec.val[0]) * D; + delta.y = (A12 * mismatchVec.val[0] - A11 * mismatchVec.val[1]) * D; + delta.x = MAX(-1.f, MIN(1.f, delta.x)); + delta.y = MAX(-1.f, MIN(1.f, delta.y)); + nextPt += delta; + nextPts[ptidx] = nextPt + halfWin; + } + else + { + nextPt += delta; + nextPts[ptidx] = nextPt + halfWin; + noSolvedIteration++; + break; + } - prevDelta = delta; - } + delta.x = ( delta.x != delta.x) ? 0 : delta.x; + delta.y = ( delta.y != delta.y) ? 0 : delta.y; + if(j > 0 && ( + (std::abs(delta.x - prevDelta.x) < 0.01 && std::abs(delta.y - prevDelta.y) < 0.01))) + { + nextPts[ptidx] -= delta*0.5f; + break; } + prevDelta = delta; } - const Mat* prevImg; - const Mat* nextImg; - const Mat* prevDeriv; - const Mat* rgbPrevImg; - const Mat* rgbNextImg; - const Point2f* prevPts; - Point2f* nextPts; - uchar* status; - float* err; - int maxWinSize; - int minWinSize; - TermCriteria criteria; - int level; - int maxLevel; - int windowType; - float minEigThreshold; - bool useInitialFlow; - std::vector paramReset; - int crossSegmentationThreshold; - - }; + } + + } + + const Mat* prevImg; + const Mat* nextImg; + const Mat* prevDeriv; + const Mat* rgbPrevImg; + const Mat* rgbNextImg; + const Point2f* prevPts; + Point2f* nextPts; + uchar* status; + float* err; + int maxWinSize; + int minWinSize; + TermCriteria criteria; + int level; + int maxLevel; + int windowType; + float minEigThreshold; + bool useInitialFlow; + std::vector paramReset; + int crossSegmentationThreshold; + +}; + +} // namespace +namespace radial { +class TrackerInvoker : public cv::ParallelLoopBody +{ +public: + TrackerInvoker( + const Mat& _prevImg, + const Mat& _prevDeriv, + const Mat& _nextImg, + const Mat& _rgbPrevImg, + const Mat& _rgbNextImg, + const Point2f* _prevPts, + Point2f* _nextPts, + uchar* _status, + float* _err, + Point2f* _gainVecs, + int _level, + int _maxLevel, + int _winSize[2], + int _maxIteration, + bool _useInitialFlow, + int _supportRegionType, + int _crossSegmentationThreshold, + std::vector _normSigmas, + float _minEigenValue) + { + prevImg = &_prevImg; + prevDeriv = &_prevDeriv; + nextImg = &_nextImg; + rgbPrevImg = &_rgbPrevImg; + rgbNextImg = &_rgbNextImg; + prevPts = _prevPts; + nextPts = _nextPts; + status = _status; + err = _err; + gainVecs = _gainVecs; + minWinSize = _winSize[0]; + maxWinSize = _winSize[1]; + criteria.maxCount = _maxIteration; + criteria.epsilon = 0.01; + level = _level; + maxLevel = _maxLevel; + windowType = _supportRegionType; + minEigThreshold = _minEigenValue; + paramReset = _normSigmas; + useInitialFlow = _useInitialFlow; + crossSegmentationThreshold = _crossSegmentationThreshold; } - namespace radial + + TrackerInvoker & operator=(const TrackerInvoker &) {return *this;}; + void operator()(const cv::Range& range) const CV_OVERRIDE { - class TrackerInvoker : public cv::ParallelLoopBody +#ifdef DEBUG_INVOKER + printf("berlof::radial");fflush(stdout); +#endif + Point2f halfWin; + cv::Size winSize; + const Mat& I = *prevImg; + const Mat& J = *nextImg; + const Mat& derivI = *prevDeriv; + const Mat& BI = *rgbPrevImg; + const float FLT_SCALE = (1.f/(1 << 16));//(1.f/(1 << 20)); // 20 + winSize = cv::Size(maxWinSize,maxWinSize); + int winMaskwidth = static_cast(ceil(winSize.width / 16.f)) * 16; + cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); + winMaskMatBuf.setTo(1); + + std::vector param = paramReset; + int cn = I.channels(), cn2 = cn*2; + int winbufwidth = static_cast(ceil(winSize.width / 16.f)) * 16; + cv::Size winBufSize(winbufwidth,winbufwidth); + + + cv::Mat invTensorMat(4,4, CV_32FC1); + cv::Mat mismatchMat(4, 1, CV_32FC1); + cv::Mat resultMat(4, 1, CV_32FC1); + + cv::AutoBuffer _buf(winBufSize.area()*(cn + cn2)); + int derivDepth = DataType::depth; + + Mat IWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf.data()); + Mat derivIWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf.data() + winBufSize.area()*cn); + + for( int ptidx = range.start; ptidx < range.end; ptidx++ ) { - public: - TrackerInvoker( - const Mat& _prevImg, - const Mat& _prevDeriv, - const Mat& _nextImg, - const Mat& _rgbPrevImg, - const Mat& _rgbNextImg, - const Point2f* _prevPts, - Point2f* _nextPts, - uchar* _status, - float* _err, - Point2f* _gainVecs, - int _level, - int _maxLevel, - int _winSize[2], - int _maxIteration, - bool _useInitialFlow, - int _supportRegionType, - int _crossSegmentationThreshold, - std::vector _normSigmas, - float _minEigenValue) + Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level)); + Point2f nextPt; + if( level == maxLevel ) { - prevImg = &_prevImg; - prevDeriv = &_prevDeriv; - nextImg = &_nextImg; - rgbPrevImg = &_rgbPrevImg; - rgbNextImg = &_rgbNextImg; - prevPts = _prevPts; - nextPts = _nextPts; - status = _status; - err = _err; - gainVecs = _gainVecs; - minWinSize = _winSize[0]; - maxWinSize = _winSize[1]; - criteria.maxCount = _maxIteration; - criteria.epsilon = 0.01; - level = _level; - maxLevel = _maxLevel; - windowType = _supportRegionType; - minEigThreshold = _minEigenValue; - paramReset = _normSigmas; - useInitialFlow = _useInitialFlow; - crossSegmentationThreshold = _crossSegmentationThreshold; + if( useInitialFlow ) + nextPt = nextPts[ptidx]*(float)(1./(1 << level)); + else + nextPt = prevPt; } - - TrackerInvoker & operator=(const TrackerInvoker &) {return *this;}; - void operator()(const cv::Range& range) const CV_OVERRIDE + else + nextPt = nextPts[ptidx]*2.f; + nextPts[ptidx] = nextPt; + + Point2i iprevPt, inextPt; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + int winArea = maxWinSize * maxWinSize; + cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize)); + winMaskMatBuf.setTo(0); + if( calcWinMaskMat(BI, windowType, iprevPt, + winMaskMat,winSize,halfWin,winArea, + minWinSize,maxWinSize, crossSegmentationThreshold) == false) + continue; + + prevPt -= halfWin; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + + if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols || + iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows ) { -#ifdef DEBUG_INVOKER - printf("berlof::radial");fflush(stdout); + if( level == 0 ) + { + if( status ) + status[ptidx] = 3; + if( err ) + err[ptidx] = 0; + } + continue; + } + + float a = prevPt.x - iprevPt.x; + float b = prevPt.y - iprevPt.y; + const int W_BITS = 14, W_BITS1 = 14; + + int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + + int dstep = (int)(derivI.step/derivI.elemSize1()); + int step = (int)(I.step/I.elemSize1()); + int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1()); + CV_Assert( step == (int)(J.step/J.elemSize1()) ); + + float A11 = 0, A12 = 0, A22 = 0; + + // tensor + float sumIx = 0; + float sumIy = 0; + float sumI = 0; + float sumW = 0; + float w1 = 0, w2 = 0; // -IyI + float dI = 0; // I^2 + float D = 0; + +#ifdef RLOF_SSE + + __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128i z = _mm_setzero_si128(); + __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1)); + __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1)); + __m128i mmMask4_epi32; + __m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits::max()); + get4BitMask(winSize.width, mmMask4_epi32); #endif - Point2f halfWin; - cv::Size winSize; - const Mat& I = *prevImg; - const Mat& J = *nextImg; - const Mat& derivI = *prevDeriv; - const Mat& BI = *rgbPrevImg; - const float FLT_SCALE = (1.f/(1 << 16));//(1.f/(1 << 20)); // 20 - winSize = cv::Size(maxWinSize,maxWinSize); - int winMaskwidth = static_cast(ceil(winSize.width / 16.f)) * 16; - cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); - winMaskMatBuf.setTo(1); - - std::vector param = paramReset; - int cn = I.channels(), cn2 = cn*2; - int winbufwidth = static_cast(ceil(winSize.width / 16.f)) * 16; - cv::Size winBufSize(winbufwidth,winbufwidth); - - - cv::Mat invTensorMat(4,4, CV_32FC1); - cv::Mat mismatchMat(4, 1, CV_32FC1); - cv::Mat resultMat(4, 1, CV_32FC1); - - cv::AutoBuffer _buf(winBufSize.area()*(cn + cn2)); - int derivDepth = DataType::depth; - - Mat IWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf.data()); - Mat derivIWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf.data() + winBufSize.area()*cn); - - for( int ptidx = range.start; ptidx < range.end; ptidx++ ) + + // extract the patch from the first image, compute covariation matrix of derivatives + int x, y; + for( y = 0; y < winSize.height; y++ ) + { + x = 0; + + const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; + const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; + + short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step); + short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step); +#ifdef RLOF_SSE + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 ) { - Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level)); - Point2f nextPt; - if( level == maxLevel ) + __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16); + __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16); + + + __m128i v00, v01, v10, v11, t0, t1; + v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); + v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); + v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z); + v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); + if( x + 4 > winSize.width) { - if( useInitialFlow ) - nextPt = nextPts[ptidx]*(float)(1./(1 << level)); - else - nextPt = prevPt; + t0 = _mm_and_si128(t0, mmMask4_epi32); } - else - nextPt = nextPts[ptidx]*2.f; - nextPts[ptidx] = nextPt; - - Point2i iprevPt, inextPt; - iprevPt.x = cvFloor(prevPt.x); - iprevPt.y = cvFloor(prevPt.y); - int winArea = maxWinSize * maxWinSize; - cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize)); - winMaskMatBuf.setTo(0); - if( calcWinMaskMat(BI, windowType, iprevPt, - winMaskMat,winSize,halfWin,winArea, - minWinSize,maxWinSize, crossSegmentationThreshold) == false) - continue; - - prevPt -= halfWin; - iprevPt.x = cvFloor(prevPt.x); - iprevPt.y = cvFloor(prevPt.y); + t0 = _mm_and_si128(t0, mask_0_3_epi16); + _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0)); + + + v00 = _mm_loadu_si128((const __m128i*)(dsrc)); + v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); + v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep)); + v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2)); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1); + v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... + if( x + 4 > winSize.width) + { + v00 = _mm_and_si128(v00, mmMask4_epi32); + } + v00 = _mm_and_si128(v00, mask_0_3_epi16); + _mm_storeu_si128((__m128i*)dIptr, v00); + } +#else - if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols || - iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows ) + for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 ) + { + if( winMaskMat.at(y,x) == 0) { - if( level == 0 ) - { - if( status ) - status[ptidx] = 3; - if( err ) - err[ptidx] = 0; - } + dIptr[0] = 0; + dIptr[1] = 0; continue; } + int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + + src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5); + int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 + + dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1); + int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 + + dsrc[dstep+cn2+1]*iw11, W_BITS1); + + Iptr[x] = (short)ival; + dIptr[0] = (short)ixval; + dIptr[1] = (short)iyval; + + } +#endif + } - float a = prevPt.x - iprevPt.x; - float b = prevPt.y - iprevPt.y; - const int W_BITS = 14, W_BITS1 = 14; + cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1); - int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); - int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); - int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); - int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + cv::Point2f backUpNextPt = nextPt; + nextPt -= halfWin; + Point2f prevDelta(0,0); //relates to h(t-1) + Point2f prevGain(1,0); + cv::Point2f gainVec = gainVecs[ptidx]; + cv::Point2f backUpGain = gainVec; + cv::Size _winSize = winSize; + int j; +#ifdef RLOF_SSE + __m128i mmMask0, mmMask1, mmMask; + getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask); + __m128 mmOnes = _mm_set1_ps(1.f ); +#endif + float MEstimatorScale = 1; + int buffIdx = 0; + float c[8]; + cv::Mat GMc0, GMc1, GMc2, GMc3; + cv::Vec4f Mc0, Mc1, Mc2, Mc3; + int noIteration = 0; + int noReusedIteration = 0; + int noSolvedIteration = 0; + for( j = 0; j < criteria.maxCount; j++ ) + { + cv::Point2f delta(0,0); + cv::Point2f deltaGain(0,0); + bool hasSolved = false; + a = nextPt.x - inextPt.x; + b = nextPt.y - inextPt.y; + float ab = a * b; + if( j == 0 + || (inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j % 2 != 0) ) + { + inextPt.x = cvFloor(nextPt.x); + inextPt.y = cvFloor(nextPt.y); + + if( inextPt.x < -winSize.width || inextPt.x >= J.cols || + inextPt.y < -winSize.height || inextPt.y >= J.rows ) + { + if( level == 0 && status ) + status[ptidx] = 3; + if (level > 0) + { + nextPts[ptidx] = backUpNextPt; + gainVecs[ptidx] = backUpGain; + } + noIteration++; + break; + } - int dstep = (int)(derivI.step/derivI.elemSize1()); - int step = (int)(I.step/I.elemSize1()); - int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1()); - CV_Assert( step == (int)(J.step/J.elemSize1()) ); - float A11 = 0, A12 = 0, A22 = 0; + a = nextPt.x - inextPt.x; + b = nextPt.y - inextPt.y; + ab = a * b; + iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + // mismatch + if( j == 0) + { // tensor - float sumIx = 0; - float sumIy = 0; - float sumI = 0; - float sumW = 0; - float w1 = 0, w2 = 0; // -IyI - float dI = 0; // I^2 - float D = 0; - - #ifdef RLOF_SSE - - __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); - __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); - __m128i z = _mm_setzero_si128(); - __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1)); - __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1)); - __m128i mmMask4_epi32; - __m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits::max()); - get4BitMask(winSize.width, mmMask4_epi32); - #endif - - // extract the patch from the first image, compute covariation matrix of derivatives - int x, y; + w1 = 0; // -IxI + w2 = 0; // -IyI + dI = 0; // I^2 + sumIx = 0; + sumIy = 0; + sumI = 0; + sumW = 0; + A11 = 0; + A12 = 0; + A22 = 0; + } + + if ( j == 0 ) + { + buffIdx = 0; for( y = 0; y < winSize.height; y++ ) { + const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y )*step + (inextPt.x )*cn; + const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step); + const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; x = 0; + for( ; x < winSize.width*cn; x++, dIptr += 2) + { + if( maskPtr[x] == 0) + continue; + int diff = static_cast(CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, W_BITS1-5) + - Iptr[x] + Iptr[x] * gainVec.x +gainVec.y); + residualMat.at(buffIdx++) = static_cast(diff); + } + } + /*! Estimation for the residual */ + cv::Mat residualMatRoi(residualMat, cv::Rect(0,0,1, buffIdx)); + MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi); + } - const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; - const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; + float eta = 1.f / winArea; + float fParam0 = param[0] * 32.f; + float fParam1 = param[1] * 32.f; + fParam0 = param[0] * MEstimatorScale; + fParam1 = param[1] * MEstimatorScale; - short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step); - short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step); - #ifdef RLOF_SSE - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; - for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 ) - { - __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16); - __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16); +#ifdef RLOF_SSE + qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128 qb0[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; + __m128 qb1[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; + __m128 qb2[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; + __m128 qb3[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; + __m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps(); + __m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps(); + __m128 mmSumIy = _mm_setzero_ps(), mmSumIx = _mm_setzero_ps(); + __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps(); + __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits::max() -1, static_cast(fParam0))); + __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits::max()- 1, static_cast(fParam1))); + + + float s2Val = param[2] > 0 ? param[2] : -param[2]; + int s2bitShift = param[2] == 0 ? 1 : static_cast(ceil(log(200.f / s2Val) / log(2.f))); + __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast(param[2] * (float)(1 << s2bitShift))); + __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift); + __m128 mmParam2s = _mm_set1_ps(0.01f * param[2]); + __m128 mmParam2s2 = _mm_set1_ps(param[2] * param[2]); + float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x; + int bitShift = gainVec.x == 0 ? 1 : static_cast(ceil(log(200.f / gainVal) / log(2.f))); + __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast(gainVec.x * (float)(1 << bitShift))); + __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast(gainVec.y)); + __m128i mmEta = _mm_setzero_si128(); + __m128i mmScale = _mm_set1_epi16(static_cast(MEstimatorScale)); - __m128i v00, v01, v10, v11, t0, t1; - v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); - v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); - v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z); - v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z); +#endif - t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); - t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); - if( x + 4 > winSize.width) - { - t0 = _mm_and_si128(t0, mmMask4_epi32); - } - t0 = _mm_and_si128(t0, mask_0_3_epi16); - _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0)); - - - v00 = _mm_loadu_si128((const __m128i*)(dsrc)); - v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); - v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep)); - v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2)); - - t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); - t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); - t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1); - t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1); - v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... - if( x + 4 > winSize.width) - { - v00 = _mm_and_si128(v00, mmMask4_epi32); - } - v00 = _mm_and_si128(v00, mask_0_3_epi16); - _mm_storeu_si128((__m128i*)dIptr, v00); - } - #else + buffIdx = 0; + float _b0[4] = {0,0,0,0}; + float _b1[4] = {0,0,0,0}; + float _b2[4] = {0,0,0,0}; + float _b3[4] = {0,0,0,0}; + /* + */ + for( y = 0; y < _winSize.height; y++ ) + { + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn; + const short* Iptr = (const short*)(IWinBuf.data + y*IWinBuf.step); + const short* dIptr = (const short*)(derivIWinBuf.data + y*derivIWinBuf.step); - for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 ) - { - if( winMaskMat.at(y,x) == 0) - { - dIptr[0] = 0; - dIptr[1] = 0; - continue; - } - int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + - src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5); - int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 + - dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1); - int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 + - dsrc[dstep+cn2+1]*iw11, W_BITS1); + x = 0; +#ifdef RLOF_SSE + for( ; x <= _winSize.width*cn; x += 8, dIptr += 8*2 ) + { + __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16); + __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); + + __m128i v00 = _mm_unpacklo_epi8( + _mm_loadl_epi64((const __m128i*)(Jptr + x)), z); + __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); + __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z); + __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z); + + __m128i t0 = _mm_add_epi32 + (_mm_madd_epi16( + _mm_unpacklo_epi16(v00, v01), + qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5); - Iptr[x] = (short)ival; - dIptr[0] = (short)ixval; - dIptr[1] = (short)iyval; + __m128i lo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16); + __m128i hi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16); - } - #endif - } + __m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), bitShift); + __m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), bitShift); + __m128i Igain_epi16 = _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32); - cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1); - - cv::Point2f backUpNextPt = nextPt; - nextPt -= halfWin; - Point2f prevDelta(0,0); //relates to h(t-1) - Point2f prevGain(1,0); - cv::Point2f gainVec = gainVecs[ptidx]; - cv::Point2f backUpGain = gainVec; - cv::Size _winSize = winSize; - int j; - #ifdef RLOF_SSE - __m128i mmMask0, mmMask1, mmMask; - getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask); - __m128 mmOnes = _mm_set1_ps(1.f ); - #endif - float MEstimatorScale = 1; - int buffIdx = 0; - float c[8]; - cv::Mat GMc0, GMc1, GMc2, GMc3; - cv::Vec4f Mc0, Mc1, Mc2, Mc3; - int noIteration = 0; - int noReusedIteration = 0; - int noSolvedIteration = 0; - for( j = 0; j < criteria.maxCount; j++ ) - { - cv::Point2f delta(0,0); - cv::Point2f deltaGain(0,0); - bool hasSolved = false; - a = nextPt.x - inextPt.x; - b = nextPt.y - inextPt.y; - float ab = a * b; - if( j == 0 - || (inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j % 2 != 0) ) - { - inextPt.x = cvFloor(nextPt.x); - inextPt.y = cvFloor(nextPt.y); - if( inextPt.x < -winSize.width || inextPt.x >= J.cols || - inextPt.y < -winSize.height || inextPt.y >= J.rows ) + __m128i diffValue = _mm_subs_epi16(_mm_add_epi16(Igain_epi16, mmConstValue_epi16), I_0_7_epi16); + __m128i mmDiffc_epi16[4] = { - if( level == 0 && status ) - status[ptidx] = 3; - if (level > 0) - { - nextPts[ptidx] = backUpNextPt; - gainVecs[ptidx] = backUpGain; - } - noIteration++; - break; - } + _mm_add_epi16(_mm_slli_epi16(v11, 5), diffValue), + _mm_add_epi16(_mm_slli_epi16(v01, 5), diffValue), + _mm_add_epi16(_mm_slli_epi16(v10, 5), diffValue), + _mm_add_epi16(_mm_slli_epi16(v00, 5), diffValue) + }; + __m128i mmDiff_epi16 = _mm_add_epi16( _mm_packs_epi32(t0, t1), diffValue); - a = nextPt.x - inextPt.x; - b = nextPt.y - inextPt.y; - ab = a * b; - iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); - iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); - iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); - iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - // mismatch - if( j == 0) - { - // tensor - w1 = 0; // -IxI - w2 = 0; // -IyI - dI = 0; // I^2 - sumIx = 0; - sumIy = 0; - sumI = 0; - sumW = 0; - A11 = 0; - A12 = 0; - A22 = 0; - } + mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16); - if ( j == 0 ) - { - buffIdx = 0; - for( y = 0; y < winSize.height; y++ ) - { - const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y )*step + (inextPt.x )*cn; - const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step); - const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; - x = 0; - for( ; x < winSize.width*cn; x++, dIptr += 2) - { - if( maskPtr[x] == 0) - continue; - int diff = static_cast(CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, W_BITS1-5) - - Iptr[x] + Iptr[x] * gainVec.x +gainVec.y); - residualMat.at(buffIdx++) = static_cast(diff); - } - } - /*! Estimation for the residual */ - cv::Mat residualMatRoi(residualMat, cv::Rect(0,0,1, buffIdx)); - MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi); - } + __m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale); + mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1))); - float eta = 1.f / winArea; - float fParam0 = param[0] * 32.f; - float fParam1 = param[1] * 32.f; - fParam0 = param[0] * MEstimatorScale; - fParam1 = param[1] * MEstimatorScale; -#ifdef RLOF_SSE + __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); + __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); - qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); - qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); - __m128 qb0[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; - __m128 qb1[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; - __m128 qb2[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; - __m128 qb3[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; - __m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps(); - __m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps(); - __m128 mmSumIy = _mm_setzero_ps(), mmSumIx = _mm_setzero_ps(); - __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps(); - __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits::max() -1, static_cast(fParam0))); - __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits::max()- 1, static_cast(fParam1))); - - - float s2Val = param[2] > 0 ? param[2] : -param[2]; - int s2bitShift = param[2] == 0 ? 1 : static_cast(ceil(log(200.f / s2Val) / log(2.f))); - __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast(param[2] * (float)(1 << s2bitShift))); - __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift); - __m128 mmParam2s = _mm_set1_ps(0.01f * param[2]); - __m128 mmParam2s2 = _mm_set1_ps(param[2] * param[2]); - float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x; - int bitShift = gainVec.x == 0 ? 1 : static_cast(ceil(log(200.f / gainVal) / log(2.f))); - __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast(gainVec.x * (float)(1 << bitShift))); - __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast(gainVec.y)); - __m128i mmEta = _mm_setzero_si128(); - __m128i mmScale = _mm_set1_epi16(static_cast(MEstimatorScale)); -#endif - - buffIdx = 0; - float _b0[4] = {0,0,0,0}; - float _b1[4] = {0,0,0,0}; - float _b2[4] = {0,0,0,0}; - float _b3[4] = {0,0,0,0}; - /* - */ - for( y = 0; y < _winSize.height; y++ ) - { - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; - const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn; - const short* Iptr = (const short*)(IWinBuf.data + y*IWinBuf.step); - const short* dIptr = (const short*)(derivIWinBuf.data + y*derivIWinBuf.step); + __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16); + __m128i bSet2_epi16, bSet1_epi16; + // |It| < sigma1 ? + bSet2_epi16 = _mm_cmplt_epi16(abs_epi16, mmParam1); + // It > 0 ? + __m128i diffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128()); + // sigma0 < |It| < sigma1 ? + bSet1_epi16 = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0)); + // val = |It| -/+ sigma1 + __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)), + _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1))); + // It == 0 ? |It| > sigma13 + mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16); - x = 0; -#ifdef RLOF_SSE - for( ; x <= _winSize.width*cn; x += 8, dIptr += 8*2 ) + for( unsigned int mmi = 0; mmi < 4; mmi++) { - __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16); - __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); - - __m128i v00 = _mm_unpacklo_epi8( - _mm_loadl_epi64((const __m128i*)(Jptr + x)), z); - __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); - __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z); - __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z); - - __m128i t0 = _mm_add_epi32 - (_mm_madd_epi16( - _mm_unpacklo_epi16(v00, v01), - qw0), - _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); - __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + __m128i mmDiffc_epi16_t = _mm_and_si128(mmDiffc_epi16[mmi], mask_0_7_epi16); + mmDiffc_epi16_t = _mm_and_si128(bSet2_epi16, mmDiffc_epi16_t); + + // It == val ? sigma0 < |It| < sigma1 + mmDiffc_epi16_t = _mm_blendv_epi8(mmDiffc_epi16_t, tmpParam1_epi16, bSet1_epi16); + __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3 + // diff = diff * sigma2 + lo = _mm_mullo_epi16(tale_epi16_, mmDiffc_epi16_t); + hi = _mm_mulhi_epi16(tale_epi16_, mmDiffc_epi16_t); + __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); + __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); + + mmDiffc_epi16_t = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); + __m128i diff1 = _mm_unpackhi_epi16(mmDiffc_epi16_t, mmDiffc_epi16_t); // It4 It4 It5 It5 It6 It6 It7 It7 | It12 It12 It13 It13... + __m128i diff0 = _mm_unpacklo_epi16(mmDiffc_epi16_t, mmDiffc_epi16_t); // It0 It0 It1 It1 It2 It2 It3 It3 | It8 It8 It9 It9... + + // Ix * diff / Iy * diff + v10 = _mm_mullo_epi16(Ixy_0, diff0); + v11 = _mm_mulhi_epi16(Ixy_0, diff0); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); - t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); - t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5); + qb0[mmi] = _mm_add_ps(qb0[mmi], _mm_cvtepi32_ps(v00)); + qb1[mmi] = _mm_add_ps(qb1[mmi], _mm_cvtepi32_ps(v10)); + // It * Ix It * Iy [4 ... 7] + // for set 1 hi sigma 1 + v10 = _mm_mullo_epi16(Ixy_1, diff1); + v11 = _mm_mulhi_epi16(Ixy_1, diff1); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qb0[mmi] = _mm_add_ps(qb0[mmi], _mm_cvtepi32_ps(v00)); + qb1[mmi] = _mm_add_ps(qb1[mmi], _mm_cvtepi32_ps(v10)); + // diff * J [0 ... 7] + // for set 1 sigma 1 + // b3 += diff * Iptr[x] + v10 = _mm_mullo_epi16(mmDiffc_epi16_t, I_0_7_epi16); + v11 = _mm_mulhi_epi16(mmDiffc_epi16_t, I_0_7_epi16); + v00 = _mm_unpacklo_epi16(v10, v11); - __m128i lo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16); - __m128i hi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16); + v10 = _mm_unpackhi_epi16(v10, v11); + qb2[mmi] = _mm_add_ps(qb2[mmi], _mm_cvtepi32_ps(v00)); + qb2[mmi] = _mm_add_ps(qb2[mmi], _mm_cvtepi32_ps(v10)); + qb3[mmi] = _mm_add_ps(qb3[mmi], _mm_cvtepi32_ps(diff_0_3_epi32)); + qb3[mmi] = _mm_add_ps(qb3[mmi], _mm_cvtepi32_ps(diff_4_7_epi32)); + } - __m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), bitShift); - __m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), bitShift); - __m128i Igain_epi16 = _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32); + if( j == 0 ) + { + __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16)); + __m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16,bSet1_epi16), 16)); + __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16)); + __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16),16))); + __m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16)); + __m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16),16)); - __m128i diffValue = _mm_subs_epi16(_mm_add_epi16(Igain_epi16, mmConstValue_epi16), I_0_7_epi16); - __m128i mmDiffc_epi16[4] = - { - _mm_add_epi16(_mm_slli_epi16(v11, 5), diffValue), - _mm_add_epi16(_mm_slli_epi16(v01, 5), diffValue), - _mm_add_epi16(_mm_slli_epi16(v10, 5), diffValue), - _mm_add_epi16(_mm_slli_epi16(v00, 5), diffValue) - }; + __m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps); + __m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps); + tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps); + tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps); - __m128i mmDiff_epi16 = _mm_add_epi16( _mm_packs_epi32(t0, t1), diffValue); + tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps); + tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps); + t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3 + t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3 - mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16); + __m128 fy = _mm_cvtepi32_ps(t0); + __m128 fx = _mm_cvtepi32_ps(t1); - __m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale); - mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1))); + // 0 ... 3 + __m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16)); + // A11 - A22 + __m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps); + __m128 fytale = _mm_mul_ps(fy, tale_0_3_ps); - __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); - __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); + mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); + mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); + mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); + // sumIx und sumIy + mmSumIx = _mm_add_ps(mmSumIx, fxtale); + mmSumIy = _mm_add_ps(mmSumIy, fytale); - __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16); - __m128i bSet2_epi16, bSet1_epi16; - // |It| < sigma1 ? - bSet2_epi16 = _mm_cmplt_epi16(abs_epi16, mmParam1); - // It > 0 ? - __m128i diffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128()); - // sigma0 < |It| < sigma1 ? - bSet1_epi16 = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0)); - // val = |It| -/+ sigma1 - __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)), - _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1))); - // It == 0 ? |It| > sigma13 - mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16); + mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale)); + mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale)); - for( unsigned int mmi = 0; mmi < 4; mmi++) - { - __m128i mmDiffc_epi16_t = _mm_and_si128(mmDiffc_epi16[mmi], mask_0_7_epi16); - mmDiffc_epi16_t = _mm_and_si128(bSet2_epi16, mmDiffc_epi16_t); - - // It == val ? sigma0 < |It| < sigma1 - mmDiffc_epi16_t = _mm_blendv_epi8(mmDiffc_epi16_t, tmpParam1_epi16, bSet1_epi16); - __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3 - // diff = diff * sigma2 - lo = _mm_mullo_epi16(tale_epi16_, mmDiffc_epi16_t); - hi = _mm_mulhi_epi16(tale_epi16_, mmDiffc_epi16_t); - __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); - __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); - - mmDiffc_epi16_t = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); - __m128i diff1 = _mm_unpackhi_epi16(mmDiffc_epi16_t, mmDiffc_epi16_t); // It4 It4 It5 It5 It6 It6 It7 It7 | It12 It12 It13 It13... - __m128i diff0 = _mm_unpacklo_epi16(mmDiffc_epi16_t, mmDiffc_epi16_t); // It0 It0 It1 It1 It2 It2 It3 It3 | It8 It8 It9 It9... - - // Ix * diff / Iy * diff - v10 = _mm_mullo_epi16(Ixy_0, diff0); - v11 = _mm_mulhi_epi16(Ixy_0, diff0); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - - qb0[mmi] = _mm_add_ps(qb0[mmi], _mm_cvtepi32_ps(v00)); - qb1[mmi] = _mm_add_ps(qb1[mmi], _mm_cvtepi32_ps(v10)); - // It * Ix It * Iy [4 ... 7] - // for set 1 hi sigma 1 - v10 = _mm_mullo_epi16(Ixy_1, diff1); - v11 = _mm_mulhi_epi16(Ixy_1, diff1); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qb0[mmi] = _mm_add_ps(qb0[mmi], _mm_cvtepi32_ps(v00)); - qb1[mmi] = _mm_add_ps(qb1[mmi], _mm_cvtepi32_ps(v10)); - // diff * J [0 ... 7] - // for set 1 sigma 1 - // b3 += diff * Iptr[x] - v10 = _mm_mullo_epi16(mmDiffc_epi16_t, I_0_7_epi16); - v11 = _mm_mulhi_epi16(mmDiffc_epi16_t, I_0_7_epi16); - v00 = _mm_unpacklo_epi16(v10, v11); - - v10 = _mm_unpackhi_epi16(v10, v11); - qb2[mmi] = _mm_add_ps(qb2[mmi], _mm_cvtepi32_ps(v00)); - qb2[mmi] = _mm_add_ps(qb2[mmi], _mm_cvtepi32_ps(v10)); - qb3[mmi] = _mm_add_ps(qb3[mmi], _mm_cvtepi32_ps(diff_0_3_epi32)); - qb3[mmi] = _mm_add_ps(qb3[mmi], _mm_cvtepi32_ps(diff_4_7_epi32)); - } + // sumI + __m128 I_tale_ps = _mm_mul_ps(I_ps, tale_0_3_ps); + mmSumI = _mm_add_ps(mmSumI,I_tale_ps); - if( j == 0 ) - { - __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16)); - __m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16,bSet1_epi16), 16)); - __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16)); - __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16),16))); + // sumW + mmSumW = _mm_add_ps(mmSumW, tale_0_3_ps); - __m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16)); - __m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16),16)); + // sumDI + mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_tale_ps)); - __m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps); - __m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps); - tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps); - tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps); - tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps); - tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps); + t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11 + t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3 - t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3 - t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3 + fy = _mm_cvtepi32_ps(t0); + fx = _mm_cvtepi32_ps(t1); - __m128 fy = _mm_cvtepi32_ps(t0); - __m128 fx = _mm_cvtepi32_ps(t1); + // 4 ... 7 + I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16)); - // 0 ... 3 - __m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16)); + // A11 - A22 + fxtale = _mm_mul_ps(fx, tale_4_7_ps); + fytale = _mm_mul_ps(fy, tale_4_7_ps); - // A11 - A22 - __m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps); - __m128 fytale = _mm_mul_ps(fy, tale_0_3_ps); + mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); + mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); + mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); - mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); - mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); - mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); + // sumIx und sumIy + mmSumIx = _mm_add_ps(mmSumIx, fxtale); + mmSumIy = _mm_add_ps(mmSumIy, fytale); - // sumIx und sumIy - mmSumIx = _mm_add_ps(mmSumIx, fxtale); - mmSumIy = _mm_add_ps(mmSumIy, fytale); + mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale)); + mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale)); - mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale)); - mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale)); + // sumI + I_tale_ps = _mm_mul_ps(I_ps, tale_4_7_ps); + mmSumI = _mm_add_ps(mmSumI, I_tale_ps); - // sumI - __m128 I_tale_ps = _mm_mul_ps(I_ps, tale_0_3_ps); - mmSumI = _mm_add_ps(mmSumI,I_tale_ps); + // sumW + mmSumW = _mm_add_ps(mmSumW, tale_4_7_ps); - // sumW - mmSumW = _mm_add_ps(mmSumW, tale_0_3_ps); + // sumDI + mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_tale_ps)); + } - // sumDI - mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_tale_ps)); + } +#else + for( ; x < _winSize.width*cn; x++, dIptr += 2 ) + { + if( maskPtr[x] == 0) + continue; + short ixval = dIptr[0]; + short iyval = dIptr[1]; + int illValue = static_cast(Iptr[x] * gainVec.x + gainVec.y - Iptr[x]); - t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11 - t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3 + int It[4] = {(Jptr[x+step+cn]<< 5) + illValue, + (Jptr[x+cn]<< 5) + illValue, + (Jptr[x+step]<< 5) + illValue, + (Jptr[x] << 5) + illValue}; - fy = _mm_cvtepi32_ps(t0); - fx = _mm_cvtepi32_ps(t1); - // 4 ... 7 - I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16)); + int J_val = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, + W_BITS1-5); - // A11 - A22 - fxtale = _mm_mul_ps(fx, tale_4_7_ps); - fytale = _mm_mul_ps(fy, tale_4_7_ps); + int diff = J_val + illValue; - mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); - mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); - mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); - // sumIx und sumIy - mmSumIx = _mm_add_ps(mmSumIx, fxtale); - mmSumIy = _mm_add_ps(mmSumIy, fytale); - mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale)); - mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale)); + MEstimatorScale += (diff < MEstimatorScale) ? -eta : eta; - // sumI - I_tale_ps = _mm_mul_ps(I_ps, tale_4_7_ps); - mmSumI = _mm_add_ps(mmSumI, I_tale_ps); + int abss = (diff < 0) ? -diff : diff; - // sumW - mmSumW = _mm_add_ps(mmSumW, tale_4_7_ps); - // sumDI - mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_tale_ps)); + // compute the missmatch vector + if( j >= 0) + { + if( abss > fParam1) + { + It[0] = 0; + It[1] = 0; + It[2] = 0; + It[3] = 0; } - - } -#else - for( ; x < _winSize.width*cn; x++, dIptr += 2 ) + else if( abss > static_cast(fParam0) && diff >= 0 ) { - if( maskPtr[x] == 0) - continue; - - short ixval = dIptr[0]; - short iyval = dIptr[1]; - int illValue = static_cast(Iptr[x] * gainVec.x + gainVec.y - Iptr[x]); - - int It[4] = {(Jptr[x+step+cn]<< 5) + illValue, - (Jptr[x+cn]<< 5) + illValue, - (Jptr[x+step]<< 5) + illValue, - (Jptr[x] << 5) + illValue}; - - - int J_val = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + - Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, - W_BITS1-5); - - int diff = J_val + illValue; - - - - MEstimatorScale += (diff < MEstimatorScale) ? -eta : eta; + It[0] = static_cast(param[2] * (It[0] - fParam1)); + It[1] = static_cast(param[2] * (It[1] - fParam1)); + It[2] = static_cast(param[2] * (It[2] - fParam1)); + It[3] = static_cast(param[2] * (It[3] - fParam1)); + } + else if( abss > static_cast(fParam0) && diff < 0 ) + { + It[0] = static_cast(param[2] * (It[0] + fParam1)); + It[1] = static_cast(param[2] * (It[1] + fParam1)); + It[2] = static_cast(param[2] * (It[2] + fParam1)); + It[3] = static_cast(param[2] * (It[3] + fParam1)); + } + } + _b0[0] += (float)(It[0]*dIptr[0]) ; + _b0[1] += (float)(It[1]*dIptr[0]) ; + _b0[2] += (float)(It[2]*dIptr[0]) ; + _b0[3] += (float)(It[3]*dIptr[0]) ; - int abss = (diff < 0) ? -diff : diff; + _b1[0] += (float)(It[0]*dIptr[1]) ; + _b1[1] += (float)(It[1]*dIptr[1]) ; + _b1[2] += (float)(It[2]*dIptr[1]) ; + _b1[3] += (float)(It[3]*dIptr[1]) ; - // compute the missmatch vector - if( j >= 0) - { - if( abss > fParam1) - { - It[0] = 0; - It[1] = 0; - It[2] = 0; - It[3] = 0; - } - else if( abss > static_cast(fParam0) && diff >= 0 ) - { - It[0] = static_cast(param[2] * (It[0] - fParam1)); - It[1] = static_cast(param[2] * (It[1] - fParam1)); - It[2] = static_cast(param[2] * (It[2] - fParam1)); - It[3] = static_cast(param[2] * (It[3] - fParam1)); - } - else if( abss > static_cast(fParam0) && diff < 0 ) - { - It[0] = static_cast(param[2] * (It[0] + fParam1)); - It[1] = static_cast(param[2] * (It[1] + fParam1)); - It[2] = static_cast(param[2] * (It[2] + fParam1)); - It[3] = static_cast(param[2] * (It[3] + fParam1)); - } - } - _b0[0] += (float)(It[0]*dIptr[0]) ; - _b0[1] += (float)(It[1]*dIptr[0]) ; - _b0[2] += (float)(It[2]*dIptr[0]) ; - _b0[3] += (float)(It[3]*dIptr[0]) ; + _b2[0] += (float)(It[0])*Iptr[x] ; + _b2[1] += (float)(It[1])*Iptr[x] ; + _b2[2] += (float)(It[2])*Iptr[x] ; + _b2[3] += (float)(It[3])*Iptr[x] ; + _b3[0] += (float)(It[0]); + _b3[1] += (float)(It[1]); + _b3[2] += (float)(It[2]); + _b3[3] += (float)(It[3]); - _b1[0] += (float)(It[0]*dIptr[1]) ; - _b1[1] += (float)(It[1]*dIptr[1]) ; - _b1[2] += (float)(It[2]*dIptr[1]) ; - _b1[3] += (float)(It[3]*dIptr[1]) ; - - _b2[0] += (float)(It[0])*Iptr[x] ; - _b2[1] += (float)(It[1])*Iptr[x] ; - _b2[2] += (float)(It[2])*Iptr[x] ; - _b2[3] += (float)(It[3])*Iptr[x] ; - - _b3[0] += (float)(It[0]); - _b3[1] += (float)(It[1]); - _b3[2] += (float)(It[2]); - _b3[3] += (float)(It[3]); + // compute the Gradient Matrice + if( j == 0) + { + float tale = param[2] * FLT_RESCALE; + if( abss < fParam0 || j < 0) + { + tale = FLT_RESCALE; + } + else if( abss > fParam1) + { + tale *= 0.01f; + } + else + { + tale *= param[2]; + } + if( j == 0 ) + { + A11 += (float)(ixval*ixval)*tale; + A12 += (float)(ixval*iyval)*tale; + A22 += (float)(iyval*iyval)*tale; + } - // compute the Gradient Matrice - if( j == 0) - { - float tale = param[2] * FLT_RESCALE; - if( abss < fParam0 || j < 0) - { - tale = FLT_RESCALE; - } - else if( abss > fParam1) - { - tale *= 0.01f; - } - else - { - tale *= param[2]; - } - if( j == 0 ) - { - A11 += (float)(ixval*ixval)*tale; - A12 += (float)(ixval*iyval)*tale; - A22 += (float)(iyval*iyval)*tale; - } - - dI += Iptr[x] * Iptr[x] * tale; - float dx = static_cast(dIptr[0]) * tale; - float dy = static_cast(dIptr[1]) * tale; - sumIx += dx; - sumIy += dy; - w1 += dx * Iptr[x]; - w2 += dy * Iptr[x]; - sumI += Iptr[x] * tale; - sumW += tale; - } + dI += Iptr[x] * Iptr[x] * tale; + float dx = static_cast(dIptr[0]) * tale; + float dy = static_cast(dIptr[1]) * tale; + sumIx += dx; + sumIy += dy; + w1 += dx * Iptr[x]; + w2 += dy * Iptr[x]; + sumI += Iptr[x] * tale; + sumW += tale; + } - } + } #endif - } + } #ifdef RLOF_SSE - short etaValues[8]; - _mm_storeu_si128((__m128i*)(etaValues), mmEta); - MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3] - + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]); - float CV_DECL_ALIGNED(32) wbuf[4]; + short etaValues[8]; + _mm_storeu_si128((__m128i*)(etaValues), mmEta); + MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3] + + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]); + float CV_DECL_ALIGNED(32) wbuf[4]; #endif - if( j == 0 ) - { + if( j == 0 ) + { #ifdef RLOF_SSE - _mm_store_ps(wbuf, mmSumW1); - w1 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; - _mm_store_ps(wbuf, mmSumW2); - w2 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; - _mm_store_ps(wbuf, mmSumDI); - dI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; - _mm_store_ps(wbuf, mmSumI); - sumI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; - _mm_store_ps(wbuf, mmSumIx); - sumIx = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; - _mm_store_ps(wbuf, mmSumIy); - sumIy = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; - _mm_store_ps(wbuf, mmSumW); - sumW = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumW1); + w1 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumW2); + w2 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumDI); + dI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumI); + sumI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumIx); + sumIx = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumIy); + sumIy = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumW); + sumW = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; #endif - sumIx *= -FLT_SCALE; - sumIy *= -FLT_SCALE; - sumI *=FLT_SCALE; - sumW *= FLT_SCALE; - w1 *= -FLT_SCALE; - w2 *= -FLT_SCALE; - dI *= FLT_SCALE; + sumIx *= -FLT_SCALE; + sumIy *= -FLT_SCALE; + sumI *=FLT_SCALE; + sumW *= FLT_SCALE; + w1 *= -FLT_SCALE; + w2 *= -FLT_SCALE; + dI *= FLT_SCALE; #ifdef RLOF_SSE - float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];// + float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];// - _mm_store_ps(A11buf, mmAxx); - _mm_store_ps(A12buf, mmAxy); - _mm_store_ps(A22buf, mmAyy); + _mm_store_ps(A11buf, mmAxx); + _mm_store_ps(A12buf, mmAxy); + _mm_store_ps(A22buf, mmAyy); - A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; - A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; - A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; + A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; + A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; + A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; #endif - A11 *= FLT_SCALE; // 54866744. - A12 *= FLT_SCALE; // -628764.00 - A22 *= FLT_SCALE; // 19730.000 - - D = - A12*A12*sumI*sumI + dI*sumW*A12*A12 + 2*A12*sumI*sumIx*w2 + 2*A12*sumI*sumIy*w1 - - 2*dI*A12*sumIx*sumIy - 2*sumW*A12*w1*w2 + A11*A22*sumI*sumI - 2*A22*sumI*sumIx*w1 - - 2*A11*sumI*sumIy*w2 - sumIx*sumIx*w2*w2 + A22*dI*sumIx*sumIx + 2*sumIx*sumIy*w1*w2 - - sumIy*sumIy*w1*w1 + A11*dI*sumIy*sumIy + A22*sumW*w1*w1 + A11*sumW*w2*w2 - A11*A22*dI*sumW; - - float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) + - 4.f*A12*A12))/(2*winArea); - if( minEig < minEigThreshold || std::abs(D) < FLT_EPSILON ) - { - if( level == 0 && status ) - status[ptidx] = 0; - if( level > 0) - { - nextPts[ptidx] = backUpNextPt; - gainVecs[ptidx] = backUpGain; - } - noIteration++; - break; - } - - - D = (1.f / D); - - invTensorMat.at(0,0) = (A22*sumI*sumI - 2*sumI*sumIy*w2 + dI*sumIy*sumIy + sumW*w2*w2 - A22*dI*sumW)* D; - invTensorMat.at(0,1) = (A12*dI*sumW - A12*sumI * sumI - dI*sumIx*sumIy + sumI*sumIx*w2 + sumI*sumIy*w1 - sumW*w1*w2)* D; - invTensorMat.at(0,2) = (A12*sumI*sumIy - sumIy*sumIy*w1 - A22*sumI*sumIx - A12*sumW*w2 + A22*sumW*w1 + sumIx*sumIy*w2)* D; - invTensorMat.at(0,3) = (A22*dI*sumIx - A12*dI*sumIy - sumIx*w2*w2 + A12*sumI*w2 - A22*sumI*w1 + sumIy*w1*w2) * D; - invTensorMat.at(1,0) = invTensorMat.at(0,1); - invTensorMat.at(1,1) = (A11*sumI * sumI - 2*sumI*sumIx*w1 + dI*sumIx * sumIx + sumW*w1*w1 - A11*dI*sumW) * D; - invTensorMat.at(1,2) = (A12*sumI*sumIx - A11*sumI*sumIy - sumIx * sumIx*w2 + A11*sumW*w2 - A12*sumW*w1 + sumIx*sumIy*w1) * D; - invTensorMat.at(1,3) = (A11*dI*sumIy - sumIy*w1*w1 - A12*dI*sumIx - A11*sumI*w2 + A12*sumI*w1 + sumIx*w1*w2)* D; - invTensorMat.at(2,0) = invTensorMat.at(0,2); - invTensorMat.at(2,1) = invTensorMat.at(1,2); - invTensorMat.at(2,2) = (sumW*A12*A12 - 2*A12*sumIx*sumIy + A22*sumIx*sumIx + A11*sumIy*sumIy - A11*A22*sumW)* D; - invTensorMat.at(2,3) = (A11*A22*sumI - A12*A12*sumI - A11*sumIy*w2 + A12*sumIx*w2 + A12*sumIy*w1 - A22*sumIx*w1)* D; - invTensorMat.at(3,0) = invTensorMat.at(0,3); - invTensorMat.at(3,1) = invTensorMat.at(1,3); - invTensorMat.at(3,2) = invTensorMat.at(2,3); - invTensorMat.at(3,3) = (dI*A12*A12 - 2*A12*w1*w2 + A22*w1*w1 + A11*w2*w2 - A11*A22*dI)* D; + A11 *= FLT_SCALE; // 54866744. + A12 *= FLT_SCALE; // -628764.00 + A22 *= FLT_SCALE; // 19730.000 + + D = - A12*A12*sumI*sumI + dI*sumW*A12*A12 + 2*A12*sumI*sumIx*w2 + 2*A12*sumI*sumIy*w1 + - 2*dI*A12*sumIx*sumIy - 2*sumW*A12*w1*w2 + A11*A22*sumI*sumI - 2*A22*sumI*sumIx*w1 + - 2*A11*sumI*sumIy*w2 - sumIx*sumIx*w2*w2 + A22*dI*sumIx*sumIx + 2*sumIx*sumIy*w1*w2 + - sumIy*sumIy*w1*w1 + A11*dI*sumIy*sumIy + A22*sumW*w1*w1 + A11*sumW*w2*w2 - A11*A22*dI*sumW; + + float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) + + 4.f*A12*A12))/(2*winArea); + if( minEig < minEigThreshold || std::abs(D) < FLT_EPSILON ) + { + if( level == 0 && status ) + status[ptidx] = 0; + if( level > 0) + { + nextPts[ptidx] = backUpNextPt; + gainVecs[ptidx] = backUpGain; } + noIteration++; + break; + } - #ifdef RLOF_SSE - float CV_DECL_ALIGNED(16) bbuf[4]; - for(int mmi = 0; mmi < 4; mmi++) - { + D = (1.f / D); + + invTensorMat.at(0,0) = (A22*sumI*sumI - 2*sumI*sumIy*w2 + dI*sumIy*sumIy + sumW*w2*w2 - A22*dI*sumW)* D; + invTensorMat.at(0,1) = (A12*dI*sumW - A12*sumI * sumI - dI*sumIx*sumIy + sumI*sumIx*w2 + sumI*sumIy*w1 - sumW*w1*w2)* D; + invTensorMat.at(0,2) = (A12*sumI*sumIy - sumIy*sumIy*w1 - A22*sumI*sumIx - A12*sumW*w2 + A22*sumW*w1 + sumIx*sumIy*w2)* D; + invTensorMat.at(0,3) = (A22*dI*sumIx - A12*dI*sumIy - sumIx*w2*w2 + A12*sumI*w2 - A22*sumI*w1 + sumIy*w1*w2) * D; + invTensorMat.at(1,0) = invTensorMat.at(0,1); + invTensorMat.at(1,1) = (A11*sumI * sumI - 2*sumI*sumIx*w1 + dI*sumIx * sumIx + sumW*w1*w1 - A11*dI*sumW) * D; + invTensorMat.at(1,2) = (A12*sumI*sumIx - A11*sumI*sumIy - sumIx * sumIx*w2 + A11*sumW*w2 - A12*sumW*w1 + sumIx*sumIy*w1) * D; + invTensorMat.at(1,3) = (A11*dI*sumIy - sumIy*w1*w1 - A12*dI*sumIx - A11*sumI*w2 + A12*sumI*w1 + sumIx*w1*w2)* D; + invTensorMat.at(2,0) = invTensorMat.at(0,2); + invTensorMat.at(2,1) = invTensorMat.at(1,2); + invTensorMat.at(2,2) = (sumW*A12*A12 - 2*A12*sumIx*sumIy + A22*sumIx*sumIx + A11*sumIy*sumIy - A11*A22*sumW)* D; + invTensorMat.at(2,3) = (A11*A22*sumI - A12*A12*sumI - A11*sumIy*w2 + A12*sumIx*w2 + A12*sumIy*w1 - A22*sumIx*w1)* D; + invTensorMat.at(3,0) = invTensorMat.at(0,3); + invTensorMat.at(3,1) = invTensorMat.at(1,3); + invTensorMat.at(3,2) = invTensorMat.at(2,3); + invTensorMat.at(3,3) = (dI*A12*A12 - 2*A12*w1*w2 + A22*w1*w1 + A11*w2*w2 - A11*A22*dI)* D; + } - _mm_store_ps(bbuf, _mm_add_ps(qb0[mmi], qb1[mmi])); - _b0[mmi] = bbuf[0] + bbuf[2]; - _b1[mmi] = bbuf[1] + bbuf[3]; - _mm_store_ps(bbuf, qb2[mmi]); - _b2[mmi] = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; - _mm_store_ps(bbuf, qb3[mmi]); - _b3[mmi] = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; - } - #endif - - _b0[0] *= FLT_SCALE;_b0[1] *= FLT_SCALE;_b0[2] *= FLT_SCALE;_b0[3] *= FLT_SCALE; - _b1[0] *= FLT_SCALE;_b1[1] *= FLT_SCALE;_b1[2] *= FLT_SCALE;_b1[3] *= FLT_SCALE; - _b2[0] *= FLT_SCALE;_b2[1] *= FLT_SCALE;_b2[2] *= FLT_SCALE;_b2[3] *= FLT_SCALE; - _b3[0] *= FLT_SCALE;_b3[1] *= FLT_SCALE;_b3[2] *= FLT_SCALE;_b3[3] *= FLT_SCALE; - - - Mc0[0] = _b0[0] - _b0[1] - _b0[2] + _b0[3]; - Mc0[1] = _b1[0] - _b1[1] - _b1[2] + _b1[3]; - Mc0[2] = -(_b2[0] - _b2[1] - _b2[2] + _b2[3]); - Mc0[3] = -(_b3[0] - _b3[1] - _b3[2] + _b3[3]); - - Mc1[0] = _b0[1] - _b0[3]; - Mc1[1] = _b1[1] - _b1[3]; - Mc1[2] = -(_b2[1] - _b2[3]); - Mc1[3] = -(_b3[1] - _b3[3]); - - - Mc2[0] = _b0[2] - _b0[3]; - Mc2[1] = _b1[2] - _b1[3]; - Mc2[2] = -(_b2[2] - _b2[3]); - Mc2[3] = -(_b3[2] - _b3[3]); - - - Mc3[0] = _b0[3]; - Mc3[1] = _b1[3]; - Mc3[2] = -_b2[3]; - Mc3[3] = -_b3[3]; - - // - c[0] = -Mc0[0]; - c[1] = -Mc1[0]; - c[2] = -Mc2[0]; - c[3] = -Mc3[0]; - c[4] = -Mc0[1]; - c[5] = -Mc1[1]; - c[6] = -Mc2[1]; - c[7] = -Mc3[1]; - - float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]); - float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]); - float e2 = e0 * (c[1] * c[7] -c[3] * c[5]); - e0 = e1 * e1 - e2; - hasSolved = false; - if ( e0 > 0) - { - e0 = sqrt(e0); - float _y[2] = {-e1 - e0, e0 - e1}; - float c0yc1[2] = {c[0] * _y[0] + c[1], - c[0] * _y[1] + c[1]}; - float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0], - -(c[2] * _y[1] + c[3]) / c0yc1[1]}; - bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1); - bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1); - - bool isSolution1 = checkSolution(_x[0], _y[0], c ); - bool isSolution2 = checkSolution(_x[1], _y[1], c ); - bool isSink1 = isIn1 && isSolution1; - bool isSink2 = isIn2 && isSolution2; - - if ( isSink1 != isSink2) - { - a = isSink1 ? _x[0] : _x[1]; - b = isSink1 ? _y[0] : _y[1]; - ab = a * b; - hasSolved = true; - delta.x = inextPt.x + a - nextPt.x; - delta.y = inextPt.y + b - nextPt.y; - - cv::Vec4f mismatchVec = ab * Mc0 + Mc1 *a + Mc2 * b + Mc3; - deltaGain = est_DeltaGain(invTensorMat, mismatchVec); +#ifdef RLOF_SSE + float CV_DECL_ALIGNED(16) bbuf[4]; + for(int mmi = 0; mmi < 4; mmi++) + { - } // isIn1 != isIn2 - } - if( hasSolved == false) - noIteration++; - } - else - { - hasSolved = false; - noReusedIteration++; - } - if( hasSolved == false ) - { + _mm_store_ps(bbuf, _mm_add_ps(qb0[mmi], qb1[mmi])); + _b0[mmi] = bbuf[0] + bbuf[2]; + _b1[mmi] = bbuf[1] + bbuf[3]; + _mm_store_ps(bbuf, qb2[mmi]); + _b2[mmi] = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; + _mm_store_ps(bbuf, qb3[mmi]); + _b3[mmi] = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; - cv::Vec4f mismatchVec = ab * Mc0 + Mc1 *a + Mc2 * b + Mc3; - est_Result(invTensorMat, mismatchVec, delta, deltaGain); + } +#endif - delta.x = MAX(-1.f, MIN( 1.f , delta.x)); - delta.y = MAX(-1.f, MIN( 1.f , delta.y)); + _b0[0] *= FLT_SCALE;_b0[1] *= FLT_SCALE;_b0[2] *= FLT_SCALE;_b0[3] *= FLT_SCALE; + _b1[0] *= FLT_SCALE;_b1[1] *= FLT_SCALE;_b1[2] *= FLT_SCALE;_b1[3] *= FLT_SCALE; + _b2[0] *= FLT_SCALE;_b2[1] *= FLT_SCALE;_b2[2] *= FLT_SCALE;_b2[3] *= FLT_SCALE; + _b3[0] *= FLT_SCALE;_b3[1] *= FLT_SCALE;_b3[2] *= FLT_SCALE;_b3[3] *= FLT_SCALE; + + + Mc0[0] = _b0[0] - _b0[1] - _b0[2] + _b0[3]; + Mc0[1] = _b1[0] - _b1[1] - _b1[2] + _b1[3]; + Mc0[2] = -(_b2[0] - _b2[1] - _b2[2] + _b2[3]); + Mc0[3] = -(_b3[0] - _b3[1] - _b3[2] + _b3[3]); + + Mc1[0] = _b0[1] - _b0[3]; + Mc1[1] = _b1[1] - _b1[3]; + Mc1[2] = -(_b2[1] - _b2[3]); + Mc1[3] = -(_b3[1] - _b3[3]); + + + Mc2[0] = _b0[2] - _b0[3]; + Mc2[1] = _b1[2] - _b1[3]; + Mc2[2] = -(_b2[2] - _b2[3]); + Mc2[3] = -(_b3[2] - _b3[3]); + + + Mc3[0] = _b0[3]; + Mc3[1] = _b1[3]; + Mc3[2] = -_b2[3]; + Mc3[3] = -_b3[3]; + + // + c[0] = -Mc0[0]; + c[1] = -Mc1[0]; + c[2] = -Mc2[0]; + c[3] = -Mc3[0]; + c[4] = -Mc0[1]; + c[5] = -Mc1[1]; + c[6] = -Mc2[1]; + c[7] = -Mc3[1]; + + float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]); + float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]); + float e2 = e0 * (c[1] * c[7] -c[3] * c[5]); + e0 = e1 * e1 - e2; + hasSolved = false; + if ( e0 > 0) + { + e0 = sqrt(e0); + float _y[2] = {-e1 - e0, e0 - e1}; + float c0yc1[2] = {c[0] * _y[0] + c[1], + c[0] * _y[1] + c[1]}; + float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0], + -(c[2] * _y[1] + c[3]) / c0yc1[1]}; + bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1); + bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1); + + bool isSolution1 = checkSolution(_x[0], _y[0], c ); + bool isSolution2 = checkSolution(_x[1], _y[1], c ); + bool isSink1 = isIn1 && isSolution1; + bool isSink2 = isIn2 && isSolution2; + + if ( isSink1 != isSink2) + { + a = isSink1 ? _x[0] : _x[1]; + b = isSink1 ? _y[0] : _y[1]; + ab = a * b; + hasSolved = true; + delta.x = inextPt.x + a - nextPt.x; + delta.y = inextPt.y + b - nextPt.y; + cv::Vec4f mismatchVec = ab * Mc0 + Mc1 *a + Mc2 * b + Mc3; + deltaGain = est_DeltaGain(invTensorMat, mismatchVec); - if( j == 0) - prevGain = deltaGain; - gainVec += deltaGain; - nextPt += delta ; - nextPts[ptidx] = nextPt + halfWin; - gainVecs[ptidx]= gainVec; + } // isIn1 != isIn2 + } + if( hasSolved == false) + noIteration++; + } + else + { + hasSolved = false; + noReusedIteration++; + } + if( hasSolved == false ) + { - } - else - { - nextPt += delta; - nextPts[ptidx] = nextPt + halfWin; - gainVecs[ptidx]= gainVec + deltaGain; - noSolvedIteration++; - break; - } + cv::Vec4f mismatchVec = ab * Mc0 + Mc1 *a + Mc2 * b + Mc3; + est_Result(invTensorMat, mismatchVec, delta, deltaGain); - if(j > 0 && ( - (std::abs(delta.x - prevDelta.x) < 0.01 && std::abs(delta.y - prevDelta.y) < 0.01) - || ((delta.ddot(delta) <= 0.001) && std::abs(prevGain.x - deltaGain.x) < 0.01))) - { - nextPts[ptidx] -= delta*0.5f; - gainVecs[ptidx] -= deltaGain* 0.5f; - break; - } + delta.x = MAX(-1.f, MIN( 1.f , delta.x)); + delta.y = MAX(-1.f, MIN( 1.f , delta.y)); - prevDelta = delta; + if( j == 0) prevGain = deltaGain; - } + gainVec += deltaGain; + nextPt += delta ; + nextPts[ptidx] = nextPt + halfWin; + gainVecs[ptidx]= gainVec; } + else + { + nextPt += delta; + nextPts[ptidx] = nextPt + halfWin; + gainVecs[ptidx]= gainVec + deltaGain; + noSolvedIteration++; + break; + } + if(j > 0 && ( + (std::abs(delta.x - prevDelta.x) < 0.01 && std::abs(delta.y - prevDelta.y) < 0.01) + || ((delta.ddot(delta) <= 0.001) && std::abs(prevGain.x - deltaGain.x) < 0.01))) + { + nextPts[ptidx] -= delta*0.5f; + gainVecs[ptidx] -= deltaGain* 0.5f; + break; + } + + + prevDelta = delta; + prevGain = deltaGain; } - const Mat* prevImg; - const Mat* nextImg; - const Mat* prevDeriv; - const Mat* rgbPrevImg; - const Mat* rgbNextImg; - const Point2f* prevPts; - Point2f* nextPts; - uchar* status; - cv::Point2f* gainVecs; // gain vector x -> multiplier y -> offset - float* err; - int maxWinSize; - int minWinSize; - TermCriteria criteria; - int level; - int maxLevel; - int windowType; - float minEigThreshold; - bool useInitialFlow; - std::vector paramReset; - int crossSegmentationThreshold; - - }; + } + } -} -namespace beplk + + const Mat* prevImg; + const Mat* nextImg; + const Mat* prevDeriv; + const Mat* rgbPrevImg; + const Mat* rgbNextImg; + const Point2f* prevPts; + Point2f* nextPts; + uchar* status; + cv::Point2f* gainVecs; // gain vector x -> multiplier y -> offset + float* err; + int maxWinSize; + int minWinSize; + TermCriteria criteria; + int level; + int maxLevel; + int windowType; + float minEigThreshold; + bool useInitialFlow; + std::vector paramReset; + int crossSegmentationThreshold; +}; + +}} // namespace +namespace beplk { +namespace ica { + +class TrackerInvoker : public cv::ParallelLoopBody { - namespace ica{ - class TrackerInvoker : public cv::ParallelLoopBody +public: + TrackerInvoker( + const Mat& _prevImg, + const Mat& _prevDeriv, + const Mat& _nextImg, + const Mat& _rgbPrevImg, + const Mat& _rgbNextImg, + const Point2f* _prevPts, + Point2f* _nextPts, + uchar* _status, + float* _err, + int _level, + int _maxLevel, + int _winSize[2], + int _maxIteration, + bool _useInitialFlow, + int _supportRegionType, + int _crossSegmentationThreshold, + float _minEigenValue) { - public: - TrackerInvoker(const Mat& _prevImg, - const Mat& _prevDeriv, - const Mat& _nextImg, - const Mat& _rgbPrevImg, - const Mat& _rgbNextImg, - const Point2f* _prevPts, - Point2f* _nextPts, - uchar* _status, - float* _err, - int _level, - int _maxLevel, - int _winSize[2], - int _maxIteration, - bool _useInitialFlow, - int _supportRegionType, - int _crossSegmentationThreshold, - float _minEigenValue) - { - prevImg = &_prevImg; - prevDeriv = &_prevDeriv; - nextImg = &_nextImg; - rgbPrevImg = &_rgbPrevImg; - rgbNextImg = &_rgbNextImg; - prevPts = _prevPts; - nextPts = _nextPts; - status = _status; - err = _err; - minWinSize = _winSize[0]; - maxWinSize = _winSize[1]; - criteria.maxCount = _maxIteration; - criteria.epsilon = 0.01; - level = _level; - maxLevel = _maxLevel; - windowType = _supportRegionType; - minEigThreshold = _minEigenValue; - useInitialFlow = _useInitialFlow; - crossSegmentationThreshold = _crossSegmentationThreshold; - } + prevImg = &_prevImg; + prevDeriv = &_prevDeriv; + nextImg = &_nextImg; + rgbPrevImg = &_rgbPrevImg; + rgbNextImg = &_rgbNextImg; + prevPts = _prevPts; + nextPts = _nextPts; + status = _status; + err = _err; + minWinSize = _winSize[0]; + maxWinSize = _winSize[1]; + criteria.maxCount = _maxIteration; + criteria.epsilon = 0.01; + level = _level; + maxLevel = _maxLevel; + windowType = _supportRegionType; + minEigThreshold = _minEigenValue; + useInitialFlow = _useInitialFlow; + crossSegmentationThreshold = _crossSegmentationThreshold; + } - TrackerInvoker & operator=(const TrackerInvoker &) {return *this;}; + TrackerInvoker & operator=(const TrackerInvoker &) {return *this;}; - void operator()(const cv::Range& range) const CV_OVERRIDE - { - cv::Size winSize; - cv::Point2f halfWin; + void operator()(const cv::Range& range) const CV_OVERRIDE + { + cv::Size winSize; + cv::Point2f halfWin; - const Mat& I = *prevImg; - const Mat& J = *nextImg; - const Mat& derivI = *prevDeriv; - const Mat& BI = *rgbPrevImg; + const Mat& I = *prevImg; + const Mat& J = *nextImg; + const Mat& derivI = *prevDeriv; + const Mat& BI = *rgbPrevImg; - winSize = cv::Size(maxWinSize,maxWinSize); - int winMaskwidth = static_cast(ceil(winSize.width / 8.f)) * 16; - cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); - winMaskMatBuf.setTo(1); + winSize = cv::Size(maxWinSize,maxWinSize); + int winMaskwidth = static_cast(ceil(winSize.width / 8.f)) * 16; + cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); + winMaskMatBuf.setTo(1); - const float FLT_SCALE = (1.f/(1 << 20)); // 20 + const float FLT_SCALE = (1.f/(1 << 20)); // 20 - int j, cn = I.channels(), cn2 = cn*2; - int winbufwidth = static_cast(ceil(winSize.width / 8.f)) * 8; - cv::Size winBufSize(winbufwidth,winbufwidth); + int j, cn = I.channels(), cn2 = cn*2; + int winbufwidth = static_cast(ceil(winSize.width / 8.f)) * 8; + cv::Size winBufSize(winbufwidth,winbufwidth); - std::vector _buf(winBufSize.area()*(cn + cn2)); - Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]); - Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]); + std::vector _buf(winBufSize.area()*(cn + cn2)); + Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]); + Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]); - for( int ptidx = range.start; ptidx < range.end; ptidx++ ) + for( int ptidx = range.start; ptidx < range.end; ptidx++ ) + { + Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level)); + Point2f nextPt; + if( level == maxLevel ) { - Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level)); - Point2f nextPt; - if( level == maxLevel ) - { - if( useInitialFlow ) - nextPt = nextPts[ptidx]*(float)(1./(1 << level)); - else - nextPt = prevPt; - } + if( useInitialFlow ) + nextPt = nextPts[ptidx]*(float)(1./(1 << level)); else - nextPt = nextPts[ptidx]*2.f; - nextPts[ptidx] = nextPt; - - Point2i iprevPt, inextPt; - iprevPt.x = cvFloor(prevPt.x); - iprevPt.y = cvFloor(prevPt.y); - int winArea = maxWinSize * maxWinSize; - cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize)); - - if( calcWinMaskMat(BI, windowType, iprevPt, - winMaskMat,winSize,halfWin,winArea, - minWinSize,maxWinSize, crossSegmentationThreshold) == false) - continue; - - prevPt -= halfWin; - iprevPt.x = cvFloor(prevPt.x); - iprevPt.y = cvFloor(prevPt.y); - - if( iprevPt.x + halfWin.x < 0 || iprevPt.x + halfWin.x>= derivI.cols || - iprevPt.y + halfWin.y < 0 || iprevPt.y + halfWin.y >= derivI.rows ) + nextPt = prevPt; + } + else + nextPt = nextPts[ptidx]*2.f; + nextPts[ptidx] = nextPt; + + Point2i iprevPt, inextPt; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + int winArea = maxWinSize * maxWinSize; + cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize)); + + if( calcWinMaskMat(BI, windowType, iprevPt, + winMaskMat,winSize,halfWin,winArea, + minWinSize,maxWinSize, crossSegmentationThreshold) == false) + continue; + + prevPt -= halfWin; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + + if( iprevPt.x + halfWin.x < 0 || iprevPt.x + halfWin.x>= derivI.cols || + iprevPt.y + halfWin.y < 0 || iprevPt.y + halfWin.y >= derivI.rows ) + { + if( level == 0 ) { - if( level == 0 ) - { - if( status ) - status[ptidx] = 3; - if( err ) - err[ptidx] = 0; - } - continue; + if( status ) + status[ptidx] = 3; + if( err ) + err[ptidx] = 0; } + continue; + } - float a = prevPt.x - iprevPt.x; - float b = prevPt.y - iprevPt.y; - const int W_BITS = 14, W_BITS1 = 14; - - int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); - int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); - int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); - int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - - int dstep = (int)(derivI.step/derivI.elemSize1()); - int step = (int)(I.step/I.elemSize1()); - int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1()); - CV_Assert( step == (int)(J.step/J.elemSize1()) ); - float A11 = 0, A12 = 0, A22 = 0; - - #ifdef RLOF_SSE - __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); - __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); - __m128i z = _mm_setzero_si128(); - __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1)); - __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1)); - __m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps(); - __m128i mmMask4_epi32; - get4BitMask(winSize.width, mmMask4_epi32); - #endif - - - int x, y; - for( y = 0; y < winSize.height; y++ ) - { - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; - const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; - const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; + float a = prevPt.x - iprevPt.x; + float b = prevPt.y - iprevPt.y; + const int W_BITS = 14, W_BITS1 = 14; - short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step); - short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step); + int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - x = 0; + int dstep = (int)(derivI.step/derivI.elemSize1()); + int step = (int)(I.step/I.elemSize1()); + int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1()); + CV_Assert( step == (int)(J.step/J.elemSize1()) ); + float A11 = 0, A12 = 0, A22 = 0; - #ifdef RLOF_SSE - for( ; x < winSize.width*cn; x += 4, dsrc += 4*2, dIptr += 4*2 ) - { - __m128i wMask = _mm_set_epi32(MaskSet * maskPtr[x+3], - MaskSet * maskPtr[x+2], - MaskSet * maskPtr[x+1], - MaskSet * maskPtr[x]); - __m128i v00, v01, v10, v11, t0, t1; - v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); - v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); - v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z); - v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z); - - t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); +#ifdef RLOF_SSE + __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128i z = _mm_setzero_si128(); + __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1)); + __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1)); + __m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps(); + __m128i mmMask4_epi32; + get4BitMask(winSize.width, mmMask4_epi32); +#endif - t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); - _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0)); - - v00 = _mm_loadu_si128((const __m128i*)(dsrc)); - v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); - v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep)); - v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2)); - - t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); - t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); - t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1); - t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1); - v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... - - if( x + 4 > winSize.width) - { - v00 = _mm_and_si128(v00, mmMask4_epi32); - } - v00 = _mm_and_si128(v00, wMask); - _mm_storeu_si128((__m128i*)dIptr, v00); - t0 = _mm_srai_epi32(v00, 16); // Iy0 Iy1 Iy2 Iy3 - t1 = _mm_srai_epi32(_mm_slli_epi32(v00, 16), 16); // Ix0 Ix1 Ix2 Ix3 + int x, y; + for( y = 0; y < winSize.height; y++ ) + { + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; + const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; - __m128 fy = _mm_cvtepi32_ps(t0); - __m128 fx = _mm_cvtepi32_ps(t1); + short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step); + short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step); - qA22 = _mm_add_ps(qA22, _mm_mul_ps(fy, fy)); - qA12 = _mm_add_ps(qA12, _mm_mul_ps(fx, fy)); - qA11 = _mm_add_ps(qA11, _mm_mul_ps(fx, fx)); - } - #else + x = 0; - for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2) +#ifdef RLOF_SSE + for( ; x < winSize.width*cn; x += 4, dsrc += 4*2, dIptr += 4*2 ) + { + __m128i wMask = _mm_set_epi32(MaskSet * maskPtr[x+3], + MaskSet * maskPtr[x+2], + MaskSet * maskPtr[x+1], + MaskSet * maskPtr[x]); + __m128i v00, v01, v10, v11, t0, t1; + v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); + v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); + v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z); + v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); + _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0)); + + v00 = _mm_loadu_si128((const __m128i*)(dsrc)); + v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); + v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep)); + v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2)); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1); + v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... + + if( x + 4 > winSize.width) { - if( maskPtr[x] == 0) - { - dIptr[0] = (short)0; - dIptr[1] = (short)0; - - continue; - } - int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + - src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5); - int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 + - dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1); - int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 + - dsrc[dstep+cn2+1]*iw11, W_BITS1); - - Iptr[x] = (short)ival; - dIptr[0] = (short)ixval; - dIptr[1] = (short)iyval; - A11 += (float)(ixval*ixval); - A12 += (float)(ixval*iyval); - A22 += (float)(iyval*iyval); - + v00 = _mm_and_si128(v00, mmMask4_epi32); } + v00 = _mm_and_si128(v00, wMask); + _mm_storeu_si128((__m128i*)dIptr, v00); + t0 = _mm_srai_epi32(v00, 16); // Iy0 Iy1 Iy2 Iy3 + t1 = _mm_srai_epi32(_mm_slli_epi32(v00, 16), 16); // Ix0 Ix1 Ix2 Ix3 - #endif + __m128 fy = _mm_cvtepi32_ps(t0); + __m128 fx = _mm_cvtepi32_ps(t1); + qA22 = _mm_add_ps(qA22, _mm_mul_ps(fy, fy)); + qA12 = _mm_add_ps(qA12, _mm_mul_ps(fx, fy)); + qA11 = _mm_add_ps(qA11, _mm_mul_ps(fx, fx)); } +#else - #ifdef RLOF_SSE - float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4]; - _mm_store_ps(A11buf, qA11); - _mm_store_ps(A12buf, qA12); - _mm_store_ps(A22buf, qA22); - A11 += A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; - A12 += A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; - A22 += A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; - #endif - - A11 *= FLT_SCALE; - A12 *= FLT_SCALE; - A22 *= FLT_SCALE; - - - float D = A11*A22 - A12*A12; - float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) + - 4.f*A12*A12))/(2 * winArea); + for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2) + { + if( maskPtr[x] == 0) + { + dIptr[0] = (short)0; + dIptr[1] = (short)0; - if( err ) - err[ptidx] = (float)minEig; + continue; + } + int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + + src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5); + int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 + + dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1); + int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 + + dsrc[dstep+cn2+1]*iw11, W_BITS1); + + Iptr[x] = (short)ival; + dIptr[0] = (short)ixval; + dIptr[1] = (short)iyval; + A11 += (float)(ixval*ixval); + A12 += (float)(ixval*iyval); + A22 += (float)(iyval*iyval); - if( D < FLT_EPSILON ) - { - if( level == 0 && status ) - status[ptidx] = 0; - continue; } - D = 1.f/D; - - cv::Point2f backUpNextPt = nextPt; - nextPt -= halfWin; - Point2f prevDelta(0,0); - float c[8]; -#ifdef RLOF_SSE - __m128i mmMask0, mmMask1, mmMask; - getWBitMask(winSize.width, mmMask0, mmMask1, mmMask); #endif - for( j = 0; j < criteria.maxCount; j++ ) - { - cv::Point2f delta; - bool hasSolved = false; - a = nextPt.x - cvFloor(nextPt.x); - b = nextPt.y - cvFloor(nextPt.y); - float ab = a * b; - - - if( (inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j == 0)) - { - inextPt.x = cvFloor(nextPt.x); - inextPt.y = cvFloor(nextPt.y); - - - if( inextPt.x < -winSize.width || inextPt.x >= J.cols || - inextPt.y < -winSize.height || inextPt.y >= J.rows ) - { - if( level == 0 && status ) - status[ptidx] = 3; - if (level > 0) - nextPts[ptidx] = backUpNextPt; - break; - } + } - iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); - iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); - iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); - iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; +#ifdef RLOF_SSE + float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4]; + _mm_store_ps(A11buf, qA11); + _mm_store_ps(A12buf, qA12); + _mm_store_ps(A22buf, qA22); + A11 += A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; + A12 += A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; + A22 += A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; +#endif - float _b1[4] = {0,0,0,0}; - float _b2[4] = {0,0,0,0}; - #ifdef RLOF_SSE - __m128 qbc0[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; - __m128 qbc1[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; - qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); - qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); - #endif - for( y = 0; y < winSize.height; y++ ) - { - const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn; - const short* Iptr = (const short*)(IWinBuf.data + ( y)*IWinBuf.step); - const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); - x = 0; - #ifdef RLOF_SSE - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; - for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 ) - { - if( maskPtr[x ] == 0 && maskPtr[x+1] == 0 && maskPtr[x+2] == 0 && maskPtr[x+3] == 0 - && maskPtr[x+4] == 0 && maskPtr[x+5] == 0 && maskPtr[x+6] == 0 && maskPtr[x+7] == 0) - continue; - __m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x)); - __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z); - __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); - __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z); - __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z); - - __m128i mmDiffc_epi16[4] = - { _mm_subs_epi16(_mm_slli_epi16(v00, 5), diff0), - _mm_subs_epi16(_mm_slli_epi16(v01, 5), diff0), - _mm_subs_epi16(_mm_slli_epi16(v10, 5), diff0), - _mm_subs_epi16(_mm_slli_epi16(v11, 5), diff0) - }; - - __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... - __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); - - if( x > winSize.width*cn - 8) - { - Ixy_0 = _mm_and_si128(Ixy_0, mmMask0); - Ixy_1 = _mm_and_si128(Ixy_1, mmMask1); - } + A11 *= FLT_SCALE; + A12 *= FLT_SCALE; + A22 *= FLT_SCALE; - __m128i diffc1[4] = - {_mm_unpackhi_epi16(mmDiffc_epi16[0],mmDiffc_epi16[0]), - _mm_unpackhi_epi16(mmDiffc_epi16[1],mmDiffc_epi16[1]), - _mm_unpackhi_epi16(mmDiffc_epi16[2],mmDiffc_epi16[2]), - _mm_unpackhi_epi16(mmDiffc_epi16[3],mmDiffc_epi16[3]) - }; - __m128i diffc0[4] = - {_mm_unpacklo_epi16(mmDiffc_epi16[0],mmDiffc_epi16[0]), - _mm_unpacklo_epi16(mmDiffc_epi16[1],mmDiffc_epi16[1]), - _mm_unpacklo_epi16(mmDiffc_epi16[2],mmDiffc_epi16[2]), - _mm_unpacklo_epi16(mmDiffc_epi16[3],mmDiffc_epi16[3]) - }; + float D = A11*A22 - A12*A12; + float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) + + 4.f*A12*A12))/(2 * winArea); + if( err ) + err[ptidx] = (float)minEig; - // It * Ix It * Iy [0 ... 3] - //mask split for multiplication - // for set 1 lo sigma 1 + if( D < FLT_EPSILON ) + { + if( level == 0 && status ) + status[ptidx] = 0; + continue; + } + D = 1.f/D; - v10 = _mm_mullo_epi16(Ixy_0, diffc0[0]); - v11 = _mm_mulhi_epi16(Ixy_0, diffc0[0]); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qbc0[0] = _mm_add_ps(qbc0[0], _mm_cvtepi32_ps(v00)); - qbc1[0] = _mm_add_ps(qbc1[0], _mm_cvtepi32_ps(v10)); + cv::Point2f backUpNextPt = nextPt; + nextPt -= halfWin; + Point2f prevDelta(0,0); - v10 = _mm_mullo_epi16(Ixy_0, diffc0[1]); - v11 = _mm_mulhi_epi16(Ixy_0, diffc0[1]); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qbc0[1] = _mm_add_ps(qbc0[1], _mm_cvtepi32_ps(v00)); - qbc1[1] = _mm_add_ps(qbc1[1], _mm_cvtepi32_ps(v10)); + float c[8]; +#ifdef RLOF_SSE + __m128i mmMask0, mmMask1, mmMask; + getWBitMask(winSize.width, mmMask0, mmMask1, mmMask); +#endif + for( j = 0; j < criteria.maxCount; j++ ) + { + cv::Point2f delta; + bool hasSolved = false; + a = nextPt.x - cvFloor(nextPt.x); + b = nextPt.y - cvFloor(nextPt.y); + float ab = a * b; - v10 = _mm_mullo_epi16(Ixy_0, diffc0[2]); - v11 = _mm_mulhi_epi16(Ixy_0, diffc0[2]); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qbc0[2] = _mm_add_ps(qbc0[2], _mm_cvtepi32_ps(v00)); - qbc1[2] = _mm_add_ps(qbc1[2], _mm_cvtepi32_ps(v10)); - v10 = _mm_mullo_epi16(Ixy_0, diffc0[3]); - v11 = _mm_mulhi_epi16(Ixy_0, diffc0[3]); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qbc0[3] = _mm_add_ps(qbc0[3], _mm_cvtepi32_ps(v00)); - qbc1[3] = _mm_add_ps(qbc1[3], _mm_cvtepi32_ps(v10)); - // It * Ix It * Iy [4 ... 7] - // for set 1 hi sigma 1 + if( (inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j == 0)) + { + inextPt.x = cvFloor(nextPt.x); + inextPt.y = cvFloor(nextPt.y); - v10 = _mm_mullo_epi16(Ixy_1, diffc1[0]); - v11 = _mm_mulhi_epi16(Ixy_1, diffc1[0]); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qbc0[0] = _mm_add_ps(qbc0[0], _mm_cvtepi32_ps(v00)); - qbc1[0] = _mm_add_ps(qbc1[0], _mm_cvtepi32_ps(v10)); - v10 = _mm_mullo_epi16(Ixy_1, diffc1[1]); - v11 = _mm_mulhi_epi16(Ixy_1, diffc1[1]); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qbc0[1] = _mm_add_ps(qbc0[1], _mm_cvtepi32_ps(v00)); - qbc1[1] = _mm_add_ps(qbc1[1], _mm_cvtepi32_ps(v10)); + if( inextPt.x < -winSize.width || inextPt.x >= J.cols || + inextPt.y < -winSize.height || inextPt.y >= J.rows ) + { + if( level == 0 && status ) + status[ptidx] = 3; + if (level > 0) + nextPts[ptidx] = backUpNextPt; + break; + } - v10 = _mm_mullo_epi16(Ixy_1, diffc1[2]); - v11 = _mm_mulhi_epi16(Ixy_1, diffc1[2]); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qbc0[2] = _mm_add_ps(qbc0[2], _mm_cvtepi32_ps(v00)); - qbc1[2] = _mm_add_ps(qbc1[2], _mm_cvtepi32_ps(v10)); - v10 = _mm_mullo_epi16(Ixy_1, diffc1[3]); - v11 = _mm_mulhi_epi16(Ixy_1, diffc1[3]); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qbc0[3] = _mm_add_ps(qbc0[3], _mm_cvtepi32_ps(v00)); - qbc1[3] = _mm_add_ps(qbc1[3], _mm_cvtepi32_ps(v10)); + iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - } - #else - for( ; x < winSize.width*cn; x++, dIptr += 2 ) + float _b1[4] = {0,0,0,0}; + float _b2[4] = {0,0,0,0}; +#ifdef RLOF_SSE + __m128 qbc0[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; + __m128 qbc1[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; + qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); +#endif + for( y = 0; y < winSize.height; y++ ) + { + const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn; + const short* Iptr = (const short*)(IWinBuf.data + ( y)*IWinBuf.step); + const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); + x = 0; +#ifdef RLOF_SSE + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 ) + { + if( maskPtr[x ] == 0 && maskPtr[x+1] == 0 && maskPtr[x+2] == 0 && maskPtr[x+3] == 0 + && maskPtr[x+4] == 0 && maskPtr[x+5] == 0 && maskPtr[x+6] == 0 && maskPtr[x+7] == 0) + continue; + __m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x)); + __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z); + __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); + __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z); + __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z); + + __m128i mmDiffc_epi16[4] = + { _mm_subs_epi16(_mm_slli_epi16(v00, 5), diff0), + _mm_subs_epi16(_mm_slli_epi16(v01, 5), diff0), + _mm_subs_epi16(_mm_slli_epi16(v10, 5), diff0), + _mm_subs_epi16(_mm_slli_epi16(v11, 5), diff0) + }; + + __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... + __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); + + if( x > winSize.width*cn - 8) { - if( dIptr[0] == 0 && dIptr[1] == 0) - continue; - /*short It[4] = {CV_DESCALE((Jptr[x]) *(1 << W_BITS),W_BITS1-5) - Iptr[x], - CV_DESCALE((Jptr[x+cn]) *(1 << W_BITS),W_BITS1-5) - Iptr[x], - CV_DESCALE((Jptr[x+step]) *(1 << W_BITS),W_BITS1-5) - Iptr[x], - CV_DESCALE((Jptr[x+step+cn])*(1 << W_BITS),W_BITS1-5) - Iptr[x]};*/ - short It[4] = {(Jptr[x] << 5) - Iptr[x], - (Jptr[x+cn]<< 5) - Iptr[x], - (Jptr[x+step]<< 5) - Iptr[x], - (Jptr[x+step+cn]<< 5) - Iptr[x]}; - - _b1[0] += (float)(It[0]*dIptr[0]); - _b1[1] += (float)(It[1]*dIptr[0]); - _b1[2] += (float)(It[2]*dIptr[0]); - _b1[3] += (float)(It[3]*dIptr[0]); - - _b2[0] += (float)(It[0]*dIptr[1]); - _b2[1] += (float)(It[1]*dIptr[1]); - _b2[2] += (float)(It[2]*dIptr[1]); - _b2[3] += (float)(It[3]*dIptr[1]); + Ixy_0 = _mm_and_si128(Ixy_0, mmMask0); + Ixy_1 = _mm_and_si128(Ixy_1, mmMask1); } - #endif + __m128i diffc1[4] = + {_mm_unpackhi_epi16(mmDiffc_epi16[0],mmDiffc_epi16[0]), + _mm_unpackhi_epi16(mmDiffc_epi16[1],mmDiffc_epi16[1]), + _mm_unpackhi_epi16(mmDiffc_epi16[2],mmDiffc_epi16[2]), + _mm_unpackhi_epi16(mmDiffc_epi16[3],mmDiffc_epi16[3]) + }; + + __m128i diffc0[4] = + {_mm_unpacklo_epi16(mmDiffc_epi16[0],mmDiffc_epi16[0]), + _mm_unpacklo_epi16(mmDiffc_epi16[1],mmDiffc_epi16[1]), + _mm_unpacklo_epi16(mmDiffc_epi16[2],mmDiffc_epi16[2]), + _mm_unpacklo_epi16(mmDiffc_epi16[3],mmDiffc_epi16[3]) + }; + + + // It * Ix It * Iy [0 ... 3] + //mask split for multiplication + // for set 1 lo sigma 1 + + + v10 = _mm_mullo_epi16(Ixy_0, diffc0[0]); + v11 = _mm_mulhi_epi16(Ixy_0, diffc0[0]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[0] = _mm_add_ps(qbc0[0], _mm_cvtepi32_ps(v00)); + qbc1[0] = _mm_add_ps(qbc1[0], _mm_cvtepi32_ps(v10)); + + v10 = _mm_mullo_epi16(Ixy_0, diffc0[1]); + v11 = _mm_mulhi_epi16(Ixy_0, diffc0[1]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[1] = _mm_add_ps(qbc0[1], _mm_cvtepi32_ps(v00)); + qbc1[1] = _mm_add_ps(qbc1[1], _mm_cvtepi32_ps(v10)); + + v10 = _mm_mullo_epi16(Ixy_0, diffc0[2]); + v11 = _mm_mulhi_epi16(Ixy_0, diffc0[2]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[2] = _mm_add_ps(qbc0[2], _mm_cvtepi32_ps(v00)); + qbc1[2] = _mm_add_ps(qbc1[2], _mm_cvtepi32_ps(v10)); + + v10 = _mm_mullo_epi16(Ixy_0, diffc0[3]); + v11 = _mm_mulhi_epi16(Ixy_0, diffc0[3]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[3] = _mm_add_ps(qbc0[3], _mm_cvtepi32_ps(v00)); + qbc1[3] = _mm_add_ps(qbc1[3], _mm_cvtepi32_ps(v10)); + // It * Ix It * Iy [4 ... 7] + // for set 1 hi sigma 1 + + v10 = _mm_mullo_epi16(Ixy_1, diffc1[0]); + v11 = _mm_mulhi_epi16(Ixy_1, diffc1[0]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[0] = _mm_add_ps(qbc0[0], _mm_cvtepi32_ps(v00)); + qbc1[0] = _mm_add_ps(qbc1[0], _mm_cvtepi32_ps(v10)); + + v10 = _mm_mullo_epi16(Ixy_1, diffc1[1]); + v11 = _mm_mulhi_epi16(Ixy_1, diffc1[1]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[1] = _mm_add_ps(qbc0[1], _mm_cvtepi32_ps(v00)); + qbc1[1] = _mm_add_ps(qbc1[1], _mm_cvtepi32_ps(v10)); + + v10 = _mm_mullo_epi16(Ixy_1, diffc1[2]); + v11 = _mm_mulhi_epi16(Ixy_1, diffc1[2]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[2] = _mm_add_ps(qbc0[2], _mm_cvtepi32_ps(v00)); + qbc1[2] = _mm_add_ps(qbc1[2], _mm_cvtepi32_ps(v10)); + + v10 = _mm_mullo_epi16(Ixy_1, diffc1[3]); + v11 = _mm_mulhi_epi16(Ixy_1, diffc1[3]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[3] = _mm_add_ps(qbc0[3], _mm_cvtepi32_ps(v00)); + qbc1[3] = _mm_add_ps(qbc1[3], _mm_cvtepi32_ps(v10)); + + } +#else + for( ; x < winSize.width*cn; x++, dIptr += 2 ) + { + if( dIptr[0] == 0 && dIptr[1] == 0) + continue; + /*short It[4] = {CV_DESCALE((Jptr[x]) *(1 << W_BITS),W_BITS1-5) - Iptr[x], + CV_DESCALE((Jptr[x+cn]) *(1 << W_BITS),W_BITS1-5) - Iptr[x], + CV_DESCALE((Jptr[x+step]) *(1 << W_BITS),W_BITS1-5) - Iptr[x], + CV_DESCALE((Jptr[x+step+cn])*(1 << W_BITS),W_BITS1-5) - Iptr[x]};*/ + short It[4] = {(Jptr[x] << 5) - Iptr[x], + (Jptr[x+cn]<< 5) - Iptr[x], + (Jptr[x+step]<< 5) - Iptr[x], + (Jptr[x+step+cn]<< 5) - Iptr[x]}; + + _b1[0] += (float)(It[0]*dIptr[0]); + _b1[1] += (float)(It[1]*dIptr[0]); + _b1[2] += (float)(It[2]*dIptr[0]); + _b1[3] += (float)(It[3]*dIptr[0]); + + _b2[0] += (float)(It[0]*dIptr[1]); + _b2[1] += (float)(It[1]*dIptr[1]); + _b2[2] += (float)(It[2]*dIptr[1]); + _b2[3] += (float)(It[3]*dIptr[1]); } +#endif - #ifdef RLOF_SSE - float CV_DECL_ALIGNED(16) bbuf[4]; - _mm_store_ps(bbuf, _mm_add_ps(qbc0[0], qbc1[0])); - _b1[0] += bbuf[0] + bbuf[2]; - _b2[0] += bbuf[1] + bbuf[3]; - - _mm_store_ps(bbuf, _mm_add_ps(qbc0[1], qbc1[1])); - _b1[1] += bbuf[0] + bbuf[2]; - _b2[1] += bbuf[1] + bbuf[3]; - - _mm_store_ps(bbuf, _mm_add_ps(qbc0[2], qbc1[2])); - _b1[2] += bbuf[0] + bbuf[2]; - _b2[2] += bbuf[1] + bbuf[3]; - - _mm_store_ps(bbuf, _mm_add_ps(qbc0[3], qbc1[3])); - _b1[3] += bbuf[0] + bbuf[2]; - _b2[3] += bbuf[1] + bbuf[3]; - - #endif - _b1[0] *= FLT_SCALE; - _b1[1] *= FLT_SCALE; - _b1[2] *= FLT_SCALE; - _b1[3] *= FLT_SCALE; - _b2[0] *= FLT_SCALE; - _b2[1] *= FLT_SCALE; - _b2[2] *= FLT_SCALE; - _b2[3] *= FLT_SCALE; - - float c0[4] = { _b1[3] + _b1[0] - _b1[2] - _b1[1], - _b1[1] - _b1[0], - _b1[2] - _b1[0], - _b1[0]}; - float c1[4] = { _b2[3] + _b2[0] - _b2[2] - _b2[1], - _b2[1] - _b2[0], - _b2[2] - _b2[0], - _b2[0]}; - - float DA12 = A12 * D ; - float DA22 = A22 * D ; - float DA11 = A11 * D ; - c[0] = DA12 * c1[0] - DA22 * c0[0]; - c[1] = DA12 * c1[1] - DA22 * c0[1]; - c[2] = DA12 * c1[2] - DA22 * c0[2]; - c[3] = DA12 * c1[3] - DA22 * c0[3]; - c[4] = DA12 * c0[0] - DA11 * c1[0]; - c[5] = DA12 * c0[1] - DA11 * c1[1]; - c[6] = DA12 * c0[2] - DA11 * c1[2]; - c[7] = DA12 * c0[3] - DA11 * c1[3]; + } - float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]); - float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]); - float e2 = e0 * (c[1] * c[7] -c[3] * c[5]); - e0 = e1 * e1 - e2; - if ( e0 >= 0) - { - e0 = sqrt(e0); - float _y[2] = {-e1 - e0, e0 - e1}; - float c0yc1[2] = {c[0] * _y[0] + c[1], - c[0] * _y[1] + c[1]}; +#ifdef RLOF_SSE + float CV_DECL_ALIGNED(16) bbuf[4]; + _mm_store_ps(bbuf, _mm_add_ps(qbc0[0], qbc1[0])); + _b1[0] += bbuf[0] + bbuf[2]; + _b2[0] += bbuf[1] + bbuf[3]; - float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0], - -(c[2] * _y[1] + c[3]) / c0yc1[1]}; + _mm_store_ps(bbuf, _mm_add_ps(qbc0[1], qbc1[1])); + _b1[1] += bbuf[0] + bbuf[2]; + _b2[1] += bbuf[1] + bbuf[3]; - bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1); - bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1); + _mm_store_ps(bbuf, _mm_add_ps(qbc0[2], qbc1[2])); + _b1[2] += bbuf[0] + bbuf[2]; + _b2[2] += bbuf[1] + bbuf[3]; + _mm_store_ps(bbuf, _mm_add_ps(qbc0[3], qbc1[3])); + _b1[3] += bbuf[0] + bbuf[2]; + _b2[3] += bbuf[1] + bbuf[3]; - bool isSink1 = isIn1 && checkSolution(_x[0], _y[0], c ); - bool isSink2 = isIn2 && checkSolution(_x[1], _y[1], c ); +#endif + _b1[0] *= FLT_SCALE; + _b1[1] *= FLT_SCALE; + _b1[2] *= FLT_SCALE; + _b1[3] *= FLT_SCALE; + _b2[0] *= FLT_SCALE; + _b2[1] *= FLT_SCALE; + _b2[2] *= FLT_SCALE; + _b2[3] *= FLT_SCALE; + + float c0[4] = { _b1[3] + _b1[0] - _b1[2] - _b1[1], + _b1[1] - _b1[0], + _b1[2] - _b1[0], + _b1[0]}; + float c1[4] = { _b2[3] + _b2[0] - _b2[2] - _b2[1], + _b2[1] - _b2[0], + _b2[2] - _b2[0], + _b2[0]}; + + float DA12 = A12 * D ; + float DA22 = A22 * D ; + float DA11 = A11 * D ; + c[0] = DA12 * c1[0] - DA22 * c0[0]; + c[1] = DA12 * c1[1] - DA22 * c0[1]; + c[2] = DA12 * c1[2] - DA22 * c0[2]; + c[3] = DA12 * c1[3] - DA22 * c0[3]; + c[4] = DA12 * c0[0] - DA11 * c1[0]; + c[5] = DA12 * c0[1] - DA11 * c1[1]; + c[6] = DA12 * c0[2] - DA11 * c1[2]; + c[7] = DA12 * c0[3] - DA11 * c1[3]; + + float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]); + float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]); + float e2 = e0 * (c[1] * c[7] -c[3] * c[5]); + e0 = e1 * e1 - e2; + if ( e0 >= 0) + { + e0 = sqrt(e0); + float _y[2] = {-e1 - e0, e0 - e1}; + float c0yc1[2] = {c[0] * _y[0] + c[1], + c[0] * _y[1] + c[1]}; + float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0], + -(c[2] * _y[1] + c[3]) / c0yc1[1]}; - //if( isSink1 != isSink2 ) - { - if( isSink1 ) - { - hasSolved = true; - delta.x = inextPt.x + _x[0] - nextPt.x; - delta.y = inextPt.y + _y[0] - nextPt.y; - } - if (isSink2 ) - { - hasSolved = true; - delta.x = inextPt.x + _x[1] - nextPt.x; - delta.y = inextPt.y + _y[1] - nextPt.y; - } - } // isIn1 != isIn2 - } // e0 >= 0 - } - else - hasSolved = false; - if(hasSolved == false) - { - delta = Point2f( c[0] * ab + c[1] * a + c[2] * b + c[3], - c[4] * ab + c[5] * a + c[6] * b + c[7]); - nextPt += 0.7 * delta; - nextPts[ptidx] = nextPt + halfWin; - } - else - { - nextPt += delta; - nextPts[ptidx] = nextPt + halfWin; - break; - } + bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1); + bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1); - if( delta.ddot(delta) <= criteria.epsilon) - break; + bool isSink1 = isIn1 && checkSolution(_x[0], _y[0], c ); + bool isSink2 = isIn2 && checkSolution(_x[1], _y[1], c ); - if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01 && - std::abs(delta.y - prevDelta.y) < 0.01) + + //if( isSink1 != isSink2 ) { - nextPts[ptidx] -= delta*0.35f; - break; - } + if( isSink1 ) + { + hasSolved = true; + delta.x = inextPt.x + _x[0] - nextPt.x; + delta.y = inextPt.y + _y[0] - nextPt.y; + } + if (isSink2 ) + { + hasSolved = true; + delta.x = inextPt.x + _x[1] - nextPt.x; + delta.y = inextPt.y + _y[1] - nextPt.y; + } + } // isIn1 != isIn2 + } // e0 >= 0 + } + else + hasSolved = false; + if(hasSolved == false) + { + delta = Point2f( c[0] * ab + c[1] * a + c[2] * b + c[3], + c[4] * ab + c[5] * a + c[6] * b + c[7]); + nextPt += 0.7 * delta; + nextPts[ptidx] = nextPt + halfWin; + } + else + { + nextPt += delta; + nextPts[ptidx] = nextPt + halfWin; + break; + } - prevDelta = delta; + if( delta.ddot(delta) <= criteria.epsilon) + break; + + if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01 && + std::abs(delta.y - prevDelta.y) < 0.01) + { + nextPts[ptidx] -= delta*0.35f; + break; } + prevDelta = delta; + } + } + } - const Mat* prevImg; - const Mat* nextImg; - const Mat* prevDeriv; - const Mat* rgbPrevImg; - const Mat* rgbNextImg; - const Point2f* prevPts; - Point2f* nextPts; - uchar* status; - float* err; - int maxWinSize; - int minWinSize; - TermCriteria criteria; - int level; - int maxLevel; - int windowType; - float minEigThreshold; - bool useInitialFlow; - int crossSegmentationThreshold; - }; -} -} -} -} + const Mat* prevImg; + const Mat* nextImg; + const Mat* prevDeriv; + const Mat* rgbPrevImg; + const Mat* rgbNextImg; + const Point2f* prevPts; + Point2f* nextPts; + uchar* status; + float* err; + int maxWinSize; + int minWinSize; + TermCriteria criteria; + int level; + int maxLevel; + int windowType; + float minEigThreshold; + bool useInitialFlow; + int crossSegmentationThreshold; +}; + +}}}} // namespace #endif diff --git a/modules/optflow/src/rlof/geo_interpolation.cpp b/modules/optflow/src/rlof/geo_interpolation.cpp index f96e3914cff..048f65812d0 100644 --- a/modules/optflow/src/rlof/geo_interpolation.cpp +++ b/modules/optflow/src/rlof/geo_interpolation.cpp @@ -5,482 +5,480 @@ #include "geo_interpolation.hpp" #include #include -namespace cv -{ -namespace optflow +namespace cv { +namespace optflow { + +inline double K_h1(float x, double h2) { + return exp(-(0.5 / (h2)) * x); +} - inline double K_h1(float x, double h2) - { - return exp(-(0.5 / (h2)) * x); +struct Graph_helper { + int * mem; + int e_size; + Graph_helper(int k, int num_nodes) { + e_size = (2 * k + 1); + mem = (int*)malloc(sizeof(int) * e_size * num_nodes); + memset(mem, 0, sizeof(int) * e_size * num_nodes); } - - struct Graph_helper { - int * mem; - int e_size; - Graph_helper(int k, int num_nodes) { - e_size = (2 * k + 1); - mem = (int*)malloc(sizeof(int) * e_size * num_nodes); - memset(mem, 0, sizeof(int) * e_size * num_nodes); - } - inline int size(int id) { - int r_addr = id * (e_size); - return ((int*)mem)[r_addr]; - } - inline int * data(int id) { - int r_addr = id * (e_size)+1; - return mem + r_addr; - } - inline void add(int id, std::pair data) { - int r_addr = id * (e_size); - int size = ++((int*)mem)[r_addr]; - r_addr += 2 * size - 1;//== 1 + 2*(size-1); - ((float*)mem)[r_addr] = data.first; - ((int*)mem)[r_addr + 1] = data.second; - } - inline bool color_in_target(int id, int color) { - int r_addr = id * (e_size); - int size = (((int*)mem)[r_addr]); - r_addr += 2; - for (int i = 0; i < size; i++) { - if (((int*)mem)[r_addr] == color) { - return true; - } - r_addr += 2; + inline int size(int id) { + int r_addr = id * (e_size); + return ((int*)mem)[r_addr]; + } + inline int * data(int id) { + int r_addr = id * (e_size)+1; + return mem + r_addr; + } + inline void add(int id, std::pair data) { + int r_addr = id * (e_size); + int size = ++((int*)mem)[r_addr]; + r_addr += 2 * size - 1;//== 1 + 2*(size-1); + ((float*)mem)[r_addr] = data.first; + ((int*)mem)[r_addr + 1] = data.second; + } + inline bool color_in_target(int id, int color) { + int r_addr = id * (e_size); + int size = (((int*)mem)[r_addr]); + r_addr += 2; + for (int i = 0; i < size; i++) { + if (((int*)mem)[r_addr] == color) { + return true; } - return false; + r_addr += 2; } + return false; + } + +}; + +Mat sgeo_dist(const Mat& gra, int y, int x, float max, Mat &prev) +{ + std::vector points; + points.push_back(Point2f(static_cast(x), static_cast(y))); + return sgeo_dist(gra, points, max, prev); +} +Mat sgeo_dist(const Mat& gra, const std::vector & points, float max, Mat &prev) +{ + int Dx[] = { -1,0,1,-1,1,-1,0,1 }; + int Dy[] = { -1,-1,-1,0,0,1,1,1 }; + Mat dm(gra.rows, gra.cols, CV_32F, Scalar(max)); + prev = Mat(gra.rows, gra.cols, CV_8U, Scalar(255)); - }; + std::multimap not_visited_with_value; - Mat sgeo_dist(const Mat& gra, int y, int x, float max, Mat &prev) + for (auto i = points.begin(); i != points.end(); i++) { - std::vector points; - points.push_back(Point2f(static_cast(x), static_cast(y))); - return sgeo_dist(gra, points, max, prev); + int y = static_cast(i->y); + int x = static_cast(i->x); + not_visited_with_value.insert(std::pair(0.f, Vec2i(y, x))); + dm.at(y, x) = 0; } - Mat sgeo_dist(const Mat& gra, const std::vector & points, float max, Mat &prev) + + bool done = false; + while (!done) { - int Dx[] = { -1,0,1,-1,1,-1,0,1 }; - int Dy[] = { -1,-1,-1,0,0,1,1,1 }; - Mat dm(gra.rows, gra.cols, CV_32F, Scalar(max)); - prev = Mat(gra.rows, gra.cols, CV_8U, Scalar(255)); + if (not_visited_with_value.begin() == not_visited_with_value.end()) { + done = true; + break; + } + std::multimap::iterator current_it = not_visited_with_value.begin(); + std::pair current_p = *current_it; + not_visited_with_value.erase(current_it); - std::multimap not_visited_with_value; + int y = current_p.second[0]; + int x = current_p.second[1]; + float cur_d = current_p.first; - for (auto i = points.begin(); i != points.end(); i++) - { - int y = static_cast(i->y); - int x = static_cast(i->x); - not_visited_with_value.insert(std::pair(0.f, Vec2i(y, x))); - dm.at(y, x) = 0; + if (dm.at(y, x) != cur_d) { + continue; } - bool done = false; - while (!done) - { - if (not_visited_with_value.begin() == not_visited_with_value.end()) { - done = true; - break; - } - std::multimap::iterator current_it = not_visited_with_value.begin(); - std::pair current_p = *current_it; - not_visited_with_value.erase(current_it); - - int y = current_p.second[0]; - int x = current_p.second[1]; - float cur_d = current_p.first; + Vec8f gra_e = gra.at(y, x); - if (dm.at(y, x) != cur_d) { + for (int i = 0; i < 8; i++) { + if (gra_e[i] < 0) { continue; } + int dx = Dx[i]; + int dy = Dy[i]; - Vec8f gra_e = gra.at(y, x); - - for (int i = 0; i < 8; i++) { - if (gra_e[i] < 0) { - continue; - } - int dx = Dx[i]; - int dy = Dy[i]; - - if (dm.at(y + dy, x + dx) > cur_d + gra_e[i]) { - dm.at(y + dy, x + dx) = cur_d + gra_e[i]; - prev.at(y + dy, x + dx) = static_cast(7 - i); - not_visited_with_value.insert(std::pair(cur_d + gra_e[i], Vec2i(y + dy, x + dx))); - } - + if (dm.at(y + dy, x + dx) > cur_d + gra_e[i]) { + dm.at(y + dy, x + dx) = cur_d + gra_e[i]; + prev.at(y + dy, x + dx) = static_cast(7 - i); + not_visited_with_value.insert(std::pair(cur_d + gra_e[i], Vec2i(y + dy, x + dx))); } + } + } - return dm; - } + return dm; +} - Mat interpolate_irregular_nn_raster(const std::vector & prevPoints, - const std::vector & nextPoints, - const std::vector & status, - const Mat & i1) +Mat interpolate_irregular_nn_raster(const std::vector & prevPoints, + const std::vector & nextPoints, + const std::vector & status, + const Mat & i1) +{ + Mat gra = getGraph(i1, 0.1f); + int Dx[] = { -1,0,1,-1,1,-1,0,1 }; + int Dy[] = { -1,-1,-1,0,0,1,1,1 }; + int max_rounds = 10; + Mat dirt = Mat(gra.rows, gra.cols, CV_8U, Scalar(0)); + Mat quellknoten = Mat(gra.rows, gra.cols, CV_32S, Scalar(-1)); + Mat dist = Mat(gra.rows, gra.cols, CV_32F, Scalar(std::numeric_limits::max())); + /* + * assign quellknoten ids. + */ + for (int i = 0; i < static_cast(prevPoints.size()); i++) { - Mat gra = getGraph(i1, 0.1f); - int Dx[] = { -1,0,1,-1,1,-1,0,1 }; - int Dy[] = { -1,-1,-1,0,0,1,1,1 }; - int max_rounds = 10; - Mat dirt = Mat(gra.rows, gra.cols, CV_8U, Scalar(0)); - Mat quellknoten = Mat(gra.rows, gra.cols, CV_32S, Scalar(-1)); - Mat dist = Mat(gra.rows, gra.cols, CV_32F, Scalar(std::numeric_limits::max())); + int x = (int)prevPoints[i].x; + int y = (int)prevPoints[i].y; + if (status[i] == 0) + continue; + dirt.at(y, x) = 1; + dist.at(y, x) = 0; + quellknoten.at(y, x) = i; + } + + bool clean = true; + bool done = false; + int x = 0; + int y = 0; + int rounds = 0; + while (!done) { /* - * assign quellknoten ids. + * Update x and y + * on even rounds go rasterscanorder , on odd round inverse rasterscanorder */ - for (int i = 0; i < static_cast(prevPoints.size()); i++) - { - int x = (int)prevPoints[i].x; - int y = (int)prevPoints[i].y; - if (status[i] == 0) - continue; - dirt.at(y, x) = 1; - dist.at(y, x) = 0; - quellknoten.at(y, x) = i; - } - - bool clean = true; - bool done = false; - int x = 0; - int y = 0; - int rounds = 0; - while (!done) { - /* - * Update x and y - * on even rounds go rasterscanorder , on odd round inverse rasterscanorder - */ - if (rounds % 2 == 0) { - x++; - if (x >= gra.cols) { - x = 0; - y++; - if (y >= gra.rows) { - y = 0; - rounds++; - y = gra.rows - 1; - x = gra.cols - 1; - if (rounds >= max_rounds || clean) { - done = true; - break; - } + if (rounds % 2 == 0) { + x++; + if (x >= gra.cols) { + x = 0; + y++; + if (y >= gra.rows) { + y = 0; + rounds++; + y = gra.rows - 1; + x = gra.cols - 1; + if (rounds >= max_rounds || clean) { + done = true; + break; } } } - else { - x--; - if (x < 0) { - x = gra.cols - 1; - y--; - if (y < 0) { - y = gra.rows - 1; - rounds++; - y = 0; - x = 0; - if (rounds >= max_rounds || clean) { - done = true; - break; - } + } + else { + x--; + if (x < 0) { + x = gra.cols - 1; + y--; + if (y < 0) { + y = gra.rows - 1; + rounds++; + y = 0; + x = 0; + if (rounds >= max_rounds || clean) { + done = true; + break; } } } - if (dirt.at(y, x) == 0) { - continue; - } - dirt.at(y, x) = 0; + } + if (dirt.at(y, x) == 0) { + continue; + } + dirt.at(y, x) = 0; - float c_dist = dist.at(y, x); - Vec8f gra_e = gra.at(y, x); + float c_dist = dist.at(y, x); + Vec8f gra_e = gra.at(y, x); - for (int i = 0; i < 8; i++) { - int tx = Dx[i]; - int ty = Dy[i]; - if (ty == 0 && tx == 0) { - continue; - } - if (x + tx < 0 || x + tx >= gra.cols) { - continue; - } - if (y + ty < 0 || y + ty >= gra.rows) { - continue; - } - if (c_dist > dist.at(y + ty, x + tx)) { - if (c_dist > dist.at(y + ty, x + tx) + gra_e[i]) { - quellknoten.at(y, x) = quellknoten.at(y + ty, x + tx); - dist.at(y, x) = dist.at(y + ty, x + tx) + gra_e[i]; - dirt.at(y, x) = 1; - clean = false; - } + for (int i = 0; i < 8; i++) { + int tx = Dx[i]; + int ty = Dy[i]; + if (ty == 0 && tx == 0) { + continue; + } + if (x + tx < 0 || x + tx >= gra.cols) { + continue; + } + if (y + ty < 0 || y + ty >= gra.rows) { + continue; + } + if (c_dist > dist.at(y + ty, x + tx)) { + if (c_dist > dist.at(y + ty, x + tx) + gra_e[i]) { + quellknoten.at(y, x) = quellknoten.at(y + ty, x + tx); + dist.at(y, x) = dist.at(y + ty, x + tx) + gra_e[i]; + dirt.at(y, x) = 1; + clean = false; } - else { - if (c_dist + gra_e[i] < dist.at(y + ty, x + tx)) { - quellknoten.at(y + ty, x + tx) = quellknoten.at(y, x); - dist.at(y + ty, x + tx) = dist.at(y, x) + gra_e[i]; - dirt.at(y + ty, x + tx) = 1; - clean = false; - } + } + else { + if (c_dist + gra_e[i] < dist.at(y + ty, x + tx)) { + quellknoten.at(y + ty, x + tx) = quellknoten.at(y, x); + dist.at(y + ty, x + tx) = dist.at(y, x) + gra_e[i]; + dirt.at(y + ty, x + tx) = 1; + clean = false; } } + } - } + } - Mat nnFlow(i1.rows, i1.cols, CV_32FC2, Scalar(0)); - for (y = 0; y < i1.rows; y++) { - for (x = 0; x < i1.cols; x++) { + Mat nnFlow(i1.rows, i1.cols, CV_32FC2, Scalar(0)); + for (y = 0; y < i1.rows; y++) { + for (x = 0; x < i1.cols; x++) { - int id = quellknoten.at(y, x); - if (id != -1) - { - nnFlow.at(y, x) = nextPoints[id] - prevPoints[id]; - } + int id = quellknoten.at(y, x); + if (id != -1) + { + nnFlow.at(y, x) = nextPoints[id] - prevPoints[id]; } } - return nnFlow; } + return nnFlow; +} - Mat interpolate_irregular_knn( - const std::vector & _prevPoints, - const std::vector & _nextPoints, - const std::vector & status, - const Mat &color_img, - int k, - float pixeldistance) - { - Mat in(color_img.rows, color_img.cols, CV_32FC2); - Mat mask = Mat::zeros(color_img.rows, color_img.cols, CV_8UC1); +Mat interpolate_irregular_knn( + const std::vector & _prevPoints, + const std::vector & _nextPoints, + const std::vector & status, + const Mat &color_img, + int k, + float pixeldistance) +{ + Mat in(color_img.rows, color_img.cols, CV_32FC2); + Mat mask = Mat::zeros(color_img.rows, color_img.cols, CV_8UC1); - for (unsigned n = 0; n < _prevPoints.size(); n++) + for (unsigned n = 0; n < _prevPoints.size(); n++) + { + if (_prevPoints[n].x >= 0 && _prevPoints[n].y >= 0 && _prevPoints[n].x < color_img.cols && _prevPoints[n].y < color_img.rows) { - if (_prevPoints[n].x >= 0 && _prevPoints[n].y >= 0 && _prevPoints[n].x < color_img.cols && _prevPoints[n].y < color_img.rows) - { - in.at(_prevPoints[n]) = _nextPoints[n] - _prevPoints[n]; - mask.at(_prevPoints[n]) = status[n]; - } - + in.at(_prevPoints[n]) = _nextPoints[n] - _prevPoints[n]; + mask.at(_prevPoints[n]) = status[n]; } - int Dx[] = { -1,0,1,-1,1,-1,0,1 }; - int Dy[] = { -1,-1,-1,0,0,1,1,1 }; - Mat gra = getGraph(color_img, pixeldistance); - Mat nnFlow(in.rows, in.cols, CV_32FC2, Scalar(0)); - std::multimap my_agents; // > - Graph_helper graph_helper(k, in.rows*in.cols); //< arrivaltime, color> + } + int Dx[] = { -1,0,1,-1,1,-1,0,1 }; + int Dy[] = { -1,-1,-1,0,0,1,1,1 }; + Mat gra = getGraph(color_img, pixeldistance); + Mat nnFlow(in.rows, in.cols, CV_32FC2, Scalar(0)); + std::multimap my_agents; // > + Graph_helper graph_helper(k, in.rows*in.cols); //< arrivaltime, color> - int color = 0; - std::vector flow_point_list; - for (int y = 0; y < in.rows; y++) { - for (int x = 0; x < in.cols; x++) { - if (mask.at(y, x) > 0) { - flow_point_list.push_back(Vec2i(y, x)); - nnFlow.at(y, x) = in.at(y, x); - int v_id = (y * in.cols + x); - graph_helper.add(v_id, std::pair(0.f, color)); + int color = 0; + std::vector flow_point_list; + for (int y = 0; y < in.rows; y++) { + for (int x = 0; x < in.cols; x++) { + if (mask.at(y, x) > 0) { + flow_point_list.push_back(Vec2i(y, x)); + nnFlow.at(y, x) = in.at(y, x); + int v_id = (y * in.cols + x); + graph_helper.add(v_id, std::pair(0.f, color)); - Vec8f gra_e = gra.at(y, x); - for (int i = 0; i < 8; i++) { - if (gra_e[i] < 0) - continue; - int dx = Dx[i]; - int dy = Dy[i]; - int target = (y + dy) * in.cols + (x + dx); - Vec2i agent(target, color); - my_agents.insert(std::pair(gra_e[i], agent)); - } - color++; + Vec8f gra_e = gra.at(y, x); + for (int i = 0; i < 8; i++) { + if (gra_e[i] < 0) + continue; + int dx = Dx[i]; + int dy = Dy[i]; + int target = (y + dy) * in.cols + (x + dx); + Vec2i agent(target, color); + my_agents.insert(std::pair(gra_e[i], agent)); + } + color++; } } + } - int global_time = 0; + int global_time = 0; - bool done = false; - while (!done) { - if (my_agents.size() == 0) { - done = true; - break; - } - global_time++; + bool done = false; + while (!done) { + if (my_agents.size() == 0) { + done = true; + break; + } + global_time++; - std::multimap::iterator current_it = my_agents.begin(); - std::pair current_p = *current_it; - my_agents.erase(current_it); + std::multimap::iterator current_it = my_agents.begin(); + std::pair current_p = *current_it; + my_agents.erase(current_it); - int target = current_p.second[0]; - color = current_p.second[1]; - float arriv_time = current_p.first; + int target = current_p.second[0]; + color = current_p.second[1]; + float arriv_time = current_p.first; + + Vec8f gra_e = gra.at(target);// (y*cols+x) + if (graph_helper.size(target) >= k) { + continue; + } - Vec8f gra_e = gra.at(target);// (y*cols+x) - if (graph_helper.size(target) >= k) { + bool color_found_in_target = graph_helper.color_in_target(target, color); + if (color_found_in_target) { + continue; + } + graph_helper.add(target, std::pair(arriv_time, color)); + + + for (int i = 0; i < 8; i++) { + if (gra_e[i] < 0) continue; - } - bool color_found_in_target = graph_helper.color_in_target(target, color); + int dx = Dx[i]; + int dy = Dy[i]; + int new_target = target + dx + (dy*in.cols); + if (graph_helper.size(new_target) >= k) { + continue; + } + color_found_in_target = graph_helper.color_in_target(new_target, color); if (color_found_in_target) { continue; } - graph_helper.add(target, std::pair(arriv_time, color)); - - - for (int i = 0; i < 8; i++) { - if (gra_e[i] < 0) - continue; + Vec2i new_agent(new_target, color); + my_agents.insert(std::pair(arriv_time + gra_e[i], new_agent)); - int dx = Dx[i]; - int dy = Dy[i]; - int new_target = target + dx + (dy*in.cols); - if (graph_helper.size(new_target) >= k) { - continue; - } - color_found_in_target = graph_helper.color_in_target(new_target, color); - if (color_found_in_target) { - continue; - } - Vec2i new_agent(new_target, color); - my_agents.insert(std::pair(arriv_time + gra_e[i], new_agent)); - - } } + } - Mat ret(in.rows, in.cols*k, CV_32FC2); - for (int y = 0; y < in.rows; y++) { - for (int x = 0; x < in.cols; x++) { - for (int i = 0; i < k; i++) { - float dist = *((float*)(graph_helper.data(y*in.cols + x) + 2 * i)); - float id = *((float*)(graph_helper.data(y*in.cols + x) + 2 * i + 1)); - ret.at(y, k*x + i) = Vec2f(dist, id); - } + Mat ret(in.rows, in.cols*k, CV_32FC2); + for (int y = 0; y < in.rows; y++) { + for (int x = 0; x < in.cols; x++) { + for (int i = 0; i < k; i++) { + float dist = *((float*)(graph_helper.data(y*in.cols + x) + 2 * i)); + float id = *((float*)(graph_helper.data(y*in.cols + x) + 2 * i + 1)); + ret.at(y, k*x + i) = Vec2f(dist, id); } } - return ret; + } + return ret; /* - for (int y = 0; y < in.rows; y++) { - for (int x = 0; x < in.cols; x++) { - int v_id = y * in.cols + x; - std::vector > flow_s(k); + for (int y = 0; y < in.rows; y++) { + for (int x = 0; x < in.cols; x++) { + int v_id = y * in.cols + x; + std::vector > flow_s(k); - for (int i = 0; i < k; i++) { - color = *((int*)(graph_helper.data(v_id) + 1 + 2 * i)); - int cy = flow_point_list[color][0]; - int cx = flow_point_list[color][1]; - Vec2f flow = in.at(cy, cx); - flow_s[i] = std::make_pair(static_cast(norm(flow)), flow); + for (int i = 0; i < k; i++) { + color = *((int*)(graph_helper.data(v_id) + 1 + 2 * i)); + int cy = flow_point_list[color][0]; + int cx = flow_point_list[color][1]; + Vec2f flow = in.at(cy, cx); + flow_s[i] = std::make_pair(static_cast(norm(flow)), flow); - } + } - nnFlow.at(y, x) = std::min_element(flow_s.begin(), flow_s.end(), [](const std::pair &left, const std::pair &right) { - return left.first < right.first; - })->second; + nnFlow.at(y, x) = std::min_element(flow_s.begin(), flow_s.end(), [](const std::pair &left, const std::pair &right) { + return left.first < right.first; + })->second; - } } - free(graph_helper.mem); - return nnFlow; + } + free(graph_helper.mem); + return nnFlow; */ - } - Mat getGraph(const Mat &image, float edge_length) - { +} - int Dx[] = { -1,0,1,-1,1,-1,0,1 }; - int Dy[] = { -1,-1,-1,0,0,1,1,1 }; - Mat gra(image.rows, image.cols, CV_32FC(8)); +Mat getGraph(const Mat &image, float edge_length) +{ - for (int y = 0; y < gra.rows; y++) { - for (int x = 0; x < gra.cols; x++) { + int Dx[] = { -1,0,1,-1,1,-1,0,1 }; + int Dy[] = { -1,-1,-1,0,0,1,1,1 }; + Mat gra(image.rows, image.cols, CV_32FC(8)); - for (int i = 0; i < 8; i++) { - int dx = Dx[i]; - int dy = Dy[i]; - gra.at(y, x)[i] = -1; + for (int y = 0; y < gra.rows; y++) { + for (int x = 0; x < gra.cols; x++) { - if (x + dx < 0 || y + dy < 0 || x + dx >= gra.cols || y + dy >= gra.rows) { - continue; - } + for (int i = 0; i < 8; i++) { + int dx = Dx[i]; + int dy = Dy[i]; + gra.at(y, x)[i] = -1; - if (i < 4) { - int si = 7 - i; - gra.at(y, x)[i] = gra.at(y + dy, x + dx)[si]; - } - else { - float p1 = dx * dx*edge_length*edge_length + dy * dy*edge_length*edge_length; - float p2 = static_cast(image.at(y, x)[0] - image.at(y + dy, x + dx)[0]); - float p3 = static_cast(image.at(y, x)[1] - image.at(y + dy, x + dx)[1]); - float p4 = static_cast(image.at(y, x)[2] - image.at(y + dy, x + dx)[2]); - gra.at(y, x)[i] = sqrt(p1 + p2 * p2 + p3 * p3 + p4 * p4); - } + if (x + dx < 0 || y + dy < 0 || x + dx >= gra.cols || y + dy >= gra.rows) { + continue; + } + if (i < 4) { + int si = 7 - i; + gra.at(y, x)[i] = gra.at(y + dy, x + dx)[si]; + } + else { + float p1 = dx * dx*edge_length*edge_length + dy * dy*edge_length*edge_length; + float p2 = static_cast(image.at(y, x)[0] - image.at(y + dy, x + dx)[0]); + float p3 = static_cast(image.at(y, x)[1] - image.at(y + dy, x + dx)[1]); + float p4 = static_cast(image.at(y, x)[2] - image.at(y + dy, x + dx)[2]); + gra.at(y, x)[i] = sqrt(p1 + p2 * p2 + p3 * p3 + p4 * p4); } } - } - return gra; + } } + return gra; +} + - Mat interpolate_irregular_nn( - const std::vector & _prevPoints, - const std::vector & _nextPoints, - const std::vector & status, - const Mat &color_img, - float pixeldistance) +Mat interpolate_irregular_nn( + const std::vector & _prevPoints, + const std::vector & _nextPoints, + const std::vector & status, + const Mat &color_img, + float pixeldistance) +{ + int Dx[] = { -1,0,1,-1,1,-1,0,1 }; + int Dy[] = { -1,-1,-1,0,0,1,1,1 }; + std::vector prevPoints, nextPoints; + std::map, std::pair> flowMap; + for (unsigned n = 0; n < _prevPoints.size(); n++) { - int Dx[] = { -1,0,1,-1,1,-1,0,1 }; - int Dy[] = { -1,-1,-1,0,0,1,1,1 }; - std::vector prevPoints, nextPoints; - std::map, std::pair> flowMap; - for (unsigned n = 0; n < _prevPoints.size(); n++) + if (status[n] != 0) { - if (status[n] != 0) - { - flowMap.insert(std::make_pair( - std::make_pair(_prevPoints[n].x, _prevPoints[n].y), - std::make_pair(_nextPoints[n].x, _nextPoints[n].y))); - prevPoints.push_back(_prevPoints[n]); - nextPoints.push_back(_nextPoints[n]); - } - + flowMap.insert(std::make_pair( + std::make_pair(_prevPoints[n].x, _prevPoints[n].y), + std::make_pair(_nextPoints[n].x, _nextPoints[n].y))); + prevPoints.push_back(_prevPoints[n]); + nextPoints.push_back(_nextPoints[n]); } - Mat gra = getGraph(color_img, pixeldistance); + } - Mat prev; - Mat geo_dist = sgeo_dist(gra, prevPoints, std::numeric_limits::max(), prev); + Mat gra = getGraph(color_img, pixeldistance); + Mat prev; + Mat geo_dist = sgeo_dist(gra, prevPoints, std::numeric_limits::max(), prev); - Mat nnFlow = Mat::zeros(color_img.size(), CV_32FC2); - for (int y = 0; y < nnFlow.rows; y++) + + Mat nnFlow = Mat::zeros(color_img.size(), CV_32FC2); + for (int y = 0; y < nnFlow.rows; y++) + { + for (int x = 0; x < nnFlow.cols; x++) { - for (int x = 0; x < nnFlow.cols; x++) + int cx = x; + int cy = y; + while (prev.at(cy, cx) != 255) { - int cx = x; - int cy = y; - while (prev.at(cy, cx) != 255) - { - int i = prev.at(cy, cx); - cx += Dx[i]; - cy += Dy[i]; - } - auto val = flowMap[std::make_pair(static_cast(cx), static_cast(cy))]; - nnFlow.at(y, x) = Vec2f(val.first - cx, val.second - cy); + int i = prev.at(cy, cx); + cx += Dx[i]; + cy += Dy[i]; } + auto val = flowMap[std::make_pair(static_cast(cx), static_cast(cy))]; + nnFlow.at(y, x) = Vec2f(val.first - cx, val.second - cy); } - return nnFlow; } - -} + return nnFlow; } + +}} // namespace diff --git a/modules/optflow/src/rlof/geo_interpolation.hpp b/modules/optflow/src/rlof/geo_interpolation.hpp index 60aef39bedc..0e13b44c493 100644 --- a/modules/optflow/src/rlof/geo_interpolation.hpp +++ b/modules/optflow/src/rlof/geo_interpolation.hpp @@ -4,34 +4,33 @@ #ifndef _GEO_INTERPOLATION_HPP_ #define _GEO_INTERPOLATION_HPP_ #include -namespace cv -{ -namespace optflow -{ - typedef Vec Vec8f; - Mat getGraph(const Mat & image, float edge_length); - Mat sgeo_dist(const Mat& gra, int y, int x, float max, Mat &prev); - Mat sgeo_dist(const Mat& gra, const std::vector & points, float max, Mat &prev); - Mat interpolate_irregular_nw(const Mat &in, const Mat &mask, const Mat &color_img, float max_d, float bandwidth, float pixeldistance); - Mat interpolate_irregular_nn( - const std::vector & prevPoints, - const std::vector & nextPoints, - const std::vector & status, - const Mat &color_img, - float pixeldistance); - Mat interpolate_irregular_knn( - const std::vector & _prevPoints, - const std::vector & _nextPoints, - const std::vector & status, - const Mat &color_img, - int k, - float pixeldistance); - Mat interpolate_irregular_nn_raster(const std::vector & prevPoints, - const std::vector & nextPoints, - const std::vector & status, - const Mat & i1); +namespace cv { +namespace optflow { -} -} +typedef Vec Vec8f; +Mat getGraph(const Mat & image, float edge_length); +Mat sgeo_dist(const Mat& gra, int y, int x, float max, Mat &prev); +Mat sgeo_dist(const Mat& gra, const std::vector & points, float max, Mat &prev); +Mat interpolate_irregular_nw(const Mat &in, const Mat &mask, const Mat &color_img, float max_d, float bandwidth, float pixeldistance); +Mat interpolate_irregular_nn( + const std::vector & prevPoints, + const std::vector & nextPoints, + const std::vector & status, + const Mat &color_img, + float pixeldistance); +Mat interpolate_irregular_knn( + const std::vector & _prevPoints, + const std::vector & _nextPoints, + const std::vector & status, + const Mat &color_img, + int k, + float pixeldistance); + +Mat interpolate_irregular_nn_raster(const std::vector & prevPoints, + const std::vector & nextPoints, + const std::vector & status, + const Mat & i1); + +}} // namespace #endif diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp index dd8ab5e7d1b..b7311edb86c 100644 --- a/modules/optflow/src/rlof/plk_invoker.hpp +++ b/modules/optflow/src/rlof/plk_invoker.hpp @@ -6,37 +6,36 @@ #include "rlof_invokerbase.hpp" -namespace cv{ -namespace optflow{ -namespace plk -{ +namespace cv { +namespace optflow { +namespace plk { // implementierung ohne SSE -namespace radial -{ +namespace radial { + class TrackerInvoker : public cv::ParallelLoopBody { public: TrackerInvoker( - const Mat& _prevImg, - const Mat& _prevDeriv, - const Mat& _nextImg, - const Mat& _rgbPrevImg, - const Mat& _rgbNextImg, - const Point2f* _prevPts, + const Mat& _prevImg, + const Mat& _prevDeriv, + const Mat& _nextImg, + const Mat& _rgbPrevImg, + const Mat& _rgbNextImg, + const Point2f* _prevPts, Point2f* _nextPts, - uchar* _status, - float* _err, + uchar* _status, + float* _err, Point2f* _gainVecs, - int _level, - int _maxLevel, - int _winSize[2], - int _maxIteration, + int _level, + int _maxLevel, + int _winSize[2], + int _maxIteration, bool _useInitialFlow, - int _supportRegionType, + int _supportRegionType, float _minEigenValue, - int _crossSegmentationThreshold) + int _crossSegmentationThreshold) { prevImg = &_prevImg; prevDeriv = &_prevDeriv; @@ -615,11 +614,13 @@ class TrackerInvoker : public cv::ParallelLoopBody gainVecs[ptidx]= gainVec; if( delta.ddot(delta) <= criteria.epsilon) break; - if( ( std::abs(delta.x - prevDelta.x) < 0.01 - && std::abs(delta.y - prevDelta.y) < 0.01) - || ((delta.ddot(delta) <= 0.001) - && std::abs(prevGain.x - deltaGain.x) < 0.01) - ) + if (( + std::abs(delta.x - prevDelta.x) < 0.01 && + std::abs(delta.y - prevDelta.y) < 0.01 + ) || ( + delta.ddot(delta) <= 0.001 && + std::abs(prevGain.x - deltaGain.x) < 0.01 + )) { nextPts[ptidx] -= delta*0.5f; gainVecs[ptidx] -= deltaGain* 0.5f; @@ -643,47 +644,50 @@ class TrackerInvoker : public cv::ParallelLoopBody } - const Mat* prevImg; - const Mat* nextImg; - const Mat* prevDeriv; - const Mat* rgbPrevImg; - const Mat* rgbNextImg; - const Point2f* prevPts; - Point2f* nextPts; - uchar* status; - cv::Point2f* gainVecs; // gain vector x -> multiplier y -> offset - float* err; - int maxWinSize; - int minWinSize; - TermCriteria criteria; - int level; - int maxLevel; - int windowType; - float minEigThreshold; - bool useInitialFlow; - int crossSegmentationThreshold; + const Mat* prevImg; + const Mat* nextImg; + const Mat* prevDeriv; + const Mat* rgbPrevImg; + const Mat* rgbNextImg; + const Point2f* prevPts; + Point2f* nextPts; + uchar* status; + cv::Point2f* gainVecs; // gain vector x -> multiplier y -> offset + float* err; + int maxWinSize; + int minWinSize; + TermCriteria criteria; + int level; + int maxLevel; + int windowType; + float minEigThreshold; + bool useInitialFlow; + int crossSegmentationThreshold; }; -} -namespace ica{ + +} // namespace +namespace ica { + class TrackerInvoker : public cv::ParallelLoopBody { public: - TrackerInvoker(const Mat& _prevImg, - const Mat& _prevDeriv, - const Mat& _nextImg, - const Mat& _rgbPrevImg, - const Mat& _rgbNextImg, - const Point2f* _prevPts, + TrackerInvoker( + const Mat& _prevImg, + const Mat& _prevDeriv, + const Mat& _nextImg, + const Mat& _rgbPrevImg, + const Mat& _rgbNextImg, + const Point2f* _prevPts, Point2f* _nextPts, - uchar* _status, - float* _err, - int _level, - int _maxLevel, - int _winSize[2], - int _maxIteration, + uchar* _status, + float* _err, + int _level, + int _maxLevel, + int _winSize[2], + int _maxIteration, bool _useInitialFlow, - int _supportRegionType, - int _crossSegmentationThreshold, + int _supportRegionType, + int _crossSegmentationThreshold, float _minEigenValue) { prevImg = &_prevImg; @@ -1062,7 +1066,7 @@ class TrackerInvoker : public cv::ParallelLoopBody nextPt += delta; nextPts[ptidx] = nextPt + halfWin; - if( delta.ddot(delta) <= criteria.epsilon) + if( delta.ddot(delta) <= criteria.epsilon) break; if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01 && std::abs(delta.y - prevDelta.y) < 0.01) @@ -1075,34 +1079,29 @@ class TrackerInvoker : public cv::ParallelLoopBody } - - - - } } - const Mat* prevImg; - const Mat* nextImg; - const Mat* prevDeriv; - const Mat* rgbPrevImg; - const Mat* rgbNextImg; - const Point2f* prevPts; - Point2f* nextPts; - uchar* status; - float* err; - int maxWinSize; - int minWinSize; - TermCriteria criteria; - int level; - int maxLevel; - int windowType; - float minEigThreshold; - bool useInitialFlow; - int crossSegmentationThreshold; + const Mat* prevImg; + const Mat* nextImg; + const Mat* prevDeriv; + const Mat* rgbPrevImg; + const Mat* rgbNextImg; + const Point2f* prevPts; + Point2f* nextPts; + uchar* status; + float* err; + int maxWinSize; + int minWinSize; + TermCriteria criteria; + int level; + int maxLevel; + int windowType; + float minEigThreshold; + bool useInitialFlow; + int crossSegmentationThreshold; }; -} -} -}} + +}}}} // namespace #endif diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp index 444d9114bd3..14eabe4b948 100644 --- a/modules/optflow/src/rlof/rlof_invoker.hpp +++ b/modules/optflow/src/rlof/rlof_invoker.hpp @@ -4,1435 +4,1431 @@ #ifndef _RLOF_INVOKER_HPP_ #define _RLOF_INVOKER_HPP_ #include "rlof_invokerbase.hpp" -namespace cv -{ -namespace optflow -{ -namespace rlof -{ -namespace ica +namespace cv { +namespace optflow { +namespace rlof { +namespace ica { + +class TrackerInvoker : public cv::ParallelLoopBody { - class TrackerInvoker : public cv::ParallelLoopBody +public: + TrackerInvoker( + const Mat& _prevImg, + const Mat& _prevDeriv, + const Mat& _nextImg, + const Mat& _rgbPrevImg, + const Mat& _rgbNextImg, + const Point2f* _prevPts, + Point2f* _nextPts, + uchar* _status, + float* _err, + int _level, + int _maxLevel, + int _winSize[2], + int _maxIteration, + bool _useInitialFlow, + int _supportRegionType, + std::vector _normSigmas, + float _minEigenValue, + int _crossSegmentationThreshold) + { + prevImg = &_prevImg; + prevDeriv = &_prevDeriv; + nextImg = &_nextImg; + rgbPrevImg = &_rgbPrevImg; + rgbNextImg = &_rgbNextImg; + prevPts = _prevPts; + nextPts = _nextPts; + status = _status; + err = _err; + minWinSize = _winSize[0]; + maxWinSize = _winSize[1]; + criteria.maxCount = _maxIteration; + criteria.epsilon = 0.01; + level = _level; + maxLevel = _maxLevel; + windowType = _supportRegionType; + minEigThreshold = _minEigenValue; + paramReset = _normSigmas; + useInitialFlow = _useInitialFlow; + crossSegmentationThreshold = _crossSegmentationThreshold; + } + TrackerInvoker & operator=(const TrackerInvoker &) { return *this; }; + void operator()(const cv::Range& range) const CV_OVERRIDE { - public: - TrackerInvoker( - const Mat& _prevImg, - const Mat& _prevDeriv, - const Mat& _nextImg, - const Mat& _rgbPrevImg, - const Mat& _rgbNextImg, - const Point2f* _prevPts, - Point2f* _nextPts, - uchar* _status, - float* _err, - int _level, - int _maxLevel, - int _winSize[2], - int _maxIteration, - bool _useInitialFlow, - int _supportRegionType, - std::vector _normSigmas, - float _minEigenValue, - int _crossSegmentationThreshold) - { - prevImg = &_prevImg; - prevDeriv = &_prevDeriv; - nextImg = &_nextImg; - rgbPrevImg = &_rgbPrevImg; - rgbNextImg = &_rgbNextImg; - prevPts = _prevPts; - nextPts = _nextPts; - status = _status; - err = _err; - minWinSize = _winSize[0]; - maxWinSize = _winSize[1]; - criteria.maxCount = _maxIteration; - criteria.epsilon = 0.01; - level = _level; - maxLevel = _maxLevel; - windowType = _supportRegionType; - minEigThreshold = _minEigenValue; - paramReset = _normSigmas; - useInitialFlow = _useInitialFlow; - crossSegmentationThreshold = _crossSegmentationThreshold; - } - TrackerInvoker & operator=(const TrackerInvoker &) { return *this; }; - void operator()(const cv::Range& range) const CV_OVERRIDE - { #ifdef DEBUG_INVOKER - printf("rlof::ica");fflush(stdout); + printf("rlof::ica");fflush(stdout); #endif - cv::Size winSize; - cv::Point2f halfWin; + cv::Size winSize; + cv::Point2f halfWin; - const Mat& I = *prevImg; - const Mat& J = *nextImg; - const Mat& derivI = *prevDeriv; - const Mat& BI = *rgbPrevImg; + const Mat& I = *prevImg; + const Mat& J = *nextImg; + const Mat& derivI = *prevDeriv; + const Mat& BI = *rgbPrevImg; - winSize = cv::Size(maxWinSize, maxWinSize); - int winMaskwidth = static_cast(ceil(winSize.width / 8.f)) * 16; - cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); - winMaskMatBuf.setTo(1); + winSize = cv::Size(maxWinSize, maxWinSize); + int winMaskwidth = static_cast(ceil(winSize.width / 8.f)) * 16; + cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); + winMaskMatBuf.setTo(1); - const float FLT_SCALE = (1.f / (1 << 20)); // 20 + const float FLT_SCALE = (1.f / (1 << 20)); // 20 - std::vector param = paramReset; - int cn = I.channels(), cn2 = cn * 2; - int winbufwidth = static_cast(ceil(winSize.width / 8.f)) * 8; - cv::Size winBufSize(winbufwidth, winbufwidth); + std::vector param = paramReset; + int cn = I.channels(), cn2 = cn * 2; + int winbufwidth = static_cast(ceil(winSize.width / 8.f)) * 8; + cv::Size winBufSize(winbufwidth, winbufwidth); - std::vector _buf(winBufSize.area()*(cn + cn2)); - Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]); - Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]); + std::vector _buf(winBufSize.area()*(cn + cn2)); + Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]); + Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]); - for (int ptidx = range.start; ptidx < range.end; ptidx++) + for (int ptidx = range.start; ptidx < range.end; ptidx++) + { + Point2f prevPt = prevPts[ptidx] * (float)(1. / (1 << level)); + Point2f nextPt; + if (level == maxLevel) { - Point2f prevPt = prevPts[ptidx] * (float)(1. / (1 << level)); - Point2f nextPt; - if (level == maxLevel) - { - if ( useInitialFlow) - nextPt = nextPts[ptidx] * (float)(1. / (1 << level)); - else - nextPt = prevPt; - } + if ( useInitialFlow) + nextPt = nextPts[ptidx] * (float)(1. / (1 << level)); else - nextPt = nextPts[ptidx] * 2.f; - nextPts[ptidx] = nextPt; + nextPt = prevPt; + } + else + nextPt = nextPts[ptidx] * 2.f; + nextPts[ptidx] = nextPt; - Point2i iprevPt, inextPt; - iprevPt.x = cvFloor(prevPt.x); - iprevPt.y = cvFloor(prevPt.y); - int winArea = maxWinSize * maxWinSize; - cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0, 0, maxWinSize, maxWinSize)); + Point2i iprevPt, inextPt; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + int winArea = maxWinSize * maxWinSize; + cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0, 0, maxWinSize, maxWinSize)); - if (calcWinMaskMat(BI, windowType, iprevPt, - winMaskMat, winSize, halfWin, winArea, - minWinSize, maxWinSize, crossSegmentationThreshold) == false) - continue; + if (calcWinMaskMat(BI, windowType, iprevPt, + winMaskMat, winSize, halfWin, winArea, + minWinSize, maxWinSize, crossSegmentationThreshold) == false) + continue; - prevPt -= halfWin; - iprevPt.x = cvFloor(prevPt.x); - iprevPt.y = cvFloor(prevPt.y); + prevPt -= halfWin; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); - if (iprevPt.x + halfWin.x < 0 || iprevPt.x + halfWin.x >= derivI.cols || - iprevPt.y + halfWin.y < 0 || iprevPt.y + halfWin.y >= derivI.rows) + if (iprevPt.x + halfWin.x < 0 || iprevPt.x + halfWin.x >= derivI.cols || + iprevPt.y + halfWin.y < 0 || iprevPt.y + halfWin.y >= derivI.rows) + { + if (level == 0) { - if (level == 0) - { - if (status) - status[ptidx] = 3; - if (err) - err[ptidx] = 0; - } - continue; + if (status) + status[ptidx] = 3; + if (err) + err[ptidx] = 0; } + continue; + } - float a = prevPt.x - iprevPt.x; - float b = prevPt.y - iprevPt.y; - const int W_BITS = 14, W_BITS1 = 14; + float a = prevPt.x - iprevPt.x; + float b = prevPt.y - iprevPt.y; + const int W_BITS = 14, W_BITS1 = 14; - int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); - int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); - int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); - int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - int dstep = (int)(derivI.step / derivI.elemSize1()); - int step = (int)(I.step / I.elemSize1()); - int mStep = (int)(winMaskMat.step / winMaskMat.elemSize1()); - CV_Assert(step == (int)(J.step / J.elemSize1())); - float A11 = 0, A12 = 0, A22 = 0; - float b1 = 0, b2 = 0; - float D = 0; - float minEig; + int dstep = (int)(derivI.step / derivI.elemSize1()); + int step = (int)(I.step / I.elemSize1()); + int mStep = (int)(winMaskMat.step / winMaskMat.elemSize1()); + CV_Assert(step == (int)(J.step / J.elemSize1())); + float A11 = 0, A12 = 0, A22 = 0; + float b1 = 0, b2 = 0; + float D = 0; + float minEig; #ifdef RLOF_SSE - __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); - __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); - __m128i z = _mm_setzero_si128(); - __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1 - 1)); - __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1 - 5 - 1)); - __m128i mmMask4_epi32; - __m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits::max()); - get4BitMask(winSize.width, mmMask4_epi32); + __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128i z = _mm_setzero_si128(); + __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1 - 1)); + __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1 - 5 - 1)); + __m128i mmMask4_epi32; + __m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits::max()); + get4BitMask(winSize.width, mmMask4_epi32); #endif - // extract the patch from the first image, compute covariation matrix of derivatives - int x, y; - for (y = 0; y < winSize.height; y++) - { - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; - const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; - const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; + // extract the patch from the first image, compute covariation matrix of derivatives + int x, y; + for (y = 0; y < winSize.height; y++) + { + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; + const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; - short* Iptr = (short*)(IWinBuf.data + y * IWinBuf.step); - short* dIptr = (short*)(derivIWinBuf.data + y * derivIWinBuf.step); + short* Iptr = (short*)(IWinBuf.data + y * IWinBuf.step); + short* dIptr = (short*)(derivIWinBuf.data + y * derivIWinBuf.step); - x = 0; + x = 0; #ifdef RLOF_SSE - for (; x <= winSize.width*cn; x += 4, dsrc += 4 * 2, dIptr += 4 * 2) + for (; x <= winSize.width*cn; x += 4, dsrc += 4 * 2, dIptr += 4 * 2) + { + __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16); + __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16); + + __m128i v00, v01, v10, v11, t0, t1; + v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); + v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); + v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z); + v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5); + if (x + 4 > winSize.width) { - __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16); - __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16); - - __m128i v00, v01, v10, v11, t0, t1; - v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); - v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); - v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z); - v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z); - - t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); - t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5); - if (x + 4 > winSize.width) - { - t0 = _mm_and_si128(t0, mmMask4_epi32); - } - t0 = _mm_and_si128(t0, mask_0_3_epi16); - _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0, t0)); - - v00 = _mm_loadu_si128((const __m128i*)(dsrc)); - v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); - v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep)); - v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2)); - - t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); - t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); - t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1); - t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1); - v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... - if (x + 4 > winSize.width) - { - v00 = _mm_and_si128(v00, mmMask4_epi32); - } - v00 = _mm_and_si128(v00, mask_0_3_epi16); - _mm_storeu_si128((__m128i*)dIptr, v00); + t0 = _mm_and_si128(t0, mmMask4_epi32); } + t0 = _mm_and_si128(t0, mask_0_3_epi16); + _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0, t0)); + + v00 = _mm_loadu_si128((const __m128i*)(dsrc)); + v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); + v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep)); + v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2)); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1); + v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... + if (x + 4 > winSize.width) + { + v00 = _mm_and_si128(v00, mmMask4_epi32); + } + v00 = _mm_and_si128(v00, mask_0_3_epi16); + _mm_storeu_si128((__m128i*)dIptr, v00); + } #else - for (; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2) + for (; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2) + { + if (maskPtr[x] == 0) { - if (maskPtr[x] == 0) - { - dIptr[0] = 0; - dIptr[1] = 0; - continue; - } - int ival = CV_DESCALE(src[x] * iw00 + src[x + cn] * iw01 + - src[x + step] * iw10 + src[x + step + cn] * iw11, W_BITS1 - 5); - int ixval = CV_DESCALE(dsrc[0] * iw00 + dsrc[cn2] * iw01 + - dsrc[dstep] * iw10 + dsrc[dstep + cn2] * iw11, W_BITS1); - int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc[dstep + 1] * iw10 + - dsrc[dstep + cn2 + 1] * iw11, W_BITS1); - - Iptr[x] = (short)ival; - dIptr[0] = (short)ixval; - dIptr[1] = (short)iyval; + dIptr[0] = 0; + dIptr[1] = 0; + continue; } -#endif + int ival = CV_DESCALE(src[x] * iw00 + src[x + cn] * iw01 + + src[x + step] * iw10 + src[x + step + cn] * iw11, W_BITS1 - 5); + int ixval = CV_DESCALE(dsrc[0] * iw00 + dsrc[cn2] * iw01 + + dsrc[dstep] * iw10 + dsrc[dstep + cn2] * iw11, W_BITS1); + int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc[dstep + 1] * iw10 + + dsrc[dstep + cn2 + 1] * iw11, W_BITS1); + + Iptr[x] = (short)ival; + dIptr[0] = (short)ixval; + dIptr[1] = (short)iyval; } +#endif + } - cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1); + cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1); - cv::Point2f backUpNextPt = nextPt; - nextPt -= halfWin; - int j; + cv::Point2f backUpNextPt = nextPt; + nextPt -= halfWin; + int j; #ifdef RLOF_SSE - __m128i mmMask0, mmMask1, mmMask; - getWBitMask(winSize.width, mmMask0, mmMask1, mmMask); + __m128i mmMask0, mmMask1, mmMask; + getWBitMask(winSize.width, mmMask0, mmMask1, mmMask); #endif - float MEstimatorScale = 1; - int buffIdx = 0; - cv::Point2f prevDelta(0, 0); + float MEstimatorScale = 1; + int buffIdx = 0; + cv::Point2f prevDelta(0, 0); - for (j = 0; j < criteria.maxCount; j++) - { - inextPt.x = cvFloor(nextPt.x); - inextPt.y = cvFloor(nextPt.y); + for (j = 0; j < criteria.maxCount; j++) + { + inextPt.x = cvFloor(nextPt.x); + inextPt.y = cvFloor(nextPt.y); - if (inextPt.x < -winSize.width || inextPt.x >= J.cols || - inextPt.y < -winSize.height || inextPt.y >= J.rows) - { - if (level == 0 && status) - status[ptidx] = 3; - if (level > 0) - nextPts[ptidx] = backUpNextPt; - break; - } + if (inextPt.x < -winSize.width || inextPt.x >= J.cols || + inextPt.y < -winSize.height || inextPt.y >= J.rows) + { + if (level == 0 && status) + status[ptidx] = 3; + if (level > 0) + nextPts[ptidx] = backUpNextPt; + break; + } - a = nextPt.x - inextPt.x; - b = nextPt.y - inextPt.y; - iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); - iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); - iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); - iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - b1 = 0; - b2 = 0; + a = nextPt.x - inextPt.x; + b = nextPt.y - inextPt.y; + iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + b1 = 0; + b2 = 0; - if (j == 0 ) - { - A11 = 0; - A12 = 0; - A22 = 0; - } + if (j == 0 ) + { + A11 = 0; + A12 = 0; + A22 = 0; + } - if (j == 0 ) + if (j == 0 ) + { + buffIdx = 0; + for (y = 0; y < winSize.height; y++) { - buffIdx = 0; - for (y = 0; y < winSize.height; y++) + const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn; + const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step); + const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + x = 0; + for (; x < winSize.width*cn; x++, dIptr += 2) { - const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn; - const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step); - const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; - x = 0; - for (; x < winSize.width*cn; x++, dIptr += 2) - { - if (maskPtr[x] == 0) - continue; - int diff = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11, W_BITS1 - 5) - Iptr[x]; - residualMat.at(buffIdx++) = static_cast(diff); - } + if (maskPtr[x] == 0) + continue; + int diff = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11, W_BITS1 - 5) - Iptr[x]; + residualMat.at(buffIdx++) = static_cast(diff); } - /*! Estimation for the residual */ - cv::Mat residualMatRoi(residualMat, cv::Rect(0, 0, 1, buffIdx)); - MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi); } + /*! Estimation for the residual */ + cv::Mat residualMatRoi(residualMat, cv::Rect(0, 0, 1, buffIdx)); + MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi); + } - float eta = 1.f / winArea; - float fParam0 = param[0] * 32.f; - float fParam1 = param[1] * 32.f; - fParam0 = param[0] * MEstimatorScale; - fParam1 = param[1] * MEstimatorScale; + float eta = 1.f / winArea; + float fParam0 = param[0] * 32.f; + float fParam1 = param[1] * 32.f; + fParam0 = param[0] * MEstimatorScale; + fParam1 = param[1] * MEstimatorScale; #ifdef RLOF_SSE - qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); - qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); - __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(); - __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps(); - __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits::max() - 1, static_cast(fParam0))); - __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits::max() - 1, static_cast(fParam1))); - float s2Val = param[2] > 0 ? param[2] : -param[2]; - int s2bitShift = param[2] == 0 ? 1 : static_cast(ceil(log(200.f / s2Val) / log(2.f))); - __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast(param[2] * (float)(1 << s2bitShift))); - __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift); - __m128 mmParam2s = _mm_set1_ps(0.01f * param[2]); - __m128 mmParam2s2 = _mm_set1_ps(param[2] * param[2]); - __m128 mmOnes = _mm_set1_ps(1.f); - __m128i mmEta = _mm_setzero_si128(); - __m128i mmScale = _mm_set1_epi16(static_cast(MEstimatorScale)); + qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(); + __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps(); + __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits::max() - 1, static_cast(fParam0))); + __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits::max() - 1, static_cast(fParam1))); + float s2Val = param[2] > 0 ? param[2] : -param[2]; + int s2bitShift = param[2] == 0 ? 1 : static_cast(ceil(log(200.f / s2Val) / log(2.f))); + __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast(param[2] * (float)(1 << s2bitShift))); + __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift); + __m128 mmParam2s = _mm_set1_ps(0.01f * param[2]); + __m128 mmParam2s2 = _mm_set1_ps(param[2] * param[2]); + __m128 mmOnes = _mm_set1_ps(1.f); + __m128i mmEta = _mm_setzero_si128(); + __m128i mmScale = _mm_set1_epi16(static_cast(MEstimatorScale)); #endif - buffIdx = 0; - for (y = 0; y < winSize.height; y++) - { - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; - const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn; - const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step); - const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); + buffIdx = 0; + for (y = 0; y < winSize.height; y++) + { + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn; + const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step); + const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); - x = 0; + x = 0; #ifdef RLOF_SSE - for (; x <= winSize.width*cn; x += 8, dIptr += 8 * 2) - { - __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16); - __m128i diff0, diff1; - __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); // von element 0 bis 7 - - __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z); - __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); - __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z); - __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z); - - __m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); - __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); - t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5); - t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1 - 5); - - __m128i mmDiff_epi16 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16); - - mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16); - - - __m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale); - mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1))); - - - __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... - __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); - - __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16); - __m128i bSet2_epi16, bSet1_epi16; - // |It| < sigma1 ? - bSet2_epi16 = _mm_cmplt_epi16(abs_epi16, mmParam1); - // It > 0 ? - __m128i diffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128()); - // sigma0 < |It| < sigma1 ? - bSet1_epi16 = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0)); - // val = |It| -/+ sigma1 - __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)), - _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1))); - // It == 0 ? |It| > sigma13 - mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16); - // It == val ? sigma0 < |It| < sigma1 - mmDiff_epi16 = _mm_blendv_epi8(mmDiff_epi16, tmpParam1_epi16, bSet1_epi16); - - - __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3 - // diff = diff * sigma2 - __m128i lo = _mm_mullo_epi16(tale_epi16_, mmDiff_epi16); - __m128i hi = _mm_mulhi_epi16(tale_epi16_, mmDiff_epi16); - __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); //diff 0_3_epi32 - __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); // diff 4_7_epi32 - - mmDiff_epi16 = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); - diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7 | It12 It12 It13 It13... - diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3 | It8 It8 It9 It9... - - // Ix * diff / Iy * diff - v10 = _mm_mullo_epi16(Ixy_0, diff0); - v11 = _mm_mulhi_epi16(Ixy_0, diff0); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); - qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); - - // It * Ix It * Iy [4 ... 7] - // for set 1 hi sigma 1 - - v10 = _mm_mullo_epi16(Ixy_1, diff1); - v11 = _mm_mulhi_epi16(Ixy_1, diff1); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); - qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); + for (; x <= winSize.width*cn; x += 8, dIptr += 8 * 2) + { + __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16); + __m128i diff0, diff1; + __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); // von element 0 bis 7 + __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z); + __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); + __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z); + __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z); - if (j == 0) - { - __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16)); - __m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16, bSet1_epi16), 16)); - __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16)); - __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16))); + __m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1 - 5); - __m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16)); - __m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16), 16)); + __m128i mmDiff_epi16 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16); - __m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps); - __m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps); - tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps); - tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps); + mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16); - tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps); - tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps); - t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3 - t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3 + __m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale); + mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1))); - __m128 fy = _mm_cvtepi32_ps(t0); - __m128 fx = _mm_cvtepi32_ps(t1); - // A11 - A22 - __m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps); - __m128 fytale = _mm_mul_ps(fy, tale_0_3_ps); + __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... + __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); - mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); - mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); - mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); + __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16); + __m128i bSet2_epi16, bSet1_epi16; + // |It| < sigma1 ? + bSet2_epi16 = _mm_cmplt_epi16(abs_epi16, mmParam1); + // It > 0 ? + __m128i diffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128()); + // sigma0 < |It| < sigma1 ? + bSet1_epi16 = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0)); + // val = |It| -/+ sigma1 + __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)), + _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1))); + // It == 0 ? |It| > sigma13 + mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16); + // It == val ? sigma0 < |It| < sigma1 + mmDiff_epi16 = _mm_blendv_epi8(mmDiff_epi16, tmpParam1_epi16, bSet1_epi16); - t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11 - t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3 - fy = _mm_cvtepi32_ps(t0); - fx = _mm_cvtepi32_ps(t1); + __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3 + // diff = diff * sigma2 + __m128i lo = _mm_mullo_epi16(tale_epi16_, mmDiff_epi16); + __m128i hi = _mm_mulhi_epi16(tale_epi16_, mmDiff_epi16); + __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); //diff 0_3_epi32 + __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); // diff 4_7_epi32 - // A11 - A22 - fxtale = _mm_mul_ps(fx, tale_4_7_ps); - fytale = _mm_mul_ps(fy, tale_4_7_ps); + mmDiff_epi16 = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); + diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7 | It12 It12 It13 It13... + diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3 | It8 It8 It9 It9... - mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); - mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); - mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); - } - } -#else - for (; x < winSize.width*cn; x++, dIptr += 2) + // Ix * diff / Iy * diff + v10 = _mm_mullo_epi16(Ixy_0, diff0); + v11 = _mm_mulhi_epi16(Ixy_0, diff0); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); + qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); + + // It * Ix It * Iy [4 ... 7] + // for set 1 hi sigma 1 + + v10 = _mm_mullo_epi16(Ixy_1, diff1); + v11 = _mm_mulhi_epi16(Ixy_1, diff1); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); + qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); + + + if (j == 0) { - if (maskPtr[x] == 0) - continue; - int diff = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + - Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11, - W_BITS1 - 5) - Iptr[x]; + __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16)); + __m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16, bSet1_epi16), 16)); + __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16)); + __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16))); + __m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16)); + __m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16), 16)); + __m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps); + __m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps); + tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps); + tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps); + tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps); + tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps); + t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3 + t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3 - if (diff > MEstimatorScale) - MEstimatorScale += eta; - if (diff < MEstimatorScale) - MEstimatorScale -= eta; + __m128 fy = _mm_cvtepi32_ps(t0); + __m128 fx = _mm_cvtepi32_ps(t1); - int abss = (diff < 0) ? -diff : diff; - if (abss > fParam1) - { - diff = 0; - } - else if (abss > fParam0 && diff >= 0) - { - //diff = fParam1 * (diff - fParam1); - diff = static_cast(param[2] * (diff - fParam1)); - } - else if (abss > fParam0 && diff < 0) - { - //diff = fParam1 * (diff + fParam1); - diff = static_cast(param[2] * (diff + fParam1)); + // A11 - A22 + __m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps); + __m128 fytale = _mm_mul_ps(fy, tale_0_3_ps); - } + mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); + mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); + mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); - float ixval = (float)(dIptr[0]); - float iyval = (float)(dIptr[1]); - b1 += (float)(diff*ixval); - b2 += (float)(diff*iyval); + t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11 + t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3 - if ( j == 0) - { - float tale = param[2] * FLT_RESCALE; - if (abss < fParam0) - { - tale = FLT_RESCALE; - } - else if (abss > fParam1) - { - tale *= 0.01f; - } - else - { - tale *= param[2]; - } - A11 += (float)(ixval*ixval*tale); - A12 += (float)(ixval*iyval*tale); - A22 += (float)(iyval*iyval*tale); - } + fy = _mm_cvtepi32_ps(t0); + fx = _mm_cvtepi32_ps(t1); + // A11 - A22 + fxtale = _mm_mul_ps(fx, tale_4_7_ps); + fytale = _mm_mul_ps(fy, tale_4_7_ps); + mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); + mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); + mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); } -#endif } +#else + for (; x < winSize.width*cn; x++, dIptr += 2) + { + if (maskPtr[x] == 0) + continue; + int diff = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + + Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11, + W_BITS1 - 5) - Iptr[x]; -#ifdef RLOF_SSE - short etaValues[8]; - _mm_storeu_si128((__m128i*)(etaValues), mmEta); - MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3] - + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]); -#endif - if (j == 0) - { -#ifdef RLOF_SSE - float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];// - _mm_store_ps(A11buf, mmAxx); - _mm_store_ps(A12buf, mmAxy); - _mm_store_ps(A22buf, mmAyy); + if (diff > MEstimatorScale) + MEstimatorScale += eta; + if (diff < MEstimatorScale) + MEstimatorScale -= eta; - A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; - A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; - A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; -#endif - A11 *= FLT_SCALE; - A12 *= FLT_SCALE; - A22 *= FLT_SCALE; + int abss = (diff < 0) ? -diff : diff; + if (abss > fParam1) + { + diff = 0; + } + else if (abss > fParam0 && diff >= 0) + { + //diff = fParam1 * (diff - fParam1); + diff = static_cast(param[2] * (diff - fParam1)); + } + else if (abss > fParam0 && diff < 0) + { + //diff = fParam1 * (diff + fParam1); + diff = static_cast(param[2] * (diff + fParam1)); + } - D = A11 * A22 - A12 * A12; - minEig = (A22 + A11 - std::sqrt((A11 - A22)*(A11 - A22) + - 4.f*A12*A12)) / (2 * winArea); - D = 1.f / D; + float ixval = (float)(dIptr[0]); + float iyval = (float)(dIptr[1]); + b1 += (float)(diff*ixval); + b2 += (float)(diff*iyval); - if (minEig < minEigThreshold || std::abs(D) < FLT_EPSILON) + if ( j == 0) { - if (level == 0 && status) - status[ptidx] = 0; - if (level > 0) + float tale = param[2] * FLT_RESCALE; + if (abss < fParam0) + { + tale = FLT_RESCALE; + } + else if (abss > fParam1) + { + tale *= 0.01f; + } + else { - nextPts[ptidx] = backUpNextPt; + tale *= param[2]; } - break; + A11 += (float)(ixval*ixval*tale); + A12 += (float)(ixval*iyval*tale); + A22 += (float)(iyval*iyval*tale); } + + } +#endif + } #ifdef RLOF_SSE - float CV_DECL_ALIGNED(16) bbuf[4]; - _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1)); - b1 += bbuf[0] + bbuf[2]; - b2 += bbuf[1] + bbuf[3]; + short etaValues[8]; + _mm_storeu_si128((__m128i*)(etaValues), mmEta); + MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3] + + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]); + + #endif + if (j == 0) + { +#ifdef RLOF_SSE + float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];// - b1 *= FLT_SCALE; - b2 *= FLT_SCALE; + _mm_store_ps(A11buf, mmAxx); + _mm_store_ps(A12buf, mmAxy); + _mm_store_ps(A22buf, mmAyy); - Point2f delta((float)((A12*b2 - A22 * b1) * D), (float)((A12*b1 - A11 * b2) * D)); + A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; + A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; + A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; +#endif + A11 *= FLT_SCALE; + A12 *= FLT_SCALE; + A22 *= FLT_SCALE; - delta.x = (delta.x != delta.x) ? 0 : delta.x; - delta.y = (delta.y != delta.y) ? 0 : delta.y; - nextPt += delta * 0.7; - nextPts[ptidx] = nextPt + halfWin; + D = A11 * A22 - A12 * A12; + minEig = (A22 + A11 - std::sqrt((A11 - A22)*(A11 - A22) + + 4.f*A12*A12)) / (2 * winArea); + D = 1.f / D; - if (j > 0 && std::abs(delta.x - prevDelta.x) < 0.01 && - std::abs(delta.y - prevDelta.y) < 0.01) + if (minEig < minEigThreshold || std::abs(D) < FLT_EPSILON) { - nextPts[ptidx] -= delta * 0.5f; + if (level == 0 && status) + status[ptidx] = 0; + if (level > 0) + { + nextPts[ptidx] = backUpNextPt; + } break; } + } + +#ifdef RLOF_SSE + float CV_DECL_ALIGNED(16) bbuf[4]; + _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1)); + b1 += bbuf[0] + bbuf[2]; + b2 += bbuf[1] + bbuf[3]; +#endif + + b1 *= FLT_SCALE; + b2 *= FLT_SCALE; - prevDelta = delta; + Point2f delta((float)((A12*b2 - A22 * b1) * D), (float)((A12*b1 - A11 * b2) * D)); + + delta.x = (delta.x != delta.x) ? 0 : delta.x; + delta.y = (delta.y != delta.y) ? 0 : delta.y; + + nextPt += delta * 0.7; + nextPts[ptidx] = nextPt + halfWin; + + if (j > 0 && std::abs(delta.x - prevDelta.x) < 0.01 && + std::abs(delta.y - prevDelta.y) < 0.01) + { + nextPts[ptidx] -= delta * 0.5f; + break; } + prevDelta = delta; + } } - const Mat* prevImg; - const Mat* nextImg; - const Mat* prevDeriv; - const Mat* rgbPrevImg; - const Mat* rgbNextImg; - const Point2f* prevPts; - Point2f* nextPts; - uchar* status; - float* err; - int maxWinSize; - int minWinSize; - TermCriteria criteria; - int level; - int maxLevel; - int windowType; - float minEigThreshold; - bool useInitialFlow; - std::vector paramReset; - int crossSegmentationThreshold; - }; -} -namespace radial + } + + const Mat* prevImg; + const Mat* nextImg; + const Mat* prevDeriv; + const Mat* rgbPrevImg; + const Mat* rgbNextImg; + const Point2f* prevPts; + Point2f* nextPts; + uchar* status; + float* err; + int maxWinSize; + int minWinSize; + TermCriteria criteria; + int level; + int maxLevel; + int windowType; + float minEigThreshold; + bool useInitialFlow; + std::vector paramReset; + int crossSegmentationThreshold; +}; + +} // namespace +namespace radial { + +class TrackerInvoker : public cv::ParallelLoopBody { - class TrackerInvoker : public cv::ParallelLoopBody +public: + TrackerInvoker( + const Mat& _prevImg, + const Mat& _prevDeriv, + const Mat& _nextImg, + const Mat& _rgbPrevImg, + const Mat& _rgbNextImg, + const Point2f* _prevPts, + Point2f* _nextPts, + uchar* _status, + float* _err, + Point2f* _gainVecs, + int _level, + int _maxLevel, + int _winSize[2], + int _maxIteration, + bool _useInitialFlow, + int _supportRegionType, + std::vector _normSigmas, + float _minEigenValue, + int _crossSegmentationThreshold) + { + prevImg = &_prevImg; + prevDeriv = &_prevDeriv; + nextImg = &_nextImg; + rgbPrevImg = &_rgbPrevImg; + rgbNextImg = &_rgbNextImg; + prevPts = _prevPts; + nextPts = _nextPts; + status = _status; + err = _err; + gainVecs = _gainVecs; + minWinSize = _winSize[0]; + maxWinSize = _winSize[1]; + criteria.maxCount = _maxIteration; + criteria.epsilon = 0.01; + level = _level; + maxLevel = _maxLevel; + windowType = _supportRegionType; + minEigThreshold = _minEigenValue; + paramReset = _normSigmas; + useInitialFlow = _useInitialFlow; + crossSegmentationThreshold = _crossSegmentationThreshold; + } + + TrackerInvoker & operator=(const TrackerInvoker &) { return *this; }; + void operator()(const cv::Range& range) const CV_OVERRIDE { - public: - TrackerInvoker( - const Mat& _prevImg, - const Mat& _prevDeriv, - const Mat& _nextImg, - const Mat& _rgbPrevImg, - const Mat& _rgbNextImg, - const Point2f* _prevPts, - Point2f* _nextPts, - uchar* _status, - float* _err, - Point2f* _gainVecs, - int _level, - int _maxLevel, - int _winSize[2], - int _maxIteration, - bool _useInitialFlow, - int _supportRegionType, - std::vector _normSigmas, - float _minEigenValue, - int _crossSegmentationThreshold) - { - prevImg = &_prevImg; - prevDeriv = &_prevDeriv; - nextImg = &_nextImg; - rgbPrevImg = &_rgbPrevImg; - rgbNextImg = &_rgbNextImg; - prevPts = _prevPts; - nextPts = _nextPts; - status = _status; - err = _err; - gainVecs = _gainVecs; - minWinSize = _winSize[0]; - maxWinSize = _winSize[1]; - criteria.maxCount = _maxIteration; - criteria.epsilon = 0.01; - level = _level; - maxLevel = _maxLevel; - windowType = _supportRegionType; - minEigThreshold = _minEigenValue; - paramReset = _normSigmas; - useInitialFlow = _useInitialFlow; - crossSegmentationThreshold = _crossSegmentationThreshold; - } - - TrackerInvoker & operator=(const TrackerInvoker &) { return *this; }; - void operator()(const cv::Range& range) const CV_OVERRIDE - { #ifdef DEBUG_INVOKER - printf("rlof::radial");fflush(stdout); + printf("rlof::radial");fflush(stdout); #endif - Point2f halfWin; - cv::Size winSize; - const Mat& I = *prevImg; - const Mat& J = *nextImg; - const Mat& derivI = *prevDeriv; - const Mat& BI = *rgbPrevImg; + Point2f halfWin; + cv::Size winSize; + const Mat& I = *prevImg; + const Mat& J = *nextImg; + const Mat& derivI = *prevDeriv; + const Mat& BI = *rgbPrevImg; - const float FLT_SCALE = (1.f / (1 << 16)); + const float FLT_SCALE = (1.f / (1 << 16)); - winSize = cv::Size(maxWinSize, maxWinSize); - int winMaskwidth = static_cast(ceil(winSize.width / 16.f)) * 16; - cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); - winMaskMatBuf.setTo(1); + winSize = cv::Size(maxWinSize, maxWinSize); + int winMaskwidth = static_cast(ceil(winSize.width / 16.f)) * 16; + cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); + winMaskMatBuf.setTo(1); - std::vector param = paramReset; - int cn = I.channels(), cn2 = cn * 2; - int winbufwidth = static_cast(ceil(winSize.width / 16.f)) * 16; - cv::Size winBufSize(winbufwidth, winbufwidth); + std::vector param = paramReset; + int cn = I.channels(), cn2 = cn * 2; + int winbufwidth = static_cast(ceil(winSize.width / 16.f)) * 16; + cv::Size winBufSize(winbufwidth, winbufwidth); - cv::Mat invTensorMat(4, 4, CV_32FC1); - cv::Mat mismatchMat(4, 1, CV_32FC1); - cv::Mat resultMat(4, 1, CV_32FC1); + cv::Mat invTensorMat(4, 4, CV_32FC1); + cv::Mat mismatchMat(4, 1, CV_32FC1); + cv::Mat resultMat(4, 1, CV_32FC1); - std::vector _buf(winBufSize.area()*(cn + cn2)); - Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]); - Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]); + std::vector _buf(winBufSize.area()*(cn + cn2)); + Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]); + Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]); - for (int ptidx = range.start; ptidx < range.end; ptidx++) - { + for (int ptidx = range.start; ptidx < range.end; ptidx++) + { - Point2f prevPt = prevPts[ptidx] * (float)(1. / (1 << level)); - Point2f nextPt; - if (level == maxLevel) + Point2f prevPt = prevPts[ptidx] * (float)(1. / (1 << level)); + Point2f nextPt; + if (level == maxLevel) + { + if (useInitialFlow) { - if (useInitialFlow) - { - nextPt = nextPts[ptidx] * (float)(1. / (1 << level)); - } - else - nextPt = prevPt; + nextPt = nextPts[ptidx] * (float)(1. / (1 << level)); } else - { - nextPt = nextPts[ptidx] * 2.f; + nextPt = prevPt; + } + else + { + nextPt = nextPts[ptidx] * 2.f; - } - nextPts[ptidx] = nextPt; - - Point2i iprevPt, inextPt; - iprevPt.x = cvFloor(prevPt.x); - iprevPt.y = cvFloor(prevPt.y); - int winArea = maxWinSize * maxWinSize; - cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0, 0, maxWinSize, maxWinSize)); - winMaskMatBuf.setTo(0); - if (calcWinMaskMat(BI, windowType, iprevPt, - winMaskMat, winSize, halfWin, winArea, - minWinSize, maxWinSize, crossSegmentationThreshold) == false) - continue; - - prevPt -= halfWin; - iprevPt.x = cvFloor(prevPt.x); - iprevPt.y = cvFloor(prevPt.y); - - if (iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols || - iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows) + } + nextPts[ptidx] = nextPt; + + Point2i iprevPt, inextPt; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + int winArea = maxWinSize * maxWinSize; + cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0, 0, maxWinSize, maxWinSize)); + winMaskMatBuf.setTo(0); + if (calcWinMaskMat(BI, windowType, iprevPt, + winMaskMat, winSize, halfWin, winArea, + minWinSize, maxWinSize, crossSegmentationThreshold) == false) + continue; + + prevPt -= halfWin; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + + if (iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols || + iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows) + { + if (level == 0) { - if (level == 0) - { - if (status) - status[ptidx] = 3; - if (err) - err[ptidx] = 0; - } - continue; + if (status) + status[ptidx] = 3; + if (err) + err[ptidx] = 0; } + continue; + } - float a = prevPt.x - iprevPt.x; - float b = prevPt.y - iprevPt.y; - const int W_BITS = 14, W_BITS1 = 14; - - int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); - int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); - int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); - int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - - int dstep = (int)(derivI.step / derivI.elemSize1()); - int step = (int)(I.step / I.elemSize1()); - int mStep = (int)(winMaskMat.step / winMaskMat.elemSize1()); - CV_Assert(step == (int)(J.step / J.elemSize1())); - - float A11 = 0, A12 = 0, A22 = 0; - float b1 = 0, b2 = 0, b3 = 0, b4 = 0; - // tensor - float sumIx = 0; - float sumIy = 0; - float sumI = 0; - float sumW = 0; - float w1 = 0, w2 = 0; // -IyI - float dI = 0; // I^2 - float D = 0; + float a = prevPt.x - iprevPt.x; + float b = prevPt.y - iprevPt.y; + const int W_BITS = 14, W_BITS1 = 14; + + int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + + int dstep = (int)(derivI.step / derivI.elemSize1()); + int step = (int)(I.step / I.elemSize1()); + int mStep = (int)(winMaskMat.step / winMaskMat.elemSize1()); + CV_Assert(step == (int)(J.step / J.elemSize1())); + + float A11 = 0, A12 = 0, A22 = 0; + float b1 = 0, b2 = 0, b3 = 0, b4 = 0; + // tensor + float sumIx = 0; + float sumIy = 0; + float sumI = 0; + float sumW = 0; + float w1 = 0, w2 = 0; // -IyI + float dI = 0; // I^2 + float D = 0; #ifdef RLOF_SSE - __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); - __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); - __m128i z = _mm_setzero_si128(); - __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1 - 1)); - __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1 - 5 - 1)); - __m128i mmMask4_epi32; - __m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits::max()); - get4BitMask(winSize.width, mmMask4_epi32); + __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128i z = _mm_setzero_si128(); + __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1 - 1)); + __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1 - 5 - 1)); + __m128i mmMask4_epi32; + __m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits::max()); + get4BitMask(winSize.width, mmMask4_epi32); #endif - // extract the patch from the first image, compute covariation matrix of derivatives - int x, y; - for (y = 0; y < winSize.height; y++) - { - const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; - const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; + // extract the patch from the first image, compute covariation matrix of derivatives + int x, y; + for (y = 0; y < winSize.height; y++) + { + const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; + const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; - short* Iptr = (short*)(IWinBuf.data + y * IWinBuf.step); - short* dIptr = (short*)(derivIWinBuf.data + y * derivIWinBuf.step); + short* Iptr = (short*)(IWinBuf.data + y * IWinBuf.step); + short* dIptr = (short*)(derivIWinBuf.data + y * derivIWinBuf.step); - x = 0; + x = 0; #ifdef RLOF_SSE - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; - for (; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4 * 2, dIptr += 4 * 2) + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + for (; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4 * 2, dIptr += 4 * 2) + { + __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16); + __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16); + __m128i v00, v01, v10, v11, t0, t1; + v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); + v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); + v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z); + v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5); + if (x + 4 > winSize.width) { - __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16); - __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16); - __m128i v00, v01, v10, v11, t0, t1; - v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); - v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); - v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z); - v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z); - - t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); - t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5); - if (x + 4 > winSize.width) - { - t0 = _mm_and_si128(t0, mmMask4_epi32); - } - t0 = _mm_and_si128(t0, mask_0_3_epi16); - _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0, t0)); - - - v00 = _mm_loadu_si128((const __m128i*)(dsrc)); - v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); - v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep)); - v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2)); - - t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); - t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); - t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1); - t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1); - v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... - if (x + 4 > winSize.width) - { - v00 = _mm_and_si128(v00, mmMask4_epi32); - } - v00 = _mm_and_si128(v00, mask_0_3_epi16); - _mm_storeu_si128((__m128i*)dIptr, v00); + t0 = _mm_and_si128(t0, mmMask4_epi32); + } + t0 = _mm_and_si128(t0, mask_0_3_epi16); + _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0, t0)); + + + v00 = _mm_loadu_si128((const __m128i*)(dsrc)); + v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); + v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep)); + v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2)); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1); + v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... + if (x + 4 > winSize.width) + { + v00 = _mm_and_si128(v00, mmMask4_epi32); } + v00 = _mm_and_si128(v00, mask_0_3_epi16); + _mm_storeu_si128((__m128i*)dIptr, v00); + } #else - for (; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2) + for (; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2) + { + if (winMaskMat.at(y, x) == 0) { - if (winMaskMat.at(y, x) == 0) - { - dIptr[0] = 0; - dIptr[1] = 0; - continue; - } - int ival = CV_DESCALE(src[x] * iw00 + src[x + cn] * iw01 + - src[x + step] * iw10 + src[x + step + cn] * iw11, W_BITS1 - 5); - int ixval = CV_DESCALE(dsrc[0] * iw00 + dsrc[cn2] * iw01 + - dsrc[dstep] * iw10 + dsrc[dstep + cn2] * iw11, W_BITS1); - int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc[dstep + 1] * iw10 + - dsrc[dstep + cn2 + 1] * iw11, W_BITS1); + dIptr[0] = 0; + dIptr[1] = 0; + continue; + } + int ival = CV_DESCALE(src[x] * iw00 + src[x + cn] * iw01 + + src[x + step] * iw10 + src[x + step + cn] * iw11, W_BITS1 - 5); + int ixval = CV_DESCALE(dsrc[0] * iw00 + dsrc[cn2] * iw01 + + dsrc[dstep] * iw10 + dsrc[dstep + cn2] * iw11, W_BITS1); + int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc[dstep + 1] * iw10 + + dsrc[dstep + cn2 + 1] * iw11, W_BITS1); - Iptr[x] = (short)ival; - dIptr[0] = (short)ixval; - dIptr[1] = (short)iyval; + Iptr[x] = (short)ival; + dIptr[0] = (short)ixval; + dIptr[1] = (short)iyval; - } -#endif } +#endif + } - cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1); + cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1); - cv::Point2f backUpNextPt = nextPt; - nextPt -= halfWin; - Point2f prevDelta(0, 0); //related to h(t-1) - Point2f prevGain(0, 0); - cv::Point2f gainVec = gainVecs[ptidx]; - cv::Point2f backUpGain = gainVec; - cv::Size _winSize = winSize; - int j; + cv::Point2f backUpNextPt = nextPt; + nextPt -= halfWin; + Point2f prevDelta(0, 0); //related to h(t-1) + Point2f prevGain(0, 0); + cv::Point2f gainVec = gainVecs[ptidx]; + cv::Point2f backUpGain = gainVec; + cv::Size _winSize = winSize; + int j; #ifdef RLOF_SSE - __m128i mmMask0, mmMask1, mmMask; - getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask); - __m128 mmOnes = _mm_set1_ps(1.f); + __m128i mmMask0, mmMask1, mmMask; + getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask); + __m128 mmOnes = _mm_set1_ps(1.f); #endif - float MEstimatorScale = 1; - int buffIdx = 0; - float minEigValue; + float MEstimatorScale = 1; + int buffIdx = 0; + float minEigValue; - for (j = 0; j < criteria.maxCount; j++) - { + for (j = 0; j < criteria.maxCount; j++) + { - inextPt.x = cvFloor(nextPt.x); - inextPt.y = cvFloor(nextPt.y); + inextPt.x = cvFloor(nextPt.x); + inextPt.y = cvFloor(nextPt.y); - if (inextPt.x < -winSize.width || inextPt.x >= J.cols || - inextPt.y < -winSize.height || inextPt.y >= J.rows) + if (inextPt.x < -winSize.width || inextPt.x >= J.cols || + inextPt.y < -winSize.height || inextPt.y >= J.rows) + { + if (level == 0 && status) + status[ptidx] = 3; + if (level > 0) { - if (level == 0 && status) - status[ptidx] = 3; - if (level > 0) - { - nextPts[ptidx] = backUpNextPt; - gainVecs[ptidx] = backUpGain; - } - break; + nextPts[ptidx] = backUpNextPt; + gainVecs[ptidx] = backUpGain; } + break; + } - a = nextPt.x - inextPt.x; - b = nextPt.y - inextPt.y; - iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); - iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); - iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); - iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - - // mismatch - b1 = 0; - b2 = 0; - b3 = 0; - b4 = 0; - if (j == 0) - { - // tensor - w1 = 0; // -IxI - w2 = 0; // -IyI - dI = 0; // I^2 - sumIx = 0; - sumIy = 0; - sumI = 0; - sumW = 0; - A11 = 0; - A12 = 0; - A22 = 0; - } + a = nextPt.x - inextPt.x; + b = nextPt.y - inextPt.y; + iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + + // mismatch + b1 = 0; + b2 = 0; + b3 = 0; + b4 = 0; + if (j == 0) + { + // tensor + w1 = 0; // -IxI + w2 = 0; // -IyI + dI = 0; // I^2 + sumIx = 0; + sumIy = 0; + sumI = 0; + sumW = 0; + A11 = 0; + A12 = 0; + A22 = 0; + } - if (j == 0 ) + if (j == 0 ) + { + buffIdx = 0; + for (y = 0; y < winSize.height; y++) { - buffIdx = 0; - for (y = 0; y < winSize.height; y++) + const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn; + const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step); + const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); + x = 0; + for (; x < winSize.width*cn; x++, dIptr += 2) { - const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn; - const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step); - const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); - x = 0; - for (; x < winSize.width*cn; x++, dIptr += 2) - { - if (dIptr[0] == 0 && dIptr[1] == 0) - continue; - int diff = static_cast(CV_DESCALE( Jptr[x] * iw00 + - Jptr[x + cn] * iw01 + - Jptr[x + step] * iw10 + - Jptr[x + step + cn] * iw11, W_BITS1 - 5) - - Iptr[x] + Iptr[x] * gainVec.x + gainVec.y); - residualMat.at(buffIdx++) = static_cast(diff); - } + if (dIptr[0] == 0 && dIptr[1] == 0) + continue; + int diff = static_cast(CV_DESCALE( Jptr[x] * iw00 + + Jptr[x + cn] * iw01 + + Jptr[x + step] * iw10 + + Jptr[x + step + cn] * iw11, W_BITS1 - 5) + - Iptr[x] + Iptr[x] * gainVec.x + gainVec.y); + residualMat.at(buffIdx++) = static_cast(diff); } - /*! Estimation for the residual */ - cv::Mat residualMatRoi(residualMat, cv::Rect(0, 0, 1, buffIdx)); - MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi); } + /*! Estimation for the residual */ + cv::Mat residualMatRoi(residualMat, cv::Rect(0, 0, 1, buffIdx)); + MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi); + } - float eta = 1.f / winArea; - float fParam0 = param[0] * 32.f; - float fParam1 = param[1] * 32.f; - fParam0 = param[0] * MEstimatorScale; - fParam1 = param[1] * MEstimatorScale; + float eta = 1.f / winArea; + float fParam0 = param[0] * 32.f; + float fParam1 = param[1] * 32.f; + fParam0 = param[0] * MEstimatorScale; + fParam1 = param[1] * MEstimatorScale; #ifdef RLOF_SSE - qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); - qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); - __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(), qb2 = _mm_setzero_ps(); - __m128 qb3 = _mm_setzero_ps(); - __m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps(); - __m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps(); - __m128 mmSumIy = _mm_setzero_ps(), mmSumIx = _mm_setzero_ps(); - __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps(); - __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits::max() - 1, static_cast(fParam0))); - __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits::max() - 1, static_cast(fParam1))); - - - float s2Val = param[2] > 0 ? param[2] : -param[2]; - int s2bitShift = param[2] == 0 ? 1 : static_cast(ceil(log(200.f / s2Val) / log(2.f))); - __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast(param[2] * (float)(1 << s2bitShift))); - __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift); - __m128 mmParam2s = _mm_set1_ps(0.01f * param[2]); - __m128 mmParam2s2 = _mm_set1_ps(param[2] * param[2]); - - float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x; - int bitShift = gainVec.x == 0 ? 1 : static_cast(ceil(log(200.f / gainVal) / log(2.f))); - - __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast(gainVec.x * (float)(1 << bitShift))); - __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast(gainVec.y)); - __m128i mmEta = _mm_setzero_si128(); - __m128i mmScale = _mm_set1_epi16(static_cast(MEstimatorScale)); + qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(), qb2 = _mm_setzero_ps(); + __m128 qb3 = _mm_setzero_ps(); + __m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps(); + __m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps(); + __m128 mmSumIy = _mm_setzero_ps(), mmSumIx = _mm_setzero_ps(); + __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps(); + __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits::max() - 1, static_cast(fParam0))); + __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits::max() - 1, static_cast(fParam1))); + + + float s2Val = param[2] > 0 ? param[2] : -param[2]; + int s2bitShift = param[2] == 0 ? 1 : static_cast(ceil(log(200.f / s2Val) / log(2.f))); + __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast(param[2] * (float)(1 << s2bitShift))); + __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift); + __m128 mmParam2s = _mm_set1_ps(0.01f * param[2]); + __m128 mmParam2s2 = _mm_set1_ps(param[2] * param[2]); + + float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x; + int bitShift = gainVec.x == 0 ? 1 : static_cast(ceil(log(200.f / gainVal) / log(2.f))); + + __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast(gainVec.x * (float)(1 << bitShift))); + __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast(gainVec.y)); + __m128i mmEta = _mm_setzero_si128(); + __m128i mmScale = _mm_set1_epi16(static_cast(MEstimatorScale)); #endif - buffIdx = 0; + buffIdx = 0; - for (y = 0; y < _winSize.height; y++) - { - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; - const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn; - const short* Iptr = (const short*)(IWinBuf.data + y * IWinBuf.step); - const short* dIptr = (const short*)(derivIWinBuf.data + y * derivIWinBuf.step); + for (y = 0; y < _winSize.height; y++) + { + const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn; + const short* Iptr = (const short*)(IWinBuf.data + y * IWinBuf.step); + const short* dIptr = (const short*)(derivIWinBuf.data + y * derivIWinBuf.step); - x = 0; + x = 0; #ifdef RLOF_SSE - for (; x <= _winSize.width*cn; x += 8, dIptr += 8 * 2) - { - __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16); - __m128i diff0, diff1; - __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); // von element 0 bis 7 - __m128i v00 = _mm_unpacklo_epi8( - _mm_loadl_epi64((const __m128i*)(Jptr + x)) , z); //J0 , 0, J1, 0, J2, 0 ... J7,0 - __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); - __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z); - __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z); - - __m128i t0 = _mm_add_epi32 - (_mm_madd_epi16( - _mm_unpacklo_epi16(v00, v01), //J0, , J1, J1, J2, J2, ... J3 , J4 - qw0), - _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); - __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); - - t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5); - t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1 - 5); - - - __m128i lo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16); - __m128i hi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16); - - __m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), bitShift); - __m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), bitShift); - __m128i Igain_epi16 = _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32); - - // diff = J - I + I*gain.x + gain.y - __m128i mmDiff_epi16 = _mm_add_epi16( - _mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16), // J - I - _mm_add_epi16(Igain_epi16, mmConstValue_epi16)); - - - mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16); - - - __m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale); - mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1))); - - - __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 - __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); - - - __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16); - __m128i bSet2_epi16, bSet1_epi16; - // |It| < sigma1 ? - bSet2_epi16 = _mm_cmplt_epi16(abs_epi16, mmParam1); - // It > 0 ? - __m128i diffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128()); - // sigma0 < |It| < sigma1 ? - bSet1_epi16 = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0)); - // val = |It| -/+ sigma1 - __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)), - _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1))); - // It == 0 ? |It| > sigma13 - mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16); - // It == val ? sigma0 < |It| < sigma1 - mmDiff_epi16 = _mm_blendv_epi8(mmDiff_epi16, tmpParam1_epi16, bSet1_epi16); - - - __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3 - // diff = diff * sigma2 - lo = _mm_mullo_epi16(tale_epi16_, mmDiff_epi16); - hi = _mm_mulhi_epi16(tale_epi16_, mmDiff_epi16); - __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); //diff 0_3_epi32 - __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); // diff 4_7_epi32 - - mmDiff_epi16 = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); // ,da das ergebniss kleiner als 16 bit sein sollte - - diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7 | It12 It12 It13 It13... - diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3 | It8 It8 It9 It9... - - // Ix * diff / Iy * diff - v10 = _mm_mullo_epi16(Ixy_0, diff0); - v11 = _mm_mulhi_epi16(Ixy_0, diff0); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); - qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); - - // It * Ix It * Iy [4 ... 7] - // for set 1 hi sigma 1 - - v10 = _mm_mullo_epi16(Ixy_1, diff1); - v11 = _mm_mulhi_epi16(Ixy_1, diff1); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); - qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); - // diff * J [0 ... 7] - // for set 1 sigma 1 - // b3 += diff * Iptr[x] - - v10 = _mm_mullo_epi16(mmDiff_epi16, I_0_7_epi16); - v11 = _mm_mulhi_epi16(mmDiff_epi16, I_0_7_epi16); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v00)); - qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v10)); - - qb3 = _mm_add_ps(qb3, _mm_cvtepi32_ps(diff_0_3_epi32)); - qb3 = _mm_add_ps(qb3, _mm_cvtepi32_ps(diff_4_7_epi32)); + for (; x <= _winSize.width*cn; x += 8, dIptr += 8 * 2) + { + __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16); + __m128i diff0, diff1; + __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); // von element 0 bis 7 + __m128i v00 = _mm_unpacklo_epi8( + _mm_loadl_epi64((const __m128i*)(Jptr + x)) , z); //J0 , 0, J1, 0, J2, 0 ... J7,0 + __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); + __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z); + __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z); + + __m128i t0 = _mm_add_epi32 + (_mm_madd_epi16( + _mm_unpacklo_epi16(v00, v01), //J0, , J1, J1, J2, J2, ... J3 , J4 + qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); - if (j == 0) - { - __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16)); - __m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16, bSet1_epi16), 16)); - __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16)); - __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16))); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1 - 5); - __m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16)); - __m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16), 16)); - __m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps); - __m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps); - tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps); - tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps); + __m128i lo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16); + __m128i hi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16); - tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps); - tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps); - t0 = _mm_srai_epi32(Ixy_0, 16); - t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); + __m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), bitShift); + __m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), bitShift); + __m128i Igain_epi16 = _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32); - __m128 fy = _mm_cvtepi32_ps(t0); - __m128 fx = _mm_cvtepi32_ps(t1); + // diff = J - I + I*gain.x + gain.y + __m128i mmDiff_epi16 = _mm_add_epi16( + _mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16), // J - I + _mm_add_epi16(Igain_epi16, mmConstValue_epi16)); - // 0 ... 3 - __m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16)); - // A11 - A22 - __m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps); - __m128 fytale = _mm_mul_ps(fy, tale_0_3_ps); + mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16); - mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); - mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); - mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); - // sumIx und sumIy - mmSumIx = _mm_add_ps(mmSumIx, fxtale); - mmSumIy = _mm_add_ps(mmSumIy, fytale); + __m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale); + mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1))); - mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale)); - mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale)); - // sumI - __m128 I_tale_ps = _mm_mul_ps(I_ps, tale_0_3_ps); - mmSumI = _mm_add_ps(mmSumI, I_tale_ps); + __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 + __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); - // sumW - mmSumW = _mm_add_ps(mmSumW, tale_0_3_ps); - // sumDI - mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps(I_ps, I_tale_ps)); + __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16); + __m128i bSet2_epi16, bSet1_epi16; + // |It| < sigma1 ? + bSet2_epi16 = _mm_cmplt_epi16(abs_epi16, mmParam1); + // It > 0 ? + __m128i diffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128()); + // sigma0 < |It| < sigma1 ? + bSet1_epi16 = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0)); + // val = |It| -/+ sigma1 + __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)), + _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1))); + // It == 0 ? |It| > sigma13 + mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16); + // It == val ? sigma0 < |It| < sigma1 + mmDiff_epi16 = _mm_blendv_epi8(mmDiff_epi16, tmpParam1_epi16, bSet1_epi16); - t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11 - t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3 - fy = _mm_cvtepi32_ps(t0); - fx = _mm_cvtepi32_ps(t1); + __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3 + // diff = diff * sigma2 + lo = _mm_mullo_epi16(tale_epi16_, mmDiff_epi16); + hi = _mm_mulhi_epi16(tale_epi16_, mmDiff_epi16); + __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); //diff 0_3_epi32 + __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); // diff 4_7_epi32 - // 4 ... 7 - I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16)); + mmDiff_epi16 = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); // ,da das ergebniss kleiner als 16 bit sein sollte - // A11 - A22 - fxtale = _mm_mul_ps(fx, tale_4_7_ps); - fytale = _mm_mul_ps(fy, tale_4_7_ps); + diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7 | It12 It12 It13 It13... + diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3 | It8 It8 It9 It9... - mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); - mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); - mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); + // Ix * diff / Iy * diff + v10 = _mm_mullo_epi16(Ixy_0, diff0); + v11 = _mm_mulhi_epi16(Ixy_0, diff0); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); + qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); - // sumIx und sumIy - mmSumIx = _mm_add_ps(mmSumIx, fxtale); - mmSumIy = _mm_add_ps(mmSumIy, fytale); + // It * Ix It * Iy [4 ... 7] + // for set 1 hi sigma 1 - mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale)); - mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale)); + v10 = _mm_mullo_epi16(Ixy_1, diff1); + v11 = _mm_mulhi_epi16(Ixy_1, diff1); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); + qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); + // diff * J [0 ... 7] + // for set 1 sigma 1 + // b3 += diff * Iptr[x] - // sumI - I_tale_ps = _mm_mul_ps(I_ps, tale_4_7_ps); - mmSumI = _mm_add_ps(mmSumI, I_tale_ps); + v10 = _mm_mullo_epi16(mmDiff_epi16, I_0_7_epi16); + v11 = _mm_mulhi_epi16(mmDiff_epi16, I_0_7_epi16); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v00)); + qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v10)); - mmSumW = _mm_add_ps(mmSumW, tale_4_7_ps); + qb3 = _mm_add_ps(qb3, _mm_cvtepi32_ps(diff_0_3_epi32)); + qb3 = _mm_add_ps(qb3, _mm_cvtepi32_ps(diff_4_7_epi32)); - mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps(I_ps, I_tale_ps)); - } + if (j == 0) + { + __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16)); + __m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16, bSet1_epi16), 16)); + __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16)); + __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16))); + + __m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16)); + __m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16), 16)); + + __m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps); + __m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps); + tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps); + tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps); + + tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps); + tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps); + t0 = _mm_srai_epi32(Ixy_0, 16); + t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); + __m128 fy = _mm_cvtepi32_ps(t0); + __m128 fx = _mm_cvtepi32_ps(t1); + + // 0 ... 3 + __m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16)); + + // A11 - A22 + __m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps); + __m128 fytale = _mm_mul_ps(fy, tale_0_3_ps); + + mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); + mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); + mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); + + // sumIx und sumIy + mmSumIx = _mm_add_ps(mmSumIx, fxtale); + mmSumIy = _mm_add_ps(mmSumIy, fytale); + + mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale)); + mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale)); + + // sumI + __m128 I_tale_ps = _mm_mul_ps(I_ps, tale_0_3_ps); + mmSumI = _mm_add_ps(mmSumI, I_tale_ps); + + // sumW + mmSumW = _mm_add_ps(mmSumW, tale_0_3_ps); + + // sumDI + mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps(I_ps, I_tale_ps)); + + t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11 + t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3 + + fy = _mm_cvtepi32_ps(t0); + fx = _mm_cvtepi32_ps(t1); + + // 4 ... 7 + I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16)); + + // A11 - A22 + fxtale = _mm_mul_ps(fx, tale_4_7_ps); + fytale = _mm_mul_ps(fy, tale_4_7_ps); + + mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); + mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); + mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); + + // sumIx und sumIy + mmSumIx = _mm_add_ps(mmSumIx, fxtale); + mmSumIy = _mm_add_ps(mmSumIy, fytale); + + mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale)); + mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale)); + + // sumI + I_tale_ps = _mm_mul_ps(I_ps, tale_4_7_ps); + mmSumI = _mm_add_ps(mmSumI, I_tale_ps); + + mmSumW = _mm_add_ps(mmSumW, tale_4_7_ps); + + mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps(I_ps, I_tale_ps)); } + + } #else - for (; x < _winSize.width*cn; x++, dIptr += 2) + for (; x < _winSize.width*cn; x++, dIptr += 2) + { + if (maskPtr[x] == 0) + continue; + int J_val = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11, + W_BITS1 - 5); + short ixval = static_cast(dIptr[0]); + short iyval = static_cast(dIptr[1]); + int diff = static_cast(J_val - Iptr[x] + Iptr[x] * gainVec.x + gainVec.y); + int abss = (diff < 0) ? -diff : diff; + if (diff > MEstimatorScale) + MEstimatorScale += eta; + if (diff < MEstimatorScale) + MEstimatorScale -= eta; + if (abss > static_cast(fParam1)) { - if (maskPtr[x] == 0) - continue; - int J_val = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11, - W_BITS1 - 5); - short ixval = static_cast(dIptr[0]); - short iyval = static_cast(dIptr[1]); - int diff = static_cast(J_val - Iptr[x] + Iptr[x] * gainVec.x + gainVec.y); - int abss = (diff < 0) ? -diff : diff; - if (diff > MEstimatorScale) - MEstimatorScale += eta; - if (diff < MEstimatorScale) - MEstimatorScale -= eta; - if (abss > static_cast(fParam1)) + diff = 0; + } + else if (abss > static_cast(fParam0) && diff >= 0) + { + diff = static_cast(param[2] * (diff - fParam1)); + } + else if (abss > static_cast(fParam0) && diff < 0) + { + diff = static_cast(param[2] * (diff + fParam1)); + } + b1 += (float)(diff*ixval); + b2 += (float)(diff*iyval); ; + b3 += (float)(diff)* Iptr[x]; + b4 += (float)(diff); + + + // compute the Gradient Matrice + if (j == 0) + { + float tale = param[2] * FLT_RESCALE; + if (abss < fParam0 || j < 0) { - diff = 0; + tale = FLT_RESCALE; } - else if (abss > static_cast(fParam0) && diff >= 0) + else if (abss > fParam1) { - diff = static_cast(param[2] * (diff - fParam1)); + tale *= 0.01f; } - else if (abss > static_cast(fParam0) && diff < 0) + else { - diff = static_cast(param[2] * (diff + fParam1)); + tale *= param[2]; } - b1 += (float)(diff*ixval); - b2 += (float)(diff*iyval); ; - b3 += (float)(diff)* Iptr[x]; - b4 += (float)(diff); - - - // compute the Gradient Matrice if (j == 0) { - float tale = param[2] * FLT_RESCALE; - if (abss < fParam0 || j < 0) - { - tale = FLT_RESCALE; - } - else if (abss > fParam1) - { - tale *= 0.01f; - } - else - { - tale *= param[2]; - } - if (j == 0) - { - A11 += (float)(ixval*ixval)*tale; - A12 += (float)(ixval*iyval)*tale; - A22 += (float)(iyval*iyval)*tale; - } - - dI += Iptr[x] * Iptr[x] * tale; - float dx = static_cast(dIptr[0]) * tale; - float dy = static_cast(dIptr[1]) * tale; - sumIx += dx; - sumIy += dy; - w1 += dx * Iptr[x]; - w2 += dy * Iptr[x]; - sumI += Iptr[x] * tale; - sumW += tale; + A11 += (float)(ixval*ixval)*tale; + A12 += (float)(ixval*iyval)*tale; + A22 += (float)(iyval*iyval)*tale; } + dI += Iptr[x] * Iptr[x] * tale; + float dx = static_cast(dIptr[0]) * tale; + float dy = static_cast(dIptr[1]) * tale; + sumIx += dx; + sumIy += dy; + w1 += dx * Iptr[x]; + w2 += dy * Iptr[x]; + sumI += Iptr[x] * tale; + sumW += tale; } -#endif + } +#endif + } #ifdef RLOF_SSE - short etaValues[8]; - _mm_storeu_si128((__m128i*)(etaValues), mmEta); - MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3] - + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]); - float CV_DECL_ALIGNED(32) wbuf[4]; + short etaValues[8]; + _mm_storeu_si128((__m128i*)(etaValues), mmEta); + MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3] + + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]); + float CV_DECL_ALIGNED(32) wbuf[4]; #endif - if (j == 0) - { + if (j == 0) + { #ifdef RLOF_SSE - _mm_store_ps(wbuf, mmSumW1); - w1 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; - _mm_store_ps(wbuf, mmSumW2); - w2 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; - _mm_store_ps(wbuf, mmSumDI); - dI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; - _mm_store_ps(wbuf, mmSumI); - sumI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; - _mm_store_ps(wbuf, mmSumIx); - sumIx = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; - _mm_store_ps(wbuf, mmSumIy); - sumIy = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; - _mm_store_ps(wbuf, mmSumW); - sumW = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumW1); + w1 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumW2); + w2 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumDI); + dI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumI); + sumI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumIx); + sumIx = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumIy); + sumIy = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumW); + sumW = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; #endif - sumIx *= -FLT_SCALE; - sumIy *= -FLT_SCALE; - sumI *= FLT_SCALE; - sumW *= FLT_SCALE; - w1 *= -FLT_SCALE; - w2 *= -FLT_SCALE; - dI *= FLT_SCALE; + sumIx *= -FLT_SCALE; + sumIy *= -FLT_SCALE; + sumI *= FLT_SCALE; + sumW *= FLT_SCALE; + w1 *= -FLT_SCALE; + w2 *= -FLT_SCALE; + dI *= FLT_SCALE; #ifdef RLOF_SSE - float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4]; - _mm_store_ps(A11buf, mmAxx); - _mm_store_ps(A12buf, mmAxy); - _mm_store_ps(A22buf, mmAyy); + float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4]; + _mm_store_ps(A11buf, mmAxx); + _mm_store_ps(A12buf, mmAxy); + _mm_store_ps(A22buf, mmAyy); - A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; - A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; - A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; + A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; + A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; + A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; #endif - A11 *= FLT_SCALE; - A12 *= FLT_SCALE; - A22 *= FLT_SCALE; - } + A11 *= FLT_SCALE; + A12 *= FLT_SCALE; + A22 *= FLT_SCALE; + } #ifdef RLOF_SSE - float CV_DECL_ALIGNED(16) bbuf[4]; - _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1)); - b1 = bbuf[0] + bbuf[2]; - b2 = bbuf[1] + bbuf[3]; - _mm_store_ps(bbuf, qb2); - b3 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; - _mm_store_ps(bbuf, qb3); - b4 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; + float CV_DECL_ALIGNED(16) bbuf[4]; + _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1)); + b1 = bbuf[0] + bbuf[2]; + b2 = bbuf[1] + bbuf[3]; + _mm_store_ps(bbuf, qb2); + b3 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; + _mm_store_ps(bbuf, qb3); + b4 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; #endif - mismatchMat.at(0, 0) = b1 * FLT_SCALE; - mismatchMat.at(1, 0) = b2 * FLT_SCALE; - mismatchMat.at(2, 0) = -b3 * FLT_SCALE; - mismatchMat.at(3, 0) = -b4 * FLT_SCALE; - - D = -A12 * A12*sumI*sumI + dI * sumW*A12*A12 + 2 * A12*sumI*sumIx*w2 + 2 * A12*sumI*sumIy*w1 - - 2 * dI*A12*sumIx*sumIy - 2 * sumW*A12*w1*w2 + A11 * A22*sumI*sumI - 2 * A22*sumI*sumIx*w1 - - 2 * A11*sumI*sumIy*w2 - sumIx * sumIx*w2*w2 + A22 * dI*sumIx*sumIx + 2 * sumIx*sumIy*w1*w2 - - sumIy * sumIy*w1*w1 + A11 * dI*sumIy*sumIy + A22 * sumW*w1*w1 + A11 * sumW*w2*w2 - A11 * A22*dI*sumW; - - float sqrtVal = std::sqrt((A11 - A22)*(A11 - A22) + 4.f*A12*A12); - minEigValue = (A22 + A11 - sqrtVal) / (2.0f*winArea); - if (minEigValue < minEigThreshold || std::abs(D) < FLT_EPSILON) + mismatchMat.at(0, 0) = b1 * FLT_SCALE; + mismatchMat.at(1, 0) = b2 * FLT_SCALE; + mismatchMat.at(2, 0) = -b3 * FLT_SCALE; + mismatchMat.at(3, 0) = -b4 * FLT_SCALE; + + D = -A12 * A12*sumI*sumI + dI * sumW*A12*A12 + 2 * A12*sumI*sumIx*w2 + 2 * A12*sumI*sumIy*w1 + - 2 * dI*A12*sumIx*sumIy - 2 * sumW*A12*w1*w2 + A11 * A22*sumI*sumI - 2 * A22*sumI*sumIx*w1 + - 2 * A11*sumI*sumIy*w2 - sumIx * sumIx*w2*w2 + A22 * dI*sumIx*sumIx + 2 * sumIx*sumIy*w1*w2 + - sumIy * sumIy*w1*w1 + A11 * dI*sumIy*sumIy + A22 * sumW*w1*w1 + A11 * sumW*w2*w2 - A11 * A22*dI*sumW; + + float sqrtVal = std::sqrt((A11 - A22)*(A11 - A22) + 4.f*A12*A12); + minEigValue = (A22 + A11 - sqrtVal) / (2.0f*winArea); + if (minEigValue < minEigThreshold || std::abs(D) < FLT_EPSILON) + { + if (level == 0 && status) + status[ptidx] = 0; + if (level > 0) { - if (level == 0 && status) - status[ptidx] = 0; - if (level > 0) - { - nextPts[ptidx] = backUpNextPt; - gainVecs[ptidx] = backUpGain; - } - break; + nextPts[ptidx] = backUpNextPt; + gainVecs[ptidx] = backUpGain; } + break; + } - D = (1.f / D); - - invTensorMat.at(0, 0) = (A22*sumI*sumI - 2 * sumI*sumIy*w2 + dI * sumIy*sumIy + sumW * w2*w2 - A22 * dI*sumW)* D; invTensorMat.at(0, 1) = (A12*dI*sumW - A12 * sumI * sumI - dI * sumIx*sumIy + sumI * sumIx*w2 + sumI * sumIy*w1 - sumW * w1*w2)* D; - invTensorMat.at(0, 2) = (A12*sumI*sumIy - sumIy * sumIy*w1 - A22 * sumI*sumIx - A12 * sumW*w2 + A22 * sumW*w1 + sumIx * sumIy*w2)* D; - invTensorMat.at(0, 3) = (A22*dI*sumIx - A12 * dI*sumIy - sumIx * w2*w2 + A12 * sumI*w2 - A22 * sumI*w1 + sumIy * w1*w2) * D; - invTensorMat.at(1, 0) = invTensorMat.at(0, 1); - invTensorMat.at(1, 1) = (A11*sumI * sumI - 2 * sumI*sumIx*w1 + dI * sumIx * sumIx + sumW * w1*w1 - A11 * dI*sumW) * D; - invTensorMat.at(1, 2) = (A12*sumI*sumIx - A11 * sumI*sumIy - sumIx * sumIx*w2 + A11 * sumW*w2 - A12 * sumW*w1 + sumIx * sumIy*w1) * D; - invTensorMat.at(1, 3) = (A11*dI*sumIy - sumIy * w1*w1 - A12 * dI*sumIx - A11 * sumI*w2 + A12 * sumI*w1 + sumIx * w1*w2)* D; - invTensorMat.at(2, 0) = invTensorMat.at(0, 2); - invTensorMat.at(2, 1) = invTensorMat.at(1, 2); - invTensorMat.at(2, 2) = (sumW*A12*A12 - 2 * A12*sumIx*sumIy + A22 * sumIx*sumIx + A11 * sumIy*sumIy - A11 * A22*sumW)* D; - invTensorMat.at(2, 3) = (A11*A22*sumI - A12 * A12*sumI - A11 * sumIy*w2 + A12 * sumIx*w2 + A12 * sumIy*w1 - A22 * sumIx*w1)* D; - invTensorMat.at(3, 0) = invTensorMat.at(0, 3); - invTensorMat.at(3, 1) = invTensorMat.at(1, 3); - invTensorMat.at(3, 2) = invTensorMat.at(2, 3); - invTensorMat.at(3, 3) = (dI*A12*A12 - 2 * A12*w1*w2 + A22 * w1*w1 + A11 * w2*w2 - A11 * A22*dI)* D; + D = (1.f / D); - resultMat = invTensorMat * mismatchMat; + invTensorMat.at(0, 0) = (A22*sumI*sumI - 2 * sumI*sumIy*w2 + dI * sumIy*sumIy + sumW * w2*w2 - A22 * dI*sumW)* D; invTensorMat.at(0, 1) = (A12*dI*sumW - A12 * sumI * sumI - dI * sumIx*sumIy + sumI * sumIx*w2 + sumI * sumIy*w1 - sumW * w1*w2)* D; + invTensorMat.at(0, 2) = (A12*sumI*sumIy - sumIy * sumIy*w1 - A22 * sumI*sumIx - A12 * sumW*w2 + A22 * sumW*w1 + sumIx * sumIy*w2)* D; + invTensorMat.at(0, 3) = (A22*dI*sumIx - A12 * dI*sumIy - sumIx * w2*w2 + A12 * sumI*w2 - A22 * sumI*w1 + sumIy * w1*w2) * D; + invTensorMat.at(1, 0) = invTensorMat.at(0, 1); + invTensorMat.at(1, 1) = (A11*sumI * sumI - 2 * sumI*sumIx*w1 + dI * sumIx * sumIx + sumW * w1*w1 - A11 * dI*sumW) * D; + invTensorMat.at(1, 2) = (A12*sumI*sumIx - A11 * sumI*sumIy - sumIx * sumIx*w2 + A11 * sumW*w2 - A12 * sumW*w1 + sumIx * sumIy*w1) * D; + invTensorMat.at(1, 3) = (A11*dI*sumIy - sumIy * w1*w1 - A12 * dI*sumIx - A11 * sumI*w2 + A12 * sumI*w1 + sumIx * w1*w2)* D; + invTensorMat.at(2, 0) = invTensorMat.at(0, 2); + invTensorMat.at(2, 1) = invTensorMat.at(1, 2); + invTensorMat.at(2, 2) = (sumW*A12*A12 - 2 * A12*sumIx*sumIy + A22 * sumIx*sumIx + A11 * sumIy*sumIy - A11 * A22*sumW)* D; + invTensorMat.at(2, 3) = (A11*A22*sumI - A12 * A12*sumI - A11 * sumIy*w2 + A12 * sumIx*w2 + A12 * sumIy*w1 - A22 * sumIx*w1)* D; + invTensorMat.at(3, 0) = invTensorMat.at(0, 3); + invTensorMat.at(3, 1) = invTensorMat.at(1, 3); + invTensorMat.at(3, 2) = invTensorMat.at(2, 3); + invTensorMat.at(3, 3) = (dI*A12*A12 - 2 * A12*w1*w2 + A22 * w1*w1 + A11 * w2*w2 - A11 * A22*dI)* D; - Point2f delta(-resultMat.at(0), -resultMat.at(1)); - Point2f deltaGain(resultMat.at(2), resultMat.at(3)); + resultMat = invTensorMat * mismatchMat; + Point2f delta(-resultMat.at(0), -resultMat.at(1)); + Point2f deltaGain(resultMat.at(2), resultMat.at(3)); - if (j == 0) - prevGain = deltaGain; - gainVec += deltaGain * 0.8; - nextPt += delta * 0.8; - nextPts[ptidx] = nextPt + halfWin; - gainVecs[ptidx] = gainVec; - if ( - (std::abs(delta.x - prevDelta.x) < 0.01 && std::abs(delta.y - prevDelta.y) < 0.01) - || ((delta.ddot(delta) <= 0.001) && std::abs(prevGain.x - deltaGain.x) < 0.01) - ) - { - nextPts[ptidx] -= delta * 0.5f; - gainVecs[ptidx] -= deltaGain * 0.5f; - break; - } - - prevDelta = delta; + if (j == 0) prevGain = deltaGain; + gainVec += deltaGain * 0.8; + nextPt += delta * 0.8; + nextPts[ptidx] = nextPt + halfWin; + gainVecs[ptidx] = gainVec; + + if ( + (std::abs(delta.x - prevDelta.x) < 0.01 && std::abs(delta.y - prevDelta.y) < 0.01) + || ((delta.ddot(delta) <= 0.001) && std::abs(prevGain.x - deltaGain.x) < 0.01) + ) + { + nextPts[ptidx] -= delta * 0.5f; + gainVecs[ptidx] -= deltaGain * 0.5f; + break; } + prevDelta = delta; + prevGain = deltaGain; } } - const Mat* prevImg; - const Mat* nextImg; - const Mat* prevDeriv; - const Mat* rgbPrevImg; - const Mat* rgbNextImg; - const Point2f* prevPts; - Point2f* nextPts; - uchar* status; - cv::Point2f* gainVecs; - float* err; - int maxWinSize; - int minWinSize; - TermCriteria criteria; - int level; - int maxLevel; - int windowType; - float minEigThreshold; - bool useInitialFlow; - std::vector paramReset; - int crossSegmentationThreshold; - }; -} -} -} -} + } + + const Mat* prevImg; + const Mat* nextImg; + const Mat* prevDeriv; + const Mat* rgbPrevImg; + const Mat* rgbNextImg; + const Point2f* prevPts; + Point2f* nextPts; + uchar* status; + cv::Point2f* gainVecs; + float* err; + int maxWinSize; + int minWinSize; + TermCriteria criteria; + int level; + int maxLevel; + int windowType; + float minEigThreshold; + bool useInitialFlow; + std::vector paramReset; + int crossSegmentationThreshold; +}; + +}}}} // namespace #endif diff --git a/modules/optflow/src/rlof/rlof_invokerbase.hpp b/modules/optflow/src/rlof/rlof_invokerbase.hpp index 68e2ca28bbc..991531deef0 100644 --- a/modules/optflow/src/rlof/rlof_invokerbase.hpp +++ b/modules/optflow/src/rlof/rlof_invokerbase.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. #ifndef _RLOF_INVOKERBASE_HPP_ -#define _RLOF_INVOKERBASE_HPP_ +#define _RLOF_INVOKERBASE_HPP_ #ifdef CV_CPU_COMPILE_SSE4_1 @@ -26,250 +26,249 @@ using namespace std; using namespace cv; -namespace cv -{ -namespace optflow -{ - typedef short deriv_type; +namespace cv { +namespace optflow { + +typedef short deriv_type; #ifdef RLOF_SSE - inline void get4BitMask(const int & width, __m128i & mask) +inline void get4BitMask(const int & width, __m128i & mask) +{ + int noBits = width - static_cast(floor(width / 4.f) * 4.f); + unsigned int val[4]; + for (int n = 0; n < 4; n++) { - int noBits = width - static_cast(floor(width / 4.f) * 4.f); - unsigned int val[4]; - for (int n = 0; n < 4; n++) - { - val[n] = (noBits > n) ? (std::numeric_limits::max()) : 0; - } - mask = _mm_set_epi32(val[3], val[2], val[1], val[0]); - + val[n] = (noBits > n) ? (std::numeric_limits::max()) : 0; } - inline void getWBitMask(const int & width, __m128i & t0, __m128i & t1, __m128i & t2) + mask = _mm_set_epi32(val[3], val[2], val[1], val[0]); + +} +inline void getWBitMask(const int & width, __m128i & t0, __m128i & t1, __m128i & t2) +{ + int noBits = width - static_cast(floor(width / 8.f) * 8.f); + unsigned short val[8]; + for (int n = 0; n < 8; n++) { - int noBits = width - static_cast(floor(width / 8.f) * 8.f); - unsigned short val[8]; - for (int n = 0; n < 8; n++) - { - val[n] = (noBits > n) ? (0xffff) : 0; - } - t1 = _mm_set_epi16(val[7], val[7], val[6], val[6], val[5], val[5], val[4], val[4]); - t0 = _mm_set_epi16(val[3], val[3], val[2], val[2], val[1], val[1], val[0], val[0]); - t2 = _mm_set_epi16(val[7], val[6], val[5], val[4], val[3], val[2], val[1], val[0]); + val[n] = (noBits > n) ? (0xffff) : 0; } + t1 = _mm_set_epi16(val[7], val[7], val[6], val[6], val[5], val[5], val[4], val[4]); + t0 = _mm_set_epi16(val[3], val[3], val[2], val[2], val[1], val[1], val[0], val[0]); + t2 = _mm_set_epi16(val[7], val[6], val[5], val[4], val[3], val[2], val[1], val[0]); +} #endif - typedef uchar tMaskType; +typedef uchar tMaskType; #define tCVMaskType CV_8UC1 #define MaskSet 0xffffffff - inline bool notSameColor(const cv::Point3_ & ref, int r, int c, const cv::Mat & img, int winSize, int threshold) +inline bool notSameColor(const cv::Point3_ & ref, int r, int c, const cv::Mat & img, int winSize, int threshold) +{ + if (r >= img.rows + winSize || c >= img.cols + winSize || r < -winSize || c < -winSize) return true; + int step = static_cast(img.step1()); + const cv::Point3_ * tval = img.ptr>(); + tval += c; + tval += r * step / 3; + int R = std::abs((int)ref.x - (int)tval->x); + int G = std::abs((int)ref.y - (int)tval->y); + int B = std::abs((int)ref.z - (int)tval->z); + int diff = MAX(R, MAX(G, B)); + return diff > threshold; +} + +/*! Estimate the mask with points of the same color as the middle point +*\param prevGray input CV_8UC1 +*\param prevImg input CV_8UC3 +*\param nextImg input CV_8UC3 +*\param prevPoint global position of the middle point of the mask window +*\param nextPoint global position of the middle point of the mask window +*\param winPointMask mask matrice with 1 labeling points contained and 0 point is not contained by the segment +*\param noPoints number of points contained by the segment +*\param winRoi rectangle of the region of interesset in global coordinates +*\param minWinSize, +*\param threshold, +*\param useBothImages +*/ +static void getLocalPatch( + const cv::Mat & prevImg, + const cv::Point2i & prevPoint, // feature points + cv::Mat & winPointMask, + int & noPoints, + cv::Rect & winRoi, + int minWinSize, + int threshold) +{ + int maxWinSizeH = (winPointMask.cols - 1) / 2; + winRoi.x = prevPoint.x; + winRoi.y = prevPoint.y; + winRoi.width = winPointMask.cols; + winRoi.height = winPointMask.rows; + + if (minWinSize == winPointMask.cols) { - if (r >= img.rows + winSize || c >= img.cols + winSize || r < -winSize || c < -winSize) return true; - int step = static_cast(img.step1()); - const cv::Point3_ * tval = img.ptr>(); - tval += c; - tval += r * step / 3; - int R = std::abs((int)ref.x - (int)tval->x); - int G = std::abs((int)ref.y - (int)tval->y); - int B = std::abs((int)ref.z - (int)tval->z); - int diff = MAX(R, MAX(G, B)); - return diff > threshold; + winRoi.x = prevPoint.x - maxWinSizeH; + winRoi.y = prevPoint.y - maxWinSizeH; + winPointMask.setTo(1); + noPoints = winPointMask.size().area(); + return; } - - /*! Estimate the mask with points of the same color as the middle point - *\param prevGray input CV_8UC1 - *\param prevImg input CV_8UC3 - *\param nextImg input CV_8UC3 - *\param prevPoint global position of the middle point of the mask window - *\param nextPoint global position of the middle point of the mask window - *\param winPointMask mask matrice with 1 labeling points contained and 0 point is not contained by the segment - *\param noPoints number of points contained by the segment - *\param winRoi rectangle of the region of interesset in global coordinates - *\param minWinSize, - *\param threshold, - *\param useBothImages - */ - static void getLocalPatch( - const cv::Mat & prevImg, - const cv::Point2i & prevPoint, // feature points - cv::Mat & winPointMask, - int & noPoints, - cv::Rect & winRoi, - int minWinSize, - int threshold) + noPoints = 0; + int c = prevPoint.x; + int r = prevPoint.y; + int c_left = c - 1; + int c_right = c + 1; + int r_top = r - 1; + int r_bottom = r; + int border_left = c - maxWinSizeH; + int border_right = c + maxWinSizeH; + int border_top = r - maxWinSizeH; + int border_bottom = r + maxWinSizeH; + int c_local_diff = prevPoint.x - maxWinSizeH; + int r_local_diff = prevPoint.y - maxWinSizeH; + int _c = c - c_local_diff; + int _r = r - r_local_diff; + int min_r = _r; + int max_r = _r; + int min_c = _c; + int max_c = _c; + // horizontal line + if (r < 0 || r >= prevImg.rows || c < 0 || c >= prevImg.cols) { - int maxWinSizeH = (winPointMask.cols - 1) / 2; - winRoi.x = prevPoint.x; - winRoi.y = prevPoint.y; - winRoi.width = winPointMask.cols; - winRoi.height = winPointMask.rows; - - if (minWinSize == winPointMask.cols) - { - winRoi.x = prevPoint.x - maxWinSizeH; - winRoi.y = prevPoint.y - maxWinSizeH; - winPointMask.setTo(1); - noPoints = winPointMask.size().area(); - return; - } noPoints = 0; - int c = prevPoint.x; - int r = prevPoint.y; - int c_left = c - 1; - int c_right = c + 1; - int r_top = r - 1; - int r_bottom = r; - int border_left = c - maxWinSizeH; - int border_right = c + maxWinSizeH; - int border_top = r - maxWinSizeH; - int border_bottom = r + maxWinSizeH; - int c_local_diff = prevPoint.x - maxWinSizeH; - int r_local_diff = prevPoint.y - maxWinSizeH; - int _c = c - c_local_diff; - int _r = r - r_local_diff; - int min_r = _r; - int max_r = _r; - int min_c = _c; - int max_c = _c; - // horizontal line - if (r < 0 || r >= prevImg.rows || c < 0 || c >= prevImg.cols) - { - noPoints = 0; - return; - } - cv::Point3_ val1 = prevImg.at>(r, c); // middle grayvalue - cv::Point3_ tval; + return; + } + cv::Point3_ val1 = prevImg.at>(r, c); // middle grayvalue + cv::Point3_ tval; - //vertical line - for (int dr = r_top; dr >= border_top; dr--) - { - if (notSameColor(val1, dr, c, prevImg, maxWinSizeH, threshold)) - break; + //vertical line + for (int dr = r_top; dr >= border_top; dr--) + { + if (notSameColor(val1, dr, c, prevImg, maxWinSizeH, threshold)) + break; - int _dr = dr - r_local_diff; - min_r = MIN(min_r, _dr); - max_r = MAX(max_r, _dr); - winPointMask.at(_dr, _c) = 1; - noPoints++; - } - for (int dr = r_bottom; dr < border_bottom; dr++) - { - if (notSameColor(val1, dr, c, prevImg, maxWinSizeH, threshold) - ) - break; + int _dr = dr - r_local_diff; + min_r = MIN(min_r, _dr); + max_r = MAX(max_r, _dr); + winPointMask.at(_dr, _c) = 1; + noPoints++; + } + for (int dr = r_bottom; dr < border_bottom; dr++) + { + if (notSameColor(val1, dr, c, prevImg, maxWinSizeH, threshold) + ) + break; - int _dr = dr - r_local_diff; - min_r = MIN(min_r, _dr); - max_r = MAX(max_r, _dr); - winPointMask.at(_dr, _c) = 1; - noPoints++; + int _dr = dr - r_local_diff; + min_r = MIN(min_r, _dr); + max_r = MAX(max_r, _dr); + winPointMask.at(_dr, _c) = 1; + noPoints++; + } + // accumulate along the vertical line and the search line that was still labled + for (int dr = min_r + r_local_diff; dr <= max_r + r_local_diff; dr++) + { + int _dr = dr - r_local_diff; + if (winPointMask.at(_dr, _c) == 0) + { + winPointMask.row(_dr).setTo(0); + continue; } - // accumulate along the vertical line and the search line that was still labled - for (int dr = min_r + r_local_diff; dr <= max_r + r_local_diff; dr++) + bool skip = false; + int _dc = c_right - c_local_diff; + for (int dc = c_right; dc < border_right; dc++, _dc++) { - int _dr = dr - r_local_diff; - if (winPointMask.at(_dr, _c) == 0) + if (skip == false) { - winPointMask.row(_dr).setTo(0); - continue; + if (notSameColor(val1, dr, dc, prevImg, maxWinSizeH, threshold)) + skip = true; } - bool skip = false; - int _dc = c_right - c_local_diff; - for (int dc = c_right; dc < border_right; dc++, _dc++) + if (skip == false) { - if (skip == false) - { - if (notSameColor(val1, dr, dc, prevImg, maxWinSizeH, threshold)) - skip = true; - } - if (skip == false) - { - min_c = MIN(min_c, _dc); - max_c = MAX(max_c, _dc); - winPointMask.at(_dr, _dc) = 1; - noPoints++; - } - else - winPointMask.at(_dr, _dc) = 0; + min_c = MIN(min_c, _dc); + max_c = MAX(max_c, _dc); + winPointMask.at(_dr, _dc) = 1; + noPoints++; } + else + winPointMask.at(_dr, _dc) = 0; + } - skip = false; - _dc = c_left - c_local_diff; - for (int dc = c_left; dc >= border_left; dc--, _dc--) + skip = false; + _dc = c_left - c_local_diff; + for (int dc = c_left; dc >= border_left; dc--, _dc--) + { + if (skip == false) + { + if (notSameColor(val1, dr, dc, prevImg, maxWinSizeH, threshold)) + skip = true; + } + if (skip == false) { - if (skip == false) - { - if (notSameColor(val1, dr, dc, prevImg, maxWinSizeH, threshold)) - skip = true; - } - if (skip == false) - { - min_c = MIN(min_c, _dc); - max_c = MAX(max_c, _dc); - winPointMask.at(_dr, _dc) = 1; - noPoints++; - } - else - winPointMask.at(_dr, _dc) = 0; + min_c = MIN(min_c, _dc); + max_c = MAX(max_c, _dc); + winPointMask.at(_dr, _dc) = 1; + noPoints++; } + else + winPointMask.at(_dr, _dc) = 0; } + } - // get the initial small window - if (noPoints < minWinSize * minWinSize) - { - cv::Rect roi(winPointMask.cols / 2 - (minWinSize - 1) / 2, - winPointMask.rows / 2 - (minWinSize - 1) / 2, - minWinSize, minWinSize); - cv::Mat(winPointMask, roi).setTo(1); - min_c = MIN(MIN(min_c, roi.tl().x), roi.br().x - 1); - max_c = MAX(MAX(max_c, roi.tl().x), roi.br().x - 1); - min_r = MIN(MIN(min_r, roi.tl().y), roi.br().y - 1); - max_r = MAX(MAX(max_r, roi.tl().y), roi.br().y - 1); - noPoints += minWinSize * minWinSize; - } - winRoi.x = c_local_diff + min_c; - winRoi.y = r_local_diff + min_r; - winRoi.width = max_c - min_c + 1; - winRoi.height = max_r - min_r + 1; - winPointMask = winPointMask(cv::Rect(min_c, min_r, winRoi.width, winRoi.height)); + // get the initial small window + if (noPoints < minWinSize * minWinSize) + { + cv::Rect roi(winPointMask.cols / 2 - (minWinSize - 1) / 2, + winPointMask.rows / 2 - (minWinSize - 1) / 2, + minWinSize, minWinSize); + cv::Mat(winPointMask, roi).setTo(1); + min_c = MIN(MIN(min_c, roi.tl().x), roi.br().x - 1); + max_c = MAX(MAX(max_c, roi.tl().x), roi.br().x - 1); + min_r = MIN(MIN(min_r, roi.tl().y), roi.br().y - 1); + max_r = MAX(MAX(max_r, roi.tl().y), roi.br().y - 1); + noPoints += minWinSize * minWinSize; } + winRoi.x = c_local_diff + min_c; + winRoi.y = r_local_diff + min_r; + winRoi.width = max_c - min_c + 1; + winRoi.height = max_r - min_r + 1; + winPointMask = winPointMask(cv::Rect(min_c, min_r, winRoi.width, winRoi.height)); +} - inline bool calcWinMaskMat( const cv::Mat & BI, - const int windowType, - const cv::Point2i & iprevPt, - cv::Mat & winMaskMat, - cv::Size & winSize, - cv::Point2f & halfWin, - int & winArea, - const int minWinSize, - const int maxWinSize, - const int crossSegmentationThreshold) +inline bool calcWinMaskMat( + const cv::Mat & BI, + const int windowType, + const cv::Point2i & iprevPt, + cv::Mat & winMaskMat, + cv::Size & winSize, + cv::Point2f & halfWin, + int & winArea, + const int minWinSize, + const int maxWinSize, + const int crossSegmentationThreshold) +{ + if (windowType == SR_CROSS && maxWinSize != minWinSize) { - - if (windowType == SR_CROSS && maxWinSize != minWinSize) - { - // patch generation - cv::Rect winRoi; - getLocalPatch(BI, iprevPt, winMaskMat, winArea, winRoi, minWinSize, crossSegmentationThreshold); - if (winArea == 0) - return false; - winSize = winRoi.size(); - halfWin = Point2f(static_cast(iprevPt.x - winRoi.tl().x), - static_cast(iprevPt.y - winRoi.tl().y)); - } - else - { - winSize = cv::Size(maxWinSize, maxWinSize); - halfWin = Point2f((winSize.width - 1) / 2.f, (winSize.height - 1) / 2.f); - winMaskMat.setTo(1); - } - return true; + // patch generation + cv::Rect winRoi; + getLocalPatch(BI, iprevPt, winMaskMat, winArea, winRoi, minWinSize, crossSegmentationThreshold); + if (winArea == 0) + return false; + winSize = winRoi.size(); + halfWin = Point2f(static_cast(iprevPt.x - winRoi.tl().x), + static_cast(iprevPt.y - winRoi.tl().y)); } - - inline short estimateScale(cv::Mat & residuals) + else { - cv::Mat absMat = cv::abs(residuals); - return quickselect(absMat, absMat.rows / 2); + winSize = cv::Size(maxWinSize, maxWinSize); + halfWin = Point2f((winSize.width - 1) / 2.f, (winSize.height - 1) / 2.f); + winMaskMat.setTo(1); } + return true; } + +inline short estimateScale(cv::Mat & residuals) +{ + cv::Mat absMat = cv::abs(residuals); + return quickselect(absMat, absMat.rows / 2); } + +}} // namespace #endif diff --git a/modules/optflow/src/rlof/rlof_localflow.cpp b/modules/optflow/src/rlof/rlof_localflow.cpp index 4d18380ea77..666421f767f 100644 --- a/modules/optflow/src/rlof/rlof_localflow.cpp +++ b/modules/optflow/src/rlof/rlof_localflow.cpp @@ -11,542 +11,539 @@ using namespace std; using namespace cv; -namespace cv -{ - namespace detail - { +namespace cv { +namespace detail { +typedef short deriv_type; +} // namespace - typedef short deriv_type; - } -namespace +namespace { +static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst) { - static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst) - { - using namespace cv; - using cv::detail::deriv_type; - int rows = src.rows, cols = src.cols, cn = src.channels(), colsn = cols * cn, depth = src.depth(); - CV_Assert(depth == CV_8U); - dst.create(rows, cols, CV_MAKETYPE(DataType::depth, cn * 2)); + using namespace cv; + using cv::detail::deriv_type; + int rows = src.rows, cols = src.cols, cn = src.channels(), colsn = cols * cn, depth = src.depth(); + CV_Assert(depth == CV_8U); + dst.create(rows, cols, CV_MAKETYPE(DataType::depth, cn * 2)); - int x, y, delta = (int)alignSize((cols + 2)*cn, 16); - AutoBuffer _tempBuf(delta * 2 + 64); - deriv_type *trow0 = alignPtr(_tempBuf.data() + cn, 16), *trow1 = alignPtr(trow0 + delta, 16); + int x, y, delta = (int)alignSize((cols + 2)*cn, 16); + AutoBuffer _tempBuf(delta * 2 + 64); + deriv_type *trow0 = alignPtr(_tempBuf.data() + cn, 16), *trow1 = alignPtr(trow0 + delta, 16); #if CV_SIMD128 - v_int16x8 c3 = v_setall_s16(3), c10 = v_setall_s16(10); - bool haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); + v_int16x8 c3 = v_setall_s16(3), c10 = v_setall_s16(10); + bool haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); #endif - for (y = 0; y < rows; y++) - { - const uchar* srow0 = src.ptr(y > 0 ? y - 1 : rows > 1 ? 1 : 0); - const uchar* srow1 = src.ptr(y); - const uchar* srow2 = src.ptr(y < rows - 1 ? y + 1 : rows > 1 ? rows - 2 : 0); - deriv_type* drow = dst.ptr(y); + for (y = 0; y < rows; y++) + { + const uchar* srow0 = src.ptr(y > 0 ? y - 1 : rows > 1 ? 1 : 0); + const uchar* srow1 = src.ptr(y); + const uchar* srow2 = src.ptr(y < rows - 1 ? y + 1 : rows > 1 ? rows - 2 : 0); + deriv_type* drow = dst.ptr(y); - // do vertical convolution - x = 0; + // do vertical convolution + x = 0; #if CV_SIMD128 - if (haveSIMD) + if (haveSIMD) + { + for (; x <= colsn - 8; x += 8) { - for (; x <= colsn - 8; x += 8) - { - v_int16x8 s0 = v_reinterpret_as_s16(v_load_expand(srow0 + x)); - v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(srow1 + x)); - v_int16x8 s2 = v_reinterpret_as_s16(v_load_expand(srow2 + x)); + v_int16x8 s0 = v_reinterpret_as_s16(v_load_expand(srow0 + x)); + v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(srow1 + x)); + v_int16x8 s2 = v_reinterpret_as_s16(v_load_expand(srow2 + x)); - v_int16x8 t1 = s2 - s0; - v_int16x8 t0 = v_mul_wrap(s0 + s2, c3) + v_mul_wrap(s1, c10); + v_int16x8 t1 = s2 - s0; + v_int16x8 t0 = v_mul_wrap(s0 + s2, c3) + v_mul_wrap(s1, c10); - v_store(trow0 + x, t0); - v_store(trow1 + x, t1); - } + v_store(trow0 + x, t0); + v_store(trow1 + x, t1); } + } #endif - for (; x < colsn; x++) - { - int t0 = (srow0[x] + srow2[x]) * 3 + srow1[x] * 10; - int t1 = srow2[x] - srow0[x]; - trow0[x] = (deriv_type)t0; - trow1[x] = (deriv_type)t1; - } + for (; x < colsn; x++) + { + int t0 = (srow0[x] + srow2[x]) * 3 + srow1[x] * 10; + int t1 = srow2[x] - srow0[x]; + trow0[x] = (deriv_type)t0; + trow1[x] = (deriv_type)t1; + } - // make border - int x0 = (cols > 1 ? 1 : 0)*cn, x1 = (cols > 1 ? cols - 2 : 0)*cn; - for (int k = 0; k < cn; k++) - { - trow0[-cn + k] = trow0[x0 + k]; trow0[colsn + k] = trow0[x1 + k]; - trow1[-cn + k] = trow1[x0 + k]; trow1[colsn + k] = trow1[x1 + k]; - } + // make border + int x0 = (cols > 1 ? 1 : 0)*cn, x1 = (cols > 1 ? cols - 2 : 0)*cn; + for (int k = 0; k < cn; k++) + { + trow0[-cn + k] = trow0[x0 + k]; trow0[colsn + k] = trow0[x1 + k]; + trow1[-cn + k] = trow1[x0 + k]; trow1[colsn + k] = trow1[x1 + k]; + } - // do horizontal convolution, interleave the results and store them to dst - x = 0; + // do horizontal convolution, interleave the results and store them to dst + x = 0; #if CV_SIMD128 - if (haveSIMD) + if (haveSIMD) + { + for (; x <= colsn - 8; x += 8) { - for (; x <= colsn - 8; x += 8) - { - v_int16x8 s0 = v_load(trow0 + x - cn); - v_int16x8 s1 = v_load(trow0 + x + cn); - v_int16x8 s2 = v_load(trow1 + x - cn); - v_int16x8 s3 = v_load(trow1 + x); - v_int16x8 s4 = v_load(trow1 + x + cn); + v_int16x8 s0 = v_load(trow0 + x - cn); + v_int16x8 s1 = v_load(trow0 + x + cn); + v_int16x8 s2 = v_load(trow1 + x - cn); + v_int16x8 s3 = v_load(trow1 + x); + v_int16x8 s4 = v_load(trow1 + x + cn); - v_int16x8 t0 = s1 - s0; - v_int16x8 t1 = v_mul_wrap(s2 + s4, c3) + v_mul_wrap(s3, c10); + v_int16x8 t0 = s1 - s0; + v_int16x8 t1 = v_mul_wrap(s2 + s4, c3) + v_mul_wrap(s3, c10); - v_store_interleave((drow + x * 2), t0, t1); - } + v_store_interleave((drow + x * 2), t0, t1); } + } #endif - for (; x < colsn; x++) - { - deriv_type t0 = (deriv_type)(trow0[x + cn] - trow0[x - cn]); - deriv_type t1 = (deriv_type)((trow1[x + cn] + trow1[x - cn]) * 3 + trow1[x] * 10); - drow[x * 2] = t0; drow[x * 2 + 1] = t1; - } + for (; x < colsn; x++) + { + deriv_type t0 = (deriv_type)(trow0[x + cn] - trow0[x - cn]); + deriv_type t1 = (deriv_type)((trow1[x + cn] + trow1[x - cn]) * 3 + trow1[x] * 10); + drow[x * 2] = t0; drow[x * 2 + 1] = t1; } } +} + +} // namespace +namespace optflow { + +int buildOpticalFlowPyramidScale(InputArray _img, OutputArrayOfArrays pyramid, Size winSize, int maxLevel, bool withDerivatives, + int pyrBorder, int derivBorder, bool tryReuseInputImage, float levelScale[2]); +void calcLocalOpticalFlowCore(Ptr prevPyramids[2], Ptr currPyramids[2], InputArray _prevPts, + InputOutputArray _nextPts, const RLOFOpticalFlowParameter & param); -}//namespace -namespace optflow +void preprocess(Ptr prevPyramids[2], Ptr currPyramids[2], const std::vector & prevPoints, + std::vector & currPoints, const RLOFOpticalFlowParameter & param); + +inline bool isrobust(const RLOFOpticalFlowParameter & param) { - int buildOpticalFlowPyramidScale(InputArray _img, OutputArrayOfArrays pyramid, Size winSize, int maxLevel, bool withDerivatives, - int pyrBorder, int derivBorder, bool tryReuseInputImage, float levelScale[2]); - void calcLocalOpticalFlowCore(Ptr prevPyramids[2], Ptr currPyramids[2], InputArray _prevPts, - InputOutputArray _nextPts, const RLOFOpticalFlowParameter & param); + return (param.normSigma0 != std::numeric_limits::max()) + && (param.normSigma1 != std::numeric_limits::max()); +} +inline std::vector get_norm(float sigma0, float sigma1) +{ + std::vector result = { sigma0, sigma1, sigma0 / (sigma0 - sigma1), sigma0 * sigma1 }; + return result; +} - void preprocess(Ptr prevPyramids[2], Ptr currPyramids[2], const std::vector & prevPoints, - std::vector & currPoints, const RLOFOpticalFlowParameter & param); - inline bool isrobust(const RLOFOpticalFlowParameter & param) - { - return (param.normSigma0 != std::numeric_limits::max()) - && (param.normSigma1 != std::numeric_limits::max()); - } - inline std::vector get_norm(float sigma0, float sigma1) +int buildOpticalFlowPyramidScale(InputArray _img, OutputArrayOfArrays pyramid, Size winSize, int maxLevel, bool withDerivatives, + int pyrBorder, int derivBorder, bool tryReuseInputImage, float levelScale[2]) +{ + Mat img = _img.getMat(); + CV_Assert(img.depth() == CV_8U && winSize.width > 2 && winSize.height > 2); + int pyrstep = withDerivatives ? 2 : 1; + + pyramid.create(1, (maxLevel + 1) * pyrstep, 0 /*type*/, -1, true); + + int derivType = CV_MAKETYPE(DataType::depth, img.channels() * 2); + + //level 0 + bool lvl0IsSet = false; + if (tryReuseInputImage && img.isSubmatrix() && (pyrBorder & BORDER_ISOLATED) == 0) { - std::vector result = { sigma0, sigma1, sigma0 / (sigma0 - sigma1), sigma0 * sigma1 }; - return result; + Size wholeSize; + Point ofs; + img.locateROI(wholeSize, ofs); + if (ofs.x >= winSize.width && ofs.y >= winSize.height + && ofs.x + img.cols + winSize.width <= wholeSize.width + && ofs.y + img.rows + winSize.height <= wholeSize.height) + { + pyramid.getMatRef(0) = img; + lvl0IsSet = true; + } } - - int buildOpticalFlowPyramidScale(InputArray _img, OutputArrayOfArrays pyramid, Size winSize, int maxLevel, bool withDerivatives, - int pyrBorder, int derivBorder, bool tryReuseInputImage, float levelScale[2]) + if (!lvl0IsSet) { - Mat img = _img.getMat(); - CV_Assert(img.depth() == CV_8U && winSize.width > 2 && winSize.height > 2); - int pyrstep = withDerivatives ? 2 : 1; + Mat& temp = pyramid.getMatRef(0); - pyramid.create(1, (maxLevel + 1) * pyrstep, 0 /*type*/, -1, true); + if (!temp.empty()) + temp.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width); + if (temp.type() != img.type() || temp.cols != winSize.width * 2 + img.cols || temp.rows != winSize.height * 2 + img.rows) + temp.create(img.rows + winSize.height * 2, img.cols + winSize.width * 2, img.type()); - int derivType = CV_MAKETYPE(DataType::depth, img.channels() * 2); + if (pyrBorder == BORDER_TRANSPARENT) + img.copyTo(temp(Rect(winSize.width, winSize.height, img.cols, img.rows))); + else + copyMakeBorder(img, temp, winSize.height, winSize.height, winSize.width, winSize.width, pyrBorder); + temp.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width); + } - //level 0 - bool lvl0IsSet = false; - if (tryReuseInputImage && img.isSubmatrix() && (pyrBorder & BORDER_ISOLATED) == 0) - { - Size wholeSize; - Point ofs; - img.locateROI(wholeSize, ofs); - if (ofs.x >= winSize.width && ofs.y >= winSize.height - && ofs.x + img.cols + winSize.width <= wholeSize.width - && ofs.y + img.rows + winSize.height <= wholeSize.height) - { - pyramid.getMatRef(0) = img; - lvl0IsSet = true; - } - } + Size sz = img.size(); + Mat prevLevel = pyramid.getMatRef(0); + Mat thisLevel = prevLevel; - if (!lvl0IsSet) + for (int level = 0; level <= maxLevel; ++level) + { + if (level != 0) { - Mat& temp = pyramid.getMatRef(0); + Mat& temp = pyramid.getMatRef(level * pyrstep); if (!temp.empty()) temp.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width); - if (temp.type() != img.type() || temp.cols != winSize.width * 2 + img.cols || temp.rows != winSize.height * 2 + img.rows) - temp.create(img.rows + winSize.height * 2, img.cols + winSize.width * 2, img.type()); + if (temp.type() != img.type() || temp.cols != winSize.width * 2 + sz.width || temp.rows != winSize.height * 2 + sz.height) + temp.create(sz.height + winSize.height * 2, sz.width + winSize.width * 2, img.type()); - if (pyrBorder == BORDER_TRANSPARENT) - img.copyTo(temp(Rect(winSize.width, winSize.height, img.cols, img.rows))); - else - copyMakeBorder(img, temp, winSize.height, winSize.height, winSize.width, winSize.width, pyrBorder); + thisLevel = temp(Rect(winSize.width, winSize.height, sz.width, sz.height)); + pyrDown(prevLevel, thisLevel, sz); + + if (pyrBorder != BORDER_TRANSPARENT) + copyMakeBorder(thisLevel, temp, winSize.height, winSize.height, winSize.width, winSize.width, pyrBorder | BORDER_ISOLATED); temp.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width); } - Size sz = img.size(); - Mat prevLevel = pyramid.getMatRef(0); - Mat thisLevel = prevLevel; - - for (int level = 0; level <= maxLevel; ++level) + if (withDerivatives) { - if (level != 0) - { - Mat& temp = pyramid.getMatRef(level * pyrstep); - - if (!temp.empty()) - temp.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width); - if (temp.type() != img.type() || temp.cols != winSize.width * 2 + sz.width || temp.rows != winSize.height * 2 + sz.height) - temp.create(sz.height + winSize.height * 2, sz.width + winSize.width * 2, img.type()); - - thisLevel = temp(Rect(winSize.width, winSize.height, sz.width, sz.height)); - pyrDown(prevLevel, thisLevel, sz); - - if (pyrBorder != BORDER_TRANSPARENT) - copyMakeBorder(thisLevel, temp, winSize.height, winSize.height, winSize.width, winSize.width, pyrBorder | BORDER_ISOLATED); - temp.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width); - } - - if (withDerivatives) - { - Mat& deriv = pyramid.getMatRef(level * pyrstep + 1); + Mat& deriv = pyramid.getMatRef(level * pyrstep + 1); - if (!deriv.empty()) - deriv.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width); - if (deriv.type() != derivType || deriv.cols != winSize.width * 2 + sz.width || deriv.rows != winSize.height * 2 + sz.height) - deriv.create(sz.height + winSize.height * 2, sz.width + winSize.width * 2, derivType); + if (!deriv.empty()) + deriv.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width); + if (deriv.type() != derivType || deriv.cols != winSize.width * 2 + sz.width || deriv.rows != winSize.height * 2 + sz.height) + deriv.create(sz.height + winSize.height * 2, sz.width + winSize.width * 2, derivType); - Mat derivI = deriv(Rect(winSize.width, winSize.height, sz.width, sz.height)); - calcSharrDeriv(thisLevel, derivI); + Mat derivI = deriv(Rect(winSize.width, winSize.height, sz.width, sz.height)); + calcSharrDeriv(thisLevel, derivI); - if (derivBorder != BORDER_TRANSPARENT) - copyMakeBorder(derivI, deriv, winSize.height, winSize.height, winSize.width, winSize.width, derivBorder | BORDER_ISOLATED); - deriv.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width); - } - - sz = Size(static_cast((sz.width + 1) / levelScale[0]), - static_cast((sz.height + 1) / levelScale[1])); - if (sz.width <= winSize.width || sz.height <= winSize.height) - { - pyramid.create(1, (level + 1) * pyrstep, 0 /*type*/, -1, true);//check this - return level; - } + if (derivBorder != BORDER_TRANSPARENT) + copyMakeBorder(derivI, deriv, winSize.height, winSize.height, winSize.width, winSize.width, derivBorder | BORDER_ISOLATED); + deriv.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width); + } - prevLevel = thisLevel; + sz = Size(static_cast((sz.width + 1) / levelScale[0]), + static_cast((sz.height + 1) / levelScale[1])); + if (sz.width <= winSize.width || sz.height <= winSize.height) + { + pyramid.create(1, (level + 1) * pyrstep, 0 /*type*/, -1, true);//check this + return level; } - return maxLevel; + prevLevel = thisLevel; } - int CImageBuffer::buildPyramid(cv::Size winSize, int maxLevel, float levelScale[2]) - { - if (m_Overwrite == false) - return m_maxLevel; - m_maxLevel = buildOpticalFlowPyramidScale(m_Image, m_ImagePyramid, winSize, maxLevel, false, 4, 0, true, levelScale); + + return maxLevel; +} +int CImageBuffer::buildPyramid(cv::Size winSize, int maxLevel, float levelScale[2]) +{ + if (m_Overwrite == false) return m_maxLevel; - } + m_maxLevel = buildOpticalFlowPyramidScale(m_Image, m_ImagePyramid, winSize, maxLevel, false, 4, 0, true, levelScale); + return m_maxLevel; +} - void calcLocalOpticalFlowCore( - Ptr prevPyramids[2], - Ptr currPyramids[2], - InputArray _prevPts, - InputOutputArray _nextPts, - const RLOFOpticalFlowParameter & param) +void calcLocalOpticalFlowCore( + Ptr prevPyramids[2], + Ptr currPyramids[2], + InputArray _prevPts, + InputOutputArray _nextPts, + const RLOFOpticalFlowParameter & param) +{ + + bool useAdditionalRGB = param.supportRegionType == SR_CROSS; + int iWinSize = param.largeWinSize; + int winSizes[2] = { iWinSize, iWinSize }; + if (param.supportRegionType != SR_FIXED) { + winSizes[0] = param.smallWinSize; + } + //cv::Size winSize(iWinSize, iWinSize); + cv::TermCriteria criteria(cv::TermCriteria::COUNT | cv::TermCriteria::EPS, param.maxIteration, 0.01); + std::vector rlofNorm = get_norm(param.normSigma0, param.normSigma1); + CV_Assert(winSizes[0] <= winSizes[1]); - bool useAdditionalRGB = param.supportRegionType == SR_CROSS; - int iWinSize = param.largeWinSize; - int winSizes[2] = { iWinSize, iWinSize }; - if (param.supportRegionType != SR_FIXED) - { - winSizes[0] = param.smallWinSize; - } - //cv::Size winSize(iWinSize, iWinSize); - cv::TermCriteria criteria(cv::TermCriteria::COUNT | cv::TermCriteria::EPS, param.maxIteration, 0.01); - std::vector rlofNorm = get_norm(param.normSigma0, param.normSigma1); - CV_Assert(winSizes[0] <= winSizes[1]); + Mat prevPtsMat = _prevPts.getMat(); + const int derivDepth = DataType::depth; - Mat prevPtsMat = _prevPts.getMat(); - const int derivDepth = DataType::depth; + CV_Assert(param.maxLevel >= 0 && iWinSize > 2); - CV_Assert(param.maxLevel >= 0 && iWinSize > 2); + int level = 0, npoints; + CV_Assert((npoints = prevPtsMat.checkVector(2, CV_32F, true)) >= 0); - int level = 0, npoints; - CV_Assert((npoints = prevPtsMat.checkVector(2, CV_32F, true)) >= 0); + if (!(param.useInitialFlow)) + _nextPts.create(prevPtsMat.size(), prevPtsMat.type(), -1, true); - if (!(param.useInitialFlow)) - _nextPts.create(prevPtsMat.size(), prevPtsMat.type(), -1, true); + Mat nextPtsMat = _nextPts.getMat(); + CV_Assert(nextPtsMat.checkVector(2, CV_32F, true) == npoints); - Mat nextPtsMat = _nextPts.getMat(); - CV_Assert(nextPtsMat.checkVector(2, CV_32F, true) == npoints); + const Point2f* prevPts = (const Point2f*)prevPtsMat.data; + Point2f* nextPts = (Point2f*)nextPtsMat.data; + std::vector status(npoints); + std::vector err(npoints); + std::vector gainPts(npoints); - const Point2f* prevPts = (const Point2f*)prevPtsMat.data; - Point2f* nextPts = (Point2f*)nextPtsMat.data; - std::vector status(npoints); - std::vector err(npoints); - std::vector gainPts(npoints); + float levelScale[2] = { 2.f,2.f }; - float levelScale[2] = { 2.f,2.f }; + int maxLevel = prevPyramids[0]->buildPyramid(cv::Size(iWinSize, iWinSize), param.maxLevel, levelScale); - int maxLevel = prevPyramids[0]->buildPyramid(cv::Size(iWinSize, iWinSize), param.maxLevel, levelScale); + maxLevel = currPyramids[0]->buildPyramid(cv::Size(iWinSize, iWinSize), maxLevel, levelScale); + if (useAdditionalRGB) + { + prevPyramids[1]->buildPyramid(cv::Size(iWinSize, iWinSize), maxLevel, levelScale); + currPyramids[1]->buildPyramid(cv::Size(iWinSize, iWinSize), maxLevel, levelScale); + } - maxLevel = currPyramids[0]->buildPyramid(cv::Size(iWinSize, iWinSize), maxLevel, levelScale); + if ((criteria.type & TermCriteria::COUNT) == 0) + criteria.maxCount = 30; + else + criteria.maxCount = std::min(std::max(criteria.maxCount, 0), 100); + if ((criteria.type & TermCriteria::EPS) == 0) + criteria.epsilon = 0.001; + else + criteria.epsilon = std::min(std::max(criteria.epsilon, 0.), 10.); + criteria.epsilon *= criteria.epsilon; + + // dI/dx ~ Ix, dI/dy ~ Iy + Mat derivIBuf; + derivIBuf.create(prevPyramids[0]->m_ImagePyramid[0].rows + iWinSize * 2, prevPyramids[0]->m_ImagePyramid[0].cols + iWinSize * 2, CV_MAKETYPE(derivDepth, prevPyramids[0]->m_ImagePyramid[0].channels() * 2)); + + for (level = maxLevel; level >= 0; level--) + { + Mat derivI; + Size imgSize = prevPyramids[0]->getImage(level).size(); + Mat _derivI(imgSize.height + iWinSize * 2, imgSize.width + iWinSize * 2, derivIBuf.type(), derivIBuf.data); + derivI = _derivI(Rect(iWinSize, iWinSize, imgSize.width, imgSize.height)); + calcSharrDeriv(prevPyramids[0]->getImage(level), derivI); + copyMakeBorder(derivI, _derivI, iWinSize, iWinSize, iWinSize, iWinSize, BORDER_CONSTANT | BORDER_ISOLATED); + + cv::Mat tRGBPrevPyr; + cv::Mat tRGBNextPyr; if (useAdditionalRGB) { - prevPyramids[1]->buildPyramid(cv::Size(iWinSize, iWinSize), maxLevel, levelScale); - currPyramids[1]->buildPyramid(cv::Size(iWinSize, iWinSize), maxLevel, levelScale); - } + tRGBPrevPyr = prevPyramids[1]->getImage(level); + tRGBNextPyr = prevPyramids[1]->getImage(level); - if ((criteria.type & TermCriteria::COUNT) == 0) - criteria.maxCount = 30; - else - criteria.maxCount = std::min(std::max(criteria.maxCount, 0), 100); - if ((criteria.type & TermCriteria::EPS) == 0) - criteria.epsilon = 0.001; - else - criteria.epsilon = std::min(std::max(criteria.epsilon, 0.), 10.); - criteria.epsilon *= criteria.epsilon; + prevPyramids[1]->m_Overwrite = false; + currPyramids[1]->m_Overwrite = false; + } - // dI/dx ~ Ix, dI/dy ~ Iy - Mat derivIBuf; - derivIBuf.create(prevPyramids[0]->m_ImagePyramid[0].rows + iWinSize * 2, prevPyramids[0]->m_ImagePyramid[0].cols + iWinSize * 2, CV_MAKETYPE(derivDepth, prevPyramids[0]->m_ImagePyramid[0].channels() * 2)); + cv::Mat prevImage = prevPyramids[0]->getImage(level); + cv::Mat currImage = currPyramids[0]->getImage(level); - for (level = maxLevel; level >= 0; level--) + // apply plk like tracker + if (isrobust(param) == false) { - Mat derivI; - Size imgSize = prevPyramids[0]->getImage(level).size(); - Mat _derivI(imgSize.height + iWinSize * 2, imgSize.width + iWinSize * 2, derivIBuf.type(), derivIBuf.data); - derivI = _derivI(Rect(iWinSize, iWinSize, imgSize.width, imgSize.height)); - calcSharrDeriv(prevPyramids[0]->getImage(level), derivI); - copyMakeBorder(derivI, _derivI, iWinSize, iWinSize, iWinSize, iWinSize, BORDER_CONSTANT | BORDER_ISOLATED); - - cv::Mat tRGBPrevPyr; - cv::Mat tRGBNextPyr; - if (useAdditionalRGB) + if (param.useIlluminationModel) { - tRGBPrevPyr = prevPyramids[1]->getImage(level); - tRGBNextPyr = prevPyramids[1]->getImage(level); - - prevPyramids[1]->m_Overwrite = false; - currPyramids[1]->m_Overwrite = false; + cv::parallel_for_(cv::Range(0, npoints), + plk::radial::TrackerInvoker( + prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, + prevPts, nextPts, &status[0], &err[0], &gainPts[0], + level, maxLevel, winSizes, + param.maxIteration, + param.useInitialFlow, + param.supportRegionType, + param.minEigenValue, + param.crossSegmentationThreshold)); } - - cv::Mat prevImage = prevPyramids[0]->getImage(level); - cv::Mat currImage = currPyramids[0]->getImage(level); - - // apply plk like tracker - if (isrobust(param) == false) + else + { + if (param.solverType == SolverType::ST_STANDART) + { + cv::parallel_for_(cv::Range(0, npoints), + plk::ica::TrackerInvoker( + prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, + prevPts, nextPts, &status[0], &err[0], + level, maxLevel, winSizes, + param.maxIteration, + param.useInitialFlow, + param.supportRegionType, + param.crossSegmentationThreshold, + param.minEigenValue)); + } + else + { + cv::parallel_for_(cv::Range(0, npoints), + beplk::ica::TrackerInvoker(prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, + prevPts, nextPts, &status[0], &err[0], + level, maxLevel, winSizes, + param.maxIteration, + param.useInitialFlow, + param.supportRegionType, + param.crossSegmentationThreshold, + param.minEigenValue)); + } + } + } + // for robust models + else + { + if (param.useIlluminationModel) { - if (param.useIlluminationModel) + if (param.solverType == SolverType::ST_STANDART) { cv::parallel_for_(cv::Range(0, npoints), - plk::radial::TrackerInvoker( + rlof::radial::TrackerInvoker( prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, prevPts, nextPts, &status[0], &err[0], &gainPts[0], level, maxLevel, winSizes, param.maxIteration, param.useInitialFlow, param.supportRegionType, + rlofNorm, param.minEigenValue, param.crossSegmentationThreshold)); } else { - if (param.solverType == SolverType::ST_STANDART) - { - cv::parallel_for_(cv::Range(0, npoints), - plk::ica::TrackerInvoker( - prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, - prevPts, nextPts, &status[0], &err[0], - level, maxLevel, winSizes, - param.maxIteration, - param.useInitialFlow, - param.supportRegionType, - param.crossSegmentationThreshold, - param.minEigenValue)); - } - else - { - cv::parallel_for_(cv::Range(0, npoints), - beplk::ica::TrackerInvoker(prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, - prevPts, nextPts, &status[0], &err[0], - level, maxLevel, winSizes, - param.maxIteration, - param.useInitialFlow, - param.supportRegionType, - param.crossSegmentationThreshold, - param.minEigenValue)); - } + cv::parallel_for_(cv::Range(0, npoints), + berlof::radial::TrackerInvoker(prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, + prevPts, nextPts, &status[0], &err[0], &gainPts[0], + level, maxLevel, winSizes, + param.maxIteration, + param.useInitialFlow, + param.supportRegionType, + param.crossSegmentationThreshold, + rlofNorm, + param.minEigenValue)); } } - // for robust models else { - if (param.useIlluminationModel) + + if (param.solverType == SolverType::ST_STANDART) { - if (param.solverType == SolverType::ST_STANDART) - { - cv::parallel_for_(cv::Range(0, npoints), - rlof::radial::TrackerInvoker( - prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, - prevPts, nextPts, &status[0], &err[0], &gainPts[0], - level, maxLevel, winSizes, - param.maxIteration, - param.useInitialFlow, - param.supportRegionType, - rlofNorm, - param.minEigenValue, - param.crossSegmentationThreshold)); - } - else - { - cv::parallel_for_(cv::Range(0, npoints), - berlof::radial::TrackerInvoker(prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, - prevPts, nextPts, &status[0], &err[0], &gainPts[0], - level, maxLevel, winSizes, - param.maxIteration, - param.useInitialFlow, - param.supportRegionType, - param.crossSegmentationThreshold, - rlofNorm, - param.minEigenValue)); - } + cv::parallel_for_(cv::Range(0, npoints), + rlof::ica::TrackerInvoker( + prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, + prevPts, nextPts, &status[0], &err[0], + level, maxLevel, winSizes, + param.maxIteration, + param.useInitialFlow, + param.supportRegionType, + rlofNorm, + param.minEigenValue, + param.crossSegmentationThreshold)); } else { - - if (param.solverType == SolverType::ST_STANDART) - { - cv::parallel_for_(cv::Range(0, npoints), - rlof::ica::TrackerInvoker( - prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, - prevPts, nextPts, &status[0], &err[0], - level, maxLevel, winSizes, - param.maxIteration, - param.useInitialFlow, - param.supportRegionType, - rlofNorm, - param.minEigenValue, - param.crossSegmentationThreshold)); - } - else - { - cv::parallel_for_(cv::Range(0, npoints), - berlof::ica::TrackerInvoker(prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, - prevPts, nextPts, &status[0], &err[0], - level, maxLevel, winSizes, - param.maxIteration, - param.useInitialFlow, - param.supportRegionType, - param.crossSegmentationThreshold, - rlofNorm, - param.minEigenValue)); - } - + cv::parallel_for_(cv::Range(0, npoints), + berlof::ica::TrackerInvoker(prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, + prevPts, nextPts, &status[0], &err[0], + level, maxLevel, winSizes, + param.maxIteration, + param.useInitialFlow, + param.supportRegionType, + param.crossSegmentationThreshold, + rlofNorm, + param.minEigenValue)); } - } - prevPyramids[0]->m_Overwrite = true; - currPyramids[0]->m_Overwrite = true; + } } + + prevPyramids[0]->m_Overwrite = true; + currPyramids[0]->m_Overwrite = true; } +} - void preprocess(Ptr prevPyramids[2], - Ptr currPyramids[2], - const std::vector & prevPoints, - std::vector & currPoints, - const RLOFOpticalFlowParameter & param) +void preprocess(Ptr prevPyramids[2], + Ptr currPyramids[2], + const std::vector & prevPoints, + std::vector & currPoints, + const RLOFOpticalFlowParameter & param) +{ + cv::Mat mask, homography; + if (param.useGlobalMotionPrior == false) + return; + + currPoints.resize(prevPoints.size()); + + RLOFOpticalFlowParameter gmeTrackerParam = param; + gmeTrackerParam.useGlobalMotionPrior = false; + gmeTrackerParam.largeWinSize = 17; + // use none robust tracker for global motion estimation since it is faster + gmeTrackerParam.normSigma0 = std::numeric_limits::max(); + gmeTrackerParam.maxIteration = MAX(15, param.maxIteration); + gmeTrackerParam.minEigenValue = 0.000001f; + + std::vector gmPrevPoints, gmCurrPoints; + + // Initialize point grid + int stepr = prevPyramids[0]->m_Image.rows / 30; + int stepc = prevPyramids[0]->m_Image.rows / 40; + for (int r = stepr / 2; r < prevPyramids[0]->m_Image.rows; r += stepr) { - cv::Mat mask, homography; - if (param.useGlobalMotionPrior == false) - return; - - currPoints.resize(prevPoints.size()); - - RLOFOpticalFlowParameter gmeTrackerParam = param; - gmeTrackerParam.useGlobalMotionPrior = false; - gmeTrackerParam.largeWinSize = 17; - // use none robust tracker for global motion estimation since it is faster - gmeTrackerParam.normSigma0 = std::numeric_limits::max(); - gmeTrackerParam.maxIteration = MAX(15, param.maxIteration); - gmeTrackerParam.minEigenValue = 0.000001f; - - std::vector gmPrevPoints, gmCurrPoints; - - // Initialize point grid - int stepr = prevPyramids[0]->m_Image.rows / 30; - int stepc = prevPyramids[0]->m_Image.rows / 40; - for (int r = stepr / 2; r < prevPyramids[0]->m_Image.rows; r += stepr) + for (int c = stepc / 2; c < prevPyramids[0]->m_Image.cols; c += stepc) { - for (int c = stepc / 2; c < prevPyramids[0]->m_Image.cols; c += stepc) - { - gmPrevPoints.push_back(cv::Point2f(static_cast(c), static_cast(r))); - } + gmPrevPoints.push_back(cv::Point2f(static_cast(c), static_cast(r))); } + } - // perform motion estimation - calcLocalOpticalFlowCore(prevPyramids, currPyramids, gmPrevPoints, gmCurrPoints, gmeTrackerParam); + // perform motion estimation + calcLocalOpticalFlowCore(prevPyramids, currPyramids, gmPrevPoints, gmCurrPoints, gmeTrackerParam); - cv::Mat prevPointsMat(static_cast(gmPrevPoints.size()), 1, CV_32FC2); - cv::Mat currPointsMat(static_cast(gmPrevPoints.size()), 1, CV_32FC2); - cv::Mat distMat(static_cast(gmPrevPoints.size()), 1, CV_32FC1); + cv::Mat prevPointsMat(static_cast(gmPrevPoints.size()), 1, CV_32FC2); + cv::Mat currPointsMat(static_cast(gmPrevPoints.size()), 1, CV_32FC2); + cv::Mat distMat(static_cast(gmPrevPoints.size()), 1, CV_32FC1); - // Forward backward confidence to estimate optimal ransac reprojection error - int noPoints = 0; - for (unsigned int n = 0; n < gmPrevPoints.size(); n++) - { - cv::Point2f flow = gmCurrPoints[n] - gmPrevPoints[n]; - prevPointsMat.at(noPoints) = gmPrevPoints[n]; - currPointsMat.at(noPoints) = gmCurrPoints[n]; - distMat.at(noPoints) = flow.x * flow.x + flow.y* flow.y; - if (isnan(distMat.at(noPoints)) == false) - noPoints++; - } + // Forward backward confidence to estimate optimal ransac reprojection error + int noPoints = 0; + for (unsigned int n = 0; n < gmPrevPoints.size(); n++) + { + cv::Point2f flow = gmCurrPoints[n] - gmPrevPoints[n]; + prevPointsMat.at(noPoints) = gmPrevPoints[n]; + currPointsMat.at(noPoints) = gmCurrPoints[n]; + distMat.at(noPoints) = flow.x * flow.x + flow.y* flow.y; + if (isnan(distMat.at(noPoints)) == false) + noPoints++; + } - float medianDist = (param.globalMotionRansacThreshold == 0) ? 1.f : - quickselect(distMat, static_cast(noPoints * static_cast(param.globalMotionRansacThreshold) / 100.f)); - medianDist = sqrt(medianDist); + float medianDist = (param.globalMotionRansacThreshold == 0) ? 1.f : + quickselect(distMat, static_cast(noPoints * static_cast(param.globalMotionRansacThreshold) / 100.f)); + medianDist = sqrt(medianDist); - if (noPoints < 8) - return; + if (noPoints < 8) + return; - cv::findHomography(prevPointsMat(cv::Rect(0, 0, 1, noPoints)), currPointsMat(cv::Rect(0, 0, 1, noPoints)), cv::RANSAC, medianDist, mask).convertTo(homography, CV_32FC1); + cv::findHomography(prevPointsMat(cv::Rect(0, 0, 1, noPoints)), currPointsMat(cv::Rect(0, 0, 1, noPoints)), cv::RANSAC, medianDist, mask).convertTo(homography, CV_32FC1); - if (homography.empty()) - return; + if (homography.empty()) + return; - cv::perspectiveTransform(prevPoints, currPoints, homography); - } + cv::perspectiveTransform(prevPoints, currPoints, homography); +} - void calcLocalOpticalFlow( - const Mat prevImage, - const Mat currImage, - Ptr prevPyramids[2], - Ptr currPyramids[2], - const std::vector & prevPoints, - std::vector & currPoints, - const RLOFOpticalFlowParameter & param) +void calcLocalOpticalFlow( + const Mat prevImage, + const Mat currImage, + Ptr prevPyramids[2], + Ptr currPyramids[2], + const std::vector & prevPoints, + std::vector & currPoints, + const RLOFOpticalFlowParameter & param) +{ + if (prevImage.empty() == false && currImage.empty()== false) { - if (prevImage.empty() == false && currImage.empty()== false) + prevPyramids[0]->m_Overwrite = true; + currPyramids[0]->m_Overwrite = true; + prevPyramids[1]->m_Overwrite = true; + currPyramids[1]->m_Overwrite = true; + if (prevImage.type() == CV_8UC3) { - prevPyramids[0]->m_Overwrite = true; - currPyramids[0]->m_Overwrite = true; - prevPyramids[1]->m_Overwrite = true; - currPyramids[1]->m_Overwrite = true; - if (prevImage.type() == CV_8UC3) - { - prevPyramids[0]->setGrayFromRGB(prevImage); - currPyramids[0]->setGrayFromRGB(currImage); - prevPyramids[1]->setImage(prevImage); - currPyramids[1]->setImage(currImage); + prevPyramids[0]->setGrayFromRGB(prevImage); + currPyramids[0]->setGrayFromRGB(currImage); + prevPyramids[1]->setImage(prevImage); + currPyramids[1]->setImage(currImage); - if (param.supportRegionType == SR_CROSS) - { - prevPyramids[1]->setBlurFromRGB(prevImage); - currPyramids[1]->setBlurFromRGB(currImage); - } - } - else + if (param.supportRegionType == SR_CROSS) { - prevPyramids[0]->setImage(prevImage); - currPyramids[0]->setImage(currImage); + prevPyramids[1]->setBlurFromRGB(prevImage); + currPyramids[1]->setBlurFromRGB(currImage); } } - preprocess(prevPyramids, currPyramids, prevPoints, currPoints, param); - RLOFOpticalFlowParameter internParam = param; - if (param.useGlobalMotionPrior == true) - internParam.useInitialFlow = true; - calcLocalOpticalFlowCore(prevPyramids, currPyramids, prevPoints, currPoints, internParam); + else + { + prevPyramids[0]->setImage(prevImage); + currPyramids[0]->setImage(currImage); + } } + preprocess(prevPyramids, currPyramids, prevPoints, currPoints, param); + RLOFOpticalFlowParameter internParam = param; + if (param.useGlobalMotionPrior == true) + internParam.useInitialFlow = true; + calcLocalOpticalFlowCore(prevPyramids, currPyramids, prevPoints, currPoints, internParam); } -} \ No newline at end of file + +}} // namespace diff --git a/modules/optflow/src/rlof/rlof_localflow.h b/modules/optflow/src/rlof/rlof_localflow.h index 89ac2f8db21..00702e5323b 100644 --- a/modules/optflow/src/rlof/rlof_localflow.h +++ b/modules/optflow/src/rlof/rlof_localflow.h @@ -69,16 +69,15 @@ T quickselect(const Mat & inp, int k) } } -namespace cv -{ -namespace optflow -{ +namespace cv { +namespace optflow { + class CImageBuffer { public: CImageBuffer() : m_Overwrite(true) - {}; + {} void setGrayFromRGB(const cv::Mat & inp) { if(m_Overwrite) @@ -98,12 +97,12 @@ class CImageBuffer int buildPyramid(cv::Size winSize, int maxLevel, float levelScale[2]); cv::Mat & getImage(int level) {return m_ImagePyramid[level];} - std::vector m_ImagePyramid; - cv::Mat m_BlurredImage; - cv::Mat m_Image; - std::vector m_CrossPyramid; - int m_maxLevel; - bool m_Overwrite; + std::vector m_ImagePyramid; + cv::Mat m_BlurredImage; + cv::Mat m_Image; + std::vector m_CrossPyramid; + int m_maxLevel; + bool m_Overwrite; }; void calcLocalOpticalFlow( @@ -114,6 +113,6 @@ void calcLocalOpticalFlow( const std::vector & prevPoints, std::vector & currPoints, const RLOFOpticalFlowParameter & param); -} -} + +}} // namespace #endif diff --git a/modules/optflow/src/rlofflow.cpp b/modules/optflow/src/rlofflow.cpp index ff43fc39820..12a72cb035f 100644 --- a/modules/optflow/src/rlofflow.cpp +++ b/modules/optflow/src/rlofflow.cpp @@ -5,398 +5,395 @@ #include "rlof/rlof_localflow.h" #include "rlof/geo_interpolation.hpp" #include "opencv2/ximgproc.hpp" -namespace cv -{ -namespace optflow + +namespace cv { +namespace optflow { + +Ptr RLOFOpticalFlowParameter::create() { + return Ptr(new RLOFOpticalFlowParameter); +} - Ptr RLOFOpticalFlowParameter::create() - { - return Ptr(new RLOFOpticalFlowParameter); - } +void RLOFOpticalFlowParameter::setSolverType(SolverType val){ solverType = val;} +SolverType RLOFOpticalFlowParameter::getSolverType() const { return solverType;} - void RLOFOpticalFlowParameter::setSolverType(SolverType val){ solverType = val;} - SolverType RLOFOpticalFlowParameter::getSolverType() const { return solverType;} +void RLOFOpticalFlowParameter::setSupportRegionType(SupportRegionType val){ supportRegionType = val;} +SupportRegionType RLOFOpticalFlowParameter::getSupportRegionType() const { return supportRegionType;} - void RLOFOpticalFlowParameter::setSupportRegionType(SupportRegionType val){ supportRegionType = val;} - SupportRegionType RLOFOpticalFlowParameter::getSupportRegionType() const { return supportRegionType;} +void RLOFOpticalFlowParameter::setNormSigma0(float val){ normSigma0 = val;} +float RLOFOpticalFlowParameter::getNormSigma0() const { return normSigma0;} - void RLOFOpticalFlowParameter::setNormSigma0(float val){ normSigma0 = val;} - float RLOFOpticalFlowParameter::getNormSigma0() const { return normSigma0;} +void RLOFOpticalFlowParameter::setNormSigma1(float val){ normSigma1 = val;} +float RLOFOpticalFlowParameter::getNormSigma1() const { return normSigma1;} - void RLOFOpticalFlowParameter::setNormSigma1(float val){ normSigma1 = val;} - float RLOFOpticalFlowParameter::getNormSigma1() const { return normSigma1;} +void RLOFOpticalFlowParameter::setSmallWinSize(int val){ smallWinSize = val;} +int RLOFOpticalFlowParameter::getSmallWinSize() const { return smallWinSize;} - void RLOFOpticalFlowParameter::setSmallWinSize(int val){ smallWinSize = val;} - int RLOFOpticalFlowParameter::getSmallWinSize() const { return smallWinSize;} +void RLOFOpticalFlowParameter::setLargeWinSize(int val){ largeWinSize = val;} +int RLOFOpticalFlowParameter::getLargeWinSize() const { return largeWinSize;} - void RLOFOpticalFlowParameter::setLargeWinSize(int val){ largeWinSize = val;} - int RLOFOpticalFlowParameter::getLargeWinSize() const { return largeWinSize;} +void RLOFOpticalFlowParameter::setCrossSegmentationThreshold(int val){ crossSegmentationThreshold = val;} +int RLOFOpticalFlowParameter::getCrossSegmentationThreshold() const { return crossSegmentationThreshold;} - void RLOFOpticalFlowParameter::setCrossSegmentationThreshold(int val){ crossSegmentationThreshold = val;} - int RLOFOpticalFlowParameter::getCrossSegmentationThreshold() const { return crossSegmentationThreshold;} +void RLOFOpticalFlowParameter::setMaxLevel(int val){ maxLevel = val;} +int RLOFOpticalFlowParameter::getMaxLevel() const { return maxLevel;} - void RLOFOpticalFlowParameter::setMaxLevel(int val){ maxLevel = val;} - int RLOFOpticalFlowParameter::getMaxLevel() const { return maxLevel;} +void RLOFOpticalFlowParameter::setUseInitialFlow(bool val){ useInitialFlow = val;} +bool RLOFOpticalFlowParameter::getUseInitialFlow() const { return useInitialFlow;} - void RLOFOpticalFlowParameter::setUseInitialFlow(bool val){ useInitialFlow = val;} - bool RLOFOpticalFlowParameter::getUseInitialFlow() const { return useInitialFlow;} +void RLOFOpticalFlowParameter::setUseIlluminationModel(bool val){ useIlluminationModel = val;} +bool RLOFOpticalFlowParameter::getUseIlluminationModel() const { return useIlluminationModel;} - void RLOFOpticalFlowParameter::setUseIlluminationModel(bool val){ useIlluminationModel = val;} - bool RLOFOpticalFlowParameter::getUseIlluminationModel() const { return useIlluminationModel;} +void RLOFOpticalFlowParameter::setUseGlobalMotionPrior(bool val){ useGlobalMotionPrior = val;} +bool RLOFOpticalFlowParameter::getUseGlobalMotionPrior() const { return useGlobalMotionPrior;} - void RLOFOpticalFlowParameter::setUseGlobalMotionPrior(bool val){ useGlobalMotionPrior = val;} - bool RLOFOpticalFlowParameter::getUseGlobalMotionPrior() const { return useGlobalMotionPrior;} +void RLOFOpticalFlowParameter::setMaxIteration(int val){ maxIteration = val;} +int RLOFOpticalFlowParameter::getMaxIteration() const { return maxIteration;} - void RLOFOpticalFlowParameter::setMaxIteration(int val){ maxIteration = val;} - int RLOFOpticalFlowParameter::getMaxIteration() const { return maxIteration;} +void RLOFOpticalFlowParameter::setMinEigenValue(float val){ minEigenValue = val;} +float RLOFOpticalFlowParameter::getMinEigenValue() const { return minEigenValue;} - void RLOFOpticalFlowParameter::setMinEigenValue(float val){ minEigenValue = val;} - float RLOFOpticalFlowParameter::getMinEigenValue() const { return minEigenValue;} +void RLOFOpticalFlowParameter::setGlobalMotionRansacThreshold(float val){ globalMotionRansacThreshold = val;} +float RLOFOpticalFlowParameter::getGlobalMotionRansacThreshold() const { return globalMotionRansacThreshold;} - void RLOFOpticalFlowParameter::setGlobalMotionRansacThreshold(float val){ globalMotionRansacThreshold = val;} - float RLOFOpticalFlowParameter::getGlobalMotionRansacThreshold() const { return globalMotionRansacThreshold;} +class DenseOpticalFlowRLOFImpl : public DenseRLOFOpticalFlow +{ +public: + DenseOpticalFlowRLOFImpl() + : param(Ptr(new RLOFOpticalFlowParameter)) + , forwardBackwardThreshold(1.f) + , gridStep(6, 6) + , interp_type(InterpolationType::INTERP_GEO) + , k(128) + , sigma(0.05f) + , lambda(999.f) + , fgs_lambda(500.0f) + , fgs_sigma(1.5f) + , use_post_proc(true) - class DenseOpticalFlowRLOFImpl : public DenseRLOFOpticalFlow { - public: - DenseOpticalFlowRLOFImpl() - : param(Ptr(new RLOFOpticalFlowParameter)) - , forwardBackwardThreshold(1.f) - , gridStep(6, 6) - , interp_type(InterpolationType::INTERP_GEO) - , k(128) - , sigma(0.05f) - , lambda(999.f) - , fgs_lambda(500.0f) - , fgs_sigma(1.5f) - , use_post_proc(true) - - { - prevPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer); - prevPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer); - currPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer); - currPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer); - } - virtual void setRLOFOpticalFlowParameter(Ptr val) CV_OVERRIDE { param = val; } - virtual Ptr getRLOFOpticalFlowParameter() const CV_OVERRIDE { return param; } + prevPyramid[0] = cv::Ptr(new CImageBuffer); + prevPyramid[1] = cv::Ptr(new CImageBuffer); + currPyramid[0] = cv::Ptr(new CImageBuffer); + currPyramid[1] = cv::Ptr(new CImageBuffer); + } + virtual void setRLOFOpticalFlowParameter(Ptr val) CV_OVERRIDE { param = val; } + virtual Ptr getRLOFOpticalFlowParameter() const CV_OVERRIDE { return param; } - virtual float getForwardBackward() const CV_OVERRIDE { return forwardBackwardThreshold; } - virtual void setForwardBackward(float val) CV_OVERRIDE { forwardBackwardThreshold = val; } + virtual float getForwardBackward() const CV_OVERRIDE { return forwardBackwardThreshold; } + virtual void setForwardBackward(float val) CV_OVERRIDE { forwardBackwardThreshold = val; } - virtual void setInterpolation(InterpolationType val) CV_OVERRIDE { interp_type = val; } - virtual InterpolationType getInterpolation() const CV_OVERRIDE { return interp_type; } + virtual void setInterpolation(InterpolationType val) CV_OVERRIDE { interp_type = val; } + virtual InterpolationType getInterpolation() const CV_OVERRIDE { return interp_type; } - virtual Size getGridStep() const CV_OVERRIDE { return gridStep; } - virtual void setGridStep(Size val) CV_OVERRIDE { gridStep = val; } + virtual Size getGridStep() const CV_OVERRIDE { return gridStep; } + virtual void setGridStep(Size val) CV_OVERRIDE { gridStep = val; } - virtual int getEPICK() const CV_OVERRIDE { return k; } - virtual void setEPICK(int val) CV_OVERRIDE { k = val; } + virtual int getEPICK() const CV_OVERRIDE { return k; } + virtual void setEPICK(int val) CV_OVERRIDE { k = val; } - virtual float getEPICSigma() const CV_OVERRIDE { return sigma; } - virtual void setEPICSigma(float val) CV_OVERRIDE { sigma = val; } + virtual float getEPICSigma() const CV_OVERRIDE { return sigma; } + virtual void setEPICSigma(float val) CV_OVERRIDE { sigma = val; } - virtual float getEPICLambda() const CV_OVERRIDE { return lambda; } - virtual void setEPICLambda(float val) CV_OVERRIDE { lambda = val; } + virtual float getEPICLambda() const CV_OVERRIDE { return lambda; } + virtual void setEPICLambda(float val) CV_OVERRIDE { lambda = val; } - virtual float getFgsLambda() const CV_OVERRIDE { return fgs_lambda; } - virtual void setFgsLambda(float val) CV_OVERRIDE { fgs_lambda = val; } + virtual float getFgsLambda() const CV_OVERRIDE { return fgs_lambda; } + virtual void setFgsLambda(float val) CV_OVERRIDE { fgs_lambda = val; } - virtual float getFgsSigma() const CV_OVERRIDE { return fgs_sigma; } - virtual void setFgsSigma(float val) CV_OVERRIDE { fgs_sigma = val; } + virtual float getFgsSigma() const CV_OVERRIDE { return fgs_sigma; } + virtual void setFgsSigma(float val) CV_OVERRIDE { fgs_sigma = val; } - virtual bool getUsePostProc() const CV_OVERRIDE { return use_post_proc; } - virtual void setUsePostProc(bool val) CV_OVERRIDE { use_post_proc = val; } + virtual bool getUsePostProc() const CV_OVERRIDE { return use_post_proc; } + virtual void setUsePostProc(bool val) CV_OVERRIDE { use_post_proc = val; } - virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow) CV_OVERRIDE + virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow) CV_OVERRIDE + { + CV_Assert(!I0.empty() && I0.depth() == CV_8U && (I0.channels() == 3 || I0.channels() == 1)); + CV_Assert(!I1.empty() && I1.depth() == CV_8U && (I1.channels() == 3 || I1.channels() == 1)); + CV_Assert(I0.sameSize(I1)); + if (param.empty()) + param = Ptr(new RLOFOpticalFlowParameter()); + if (param->supportRegionType == SR_CROSS) + CV_Assert( I0.channels() == 3 && I1.channels() == 3); + CV_Assert(interp_type == InterpolationType::INTERP_EPIC || interp_type == InterpolationType::INTERP_GEO); + // if no parameter is used use the default parameter + + Mat prevImage = I0.getMat(); + Mat currImage = I1.getMat(); + int noPoints = prevImage.cols * prevImage.rows; + std::vector prevPoints(noPoints); + std::vector currPoints, refPoints; + noPoints = 0; + cv::Size grid_h = gridStep / 2; + for (int r = grid_h.height; r < prevImage.rows; r += gridStep.height) { - CV_Assert(!I0.empty() && I0.depth() == CV_8U && (I0.channels() == 3 || I0.channels() == 1)); - CV_Assert(!I1.empty() && I1.depth() == CV_8U && (I1.channels() == 3 || I1.channels() == 1)); - CV_Assert(I0.sameSize(I1)); - if (param.empty()) - param = Ptr(new RLOFOpticalFlowParameter()); - if (param->supportRegionType == SR_CROSS) - CV_Assert( I0.channels() == 3 && I1.channels() == 3); - CV_Assert(interp_type == InterpolationType::INTERP_EPIC || interp_type == InterpolationType::INTERP_GEO); - // if no parameter is used use the default parameter - - Mat prevImage = I0.getMat(); - Mat currImage = I1.getMat(); - int noPoints = prevImage.cols * prevImage.rows; - std::vector prevPoints(noPoints); - std::vector currPoints, refPoints; - noPoints = 0; - cv::Size grid_h = gridStep / 2; - for (int r = grid_h.height; r < prevImage.rows; r += gridStep.height) + for (int c = grid_h.width; c < prevImage.cols; c += gridStep.width) { - for (int c = grid_h.width; c < prevImage.cols; c += gridStep.width) - { - prevPoints[noPoints++] = cv::Point2f(static_cast(c), static_cast(r)); - } + prevPoints[noPoints++] = cv::Point2f(static_cast(c), static_cast(r)); } - prevPoints.erase(prevPoints.begin() + noPoints, prevPoints.end()); - currPoints.resize(prevPoints.size()); - calcLocalOpticalFlow(prevImage, currImage, prevPyramid, currPyramid, prevPoints, currPoints, *(param.get())); - flow.create(prevImage.size(), CV_32FC2); - Mat dense_flow = flow.getMat(); - - std::vector filtered_prevPoints; - std::vector filtered_currPoints; - if (gridStep == cv::Size(1, 1) && forwardBackwardThreshold <= 0) + } + prevPoints.erase(prevPoints.begin() + noPoints, prevPoints.end()); + currPoints.resize(prevPoints.size()); + calcLocalOpticalFlow(prevImage, currImage, prevPyramid, currPyramid, prevPoints, currPoints, *(param.get())); + flow.create(prevImage.size(), CV_32FC2); + Mat dense_flow = flow.getMat(); + + std::vector filtered_prevPoints; + std::vector filtered_currPoints; + if (gridStep == cv::Size(1, 1) && forwardBackwardThreshold <= 0) + { + for (unsigned int n = 0; n < prevPoints.size(); n++) { - for (unsigned int n = 0; n < prevPoints.size(); n++) - { - dense_flow.at(prevPoints[n]) = currPoints[n] - prevPoints[n]; - } - return; + dense_flow.at(prevPoints[n]) = currPoints[n] - prevPoints[n]; } - if (forwardBackwardThreshold > 0) + return; + } + if (forwardBackwardThreshold > 0) + { + // reuse image pyramids + calcLocalOpticalFlow(currImage, prevImage, currPyramid, prevPyramid, currPoints, refPoints, *(param.get())); + + filtered_prevPoints.resize(prevPoints.size()); + filtered_currPoints.resize(prevPoints.size()); + float sqrForwardBackwardThreshold = forwardBackwardThreshold * forwardBackwardThreshold; + noPoints = 0; + for (unsigned int r = 0; r < refPoints.size(); r++) { - // reuse image pyramids - calcLocalOpticalFlow(currImage, prevImage, currPyramid, prevPyramid, currPoints, refPoints, *(param.get())); - - filtered_prevPoints.resize(prevPoints.size()); - filtered_currPoints.resize(prevPoints.size()); - float sqrForwardBackwardThreshold = forwardBackwardThreshold * forwardBackwardThreshold; - noPoints = 0; - for (unsigned int r = 0; r < refPoints.size(); r++) + Point2f diff = refPoints[r] - prevPoints[r]; + if (diff.x * diff.x + diff.y * diff.y < sqrForwardBackwardThreshold) { - Point2f diff = refPoints[r] - prevPoints[r]; - if (diff.x * diff.x + diff.y * diff.y < sqrForwardBackwardThreshold) - { - filtered_prevPoints[noPoints] = prevPoints[r]; - filtered_currPoints[noPoints++] = currPoints[r]; - } + filtered_prevPoints[noPoints] = prevPoints[r]; + filtered_currPoints[noPoints++] = currPoints[r]; } - - filtered_prevPoints.erase(filtered_prevPoints.begin() + noPoints, filtered_prevPoints.end()); - filtered_currPoints.erase(filtered_currPoints.begin() + noPoints, filtered_currPoints.end()); - - } - else - { - filtered_prevPoints = prevPoints; - filtered_currPoints = currPoints; } - if (interp_type == InterpolationType::INTERP_EPIC) - { - Ptr gd = ximgproc::createEdgeAwareInterpolator(); - gd->setK(k); - gd->setSigma(sigma); - gd->setLambda(lambda); - gd->setUsePostProcessing(false); - gd->interpolate(prevImage, filtered_prevPoints, currImage, filtered_currPoints, dense_flow); - } - else - { - Mat blurredPrevImage, blurredCurrImage; - GaussianBlur(prevImage, blurredPrevImage, cv::Size(5, 5), -1); - std::vector status(filtered_currPoints.size(), 1); - interpolate_irregular_nn_raster(filtered_prevPoints, filtered_currPoints, status, blurredPrevImage).copyTo(dense_flow); - std::vector vecMats; - std::vector vecMats2(2); - cv::split(dense_flow, vecMats); - cv::bilateralFilter(vecMats[0], vecMats2[0], 5, 2, 20); - cv::bilateralFilter(vecMats[1], vecMats2[1], 5, 2, 20); - cv::merge(vecMats2, dense_flow); - } - if (use_post_proc) - { - ximgproc::fastGlobalSmootherFilter(prevImage, flow, flow, fgs_lambda, fgs_sigma); - } - } + filtered_prevPoints.erase(filtered_prevPoints.begin() + noPoints, filtered_prevPoints.end()); + filtered_currPoints.erase(filtered_currPoints.begin() + noPoints, filtered_currPoints.end()); - virtual void collectGarbage() CV_OVERRIDE + } + else { - prevPyramid[0].release(); - prevPyramid[1].release(); - currPyramid[0].release(); - currPyramid[1].release(); + filtered_prevPoints = prevPoints; + filtered_currPoints = currPoints; } - protected: - Ptr param; - float forwardBackwardThreshold; - Ptr prevPyramid[2]; - Ptr currPyramid[2]; - cv::Size gridStep; - InterpolationType interp_type; - int k; - float sigma; - float lambda; - float fgs_lambda; - float fgs_sigma; - bool use_post_proc; - }; - - Ptr DenseRLOFOpticalFlow::create( - Ptr rlofParam, - float forwardBackwardThreshold, - cv::Size gridStep, - InterpolationType interp_type, - int epicK, - float epicSigma, - float epicLambda, - bool use_post_proc, - float fgs_lambda, - float fgs_sigma) - { - Ptr algo = makePtr(); - algo->setRLOFOpticalFlowParameter(rlofParam); - algo->setForwardBackward(forwardBackwardThreshold); - algo->setGridStep(gridStep); - algo->setInterpolation(interp_type); - algo->setEPICK(epicK); - algo->setEPICSigma(epicSigma); - algo->setEPICLambda(epicLambda); - algo->setUsePostProc(use_post_proc); - algo->setFgsLambda(fgs_lambda); - algo->setFgsSigma(fgs_sigma); - return algo; + if (interp_type == InterpolationType::INTERP_EPIC) + { + Ptr gd = ximgproc::createEdgeAwareInterpolator(); + gd->setK(k); + gd->setSigma(sigma); + gd->setLambda(lambda); + gd->setUsePostProcessing(false); + gd->interpolate(prevImage, filtered_prevPoints, currImage, filtered_currPoints, dense_flow); + } + else + { + Mat blurredPrevImage, blurredCurrImage; + GaussianBlur(prevImage, blurredPrevImage, cv::Size(5, 5), -1); + std::vector status(filtered_currPoints.size(), 1); + interpolate_irregular_nn_raster(filtered_prevPoints, filtered_currPoints, status, blurredPrevImage).copyTo(dense_flow); + std::vector vecMats; + std::vector vecMats2(2); + cv::split(dense_flow, vecMats); + cv::bilateralFilter(vecMats[0], vecMats2[0], 5, 2, 20); + cv::bilateralFilter(vecMats[1], vecMats2[1], 5, 2, 20); + cv::merge(vecMats2, dense_flow); + } + if (use_post_proc) + { + ximgproc::fastGlobalSmootherFilter(prevImage, flow, flow, fgs_lambda, fgs_sigma); + } } - class SparseRLOFOpticalFlowImpl : public SparseRLOFOpticalFlow + virtual void collectGarbage() CV_OVERRIDE { - public: - SparseRLOFOpticalFlowImpl() - : param(Ptr(new RLOFOpticalFlowParameter)) - , forwardBackwardThreshold(1.f) - { - prevPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer); - prevPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer); - currPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer); - currPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer); - } - virtual void setRLOFOpticalFlowParameter(Ptr val) CV_OVERRIDE { param = val; } - virtual Ptr getRLOFOpticalFlowParameter() const CV_OVERRIDE { return param; } + prevPyramid[0].release(); + prevPyramid[1].release(); + currPyramid[0].release(); + currPyramid[1].release(); + } - virtual float getForwardBackward() const CV_OVERRIDE { return forwardBackwardThreshold; } - virtual void setForwardBackward(float val) CV_OVERRIDE { forwardBackwardThreshold = val; } +protected: + Ptr param; + float forwardBackwardThreshold; + Ptr prevPyramid[2]; + Ptr currPyramid[2]; + cv::Size gridStep; + InterpolationType interp_type; + int k; + float sigma; + float lambda; + float fgs_lambda; + float fgs_sigma; + bool use_post_proc; +}; + +Ptr DenseRLOFOpticalFlow::create( + Ptr rlofParam, + float forwardBackwardThreshold, + cv::Size gridStep, + InterpolationType interp_type, + int epicK, + float epicSigma, + float epicLambda, + bool use_post_proc, + float fgs_lambda, + float fgs_sigma) +{ + Ptr algo = makePtr(); + algo->setRLOFOpticalFlowParameter(rlofParam); + algo->setForwardBackward(forwardBackwardThreshold); + algo->setGridStep(gridStep); + algo->setInterpolation(interp_type); + algo->setEPICK(epicK); + algo->setEPICSigma(epicSigma); + algo->setEPICLambda(epicLambda); + algo->setUsePostProc(use_post_proc); + algo->setFgsLambda(fgs_lambda); + algo->setFgsSigma(fgs_sigma); + return algo; +} - virtual void calc(InputArray prevImg, InputArray nextImg, - InputArray prevPts, InputOutputArray nextPts, - OutputArray status, - OutputArray err) CV_OVERRIDE - { - CV_Assert(!prevImg.empty() && prevImg.depth() == CV_8U && (prevImg.channels() == 3 || prevImg.channels() == 1)); - CV_Assert(!nextImg.empty() && nextImg.depth() == CV_8U && (nextImg.channels() == 3 || nextImg.channels() == 1)); - CV_Assert(prevImg.sameSize(nextImg)); +class SparseRLOFOpticalFlowImpl : public SparseRLOFOpticalFlow +{ + public: + SparseRLOFOpticalFlowImpl() + : param(Ptr(new RLOFOpticalFlowParameter)) + , forwardBackwardThreshold(1.f) + { + prevPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer); + prevPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer); + currPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer); + currPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer); + } + virtual void setRLOFOpticalFlowParameter(Ptr val) CV_OVERRIDE { param = val; } + virtual Ptr getRLOFOpticalFlowParameter() const CV_OVERRIDE { return param; } - Mat prevImage = prevImg.getMat(); - Mat nextImage = nextImg.getMat(); - Mat prevPtsMat = prevPts.getMat(); + virtual float getForwardBackward() const CV_OVERRIDE { return forwardBackwardThreshold; } + virtual void setForwardBackward(float val) CV_OVERRIDE { forwardBackwardThreshold = val; } - if (param.empty()) - { - param = Ptr(new RLOFOpticalFlowParameter); - } + virtual void calc(InputArray prevImg, InputArray nextImg, + InputArray prevPts, InputOutputArray nextPts, + OutputArray status, + OutputArray err) CV_OVERRIDE + { + CV_Assert(!prevImg.empty() && prevImg.depth() == CV_8U && (prevImg.channels() == 3 || prevImg.channels() == 1)); + CV_Assert(!nextImg.empty() && nextImg.depth() == CV_8U && (nextImg.channels() == 3 || nextImg.channels() == 1)); + CV_Assert(prevImg.sameSize(nextImg)); - if (param->useInitialFlow == false) - nextPts.create(prevPtsMat.size(), prevPtsMat.type(), -1, true); + Mat prevImage = prevImg.getMat(); + Mat nextImage = nextImg.getMat(); + Mat prevPtsMat = prevPts.getMat(); - int npoints = 0; - CV_Assert((npoints = prevPtsMat.checkVector(2, CV_32F, true)) >= 0); + if (param.empty()) + { + param = Ptr(new RLOFOpticalFlowParameter); + } - if (npoints == 0) - { - nextPts.release(); - status.release(); - err.release(); - return; - } + if (param->useInitialFlow == false) + nextPts.create(prevPtsMat.size(), prevPtsMat.type(), -1, true); - Mat nextPtsMat = nextPts.getMat(); - CV_Assert(nextPtsMat.checkVector(2, CV_32F, true) == npoints); - std::vector prevPoints(npoints), nextPoints(npoints), refPoints; - prevPtsMat.copyTo(cv::Mat(1, npoints, CV_32FC2, &prevPoints[0])); - if (param->useInitialFlow ) - nextPtsMat.copyTo(cv::Mat(1, nextPtsMat.cols, CV_32FC2, &nextPoints[0])); + int npoints = 0; + CV_Assert((npoints = prevPtsMat.checkVector(2, CV_32F, true)) >= 0); - cv::Mat statusMat; - cv::Mat errorMat; - if (status.needed() || forwardBackwardThreshold > 0) - { - status.create((int)npoints, 1, CV_8U, -1, true); - statusMat = status.getMat(); - statusMat.setTo(1); - } + if (npoints == 0) + { + nextPts.release(); + status.release(); + err.release(); + return; + } - if (err.needed() || forwardBackwardThreshold > 0) - { - err.create((int)npoints, 1, CV_32F, -1, true); - errorMat = err.getMat(); - errorMat.setTo(0); - } + Mat nextPtsMat = nextPts.getMat(); + CV_Assert(nextPtsMat.checkVector(2, CV_32F, true) == npoints); + std::vector prevPoints(npoints), nextPoints(npoints), refPoints; + prevPtsMat.copyTo(cv::Mat(1, npoints, CV_32FC2, &prevPoints[0])); + if (param->useInitialFlow ) + nextPtsMat.copyTo(cv::Mat(1, nextPtsMat.cols, CV_32FC2, &nextPoints[0])); - calcLocalOpticalFlow(prevImage, nextImage, prevPyramid, currPyramid, prevPoints, nextPoints, *(param.get())); - cv::Mat(1,npoints , CV_32FC2, &nextPoints[0]).copyTo(nextPtsMat); - if (forwardBackwardThreshold > 0) - { - // reuse image pyramids - calcLocalOpticalFlow(nextImage, prevImage, currPyramid, prevPyramid, nextPoints, refPoints, *(param.get())); - } - for (unsigned int r = 0; r < refPoints.size(); r++) - { - Point2f diff = refPoints[r] - prevPoints[r]; - errorMat.at(r) = sqrt(diff.x * diff.x + diff.y * diff.y); - if (errorMat.at(r) <= forwardBackwardThreshold) - statusMat.at(r) = 0; - } + cv::Mat statusMat; + cv::Mat errorMat; + if (status.needed() || forwardBackwardThreshold > 0) + { + status.create((int)npoints, 1, CV_8U, -1, true); + statusMat = status.getMat(); + statusMat.setTo(1); + } + if (err.needed() || forwardBackwardThreshold > 0) + { + err.create((int)npoints, 1, CV_32F, -1, true); + errorMat = err.getMat(); + errorMat.setTo(0); } - protected: - Ptr param; - float forwardBackwardThreshold; - Ptr prevPyramid[2]; - Ptr currPyramid[2]; - }; + calcLocalOpticalFlow(prevImage, nextImage, prevPyramid, currPyramid, prevPoints, nextPoints, *(param.get())); + cv::Mat(1,npoints , CV_32FC2, &nextPoints[0]).copyTo(nextPtsMat); + if (forwardBackwardThreshold > 0) + { + // reuse image pyramids + calcLocalOpticalFlow(nextImage, prevImage, currPyramid, prevPyramid, nextPoints, refPoints, *(param.get())); + } + for (unsigned int r = 0; r < refPoints.size(); r++) + { + Point2f diff = refPoints[r] - prevPoints[r]; + errorMat.at(r) = sqrt(diff.x * diff.x + diff.y * diff.y); + if (errorMat.at(r) <= forwardBackwardThreshold) + statusMat.at(r) = 0; + } - Ptr SparseRLOFOpticalFlow::create( - Ptr rlofParam, - float forwardBackwardThreshold) - { - Ptr algo = makePtr(); - algo->setRLOFOpticalFlowParameter(rlofParam); - algo->setForwardBackward(forwardBackwardThreshold); - return algo; } - void calcOpticalFlowDenseRLOF(InputArray I0, InputArray I1, InputOutputArray flow, - Ptr rlofParam , - float forewardBackwardThreshold, Size gridStep, - InterpolationType interp_type, - int epicK, float epicSigma, float epicLambda, - bool use_post_proc, float fgsLambda, float fgsSigma) - { - Ptr algo = DenseRLOFOpticalFlow::create( - rlofParam, forewardBackwardThreshold, gridStep, interp_type, - epicK, epicSigma, epicLambda, use_post_proc, fgsLambda, fgsSigma); - algo->calc(I0, I1, flow); - algo->collectGarbage(); - } +protected: + Ptr param; + float forwardBackwardThreshold; + Ptr prevPyramid[2]; + Ptr currPyramid[2]; +}; - void calcOpticalFlowSparseRLOF(InputArray prevImg, InputArray nextImg, - InputArray prevPts, InputOutputArray nextPts, - OutputArray status, OutputArray err, - Ptr rlofParam, - float forewardBackwardThreshold) - { - Ptr algo = SparseRLOFOpticalFlow::create( - rlofParam, forewardBackwardThreshold); - algo->calc(prevImg, nextImg, prevPts, nextPts, status, err); - } - Ptr createOptFlow_DenseRLOF() - { - return DenseRLOFOpticalFlow::create(); - } +Ptr SparseRLOFOpticalFlow::create( + Ptr rlofParam, + float forwardBackwardThreshold) +{ + Ptr algo = makePtr(); + algo->setRLOFOpticalFlowParameter(rlofParam); + algo->setForwardBackward(forwardBackwardThreshold); + return algo; +} - Ptr createOptFlow_SparseRLOF() - { - return SparseRLOFOpticalFlow::create(); - } +void calcOpticalFlowDenseRLOF(InputArray I0, InputArray I1, InputOutputArray flow, + Ptr rlofParam , + float forewardBackwardThreshold, Size gridStep, + InterpolationType interp_type, + int epicK, float epicSigma, float epicLambda, + bool use_post_proc, float fgsLambda, float fgsSigma) +{ + Ptr algo = DenseRLOFOpticalFlow::create( + rlofParam, forewardBackwardThreshold, gridStep, interp_type, + epicK, epicSigma, epicLambda, use_post_proc, fgsLambda, fgsSigma); + algo->calc(I0, I1, flow); + algo->collectGarbage(); } +void calcOpticalFlowSparseRLOF(InputArray prevImg, InputArray nextImg, + InputArray prevPts, InputOutputArray nextPts, + OutputArray status, OutputArray err, + Ptr rlofParam, + float forewardBackwardThreshold) +{ + Ptr algo = SparseRLOFOpticalFlow::create( + rlofParam, forewardBackwardThreshold); + algo->calc(prevImg, nextImg, prevPts, nextPts, status, err); +} +Ptr createOptFlow_DenseRLOF() +{ + return DenseRLOFOpticalFlow::create(); +} +Ptr createOptFlow_SparseRLOF() +{ + return SparseRLOFOpticalFlow::create(); } + +}} // namespace From 2b4eb0bc9e4680144a413f0df2522676ac92360e Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Fri, 14 Dec 2018 14:05:31 +0300 Subject: [PATCH 13/25] fix perf test --- modules/optflow/perf/perf_rlof.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/modules/optflow/perf/perf_rlof.cpp b/modules/optflow/perf/perf_rlof.cpp index d8fb64eb40a..a273a9456a2 100644 --- a/modules/optflow/perf/perf_rlof.cpp +++ b/modules/optflow/perf/perf_rlof.cpp @@ -36,10 +36,9 @@ PERF_TEST_P(ST_SR_IM_Sparse, OpticalFlow_SparseRLOF, param->supportRegionType = SR_FIXED; param->useIlluminationModel = get<2>(GetParam()); - TEST_CYCLE_N(1) - { + PERF_SAMPLE_BEGIN() calcOpticalFlowSparseRLOF(frame1, frame2, prevPts, currPts, status, err, param, 1.f); - } + PERF_SAMPLE_END() SANITY_CHECK_NOTHING(); } @@ -64,10 +63,9 @@ PERF_TEST_P(INTERP_GRID_Dense, OpticalFlow_DenseRLOF, interp_type = INTERP_EPIC; if (get<0>(GetParam()) == "INTERP_GEO") interp_type = INTERP_GEO; - TEST_CYCLE_N(5) - { + PERF_SAMPLE_BEGIN() calcOpticalFlowDenseRLOF(frame1, frame2,flow, param, 1.0f, Size(get<1>(GetParam()), get<1>(GetParam())), interp_type); - } + PERF_SAMPLE_END() SANITY_CHECK_NOTHING(); } From 8ccab0bd5b483a505277e881b145be6406756cf1 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Fri, 14 Dec 2018 14:54:17 +0300 Subject: [PATCH 14/25] other changes: precomp.hpp / static --- modules/optflow/CMakeLists.txt | 2 +- modules/optflow/src/rlof/berlof_invoker.hpp | 10 ++----- .../optflow/src/rlof/geo_interpolation.cpp | 29 +++++++++---------- .../optflow/src/rlof/geo_interpolation.hpp | 3 +- modules/optflow/src/rlof/plk_invoker.hpp | 2 -- modules/optflow/src/rlof/rlof_invoker.hpp | 3 +- modules/optflow/src/rlof/rlof_invokerbase.hpp | 28 +++++++++--------- modules/optflow/src/rlof/rlof_localflow.cpp | 7 +++-- modules/optflow/src/rlof/rlof_localflow.h | 1 - 9 files changed, 38 insertions(+), 47 deletions(-) diff --git a/modules/optflow/CMakeLists.txt b/modules/optflow/CMakeLists.txt index 39a37f143dd..e4232749ea2 100644 --- a/modules/optflow/CMakeLists.txt +++ b/modules/optflow/CMakeLists.txt @@ -1,2 +1,2 @@ set(the_description "Optical Flow Algorithms") -ocv_define_module(optflow opencv_core opencv_imgproc opencv_video opencv_ximgproc opencv_imgcodecs opencv_flann WRAP python) +ocv_define_module(optflow opencv_core opencv_imgproc opencv_calib3d opencv_video opencv_ximgproc opencv_imgcodecs opencv_flann WRAP python) diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp index e5cac1fa87c..d4e1228d3e1 100644 --- a/modules/optflow/src/rlof/berlof_invoker.hpp +++ b/modules/optflow/src/rlof/berlof_invoker.hpp @@ -10,7 +10,7 @@ namespace cv{ namespace optflow{ -inline bool checkSolution(float x, float y, float * c ) +static inline bool checkSolution(float x, float y, float * c ) { float _a = x - 0.002f; float _b = y - 0.002f; @@ -25,13 +25,13 @@ inline bool checkSolution(float x, float y, float * c ) && (bl.x >= 0 && bl.y <= 0) && (br.x <= 0 && br.y <= 0); } -inline cv::Point2f est_DeltaGain( const cv::Mat & src, const cv::Vec4f & val) +static inline cv::Point2f est_DeltaGain( const cv::Mat & src, const cv::Vec4f & val) { return cv::Point2f( src.at(2,0) * val[0] + src.at(2,1) * val[1] + src.at(2,2) * val[2] + src.at(2,3) * val[3], src.at(3,0) * val[0] + src.at(3,1) * val[1] + src.at(3,2) * val[2] + src.at(3,3) * val[3]); } -inline void est_Result( const cv::Mat & src, const cv::Vec4f & val, cv::Point2f & delta, cv::Point2f & gain) +static inline void est_Result( const cv::Mat & src, const cv::Vec4f & val, cv::Point2f & delta, cv::Point2f & gain) { delta = cv::Point2f( @@ -92,7 +92,6 @@ class TrackerInvoker : public cv::ParallelLoopBody crossSegmentationThreshold = _crossSegmentationThreshold; } - TrackerInvoker & operator=(const TrackerInvoker &) {return *this;}; void operator()(const cv::Range& range) const CV_OVERRIDE { #ifdef DEBUG_INVOKER @@ -610,7 +609,6 @@ class TrackerInvoker : public cv::ParallelLoopBody crossSegmentationThreshold = _crossSegmentationThreshold; } - TrackerInvoker & operator=(const TrackerInvoker &) {return *this;}; void operator()(const cv::Range& range) const CV_OVERRIDE { #ifdef DEBUG_INVOKER @@ -1576,8 +1574,6 @@ class TrackerInvoker : public cv::ParallelLoopBody crossSegmentationThreshold = _crossSegmentationThreshold; } - TrackerInvoker & operator=(const TrackerInvoker &) {return *this;}; - void operator()(const cv::Range& range) const CV_OVERRIDE { cv::Size winSize; diff --git a/modules/optflow/src/rlof/geo_interpolation.cpp b/modules/optflow/src/rlof/geo_interpolation.cpp index 048f65812d0..143b4f63a4e 100644 --- a/modules/optflow/src/rlof/geo_interpolation.cpp +++ b/modules/optflow/src/rlof/geo_interpolation.cpp @@ -2,46 +2,43 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // This functions have been contributed by Jonas Geisters + +#include "../precomp.hpp" + #include "geo_interpolation.hpp" #include #include namespace cv { namespace optflow { -inline double K_h1(float x, double h2) -{ - return exp(-(0.5 / (h2)) * x); -} - struct Graph_helper { - int * mem; + std::vector mem; int e_size; Graph_helper(int k, int num_nodes) { e_size = (2 * k + 1); - mem = (int*)malloc(sizeof(int) * e_size * num_nodes); - memset(mem, 0, sizeof(int) * e_size * num_nodes); + mem.resize(e_size * num_nodes, 0); } inline int size(int id) { int r_addr = id * (e_size); - return ((int*)mem)[r_addr]; + return mem[r_addr]; } inline int * data(int id) { int r_addr = id * (e_size)+1; - return mem + r_addr; + return &mem[r_addr]; } inline void add(int id, std::pair data) { int r_addr = id * (e_size); - int size = ++((int*)mem)[r_addr]; + int size = ++mem[r_addr]; r_addr += 2 * size - 1;//== 1 + 2*(size-1); - ((float*)mem)[r_addr] = data.first; - ((int*)mem)[r_addr + 1] = data.second; + *(float*)&mem[r_addr] = data.first; + mem[r_addr + 1] = data.second; } inline bool color_in_target(int id, int color) { int r_addr = id * (e_size); - int size = (((int*)mem)[r_addr]); + int size = mem[r_addr]; r_addr += 2; for (int i = 0; i < size; i++) { - if (((int*)mem)[r_addr] == color) { + if (mem[r_addr] == color) { return true; } r_addr += 2; @@ -481,4 +478,4 @@ Mat interpolate_irregular_nn( return nnFlow; } -}} // namespace +}} // namespace diff --git a/modules/optflow/src/rlof/geo_interpolation.hpp b/modules/optflow/src/rlof/geo_interpolation.hpp index 0e13b44c493..526768903b5 100644 --- a/modules/optflow/src/rlof/geo_interpolation.hpp +++ b/modules/optflow/src/rlof/geo_interpolation.hpp @@ -3,7 +3,6 @@ // of this distribution and at http://opencv.org/license.html. #ifndef _GEO_INTERPOLATION_HPP_ #define _GEO_INTERPOLATION_HPP_ -#include namespace cv { namespace optflow { @@ -32,5 +31,5 @@ Mat interpolate_irregular_nn_raster(const std::vector & prevPoints, const std::vector & status, const Mat & i1); -}} // namespace +}} // namespace #endif diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp index b7311edb86c..3a51b1ba087 100644 --- a/modules/optflow/src/rlof/plk_invoker.hpp +++ b/modules/optflow/src/rlof/plk_invoker.hpp @@ -59,7 +59,6 @@ class TrackerInvoker : public cv::ParallelLoopBody crossSegmentationThreshold = _crossSegmentationThreshold; } - TrackerInvoker & operator=(const TrackerInvoker &) {return *this;}; void operator()(const cv::Range& range) const CV_OVERRIDE { #ifdef DEBUG_INVOKER @@ -711,7 +710,6 @@ class TrackerInvoker : public cv::ParallelLoopBody crossSegmentationThreshold = _crossSegmentationThreshold; } - TrackerInvoker & operator=(const TrackerInvoker &) {return *this;}; void operator()(const cv::Range& range) const CV_OVERRIDE { #ifdef DEBUG_INVOKER diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp index 14eabe4b948..14de2d04393 100644 --- a/modules/optflow/src/rlof/rlof_invoker.hpp +++ b/modules/optflow/src/rlof/rlof_invoker.hpp @@ -53,7 +53,7 @@ class TrackerInvoker : public cv::ParallelLoopBody useInitialFlow = _useInitialFlow; crossSegmentationThreshold = _crossSegmentationThreshold; } - TrackerInvoker & operator=(const TrackerInvoker &) { return *this; }; + void operator()(const cv::Range& range) const CV_OVERRIDE { #ifdef DEBUG_INVOKER @@ -675,7 +675,6 @@ class TrackerInvoker : public cv::ParallelLoopBody crossSegmentationThreshold = _crossSegmentationThreshold; } - TrackerInvoker & operator=(const TrackerInvoker &) { return *this; }; void operator()(const cv::Range& range) const CV_OVERRIDE { #ifdef DEBUG_INVOKER diff --git a/modules/optflow/src/rlof/rlof_invokerbase.hpp b/modules/optflow/src/rlof/rlof_invokerbase.hpp index 991531deef0..f65efeb673c 100644 --- a/modules/optflow/src/rlof/rlof_invokerbase.hpp +++ b/modules/optflow/src/rlof/rlof_invokerbase.hpp @@ -5,20 +5,19 @@ #define _RLOF_INVOKERBASE_HPP_ -#ifdef CV_CPU_COMPILE_SSE4_1 -#define RLOF_SSE -#endif -#ifdef CV_CPU_COMPILE_SSE4_2 +#if CV_SSE4_1 #define RLOF_SSE #endif //#define DEBUG_INVOKER -#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n)) +#ifndef CV_DESCALE +#define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n)) +#endif + #define FLT_RESCALE 1 -#include "opencv2/core.hpp" -#include "opencv2/video.hpp" + #include "rlof_localflow.h" #include #include "opencv2/core/hal/intrin.hpp" @@ -31,7 +30,7 @@ namespace optflow { typedef short deriv_type; #ifdef RLOF_SSE -inline void get4BitMask(const int & width, __m128i & mask) +static inline void get4BitMask(const int & width, __m128i & mask) { int noBits = width - static_cast(floor(width / 4.f) * 4.f); unsigned int val[4]; @@ -42,7 +41,7 @@ inline void get4BitMask(const int & width, __m128i & mask) mask = _mm_set_epi32(val[3], val[2], val[1], val[0]); } -inline void getWBitMask(const int & width, __m128i & t0, __m128i & t1, __m128i & t2) +static inline void getWBitMask(const int & width, __m128i & t0, __m128i & t1, __m128i & t2) { int noBits = width - static_cast(floor(width / 8.f) * 8.f); unsigned short val[8]; @@ -59,7 +58,7 @@ typedef uchar tMaskType; #define tCVMaskType CV_8UC1 #define MaskSet 0xffffffff -inline bool notSameColor(const cv::Point3_ & ref, int r, int c, const cv::Mat & img, int winSize, int threshold) +static inline bool notSameColor(const cv::Point3_ & ref, int r, int c, const cv::Mat & img, int winSize, int threshold) { if (r >= img.rows + winSize || c >= img.cols + winSize || r < -winSize || c < -winSize) return true; int step = static_cast(img.step1()); @@ -86,7 +85,8 @@ inline bool notSameColor(const cv::Point3_ & ref, int r, int c, const cv: *\param threshold, *\param useBothImages */ -static void getLocalPatch( +static +void getLocalPatch( const cv::Mat & prevImg, const cv::Point2i & prevPoint, // feature points cv::Mat & winPointMask, @@ -232,7 +232,8 @@ static void getLocalPatch( winPointMask = winPointMask(cv::Rect(min_c, min_r, winRoi.width, winRoi.height)); } -inline bool calcWinMaskMat( +static inline +bool calcWinMaskMat( const cv::Mat & BI, const int windowType, const cv::Point2i & iprevPt, @@ -264,7 +265,8 @@ inline bool calcWinMaskMat( return true; } -inline short estimateScale(cv::Mat & residuals) +static inline +short estimateScale(cv::Mat & residuals) { cv::Mat absMat = cv::abs(residuals); return quickselect(absMat, absMat.rows / 2); diff --git a/modules/optflow/src/rlof/rlof_localflow.cpp b/modules/optflow/src/rlof/rlof_localflow.cpp index 666421f767f..211d999b0e7 100644 --- a/modules/optflow/src/rlof/rlof_localflow.cpp +++ b/modules/optflow/src/rlof/rlof_localflow.cpp @@ -1,10 +1,11 @@ // This file is part of OpenCV project. // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. +#include "../precomp.hpp" + +#include "opencv2/calib3d.hpp" // findHomography + #include "rlof_localflow.h" -#ifdef HAVE_OPENCV_CALIB3D -#include "opencv2/calib3d.hpp" -#endif #include "berlof_invoker.hpp" #include "rlof_invoker.hpp" #include "plk_invoker.hpp" diff --git a/modules/optflow/src/rlof/rlof_localflow.h b/modules/optflow/src/rlof/rlof_localflow.h index 00702e5323b..9ffe42cea3e 100644 --- a/modules/optflow/src/rlof/rlof_localflow.h +++ b/modules/optflow/src/rlof/rlof_localflow.h @@ -7,7 +7,6 @@ #include #include #include -#include "opencv2/core.hpp" #include "opencv2/imgproc.hpp" #include "opencv2/optflow/rlofflow.hpp" //! Fast median estimation method based on @cite Tibshirani2008. This implementation relates to http://www.stat.cmu.edu/~ryantibs/median/ From 70f11160795fd2fab1ff7b7ab161722ec062e63d Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Fri, 14 Dec 2018 15:31:54 +0300 Subject: [PATCH 15/25] use Matx44f and Vec4f instead of Mat --- modules/optflow/src/rlof/berlof_invoker.hpp | 52 ++++++++++----------- modules/optflow/src/rlof/plk_invoker.hpp | 49 +++++++++---------- modules/optflow/src/rlof/rlof_invoker.hpp | 49 +++++++++---------- 3 files changed, 75 insertions(+), 75 deletions(-) diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp index d4e1228d3e1..31d2792b00e 100644 --- a/modules/optflow/src/rlof/berlof_invoker.hpp +++ b/modules/optflow/src/rlof/berlof_invoker.hpp @@ -25,22 +25,22 @@ static inline bool checkSolution(float x, float y, float * c ) && (bl.x >= 0 && bl.y <= 0) && (br.x <= 0 && br.y <= 0); } -static inline cv::Point2f est_DeltaGain( const cv::Mat & src, const cv::Vec4f & val) +static inline cv::Point2f est_DeltaGain(const cv::Matx44f& src, const cv::Vec4f& val) { return cv::Point2f( - src.at(2,0) * val[0] + src.at(2,1) * val[1] + src.at(2,2) * val[2] + src.at(2,3) * val[3], - src.at(3,0) * val[0] + src.at(3,1) * val[1] + src.at(3,2) * val[2] + src.at(3,3) * val[3]); + src(2,0) * val[0] + src(2,1) * val[1] + src(2,2) * val[2] + src(2,3) * val[3], + src(3,0) * val[0] + src(3,1) * val[1] + src(3,2) * val[2] + src(3,3) * val[3]); } -static inline void est_Result( const cv::Mat & src, const cv::Vec4f & val, cv::Point2f & delta, cv::Point2f & gain) +static inline void est_Result(const cv::Matx44f& src, const cv::Vec4f & val, cv::Point2f & delta, cv::Point2f & gain) { delta = cv::Point2f( - -(src.at(0,0) * val[0] + src.at(0,1) * val[1] + src.at(0,2) * val[2] + src.at(0,3) * val[3]), - -(src.at(1,0) * val[0] + src.at(1,1) * val[1] + src.at(1,2) * val[2] + src.at(1,3) * val[3])); + -(src(0,0) * val[0] + src(0,1) * val[1] + src(0,2) * val[2] + src(0,3) * val[3]), + -(src(1,0) * val[0] + src(1,1) * val[1] + src(1,2) * val[2] + src(1,3) * val[3])); gain = cv::Point2f( - src.at(2,0) * val[0] + src.at(2,1) * val[1] + src.at(2,2) * val[2] + src.at(2,3) * val[3], - src.at(3,0) * val[0] + src.at(3,1) * val[1] + src.at(3,2) * val[2] + src.at(3,3) * val[3]); + src(2,0) * val[0] + src(2,1) * val[1] + src(2,2) * val[2] + src(2,3) * val[3], + src(3,0) * val[0] + src(3,1) * val[1] + src(3,2) * val[2] + src(3,3) * val[3]); } namespace berlof { @@ -632,9 +632,7 @@ class TrackerInvoker : public cv::ParallelLoopBody cv::Size winBufSize(winbufwidth,winbufwidth); - cv::Mat invTensorMat(4,4, CV_32FC1); - cv::Mat mismatchMat(4, 1, CV_32FC1); - cv::Mat resultMat(4, 1, CV_32FC1); + cv::Matx44f invTensorMat; cv::AutoBuffer _buf(winBufSize.area()*(cn + cn2)); int derivDepth = DataType::depth; @@ -1344,22 +1342,22 @@ class TrackerInvoker : public cv::ParallelLoopBody D = (1.f / D); - invTensorMat.at(0,0) = (A22*sumI*sumI - 2*sumI*sumIy*w2 + dI*sumIy*sumIy + sumW*w2*w2 - A22*dI*sumW)* D; - invTensorMat.at(0,1) = (A12*dI*sumW - A12*sumI * sumI - dI*sumIx*sumIy + sumI*sumIx*w2 + sumI*sumIy*w1 - sumW*w1*w2)* D; - invTensorMat.at(0,2) = (A12*sumI*sumIy - sumIy*sumIy*w1 - A22*sumI*sumIx - A12*sumW*w2 + A22*sumW*w1 + sumIx*sumIy*w2)* D; - invTensorMat.at(0,3) = (A22*dI*sumIx - A12*dI*sumIy - sumIx*w2*w2 + A12*sumI*w2 - A22*sumI*w1 + sumIy*w1*w2) * D; - invTensorMat.at(1,0) = invTensorMat.at(0,1); - invTensorMat.at(1,1) = (A11*sumI * sumI - 2*sumI*sumIx*w1 + dI*sumIx * sumIx + sumW*w1*w1 - A11*dI*sumW) * D; - invTensorMat.at(1,2) = (A12*sumI*sumIx - A11*sumI*sumIy - sumIx * sumIx*w2 + A11*sumW*w2 - A12*sumW*w1 + sumIx*sumIy*w1) * D; - invTensorMat.at(1,3) = (A11*dI*sumIy - sumIy*w1*w1 - A12*dI*sumIx - A11*sumI*w2 + A12*sumI*w1 + sumIx*w1*w2)* D; - invTensorMat.at(2,0) = invTensorMat.at(0,2); - invTensorMat.at(2,1) = invTensorMat.at(1,2); - invTensorMat.at(2,2) = (sumW*A12*A12 - 2*A12*sumIx*sumIy + A22*sumIx*sumIx + A11*sumIy*sumIy - A11*A22*sumW)* D; - invTensorMat.at(2,3) = (A11*A22*sumI - A12*A12*sumI - A11*sumIy*w2 + A12*sumIx*w2 + A12*sumIy*w1 - A22*sumIx*w1)* D; - invTensorMat.at(3,0) = invTensorMat.at(0,3); - invTensorMat.at(3,1) = invTensorMat.at(1,3); - invTensorMat.at(3,2) = invTensorMat.at(2,3); - invTensorMat.at(3,3) = (dI*A12*A12 - 2*A12*w1*w2 + A22*w1*w1 + A11*w2*w2 - A11*A22*dI)* D; + invTensorMat(0,0) = (A22*sumI*sumI - 2*sumI*sumIy*w2 + dI*sumIy*sumIy + sumW*w2*w2 - A22*dI*sumW)* D; + invTensorMat(0,1) = (A12*dI*sumW - A12*sumI * sumI - dI*sumIx*sumIy + sumI*sumIx*w2 + sumI*sumIy*w1 - sumW*w1*w2)* D; + invTensorMat(0,2) = (A12*sumI*sumIy - sumIy*sumIy*w1 - A22*sumI*sumIx - A12*sumW*w2 + A22*sumW*w1 + sumIx*sumIy*w2)* D; + invTensorMat(0,3) = (A22*dI*sumIx - A12*dI*sumIy - sumIx*w2*w2 + A12*sumI*w2 - A22*sumI*w1 + sumIy*w1*w2) * D; + invTensorMat(1,0) = invTensorMat(0,1); + invTensorMat(1,1) = (A11*sumI * sumI - 2*sumI*sumIx*w1 + dI*sumIx * sumIx + sumW*w1*w1 - A11*dI*sumW) * D; + invTensorMat(1,2) = (A12*sumI*sumIx - A11*sumI*sumIy - sumIx * sumIx*w2 + A11*sumW*w2 - A12*sumW*w1 + sumIx*sumIy*w1) * D; + invTensorMat(1,3) = (A11*dI*sumIy - sumIy*w1*w1 - A12*dI*sumIx - A11*sumI*w2 + A12*sumI*w1 + sumIx*w1*w2)* D; + invTensorMat(2,0) = invTensorMat(0,2); + invTensorMat(2,1) = invTensorMat(1,2); + invTensorMat(2,2) = (sumW*A12*A12 - 2*A12*sumIx*sumIy + A22*sumIx*sumIx + A11*sumIy*sumIy - A11*A22*sumW)* D; + invTensorMat(2,3) = (A11*A22*sumI - A12*A12*sumI - A11*sumIy*w2 + A12*sumIx*w2 + A12*sumIy*w1 - A22*sumIx*w1)* D; + invTensorMat(3,0) = invTensorMat(0,3); + invTensorMat(3,1) = invTensorMat(1,3); + invTensorMat(3,2) = invTensorMat(2,3); + invTensorMat(3,3) = (dI*A12*A12 - 2*A12*w1*w2 + A22*w1*w1 + A11*w2*w2 - A11*A22*dI)* D; } diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp index 3a51b1ba087..06a2972969d 100644 --- a/modules/optflow/src/rlof/plk_invoker.hpp +++ b/modules/optflow/src/rlof/plk_invoker.hpp @@ -82,9 +82,9 @@ class TrackerInvoker : public cv::ParallelLoopBody int winbufwidth = static_cast(ceil(winSize.width / 16.f)) * 16; cv::Size winBufSize(winbufwidth,winbufwidth); - cv::Mat invTensorMat(4,4, CV_32FC1); - cv::Mat mismatchMat(4, 1, CV_32FC1); - cv::Mat resultMat(4, 1, CV_32FC1); + cv::Matx44f invTensorMat; + Vec4f mismatchMat; + Vec4f resultMat; cv::AutoBuffer _buf(winBufSize.area()*(cn + cn2)); int derivDepth = DataType::depth; @@ -555,10 +555,10 @@ class TrackerInvoker : public cv::ParallelLoopBody b4 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; #endif - mismatchMat.at(0,0) = b1 * FLT_SCALE; - mismatchMat.at(1,0) = b2 * FLT_SCALE; - mismatchMat.at(2,0) = -b3 * FLT_SCALE; - mismatchMat.at(3,0) = -b4 * FLT_SCALE; + mismatchMat(0) = b1 * FLT_SCALE; + mismatchMat(1) = b2 * FLT_SCALE; + mismatchMat(2) = -b3 * FLT_SCALE; + mismatchMat(3) = -b4 * FLT_SCALE; D = - A12*A12*sumI*sumI + dI*sumW*A12*A12 + 2*A12*sumI*sumIx*w2 + 2*A12*sumI*sumIy*w1 - 2*dI*A12*sumIx*sumIy - 2*sumW*A12*w1*w2 + A11*A22*sumI*sumI - 2*A22*sumI*sumIx*w1 @@ -583,27 +583,28 @@ class TrackerInvoker : public cv::ParallelLoopBody D = (1.f / D); - invTensorMat.at(0,0) = (A22*sumI*sumI - 2*sumI*sumIy*w2 + dI*sumIy*sumIy + sumW*w2*w2 - A22*dI*sumW)* D; invTensorMat.at(0,1) = (A12*dI*sumW - A12*sumI * sumI - dI*sumIx*sumIy + sumI*sumIx*w2 + sumI*sumIy*w1 - sumW*w1*w2)* D; - invTensorMat.at(0,2) = (A12*sumI*sumIy - sumIy*sumIy*w1 - A22*sumI*sumIx - A12*sumW*w2 + A22*sumW*w1 + sumIx*sumIy*w2)* D; - invTensorMat.at(0,3) = (A22*dI*sumIx - A12*dI*sumIy - sumIx*w2*w2 + A12*sumI*w2 - A22*sumI*w1 + sumIy*w1*w2) * D; - invTensorMat.at(1,0) = invTensorMat.at(0,1); - invTensorMat.at(1,1) = (A11*sumI * sumI - 2*sumI*sumIx*w1 + dI*sumIx * sumIx + sumW*w1*w1 - A11*dI*sumW) * D; - invTensorMat.at(1,2) = (A12*sumI*sumIx - A11*sumI*sumIy - sumIx * sumIx*w2 + A11*sumW*w2 - A12*sumW*w1 + sumIx*sumIy*w1) * D; - invTensorMat.at(1,3) = (A11*dI*sumIy - sumIy*w1*w1 - A12*dI*sumIx - A11*sumI*w2 + A12*sumI*w1 + sumIx*w1*w2)* D; - invTensorMat.at(2,0) = invTensorMat.at(0,2); - invTensorMat.at(2,1) = invTensorMat.at(1,2); - invTensorMat.at(2,2) = (sumW*A12*A12 - 2*A12*sumIx*sumIy + A22*sumIx*sumIx + A11*sumIy*sumIy - A11*A22*sumW)* D; - invTensorMat.at(2,3) = (A11*A22*sumI - A12*A12*sumI - A11*sumIy*w2 + A12*sumIx*w2 + A12*sumIy*w1 - A22*sumIx*w1)* D; - invTensorMat.at(3,0) = invTensorMat.at(0,3); - invTensorMat.at(3,1) = invTensorMat.at(1,3); - invTensorMat.at(3,2) = invTensorMat.at(2,3); - invTensorMat.at(3,3) = (dI*A12*A12 - 2*A12*w1*w2 + A22*w1*w1 + A11*w2*w2 - A11*A22*dI)* D; + invTensorMat(0,0) = (A22*sumI*sumI - 2*sumI*sumIy*w2 + dI*sumIy*sumIy + sumW*w2*w2 - A22*dI*sumW)* D; + invTensorMat(0,1) = (A12*dI*sumW - A12*sumI * sumI - dI*sumIx*sumIy + sumI*sumIx*w2 + sumI*sumIy*w1 - sumW*w1*w2)* D; + invTensorMat(0,2) = (A12*sumI*sumIy - sumIy*sumIy*w1 - A22*sumI*sumIx - A12*sumW*w2 + A22*sumW*w1 + sumIx*sumIy*w2)* D; + invTensorMat(0,3) = (A22*dI*sumIx - A12*dI*sumIy - sumIx*w2*w2 + A12*sumI*w2 - A22*sumI*w1 + sumIy*w1*w2) * D; + invTensorMat(1,0) = invTensorMat(0,1); + invTensorMat(1,1) = (A11*sumI * sumI - 2*sumI*sumIx*w1 + dI*sumIx * sumIx + sumW*w1*w1 - A11*dI*sumW) * D; + invTensorMat(1,2) = (A12*sumI*sumIx - A11*sumI*sumIy - sumIx * sumIx*w2 + A11*sumW*w2 - A12*sumW*w1 + sumIx*sumIy*w1) * D; + invTensorMat(1,3) = (A11*dI*sumIy - sumIy*w1*w1 - A12*dI*sumIx - A11*sumI*w2 + A12*sumI*w1 + sumIx*w1*w2)* D; + invTensorMat(2,0) = invTensorMat(0,2); + invTensorMat(2,1) = invTensorMat(1,2); + invTensorMat(2,2) = (sumW*A12*A12 - 2*A12*sumIx*sumIy + A22*sumIx*sumIx + A11*sumIy*sumIy - A11*A22*sumW)* D; + invTensorMat(2,3) = (A11*A22*sumI - A12*A12*sumI - A11*sumIy*w2 + A12*sumIx*w2 + A12*sumIy*w1 - A22*sumIx*w1)* D; + invTensorMat(3,0) = invTensorMat(0,3); + invTensorMat(3,1) = invTensorMat(1,3); + invTensorMat(3,2) = invTensorMat(2,3); + invTensorMat(3,3) = (dI*A12*A12 - 2*A12*w1*w2 + A22*w1*w1 + A11*w2*w2 - A11*A22*dI)* D; resultMat = invTensorMat * mismatchMat; // 0.08 -12.10 - Point2f delta(-resultMat.at(0), -resultMat.at(1)); - Point2f deltaGain(-resultMat.at(2), -resultMat.at(3)); + Point2f delta(-resultMat(0), -resultMat(1)); + Point2f deltaGain(-resultMat(2), -resultMat(3)); if( j == 0) diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp index 14de2d04393..7a4bd343cbe 100644 --- a/modules/optflow/src/rlof/rlof_invoker.hpp +++ b/modules/optflow/src/rlof/rlof_invoker.hpp @@ -700,9 +700,9 @@ class TrackerInvoker : public cv::ParallelLoopBody int winbufwidth = static_cast(ceil(winSize.width / 16.f)) * 16; cv::Size winBufSize(winbufwidth, winbufwidth); - cv::Mat invTensorMat(4, 4, CV_32FC1); - cv::Mat mismatchMat(4, 1, CV_32FC1); - cv::Mat resultMat(4, 1, CV_32FC1); + cv::Matx44f invTensorMat; + cv::Vec4f mismatchMat; + cv::Vec4f resultMat; std::vector _buf(winBufSize.area()*(cn + cn2)); @@ -1332,10 +1332,10 @@ class TrackerInvoker : public cv::ParallelLoopBody _mm_store_ps(bbuf, qb3); b4 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; #endif - mismatchMat.at(0, 0) = b1 * FLT_SCALE; - mismatchMat.at(1, 0) = b2 * FLT_SCALE; - mismatchMat.at(2, 0) = -b3 * FLT_SCALE; - mismatchMat.at(3, 0) = -b4 * FLT_SCALE; + mismatchMat(0) = b1 * FLT_SCALE; + mismatchMat(1) = b2 * FLT_SCALE; + mismatchMat(2) = -b3 * FLT_SCALE; + mismatchMat(3) = -b4 * FLT_SCALE; D = -A12 * A12*sumI*sumI + dI * sumW*A12*A12 + 2 * A12*sumI*sumIx*w2 + 2 * A12*sumI*sumIy*w1 - 2 * dI*A12*sumIx*sumIy - 2 * sumW*A12*w1*w2 + A11 * A22*sumI*sumI - 2 * A22*sumI*sumIx*w1 @@ -1358,26 +1358,27 @@ class TrackerInvoker : public cv::ParallelLoopBody D = (1.f / D); - invTensorMat.at(0, 0) = (A22*sumI*sumI - 2 * sumI*sumIy*w2 + dI * sumIy*sumIy + sumW * w2*w2 - A22 * dI*sumW)* D; invTensorMat.at(0, 1) = (A12*dI*sumW - A12 * sumI * sumI - dI * sumIx*sumIy + sumI * sumIx*w2 + sumI * sumIy*w1 - sumW * w1*w2)* D; - invTensorMat.at(0, 2) = (A12*sumI*sumIy - sumIy * sumIy*w1 - A22 * sumI*sumIx - A12 * sumW*w2 + A22 * sumW*w1 + sumIx * sumIy*w2)* D; - invTensorMat.at(0, 3) = (A22*dI*sumIx - A12 * dI*sumIy - sumIx * w2*w2 + A12 * sumI*w2 - A22 * sumI*w1 + sumIy * w1*w2) * D; - invTensorMat.at(1, 0) = invTensorMat.at(0, 1); - invTensorMat.at(1, 1) = (A11*sumI * sumI - 2 * sumI*sumIx*w1 + dI * sumIx * sumIx + sumW * w1*w1 - A11 * dI*sumW) * D; - invTensorMat.at(1, 2) = (A12*sumI*sumIx - A11 * sumI*sumIy - sumIx * sumIx*w2 + A11 * sumW*w2 - A12 * sumW*w1 + sumIx * sumIy*w1) * D; - invTensorMat.at(1, 3) = (A11*dI*sumIy - sumIy * w1*w1 - A12 * dI*sumIx - A11 * sumI*w2 + A12 * sumI*w1 + sumIx * w1*w2)* D; - invTensorMat.at(2, 0) = invTensorMat.at(0, 2); - invTensorMat.at(2, 1) = invTensorMat.at(1, 2); - invTensorMat.at(2, 2) = (sumW*A12*A12 - 2 * A12*sumIx*sumIy + A22 * sumIx*sumIx + A11 * sumIy*sumIy - A11 * A22*sumW)* D; - invTensorMat.at(2, 3) = (A11*A22*sumI - A12 * A12*sumI - A11 * sumIy*w2 + A12 * sumIx*w2 + A12 * sumIy*w1 - A22 * sumIx*w1)* D; - invTensorMat.at(3, 0) = invTensorMat.at(0, 3); - invTensorMat.at(3, 1) = invTensorMat.at(1, 3); - invTensorMat.at(3, 2) = invTensorMat.at(2, 3); - invTensorMat.at(3, 3) = (dI*A12*A12 - 2 * A12*w1*w2 + A22 * w1*w1 + A11 * w2*w2 - A11 * A22*dI)* D; + invTensorMat(0, 0) = (A22*sumI*sumI - 2 * sumI*sumIy*w2 + dI * sumIy*sumIy + sumW * w2*w2 - A22 * dI*sumW)* D; + invTensorMat(0, 1) = (A12*dI*sumW - A12 * sumI * sumI - dI * sumIx*sumIy + sumI * sumIx*w2 + sumI * sumIy*w1 - sumW * w1*w2)* D; + invTensorMat(0, 2) = (A12*sumI*sumIy - sumIy * sumIy*w1 - A22 * sumI*sumIx - A12 * sumW*w2 + A22 * sumW*w1 + sumIx * sumIy*w2)* D; + invTensorMat(0, 3) = (A22*dI*sumIx - A12 * dI*sumIy - sumIx * w2*w2 + A12 * sumI*w2 - A22 * sumI*w1 + sumIy * w1*w2) * D; + invTensorMat(1, 0) = invTensorMat(0, 1); + invTensorMat(1, 1) = (A11*sumI * sumI - 2 * sumI*sumIx*w1 + dI * sumIx * sumIx + sumW * w1*w1 - A11 * dI*sumW) * D; + invTensorMat(1, 2) = (A12*sumI*sumIx - A11 * sumI*sumIy - sumIx * sumIx*w2 + A11 * sumW*w2 - A12 * sumW*w1 + sumIx * sumIy*w1) * D; + invTensorMat(1, 3) = (A11*dI*sumIy - sumIy * w1*w1 - A12 * dI*sumIx - A11 * sumI*w2 + A12 * sumI*w1 + sumIx * w1*w2)* D; + invTensorMat(2, 0) = invTensorMat(0, 2); + invTensorMat(2, 1) = invTensorMat(1, 2); + invTensorMat(2, 2) = (sumW*A12*A12 - 2 * A12*sumIx*sumIy + A22 * sumIx*sumIx + A11 * sumIy*sumIy - A11 * A22*sumW)* D; + invTensorMat(2, 3) = (A11*A22*sumI - A12 * A12*sumI - A11 * sumIy*w2 + A12 * sumIx*w2 + A12 * sumIy*w1 - A22 * sumIx*w1)* D; + invTensorMat(3, 0) = invTensorMat(0, 3); + invTensorMat(3, 1) = invTensorMat(1, 3); + invTensorMat(3, 2) = invTensorMat(2, 3); + invTensorMat(3, 3) = (dI*A12*A12 - 2 * A12*w1*w2 + A22 * w1*w1 + A11 * w2*w2 - A11 * A22*dI)* D; resultMat = invTensorMat * mismatchMat; - Point2f delta(-resultMat.at(0), -resultMat.at(1)); - Point2f deltaGain(resultMat.at(2), resultMat.at(3)); + Point2f delta(-resultMat(0), -resultMat(1)); + Point2f deltaGain(resultMat(2), resultMat(3)); From 3b794f2fbdf5c2d805615c8a1836bf210b81cf7f Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Fri, 14 Dec 2018 16:12:26 +0300 Subject: [PATCH 16/25] normSigmas into constants --- modules/optflow/src/rlof/berlof_invoker.hpp | 90 +++++++++++---------- modules/optflow/src/rlof/rlof_invoker.hpp | 76 ++++++++--------- 2 files changed, 87 insertions(+), 79 deletions(-) diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp index 31d2792b00e..87f973d1ac1 100644 --- a/modules/optflow/src/rlof/berlof_invoker.hpp +++ b/modules/optflow/src/rlof/berlof_invoker.hpp @@ -67,8 +67,12 @@ class TrackerInvoker : public cv::ParallelLoopBody bool _useInitialFlow, int _supportRegionType, int _crossSegmentationThreshold, - std::vector _normSigmas, - float _minEigenValue) + const std::vector& _normSigmas, + float _minEigenValue + ) : + normSigma0(_normSigmas[0]), + normSigma1(_normSigmas[1]), + normSigma2(_normSigmas[2]) { prevImg = &_prevImg; prevDeriv = &_prevDeriv; @@ -87,7 +91,6 @@ class TrackerInvoker : public cv::ParallelLoopBody maxLevel = _maxLevel; windowType = _supportRegionType; minEigThreshold = _minEigenValue; - paramReset = _normSigmas; useInitialFlow = _useInitialFlow; crossSegmentationThreshold = _crossSegmentationThreshold; } @@ -112,7 +115,6 @@ class TrackerInvoker : public cv::ParallelLoopBody cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); winMaskMatBuf.setTo(1); - std::vector param = paramReset; int j, cn = I.channels(), cn2 = cn*2; int winbufwidth = static_cast(ceil(winSize.width / 16.f)) * 16; cv::Size winBufSize(winbufwidth,winbufwidth); @@ -294,10 +296,10 @@ class TrackerInvoker : public cv::ParallelLoopBody } float eta = 1.f / winArea; - float fParam0 = param[0] * 32.f; - float fParam1 = param[1] * 32.f; - fParam0 = param[0] * MEstimatorScale; - fParam1 = param[1] * MEstimatorScale; + float fParam0 = normSigma0 * 32.f; + float fParam1 = normSigma1 * 32.f; + fParam0 = normSigma0 * MEstimatorScale; + fParam1 = normSigma1 * MEstimatorScale; buffIdx = 0; @@ -351,17 +353,17 @@ class TrackerInvoker : public cv::ParallelLoopBody } else if( abss > fParam0 && diff >= 0 ) { - It[0] = param[2] * (It[0] - fParam1); - It[1] = param[2] * (It[1] - fParam1); - It[2] = param[2] * (It[2] - fParam1); - It[3] = param[2] * (It[3] - fParam1); + It[0] = normSigma2 * (It[0] - fParam1); + It[1] = normSigma2 * (It[1] - fParam1); + It[2] = normSigma2 * (It[2] - fParam1); + It[3] = normSigma2 * (It[3] - fParam1); } else if( abss > fParam0 && diff < 0 ) { - It[0] = param[2] * (It[0] + fParam1); - It[1] = param[2] * (It[1] + fParam1); - It[2] = param[2] * (It[2] + fParam1); - It[3] = param[2] * (It[3] + fParam1); + It[0] = normSigma2 * (It[0] + fParam1); + It[1] = normSigma2 * (It[1] + fParam1); + It[2] = normSigma2 * (It[2] + fParam1); + It[3] = normSigma2 * (It[3] + fParam1); } } @@ -387,7 +389,7 @@ class TrackerInvoker : public cv::ParallelLoopBody // compute the Gradient Matrice if( j == 0) { - float tale = param[2] * FLT_RESCALE; + float tale = normSigma2 * FLT_RESCALE; if( abss < fParam0 || j < 0) { tale = FLT_RESCALE; @@ -398,7 +400,7 @@ class TrackerInvoker : public cv::ParallelLoopBody } else { - tale *= param[2]; + tale *= normSigma2; } A11 += (float)(ixval*ixval)*tale; @@ -555,7 +557,7 @@ class TrackerInvoker : public cv::ParallelLoopBody int windowType; float minEigThreshold; bool useInitialFlow; - std::vector paramReset; + const float normSigma0, normSigma1, normSigma2; int crossSegmentationThreshold; }; @@ -583,8 +585,12 @@ class TrackerInvoker : public cv::ParallelLoopBody bool _useInitialFlow, int _supportRegionType, int _crossSegmentationThreshold, - std::vector _normSigmas, - float _minEigenValue) + const std::vector& _normSigmas, + float _minEigenValue + ) : + normSigma0(_normSigmas[0]), + normSigma1(_normSigmas[1]), + normSigma2(_normSigmas[2]) { prevImg = &_prevImg; prevDeriv = &_prevDeriv; @@ -604,7 +610,6 @@ class TrackerInvoker : public cv::ParallelLoopBody maxLevel = _maxLevel; windowType = _supportRegionType; minEigThreshold = _minEigenValue; - paramReset = _normSigmas; useInitialFlow = _useInitialFlow; crossSegmentationThreshold = _crossSegmentationThreshold; } @@ -626,7 +631,6 @@ class TrackerInvoker : public cv::ParallelLoopBody cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); winMaskMatBuf.setTo(1); - std::vector param = paramReset; int cn = I.channels(), cn2 = cn*2; int winbufwidth = static_cast(ceil(winSize.width / 16.f)) * 16; cv::Size winBufSize(winbufwidth,winbufwidth); @@ -901,10 +905,10 @@ class TrackerInvoker : public cv::ParallelLoopBody } float eta = 1.f / winArea; - float fParam0 = param[0] * 32.f; - float fParam1 = param[1] * 32.f; - fParam0 = param[0] * MEstimatorScale; - fParam1 = param[1] * MEstimatorScale; + float fParam0 = normSigma0 * 32.f; + float fParam1 = normSigma1 * 32.f; + fParam0 = normSigma0 * MEstimatorScale; + fParam1 = normSigma1 * MEstimatorScale; #ifdef RLOF_SSE @@ -922,12 +926,12 @@ class TrackerInvoker : public cv::ParallelLoopBody __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits::max()- 1, static_cast(fParam1))); - float s2Val = param[2] > 0 ? param[2] : -param[2]; - int s2bitShift = param[2] == 0 ? 1 : static_cast(ceil(log(200.f / s2Val) / log(2.f))); - __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast(param[2] * (float)(1 << s2bitShift))); + float s2Val = std::fabs(normSigma2); + int s2bitShift = normSigma2 == 0 ? 1 : static_cast(ceil(log(200.f / s2Val) / log(2.f))); + __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast(normSigma2 * (float)(1 << s2bitShift))); __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift); - __m128 mmParam2s = _mm_set1_ps(0.01f * param[2]); - __m128 mmParam2s2 = _mm_set1_ps(param[2] * param[2]); + __m128 mmParam2s = _mm_set1_ps(0.01f * normSigma2); + __m128 mmParam2s2 = _mm_set1_ps(normSigma2 * normSigma2); float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x; int bitShift = gainVec.x == 0 ? 1 : static_cast(ceil(log(200.f / gainVal) / log(2.f))); __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast(gainVec.x * (float)(1 << bitShift))); @@ -1197,17 +1201,17 @@ class TrackerInvoker : public cv::ParallelLoopBody } else if( abss > static_cast(fParam0) && diff >= 0 ) { - It[0] = static_cast(param[2] * (It[0] - fParam1)); - It[1] = static_cast(param[2] * (It[1] - fParam1)); - It[2] = static_cast(param[2] * (It[2] - fParam1)); - It[3] = static_cast(param[2] * (It[3] - fParam1)); + It[0] = static_cast(normSigma2 * (It[0] - fParam1)); + It[1] = static_cast(normSigma2 * (It[1] - fParam1)); + It[2] = static_cast(normSigma2 * (It[2] - fParam1)); + It[3] = static_cast(normSigma2 * (It[3] - fParam1)); } else if( abss > static_cast(fParam0) && diff < 0 ) { - It[0] = static_cast(param[2] * (It[0] + fParam1)); - It[1] = static_cast(param[2] * (It[1] + fParam1)); - It[2] = static_cast(param[2] * (It[2] + fParam1)); - It[3] = static_cast(param[2] * (It[3] + fParam1)); + It[0] = static_cast(normSigma2 * (It[0] + fParam1)); + It[1] = static_cast(normSigma2 * (It[1] + fParam1)); + It[2] = static_cast(normSigma2 * (It[2] + fParam1)); + It[3] = static_cast(normSigma2 * (It[3] + fParam1)); } } _b0[0] += (float)(It[0]*dIptr[0]) ; @@ -1234,7 +1238,7 @@ class TrackerInvoker : public cv::ParallelLoopBody // compute the Gradient Matrice if( j == 0) { - float tale = param[2] * FLT_RESCALE; + float tale = normSigma2 * FLT_RESCALE; if( abss < fParam0 || j < 0) { tale = FLT_RESCALE; @@ -1245,7 +1249,7 @@ class TrackerInvoker : public cv::ParallelLoopBody } else { - tale *= param[2]; + tale *= normSigma2; } if( j == 0 ) { @@ -1521,7 +1525,7 @@ class TrackerInvoker : public cv::ParallelLoopBody int windowType; float minEigThreshold; bool useInitialFlow; - std::vector paramReset; + const float normSigma0, normSigma1, normSigma2; int crossSegmentationThreshold; }; diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp index 7a4bd343cbe..cea0f7f6718 100644 --- a/modules/optflow/src/rlof/rlof_invoker.hpp +++ b/modules/optflow/src/rlof/rlof_invoker.hpp @@ -28,9 +28,13 @@ class TrackerInvoker : public cv::ParallelLoopBody int _maxIteration, bool _useInitialFlow, int _supportRegionType, - std::vector _normSigmas, + const std::vector& _normSigmas, float _minEigenValue, - int _crossSegmentationThreshold) + int _crossSegmentationThreshold + ) : + normSigma0(_normSigmas[0]), + normSigma1(_normSigmas[1]), + normSigma2(_normSigmas[2]) { prevImg = &_prevImg; prevDeriv = &_prevDeriv; @@ -49,7 +53,6 @@ class TrackerInvoker : public cv::ParallelLoopBody maxLevel = _maxLevel; windowType = _supportRegionType; minEigThreshold = _minEigenValue; - paramReset = _normSigmas; useInitialFlow = _useInitialFlow; crossSegmentationThreshold = _crossSegmentationThreshold; } @@ -74,7 +77,6 @@ class TrackerInvoker : public cv::ParallelLoopBody const float FLT_SCALE = (1.f / (1 << 20)); // 20 - std::vector param = paramReset; int cn = I.channels(), cn2 = cn * 2; int winbufwidth = static_cast(ceil(winSize.width / 8.f)) * 8; cv::Size winBufSize(winbufwidth, winbufwidth); @@ -304,10 +306,10 @@ class TrackerInvoker : public cv::ParallelLoopBody } float eta = 1.f / winArea; - float fParam0 = param[0] * 32.f; - float fParam1 = param[1] * 32.f; - fParam0 = param[0] * MEstimatorScale; - fParam1 = param[1] * MEstimatorScale; + float fParam0 = normSigma0 * 32.f; + float fParam1 = normSigma1 * 32.f; + fParam0 = normSigma0 * MEstimatorScale; + fParam1 = normSigma1 * MEstimatorScale; #ifdef RLOF_SSE qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); @@ -315,12 +317,12 @@ class TrackerInvoker : public cv::ParallelLoopBody __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps(); __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits::max() - 1, static_cast(fParam0))); __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits::max() - 1, static_cast(fParam1))); - float s2Val = param[2] > 0 ? param[2] : -param[2]; - int s2bitShift = param[2] == 0 ? 1 : static_cast(ceil(log(200.f / s2Val) / log(2.f))); - __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast(param[2] * (float)(1 << s2bitShift))); + float s2Val = std::fabs(normSigma2); + int s2bitShift = normSigma2 == 0 ? 1 : static_cast(ceil(log(200.f / s2Val) / log(2.f))); + __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast(normSigma2 * (float)(1 << s2bitShift))); __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift); - __m128 mmParam2s = _mm_set1_ps(0.01f * param[2]); - __m128 mmParam2s2 = _mm_set1_ps(param[2] * param[2]); + __m128 mmParam2s = _mm_set1_ps(0.01f * normSigma2); + __m128 mmParam2s2 = _mm_set1_ps(normSigma2 * normSigma2); __m128 mmOnes = _mm_set1_ps(1.f); __m128i mmEta = _mm_setzero_si128(); __m128i mmScale = _mm_set1_epi16(static_cast(MEstimatorScale)); @@ -487,12 +489,12 @@ class TrackerInvoker : public cv::ParallelLoopBody else if (abss > fParam0 && diff >= 0) { //diff = fParam1 * (diff - fParam1); - diff = static_cast(param[2] * (diff - fParam1)); + diff = static_cast(normSigma2 * (diff - fParam1)); } else if (abss > fParam0 && diff < 0) { //diff = fParam1 * (diff + fParam1); - diff = static_cast(param[2] * (diff + fParam1)); + diff = static_cast(normSigma2 * (diff + fParam1)); } @@ -503,7 +505,7 @@ class TrackerInvoker : public cv::ParallelLoopBody if ( j == 0) { - float tale = param[2] * FLT_RESCALE; + float tale = normSigma2 * FLT_RESCALE; if (abss < fParam0) { tale = FLT_RESCALE; @@ -514,7 +516,7 @@ class TrackerInvoker : public cv::ParallelLoopBody } else { - tale *= param[2]; + tale *= normSigma2; } A11 += (float)(ixval*ixval*tale); A12 += (float)(ixval*iyval*tale); @@ -621,7 +623,7 @@ class TrackerInvoker : public cv::ParallelLoopBody int windowType; float minEigThreshold; bool useInitialFlow; - std::vector paramReset; + const float normSigma0, normSigma1, normSigma2; int crossSegmentationThreshold; }; @@ -648,9 +650,13 @@ class TrackerInvoker : public cv::ParallelLoopBody int _maxIteration, bool _useInitialFlow, int _supportRegionType, - std::vector _normSigmas, + const std::vector& _normSigmas, float _minEigenValue, - int _crossSegmentationThreshold) + int _crossSegmentationThreshold + ) : + normSigma0(_normSigmas[0]), + normSigma1(_normSigmas[1]), + normSigma2(_normSigmas[2]) { prevImg = &_prevImg; prevDeriv = &_prevDeriv; @@ -670,7 +676,6 @@ class TrackerInvoker : public cv::ParallelLoopBody maxLevel = _maxLevel; windowType = _supportRegionType; minEigThreshold = _minEigenValue; - paramReset = _normSigmas; useInitialFlow = _useInitialFlow; crossSegmentationThreshold = _crossSegmentationThreshold; } @@ -695,7 +700,6 @@ class TrackerInvoker : public cv::ParallelLoopBody cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); winMaskMatBuf.setTo(1); - std::vector param = paramReset; int cn = I.channels(), cn2 = cn * 2; int winbufwidth = static_cast(ceil(winSize.width / 16.f)) * 16; cv::Size winBufSize(winbufwidth, winbufwidth); @@ -965,10 +969,10 @@ class TrackerInvoker : public cv::ParallelLoopBody } float eta = 1.f / winArea; - float fParam0 = param[0] * 32.f; - float fParam1 = param[1] * 32.f; - fParam0 = param[0] * MEstimatorScale; - fParam1 = param[1] * MEstimatorScale; + float fParam0 = normSigma0 * 32.f; + float fParam1 = normSigma1 * 32.f; + fParam0 = normSigma0 * MEstimatorScale; + fParam1 = normSigma1 * MEstimatorScale; #ifdef RLOF_SSE @@ -984,12 +988,12 @@ class TrackerInvoker : public cv::ParallelLoopBody __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits::max() - 1, static_cast(fParam1))); - float s2Val = param[2] > 0 ? param[2] : -param[2]; - int s2bitShift = param[2] == 0 ? 1 : static_cast(ceil(log(200.f / s2Val) / log(2.f))); - __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast(param[2] * (float)(1 << s2bitShift))); + float s2Val = std::fabs(normSigma2); + int s2bitShift = normSigma2 == 0 ? 1 : static_cast(ceil(log(200.f / s2Val) / log(2.f))); + __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast(normSigma2 * (float)(1 << s2bitShift))); __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift); - __m128 mmParam2s = _mm_set1_ps(0.01f * param[2]); - __m128 mmParam2s2 = _mm_set1_ps(param[2] * param[2]); + __m128 mmParam2s = _mm_set1_ps(0.01f * normSigma2); + __m128 mmParam2s2 = _mm_set1_ps(normSigma2 * normSigma2); float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x; int bitShift = gainVec.x == 0 ? 1 : static_cast(ceil(log(200.f / gainVal) / log(2.f))); @@ -1226,11 +1230,11 @@ class TrackerInvoker : public cv::ParallelLoopBody } else if (abss > static_cast(fParam0) && diff >= 0) { - diff = static_cast(param[2] * (diff - fParam1)); + diff = static_cast(normSigma2 * (diff - fParam1)); } else if (abss > static_cast(fParam0) && diff < 0) { - diff = static_cast(param[2] * (diff + fParam1)); + diff = static_cast(normSigma2 * (diff + fParam1)); } b1 += (float)(diff*ixval); b2 += (float)(diff*iyval); ; @@ -1241,7 +1245,7 @@ class TrackerInvoker : public cv::ParallelLoopBody // compute the Gradient Matrice if (j == 0) { - float tale = param[2] * FLT_RESCALE; + float tale = normSigma2 * FLT_RESCALE; if (abss < fParam0 || j < 0) { tale = FLT_RESCALE; @@ -1252,7 +1256,7 @@ class TrackerInvoker : public cv::ParallelLoopBody } else { - tale *= param[2]; + tale *= normSigma2; } if (j == 0) { @@ -1426,7 +1430,7 @@ class TrackerInvoker : public cv::ParallelLoopBody int windowType; float minEigThreshold; bool useInitialFlow; - std::vector paramReset; + const float normSigma0, normSigma1, normSigma2; int crossSegmentationThreshold; }; From 5acff7bcaa63a7ceed987eca3008b7cea6249419 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Fri, 14 Dec 2018 16:22:02 +0300 Subject: [PATCH 17/25] replace ceil() calls --- modules/optflow/src/rlof/berlof_invoker.hpp | 16 ++++++++-------- modules/optflow/src/rlof/plk_invoker.hpp | 10 +++++----- modules/optflow/src/rlof/rlof_invoker.hpp | 14 +++++++------- modules/optflow/src/rlof/rlof_invokerbase.hpp | 6 +++--- 4 files changed, 23 insertions(+), 23 deletions(-) diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp index 87f973d1ac1..1d04715fe48 100644 --- a/modules/optflow/src/rlof/berlof_invoker.hpp +++ b/modules/optflow/src/rlof/berlof_invoker.hpp @@ -111,12 +111,12 @@ class TrackerInvoker : public cv::ParallelLoopBody const float FLT_SCALE = (1.f/(1 << 16)); winSize = cv::Size(maxWinSize,maxWinSize); - int winMaskwidth = static_cast(ceil(winSize.width / 16.f)) * 16; + int winMaskwidth = roundUp(winSize.width, 16); cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); winMaskMatBuf.setTo(1); int j, cn = I.channels(), cn2 = cn*2; - int winbufwidth = static_cast(ceil(winSize.width / 16.f)) * 16; + int winbufwidth = roundUp(winSize.width, 16); cv::Size winBufSize(winbufwidth,winbufwidth); cv::AutoBuffer _buf(winBufSize.area()*(cn + cn2)); @@ -627,12 +627,12 @@ class TrackerInvoker : public cv::ParallelLoopBody const Mat& BI = *rgbPrevImg; const float FLT_SCALE = (1.f/(1 << 16));//(1.f/(1 << 20)); // 20 winSize = cv::Size(maxWinSize,maxWinSize); - int winMaskwidth = static_cast(ceil(winSize.width / 16.f)) * 16; + int winMaskwidth = roundUp(winSize.width, 16); cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); winMaskMatBuf.setTo(1); int cn = I.channels(), cn2 = cn*2; - int winbufwidth = static_cast(ceil(winSize.width / 16.f)) * 16; + int winbufwidth = roundUp(winSize.width, 16); cv::Size winBufSize(winbufwidth,winbufwidth); @@ -927,13 +927,13 @@ class TrackerInvoker : public cv::ParallelLoopBody float s2Val = std::fabs(normSigma2); - int s2bitShift = normSigma2 == 0 ? 1 : static_cast(ceil(log(200.f / s2Val) / log(2.f))); + int s2bitShift = normSigma2 == 0 ? 1 : cvCeil(log(200.f / s2Val) / log(2.f)); __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast(normSigma2 * (float)(1 << s2bitShift))); __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift); __m128 mmParam2s = _mm_set1_ps(0.01f * normSigma2); __m128 mmParam2s2 = _mm_set1_ps(normSigma2 * normSigma2); float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x; - int bitShift = gainVec.x == 0 ? 1 : static_cast(ceil(log(200.f / gainVal) / log(2.f))); + int bitShift = gainVec.x == 0 ? 1 : cvCeil(log(200.f / gainVal) / log(2.f)); __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast(gainVec.x * (float)(1 << bitShift))); __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast(gainVec.y)); __m128i mmEta = _mm_setzero_si128(); @@ -1587,14 +1587,14 @@ class TrackerInvoker : public cv::ParallelLoopBody const Mat& BI = *rgbPrevImg; winSize = cv::Size(maxWinSize,maxWinSize); - int winMaskwidth = static_cast(ceil(winSize.width / 8.f)) * 16; + int winMaskwidth = roundUp(winSize.width, 8) * 2; cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); winMaskMatBuf.setTo(1); const float FLT_SCALE = (1.f/(1 << 20)); // 20 int j, cn = I.channels(), cn2 = cn*2; - int winbufwidth = static_cast(ceil(winSize.width / 8.f)) * 8; + int winbufwidth = roundUp(winSize.width, 8); cv::Size winBufSize(winbufwidth,winbufwidth); std::vector _buf(winBufSize.area()*(cn + cn2)); diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp index 06a2972969d..10c68e581cf 100644 --- a/modules/optflow/src/rlof/plk_invoker.hpp +++ b/modules/optflow/src/rlof/plk_invoker.hpp @@ -73,13 +73,13 @@ class TrackerInvoker : public cv::ParallelLoopBody const Mat& BI = *rgbPrevImg; winSize = cv::Size(maxWinSize,maxWinSize); - int winMaskwidth = static_cast(ceil(winSize.width / 16.f)) * 16; + int winMaskwidth = roundUp(winSize.width, 16); cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); winMaskMatBuf.setTo(1); const float FLT_SCALE = (1.f/(1 << 16));//(1.f/(1 << 20)); // 20 int cn = I.channels(), cn2 = cn*2; - int winbufwidth = static_cast(ceil(winSize.width / 16.f)) * 16; + int winbufwidth = roundUp(winSize.width, 16); cv::Size winBufSize(winbufwidth,winbufwidth); cv::Matx44f invTensorMat; @@ -304,7 +304,7 @@ class TrackerInvoker : public cv::ParallelLoopBody __m128 mmSumIy = _mm_setzero_ps(), mmSumIx = _mm_setzero_ps(); __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps(); float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x; - int bitShift = gainVec.x == 0 ? 1 : static_cast(ceil(log(200.f / gainVal) / log(2.f))); + int bitShift = gainVec.x == 0 ? 1 : cvCeil(log(200.f / gainVal) / log(2.f)); __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast(gainVec.x * (float)(1 << bitShift))); __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast(gainVec.y)); #endif @@ -725,7 +725,7 @@ class TrackerInvoker : public cv::ParallelLoopBody const Mat& BI = *rgbPrevImg; winSize = cv::Size(maxWinSize,maxWinSize); - int winMaskwidth = static_cast(ceil(winSize.width / 8.f)) * 16; + int winMaskwidth = roundUp(winSize.width, 8) * 2; cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); winMaskMatBuf.setTo(1); @@ -734,7 +734,7 @@ class TrackerInvoker : public cv::ParallelLoopBody const float FLT_SCALE = (1.f/(1 << 20)); // 20 int j, cn = I.channels(), cn2 = cn*2; - int winbufwidth = static_cast(ceil(winSize.width / 8.f)) * 8; + int winbufwidth = roundUp(winSize.width, 8); cv::Size winBufSize(winbufwidth,winbufwidth); std::vector _buf(winBufSize.area()*(cn + cn2)); diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp index cea0f7f6718..bb3d589e663 100644 --- a/modules/optflow/src/rlof/rlof_invoker.hpp +++ b/modules/optflow/src/rlof/rlof_invoker.hpp @@ -71,14 +71,14 @@ class TrackerInvoker : public cv::ParallelLoopBody const Mat& BI = *rgbPrevImg; winSize = cv::Size(maxWinSize, maxWinSize); - int winMaskwidth = static_cast(ceil(winSize.width / 8.f)) * 16; + int winMaskwidth = roundUp(winSize.width, 8) * 2; cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); winMaskMatBuf.setTo(1); const float FLT_SCALE = (1.f / (1 << 20)); // 20 int cn = I.channels(), cn2 = cn * 2; - int winbufwidth = static_cast(ceil(winSize.width / 8.f)) * 8; + int winbufwidth = roundUp(winSize.width, 8); cv::Size winBufSize(winbufwidth, winbufwidth); std::vector _buf(winBufSize.area()*(cn + cn2)); @@ -318,7 +318,7 @@ class TrackerInvoker : public cv::ParallelLoopBody __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits::max() - 1, static_cast(fParam0))); __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits::max() - 1, static_cast(fParam1))); float s2Val = std::fabs(normSigma2); - int s2bitShift = normSigma2 == 0 ? 1 : static_cast(ceil(log(200.f / s2Val) / log(2.f))); + int s2bitShift = normSigma2 == 0 ? 1 : cvCeil(log(200.f / s2Val) / log(2.f)); __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast(normSigma2 * (float)(1 << s2bitShift))); __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift); __m128 mmParam2s = _mm_set1_ps(0.01f * normSigma2); @@ -696,12 +696,12 @@ class TrackerInvoker : public cv::ParallelLoopBody const float FLT_SCALE = (1.f / (1 << 16)); winSize = cv::Size(maxWinSize, maxWinSize); - int winMaskwidth = static_cast(ceil(winSize.width / 16.f)) * 16; + int winMaskwidth = roundUp(winSize.width, 16); cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); winMaskMatBuf.setTo(1); int cn = I.channels(), cn2 = cn * 2; - int winbufwidth = static_cast(ceil(winSize.width / 16.f)) * 16; + int winbufwidth = roundUp(winSize.width, 16); cv::Size winBufSize(winbufwidth, winbufwidth); cv::Matx44f invTensorMat; @@ -989,14 +989,14 @@ class TrackerInvoker : public cv::ParallelLoopBody float s2Val = std::fabs(normSigma2); - int s2bitShift = normSigma2 == 0 ? 1 : static_cast(ceil(log(200.f / s2Val) / log(2.f))); + int s2bitShift = normSigma2 == 0 ? 1 : cvCeil(log(200.f / s2Val) / log(2.f)); __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast(normSigma2 * (float)(1 << s2bitShift))); __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift); __m128 mmParam2s = _mm_set1_ps(0.01f * normSigma2); __m128 mmParam2s2 = _mm_set1_ps(normSigma2 * normSigma2); float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x; - int bitShift = gainVec.x == 0 ? 1 : static_cast(ceil(log(200.f / gainVal) / log(2.f))); + int bitShift = gainVec.x == 0 ? 1 : cvCeil(log(200.f / gainVal) / log(2.f)); __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast(gainVec.x * (float)(1 << bitShift))); __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast(gainVec.y)); diff --git a/modules/optflow/src/rlof/rlof_invokerbase.hpp b/modules/optflow/src/rlof/rlof_invokerbase.hpp index f65efeb673c..17c83e026d7 100644 --- a/modules/optflow/src/rlof/rlof_invokerbase.hpp +++ b/modules/optflow/src/rlof/rlof_invokerbase.hpp @@ -216,8 +216,8 @@ void getLocalPatch( if (noPoints < minWinSize * minWinSize) { cv::Rect roi(winPointMask.cols / 2 - (minWinSize - 1) / 2, - winPointMask.rows / 2 - (minWinSize - 1) / 2, - minWinSize, minWinSize); + winPointMask.rows / 2 - (minWinSize - 1) / 2, + minWinSize, minWinSize); cv::Mat(winPointMask, roi).setTo(1); min_c = MIN(MIN(min_c, roi.tl().x), roi.br().x - 1); max_c = MAX(MAX(max_c, roi.tl().x), roi.br().x - 1); @@ -254,7 +254,7 @@ bool calcWinMaskMat( return false; winSize = winRoi.size(); halfWin = Point2f(static_cast(iprevPt.x - winRoi.tl().x), - static_cast(iprevPt.y - winRoi.tl().y)); + static_cast(iprevPt.y - winRoi.tl().y)); } else { From aa3cc4c7498d1c7bb89de2750ee696902e6acb22 Mon Sep 17 00:00:00 2001 From: senst Date: Fri, 14 Dec 2018 20:21:02 +0100 Subject: [PATCH 18/25] maximum level is set to 5 so that it is similar value used in the papers --- modules/optflow/include/opencv2/optflow/rlofflow.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/optflow/include/opencv2/optflow/rlofflow.hpp b/modules/optflow/include/opencv2/optflow/rlofflow.hpp index 4d602d2b28f..bbed7f0272c 100644 --- a/modules/optflow/include/opencv2/optflow/rlofflow.hpp +++ b/modules/optflow/include/opencv2/optflow/rlofflow.hpp @@ -68,7 +68,7 @@ class CV_EXPORTS_W RLOFOpticalFlowParameter{ ,smallWinSize(9) ,largeWinSize(21) ,crossSegmentationThreshold(25) - ,maxLevel(3) + ,maxLevel(5) ,useInitialFlow(false) ,useIlluminationModel(true) ,useGlobalMotionPrior(true) From 725e296eabcdab84b88b26451156b0c07820dfa2 Mon Sep 17 00:00:00 2001 From: senst Date: Fri, 14 Dec 2018 20:22:22 +0100 Subject: [PATCH 19/25] implement paralellized horizontal cross segmentation as used in Geistert2016 --- modules/optflow/src/rlof/berlof_invoker.hpp | 6 +- modules/optflow/src/rlof/plk_invoker.hpp | 4 +- modules/optflow/src/rlof/rlof_invoker.hpp | 4 +- modules/optflow/src/rlof/rlof_invokerbase.hpp | 190 +++++------------- modules/optflow/src/rlof/rlof_localflow.cpp | 143 +++++++++++++ 5 files changed, 203 insertions(+), 144 deletions(-) diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp index 1d04715fe48..fb40fd2bd2d 100644 --- a/modules/optflow/src/rlof/berlof_invoker.hpp +++ b/modules/optflow/src/rlof/berlof_invoker.hpp @@ -149,7 +149,7 @@ class TrackerInvoker : public cv::ParallelLoopBody winMaskMatBuf.setTo(0); if( calcWinMaskMat(BI, windowType, iprevPt, winMaskMat,winSize,halfWin,winArea, - minWinSize,maxWinSize, crossSegmentationThreshold) == false) + minWinSize,maxWinSize) == false) continue; prevPt -= halfWin; @@ -667,7 +667,7 @@ class TrackerInvoker : public cv::ParallelLoopBody winMaskMatBuf.setTo(0); if( calcWinMaskMat(BI, windowType, iprevPt, winMaskMat,winSize,halfWin,winArea, - minWinSize,maxWinSize, crossSegmentationThreshold) == false) + minWinSize,maxWinSize) == false) continue; prevPt -= halfWin; @@ -1624,7 +1624,7 @@ class TrackerInvoker : public cv::ParallelLoopBody if( calcWinMaskMat(BI, windowType, iprevPt, winMaskMat,winSize,halfWin,winArea, - minWinSize,maxWinSize, crossSegmentationThreshold) == false) + minWinSize,maxWinSize) == false) continue; prevPt -= halfWin; diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp index 10c68e581cf..b905cffe8d1 100644 --- a/modules/optflow/src/rlof/plk_invoker.hpp +++ b/modules/optflow/src/rlof/plk_invoker.hpp @@ -117,7 +117,7 @@ class TrackerInvoker : public cv::ParallelLoopBody winMaskMatBuf.setTo(0); if( calcWinMaskMat(BI, windowType, iprevPt, winMaskMat,winSize,halfWin,winArea, - minWinSize,maxWinSize, crossSegmentationThreshold) == false) + minWinSize,maxWinSize) == false) continue; prevPt -= halfWin; @@ -764,7 +764,7 @@ class TrackerInvoker : public cv::ParallelLoopBody if( calcWinMaskMat(BI, windowType, iprevPt, winMaskMat,winSize,halfWin,winArea, - minWinSize,maxWinSize, crossSegmentationThreshold) == false) + minWinSize,maxWinSize) == false) continue; diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp index bb3d589e663..67692c4da50 100644 --- a/modules/optflow/src/rlof/rlof_invoker.hpp +++ b/modules/optflow/src/rlof/rlof_invoker.hpp @@ -108,7 +108,7 @@ class TrackerInvoker : public cv::ParallelLoopBody if (calcWinMaskMat(BI, windowType, iprevPt, winMaskMat, winSize, halfWin, winArea, - minWinSize, maxWinSize, crossSegmentationThreshold) == false) + minWinSize, maxWinSize) == false) continue; @@ -742,7 +742,7 @@ class TrackerInvoker : public cv::ParallelLoopBody winMaskMatBuf.setTo(0); if (calcWinMaskMat(BI, windowType, iprevPt, winMaskMat, winSize, halfWin, winArea, - minWinSize, maxWinSize, crossSegmentationThreshold) == false) + minWinSize, maxWinSize) == false) continue; prevPt -= halfWin; diff --git a/modules/optflow/src/rlof/rlof_invokerbase.hpp b/modules/optflow/src/rlof/rlof_invokerbase.hpp index 17c83e026d7..470f4d87ad7 100644 --- a/modules/optflow/src/rlof/rlof_invokerbase.hpp +++ b/modules/optflow/src/rlof/rlof_invokerbase.hpp @@ -72,36 +72,23 @@ static inline bool notSameColor(const cv::Point3_ & ref, int r, int c, co return diff > threshold; } -/*! Estimate the mask with points of the same color as the middle point -*\param prevGray input CV_8UC1 -*\param prevImg input CV_8UC3 -*\param nextImg input CV_8UC3 -*\param prevPoint global position of the middle point of the mask window -*\param nextPoint global position of the middle point of the mask window -*\param winPointMask mask matrice with 1 labeling points contained and 0 point is not contained by the segment -*\param noPoints number of points contained by the segment -*\param winRoi rectangle of the region of interesset in global coordinates -*\param minWinSize, -*\param threshold, -*\param useBothImages -*/ static void getLocalPatch( - const cv::Mat & prevImg, - const cv::Point2i & prevPoint, // feature points - cv::Mat & winPointMask, - int & noPoints, - cv::Rect & winRoi, - int minWinSize, - int threshold) + const cv::Mat & src, + const cv::Point2i & prevPoint, // feature points + cv::Mat & winPointMask, + int & noPoints, + cv::Rect & winRoi, + int minWinSize) { int maxWinSizeH = (winPointMask.cols - 1) / 2; - winRoi.x = prevPoint.x; - winRoi.y = prevPoint.y; - winRoi.width = winPointMask.cols; - winRoi.height = winPointMask.rows; + winRoi.x = prevPoint.x;// - maxWinSizeH; + winRoi.y = prevPoint.y;// - maxWinSizeH; + winRoi.width = winPointMask.cols; + winRoi.height = winPointMask.rows; - if (minWinSize == winPointMask.cols) + if( minWinSize == winPointMask.cols || prevPoint.x < 0 || prevPoint.y < 0 + || prevPoint.x + 2*maxWinSizeH >= src.cols || prevPoint.y + 2*maxWinSizeH >= src.rows) { winRoi.x = prevPoint.x - maxWinSizeH; winRoi.y = prevPoint.y - maxWinSizeH; @@ -109,127 +96,57 @@ void getLocalPatch( noPoints = winPointMask.size().area(); return; } + winPointMask.setTo(0); noPoints = 0; - int c = prevPoint.x; - int r = prevPoint.y; - int c_left = c - 1; - int c_right = c + 1; - int r_top = r - 1; - int r_bottom = r; - int border_left = c - maxWinSizeH; - int border_right = c + maxWinSizeH; - int border_top = r - maxWinSizeH; - int border_bottom = r + maxWinSizeH; - int c_local_diff = prevPoint.x - maxWinSizeH; - int r_local_diff = prevPoint.y - maxWinSizeH; - int _c = c - c_local_diff; - int _r = r - r_local_diff; - int min_r = _r; - int max_r = _r; - int min_c = _c; - int max_c = _c; - // horizontal line - if (r < 0 || r >= prevImg.rows || c < 0 || c >= prevImg.cols) + int c = prevPoint.x + maxWinSizeH; + int r = prevPoint.y + maxWinSizeH; + int min_c = c; + int max_c = c; + int border_left = c - maxWinSizeH; + int border_top = r - maxWinSizeH; + cv::Vec4i bounds = src.at(r,c); + int min_r = bounds.val[2]; + int max_r = bounds.val[3]; + + for( int _r = min_r; _r <= max_r; _r++) { - noPoints = 0; - return; - } - cv::Point3_ val1 = prevImg.at>(r, c); // middle grayvalue - cv::Point3_ tval; - - //vertical line - for (int dr = r_top; dr >= border_top; dr--) - { - if (notSameColor(val1, dr, c, prevImg, maxWinSizeH, threshold)) - break; - - int _dr = dr - r_local_diff; - min_r = MIN(min_r, _dr); - max_r = MAX(max_r, _dr); - winPointMask.at(_dr, _c) = 1; - noPoints++; - } - for (int dr = r_bottom; dr < border_bottom; dr++) - { - if (notSameColor(val1, dr, c, prevImg, maxWinSizeH, threshold) - ) - break; - - int _dr = dr - r_local_diff; - min_r = MIN(min_r, _dr); - max_r = MAX(max_r, _dr); - winPointMask.at(_dr, _c) = 1; - noPoints++; - } - // accumulate along the vertical line and the search line that was still labled - for (int dr = min_r + r_local_diff; dr <= max_r + r_local_diff; dr++) - { - int _dr = dr - r_local_diff; - if (winPointMask.at(_dr, _c) == 0) + cv::Rect roi(maxWinSizeH, _r - border_top, winPointMask.cols, 1); + if( _r >= 0 && _r < src.cols) { - winPointMask.row(_dr).setTo(0); - continue; + bounds = src.at(_r,c); + roi.x = bounds.val[0] - border_left; + roi.width = bounds.val[1] - bounds.val[0]; } - bool skip = false; - int _dc = c_right - c_local_diff; - for (int dc = c_right; dc < border_right; dc++, _dc++) + else { - if (skip == false) - { - if (notSameColor(val1, dr, dc, prevImg, maxWinSizeH, threshold)) - skip = true; - } - if (skip == false) - { - min_c = MIN(min_c, _dc); - max_c = MAX(max_c, _dc); - winPointMask.at(_dr, _dc) = 1; - noPoints++; - } - else - winPointMask.at(_dr, _dc) = 0; - } - - skip = false; - _dc = c_left - c_local_diff; - for (int dc = c_left; dc >= border_left; dc--, _dc--) - { - if (skip == false) - { - if (notSameColor(val1, dr, dc, prevImg, maxWinSizeH, threshold)) - skip = true; - } - if (skip == false) - { - min_c = MIN(min_c, _dc); - max_c = MAX(max_c, _dc); - winPointMask.at(_dr, _dc) = 1; - noPoints++; - } - else - winPointMask.at(_dr, _dc) = 0; + bounds.val[0] = border_left; + bounds.val[1] = border_left + roi.width; } + cv::Mat(winPointMask, roi).setTo(1); + min_c = MIN(min_c, bounds.val[0]); + max_c = MAX(max_c, bounds.val[1]); + noPoints += roi.width; } - - // get the initial small window - if (noPoints < minWinSize * minWinSize) + if( noPoints < minWinSize * minWinSize) { - cv::Rect roi(winPointMask.cols / 2 - (minWinSize - 1) / 2, - winPointMask.rows / 2 - (minWinSize - 1) / 2, - minWinSize, minWinSize); + cv::Rect roi( winPointMask.cols / 2 - (minWinSize-1)/2, + winPointMask.rows / 2 - (minWinSize-1)/2, + minWinSize, minWinSize); cv::Mat(winPointMask, roi).setTo(1); - min_c = MIN(MIN(min_c, roi.tl().x), roi.br().x - 1); - max_c = MAX(MAX(max_c, roi.tl().x), roi.br().x - 1); - min_r = MIN(MIN(min_r, roi.tl().y), roi.br().y - 1); - max_r = MAX(MAX(max_r, roi.tl().y), roi.br().y - 1); + roi.x += border_left; + roi.y += border_top; + min_c = MIN(MIN(min_c, roi.tl().x),roi.br().x); + max_c = MAX(MAX(max_c, roi.tl().x),roi.br().x); + min_r = MIN(MIN(min_r, roi.tl().y),roi.br().y); + max_r = MAX(MAX(max_r, roi.tl().y),roi.br().y); noPoints += minWinSize * minWinSize; } - winRoi.x = c_local_diff + min_c; - winRoi.y = r_local_diff + min_r; - winRoi.width = max_c - min_c + 1; - winRoi.height = max_r - min_r + 1; - winPointMask = winPointMask(cv::Rect(min_c, min_r, winRoi.width, winRoi.height)); + winRoi.x = min_c - maxWinSizeH; + winRoi.y = min_r - maxWinSizeH; + winRoi.width = max_c - min_c; + winRoi.height = max_r - min_r; + winPointMask = winPointMask(cv::Rect(min_c - border_left, min_r - border_top, winRoi.width, winRoi.height)); } static inline @@ -242,14 +159,13 @@ bool calcWinMaskMat( cv::Point2f & halfWin, int & winArea, const int minWinSize, - const int maxWinSize, - const int crossSegmentationThreshold) + const int maxWinSize) { if (windowType == SR_CROSS && maxWinSize != minWinSize) { // patch generation cv::Rect winRoi; - getLocalPatch(BI, iprevPt, winMaskMat, winArea, winRoi, minWinSize, crossSegmentationThreshold); + getLocalPatch(BI, iprevPt, winMaskMat, winArea, winRoi, minWinSize); if (winArea == 0) return false; winSize = winRoi.size(); diff --git a/modules/optflow/src/rlof/rlof_localflow.cpp b/modules/optflow/src/rlof/rlof_localflow.cpp index 211d999b0e7..7a6227902b7 100644 --- a/modules/optflow/src/rlof/rlof_localflow.cpp +++ b/modules/optflow/src/rlof/rlof_localflow.cpp @@ -110,6 +110,141 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst) } // namespace namespace optflow { +/*! Helper function for preCalcCrossSegmentation. Everything is performed on the large +*\param data CV_8UC3 image ( use extended image mit winSize) +*\param winSize +*\param dst CV_32SC1 bounding map +*\param threshold +*\param stride if true store into first two bounding maps +*/ +class HorizontalCrossSegmentation : public cv::ParallelLoopBody +{ +public: + HorizontalCrossSegmentation( + const cv::Point2f * ptList, + int npoints, + float pointScale, + const cv::Mat * data, + const int winSize, + cv::Mat * dst, + int threshold, + bool stride, + const cv::Mat * mask + ) + { + m_ptList = ptList; + m_npoints = npoints; + m_pointScale = pointScale; + m_data = data; + m_winSize = winSize; + m_dst = dst; + m_threshold = threshold; + m_stride = stride; + m_mask = mask; + } + + void operator()(const cv::Range& range) const CV_OVERRIDE + { + uchar channel[2]; + channel[0] = m_stride ? 2 : 0; + channel[1] = m_stride ? 3 : 1; + int hWinSize = (m_winSize - 1) / 2; + std::vector differenz(m_winSize); + for( int r = range.start; r < range.end; r++ ) + { + for(int c = hWinSize; c < m_data->cols - hWinSize; c++) + { + if( m_mask->at(r,c) == 0) + continue; + const cv::Point3_ & ucval = m_data->at>(r,c); + cv::Point3i val(static_cast(ucval.x), static_cast(ucval.y), static_cast(ucval.z)); + int x = c - hWinSize; + cv::Point dstPos = m_stride ? cv::Point(r,c) : cv::Point(c,r); + for(int ix = 0; ix < m_winSize; ix++, x++) + { + const cv::Point3_ & valref = m_data->at>(r,x); + differenz[ix] = MAX(std::abs(static_cast(valref.x) - val.x), + MAX(std::abs(static_cast(valref.y) - val.y), + (std::abs(static_cast(valref.z) - val.z)))); + + } + cv::Vec4i & bounds = m_dst->at(dstPos); + bounds.val[channel[0]] = c - hWinSize; + bounds.val[channel[1]] = c + hWinSize; + int * diffPtr = &differenz[hWinSize]; + bool useUpperBound = false; + bool useLowerBound = false; + for(int ix = 1; ix <= hWinSize; ix++) + { + if( !useUpperBound && diffPtr[-ix] > m_threshold) + { + useUpperBound = true; + bounds.val[channel[0]] = c - ix; + } + if( !useLowerBound && diffPtr[ix-1] > m_threshold) + { + useLowerBound = true; + bounds.val[channel[1]] = c + ix - 1; + } + if( useUpperBound && useLowerBound) + break; + } + } + } + } + + const cv::Point2f * m_ptList; + int m_npoints; + float m_pointScale; + const cv::Mat * m_data; + int m_winSize; + cv::Mat * m_dst; + int m_threshold; + bool m_stride; + const cv::Mat * m_mask; +}; + +static +void preCalcCrossSegmentation( + const cv::Point2f * ptList, + int npoints, + float pointScale, + const cv::Mat & img, + const int winSize, + cv::Mat & dst, + int threshold +) +{ + int hWinSize = (winSize - 1) / 2; + cv::Mat data = img; + data.adjustROI(hWinSize, hWinSize, hWinSize, hWinSize); + if( dst.size() != dst.size() || dst.type() != CV_32SC4) + { + dst.release(); + dst.create(data.size(), CV_32SC4); + } + cv::Mat mask(data.cols, data.rows, CV_8UC1); + mask.setTo(0); + for( unsigned int n = 0; n < static_cast(npoints); n++) + { + cv::Point ipos( static_cast(floor(ptList[n].y * pointScale)), + static_cast(floor(ptList[n].x * pointScale) + hWinSize)); + ipos.x = MAX( MIN(ipos.x, mask.cols - 1), 0); + int to = MIN( mask.cols - 1, ipos.x + winSize ); + int ypos = MAX( MIN(ipos.y, mask.rows - 1), 0); + for(int x = ipos.x; x <= to ; x++) + { + mask.at(ypos, x) = 255; + } + } + cv::Mat datat = data.t(); + cv::Mat maskt = mask.t(); + parallel_for_(cv::Range(0, datat.rows), HorizontalCrossSegmentation(ptList, npoints, pointScale, &datat, winSize, &dst, threshold, true, &mask)); + parallel_for_(cv::Range(0, data.rows), HorizontalCrossSegmentation(ptList, npoints, pointScale, &data, winSize, &dst, threshold, false, &maskt)); + +} + + int buildOpticalFlowPyramidScale(InputArray _img, OutputArrayOfArrays pyramid, Size winSize, int maxLevel, bool withDerivatives, int pyrBorder, int derivBorder, bool tryReuseInputImage, float levelScale[2]); void calcLocalOpticalFlowCore(Ptr prevPyramids[2], Ptr currPyramids[2], InputArray _prevPts, @@ -254,6 +389,7 @@ void calcLocalOpticalFlowCore( std::vector rlofNorm = get_norm(param.normSigma0, param.normSigma1); CV_Assert(winSizes[0] <= winSizes[1]); + bool usePreComputedCross = winSizes[0] != winSizes[1]; Mat prevPtsMat = _prevPts.getMat(); const int derivDepth = DataType::depth; @@ -322,6 +458,13 @@ void calcLocalOpticalFlowCore( cv::Mat prevImage = prevPyramids[0]->getImage(level); cv::Mat currImage = currPyramids[0]->getImage(level); + cv::Mat preCrossMap; + if( usePreComputedCross ) + { + preCalcCrossSegmentation(prevPts, npoints, (float)(1./(1 << level)), tRGBPrevPyr, winSizes[1], preCrossMap, param.crossSegmentationThreshold); + tRGBNextPyr = cv::Mat(); + tRGBPrevPyr = preCrossMap; + } // apply plk like tracker if (isrobust(param) == false) { From c1c94a2191c44aa043da7357083f4300a0c9cc67 Mon Sep 17 00:00:00 2001 From: senst Date: Fri, 14 Dec 2018 20:57:50 +0100 Subject: [PATCH 20/25] drop dead code --- .../optflow/src/rlof/geo_interpolation.cpp | 28 ------------------- 1 file changed, 28 deletions(-) diff --git a/modules/optflow/src/rlof/geo_interpolation.cpp b/modules/optflow/src/rlof/geo_interpolation.cpp index 143b4f63a4e..0ac2e1715da 100644 --- a/modules/optflow/src/rlof/geo_interpolation.cpp +++ b/modules/optflow/src/rlof/geo_interpolation.cpp @@ -358,34 +358,6 @@ Mat interpolate_irregular_knn( } } return ret; -/* - for (int y = 0; y < in.rows; y++) { - for (int x = 0; x < in.cols; x++) { - int v_id = y * in.cols + x; - std::vector > flow_s(k); - - for (int i = 0; i < k; i++) { - color = *((int*)(graph_helper.data(v_id) + 1 + 2 * i)); - int cy = flow_point_list[color][0]; - int cx = flow_point_list[color][1]; - Vec2f flow = in.at(cy, cx); - flow_s[i] = std::make_pair(static_cast(norm(flow)), flow); - - - - } - - nnFlow.at(y, x) = std::min_element(flow_s.begin(), flow_s.end(), [](const std::pair &left, const std::pair &right) { - return left.first < right.first; - })->second; - - - } - } - free(graph_helper.mem); - return nnFlow; -*/ - } Mat getGraph(const Mat &image, float edge_length) From b95339c940ee20fb7d9da5c62997221073abaf16 Mon Sep 17 00:00:00 2001 From: senst Date: Sat, 15 Dec 2018 01:39:43 +0100 Subject: [PATCH 21/25] Avoid using "data" and "step" calculations. Use .ptr(row, col) instead. --- modules/optflow/src/rlof/berlof_invoker.hpp | 89 +++++++++------------ modules/optflow/src/rlof/plk_invoker.hpp | 44 +++++----- modules/optflow/src/rlof/rlof_invoker.hpp | 59 ++++++-------- modules/optflow/src/rlof/rlof_localflow.cpp | 2 +- 4 files changed, 84 insertions(+), 110 deletions(-) diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp index fb40fd2bd2d..1a7f6e10502 100644 --- a/modules/optflow/src/rlof/berlof_invoker.hpp +++ b/modules/optflow/src/rlof/berlof_invoker.hpp @@ -150,8 +150,9 @@ class TrackerInvoker : public cv::ParallelLoopBody if( calcWinMaskMat(BI, windowType, iprevPt, winMaskMat,winSize,halfWin,winArea, minWinSize,maxWinSize) == false) + { continue; - + } prevPt -= halfWin; iprevPt.x = cvFloor(prevPt.x); iprevPt.y = cvFloor(prevPt.y); @@ -180,7 +181,6 @@ class TrackerInvoker : public cv::ParallelLoopBody int dstep = (int)(derivI.step/derivI.elemSize1()); int step = (int)(I.step/I.elemSize1()); - int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1()); CV_Assert( step == (int)(J.step/J.elemSize1()) ); float A11 = 0, A12 = 0, A22 = 0; @@ -190,11 +190,10 @@ class TrackerInvoker : public cv::ParallelLoopBody int x, y; for( y = 0; y < winSize.height; y++ ) { - const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; - const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; - - short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step); - short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step); + const uchar* src = I.ptr(y + iprevPt.y, 0) + iprevPt.x*cn; + const short* dsrc = derivI.ptr(y + iprevPt.y, 0) + iprevPt.x*cn2; + short* Iptr = IWinBuf.ptr(y, 0); + short* dIptr = derivIWinBuf.ptr(y, 0); x = 0; for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 ) { @@ -276,10 +275,10 @@ class TrackerInvoker : public cv::ParallelLoopBody buffIdx = 0; for( y = 0; y < winSize.height; y++ ) { - const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y )*step + (inextPt.x )*cn; - const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step); - const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const short* Iptr = IWinBuf.ptr(y, 0); + const short* dIptr = derivIWinBuf.ptr(y, 0); + const tMaskType* maskPtr = winMaskMat.ptr(y, 0); x = 0; for( ; x < winSize.width*cn; x++, dIptr += 2) { @@ -310,11 +309,10 @@ class TrackerInvoker : public cv::ParallelLoopBody */ for( y = 0; y < _winSize.height; y++ ) { - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; - const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn; - const short* Iptr = (const short*)(IWinBuf.data + y*IWinBuf.step); - const short* dIptr = (const short*)(derivIWinBuf.data + y*derivIWinBuf.step); - + const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const short* Iptr = IWinBuf.ptr(y, 0); + const short* dIptr = derivIWinBuf.ptr(y, 0); + const tMaskType* maskPtr = winMaskMat.ptr(y, 0); x = 0; for( ; x < _winSize.width*cn; x++, dIptr += 2 ) { @@ -668,7 +666,7 @@ class TrackerInvoker : public cv::ParallelLoopBody if( calcWinMaskMat(BI, windowType, iprevPt, winMaskMat,winSize,halfWin,winArea, minWinSize,maxWinSize) == false) - continue; + continue; prevPt -= halfWin; iprevPt.x = cvFloor(prevPt.x); @@ -698,7 +696,6 @@ class TrackerInvoker : public cv::ParallelLoopBody int dstep = (int)(derivI.step/derivI.elemSize1()); int step = (int)(I.step/I.elemSize1()); - int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1()); CV_Assert( step == (int)(J.step/J.elemSize1()) ); float A11 = 0, A12 = 0, A22 = 0; @@ -729,14 +726,12 @@ class TrackerInvoker : public cv::ParallelLoopBody for( y = 0; y < winSize.height; y++ ) { x = 0; - - const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; - const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; - - short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step); - short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step); + const uchar* src = I.ptr(y + iprevPt.y, 0) + iprevPt.x*cn; + const short* dsrc = derivI.ptr(y + iprevPt.y, 0) + iprevPt.x*cn2; + short* Iptr = IWinBuf.ptr(y, 0); + short* dIptr = derivIWinBuf.ptr(y, 0); #ifdef RLOF_SSE - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + const tMaskType* maskPtr = winMaskMat.ptr(y, 0); for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 ) { __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16); @@ -885,10 +880,10 @@ class TrackerInvoker : public cv::ParallelLoopBody buffIdx = 0; for( y = 0; y < winSize.height; y++ ) { - const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y )*step + (inextPt.x )*cn; - const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step); - const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const short* Iptr = IWinBuf.ptr(y, 0); + const short* dIptr = derivIWinBuf.ptr(y, 0); + const tMaskType* maskPtr = winMaskMat.ptr(y, 0); x = 0; for( ; x < winSize.width*cn; x++, dIptr += 2) { @@ -950,11 +945,10 @@ class TrackerInvoker : public cv::ParallelLoopBody */ for( y = 0; y < _winSize.height; y++ ) { - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; - const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn; - const short* Iptr = (const short*)(IWinBuf.data + y*IWinBuf.step); - const short* dIptr = (const short*)(derivIWinBuf.data + y*derivIWinBuf.step); - + const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const short* Iptr = IWinBuf.ptr(y, 0); + const short* dIptr = derivIWinBuf.ptr(y, 0); + const tMaskType* maskPtr = winMaskMat.ptr(y, 0); x = 0; #ifdef RLOF_SSE for( ; x <= _winSize.width*cn; x += 8, dIptr += 8*2 ) @@ -1623,8 +1617,8 @@ class TrackerInvoker : public cv::ParallelLoopBody cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize)); if( calcWinMaskMat(BI, windowType, iprevPt, - winMaskMat,winSize,halfWin,winArea, - minWinSize,maxWinSize) == false) + winMaskMat,winSize,halfWin,winArea, + minWinSize,maxWinSize) == false) continue; prevPt -= halfWin; @@ -1655,7 +1649,6 @@ class TrackerInvoker : public cv::ParallelLoopBody int dstep = (int)(derivI.step/derivI.elemSize1()); int step = (int)(I.step/I.elemSize1()); - int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1()); CV_Assert( step == (int)(J.step/J.elemSize1()) ); float A11 = 0, A12 = 0, A22 = 0; @@ -1674,15 +1667,12 @@ class TrackerInvoker : public cv::ParallelLoopBody int x, y; for( y = 0; y < winSize.height; y++ ) { - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; - const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; - const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; - - short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step); - short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step); - + const uchar* src = I.ptr(y + iprevPt.y, 0) + iprevPt.x*cn; + const short* dsrc = derivI.ptr(y + iprevPt.y, 0) + iprevPt.x*cn2; + short* Iptr = IWinBuf.ptr(y, 0); + short* dIptr = derivIWinBuf.ptr(y, 0); + const tMaskType* maskPtr = winMaskMat.ptr(y, 0); x = 0; - #ifdef RLOF_SSE for( ; x < winSize.width*cn; x += 4, dsrc += 4*2, dIptr += 4*2 ) { @@ -1845,12 +1835,13 @@ class TrackerInvoker : public cv::ParallelLoopBody #endif for( y = 0; y < winSize.height; y++ ) { - const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn; - const short* Iptr = (const short*)(IWinBuf.data + ( y)*IWinBuf.step); - const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); + const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const short* Iptr = IWinBuf.ptr(y, 0); + const short* dIptr = derivIWinBuf.ptr(y, 0); x = 0; #ifdef RLOF_SSE - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + + const tMaskType* maskPtr = winMaskMat.ptr(y, 0); for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 ) { if( maskPtr[x ] == 0 && maskPtr[x+1] == 0 && maskPtr[x+2] == 0 && maskPtr[x+3] == 0 diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp index b905cffe8d1..cabf61df104 100644 --- a/modules/optflow/src/rlof/plk_invoker.hpp +++ b/modules/optflow/src/rlof/plk_invoker.hpp @@ -147,7 +147,6 @@ class TrackerInvoker : public cv::ParallelLoopBody int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; int dstep = (int)(derivI.step/derivI.elemSize1()); int step = (int)(I.step/I.elemSize1()); - int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1()); CV_Assert( step == (int)(J.step/J.elemSize1()) ); float A11 = 0, A12 = 0, A22 = 0; // tensor @@ -173,13 +172,11 @@ class TrackerInvoker : public cv::ParallelLoopBody int x, y; for( y = 0; y < winSize.height; y++ ) { - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; - const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; - const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; - - short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step); - short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step); - + const uchar* src = I.ptr(y + iprevPt.y, 0) + iprevPt.x*cn; + const short* dsrc = derivI.ptr(y + iprevPt.y, 0) + iprevPt.x*cn2; + short* Iptr = IWinBuf.ptr(y, 0); + short* dIptr = derivIWinBuf.ptr(y, 0); + const tMaskType* maskPtr = winMaskMat.ptr(y, 0); x = 0; #ifdef RLOF_SSE for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 ) @@ -310,11 +307,10 @@ class TrackerInvoker : public cv::ParallelLoopBody #endif for( y = 0; y < winSize.height; y++ ) { - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; - const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn; - const short* Iptr = (const short*)(IWinBuf.data + ( y)*IWinBuf.step); - const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); - + const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const short* Iptr = IWinBuf.ptr(y, 0); + const short* dIptr = derivIWinBuf.ptr(y, 0); + const tMaskType* maskPtr = winMaskMat.ptr(y, 0); x = 0; #ifdef RLOF_SSE @@ -797,7 +793,6 @@ class TrackerInvoker : public cv::ParallelLoopBody int dstep = (int)(derivI.step/derivI.elemSize1()); int step = (int)(I.step/I.elemSize1()); - int mStep = (int)(winMaskMat.step/winMaskMat.elemSize1()); CV_Assert( step == (int)(J.step/J.elemSize1()) ); float A11 = 0, A12 = 0, A22 = 0; @@ -816,15 +811,12 @@ class TrackerInvoker : public cv::ParallelLoopBody int x, y; for( y = 0; y < winSize.height; y++ ) { - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; - const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; - const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; - - short* Iptr = (short*)(IWinBuf.data + y*IWinBuf.step); - short* dIptr = (short*)(derivIWinBuf.data + y*derivIWinBuf.step); - + const uchar* src = I.ptr(y + iprevPt.y, 0) + iprevPt.x*cn; + const short* dsrc = derivI.ptr(y + iprevPt.y, 0) + iprevPt.x*cn2; + short* Iptr = IWinBuf.ptr(y, 0); + short* dIptr = derivIWinBuf.ptr(y, 0); + const tMaskType* maskPtr = winMaskMat.ptr(y, 0); x = 0; - #ifdef RLOF_SSE for( ; x <= winSize.width*cn; x += 4, dsrc += 4*2, dIptr += 4*2 ) { @@ -974,13 +966,13 @@ class TrackerInvoker : public cv::ParallelLoopBody #endif for( y = 0; y < winSize.height; y++ ) { - const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x )*cn; - const short* Iptr = (const short*)(IWinBuf.data + ( y)*IWinBuf.step); - const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); + const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const short* Iptr = IWinBuf.ptr(y, 0); + const short* dIptr = derivIWinBuf.ptr(y, 0); x = 0; #ifdef RLOF_SSE - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + const tMaskType* maskPtr = winMaskMat.ptr(y, 0); for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 ) { if( maskPtr[x ] == 0 && maskPtr[x+1] == 0 && maskPtr[x+2] == 0 && maskPtr[x+3] == 0 diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp index 67692c4da50..da9fb20a8ab 100644 --- a/modules/optflow/src/rlof/rlof_invoker.hpp +++ b/modules/optflow/src/rlof/rlof_invoker.hpp @@ -140,7 +140,6 @@ class TrackerInvoker : public cv::ParallelLoopBody int dstep = (int)(derivI.step / derivI.elemSize1()); int step = (int)(I.step / I.elemSize1()); - int mStep = (int)(winMaskMat.step / winMaskMat.elemSize1()); CV_Assert(step == (int)(J.step / J.elemSize1())); float A11 = 0, A12 = 0, A22 = 0; float b1 = 0, b2 = 0; @@ -162,15 +161,12 @@ class TrackerInvoker : public cv::ParallelLoopBody int x, y; for (y = 0; y < winSize.height; y++) { - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; - const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; - const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; - - short* Iptr = (short*)(IWinBuf.data + y * IWinBuf.step); - short* dIptr = (short*)(derivIWinBuf.data + y * derivIWinBuf.step); - + const uchar* src = I.ptr(y + iprevPt.y, 0) + iprevPt.x*cn; + const short* dsrc = derivI.ptr(y + iprevPt.y, 0) + iprevPt.x*cn2; + short* Iptr = IWinBuf.ptr(y, 0); + short* dIptr = derivIWinBuf.ptr(y, 0); + const tMaskType* maskPtr = winMaskMat.ptr(y, 0); x = 0; - #ifdef RLOF_SSE for (; x <= winSize.width*cn; x += 4, dsrc += 4 * 2, dIptr += 4 * 2) @@ -287,10 +283,10 @@ class TrackerInvoker : public cv::ParallelLoopBody buffIdx = 0; for (y = 0; y < winSize.height; y++) { - const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn; - const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step); - const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const short* Iptr = IWinBuf.ptr(y, 0); + const short* dIptr = derivIWinBuf.ptr(y, 0); + const tMaskType* maskPtr = winMaskMat.ptr(y, 0); x = 0; for (; x < winSize.width*cn; x++, dIptr += 2) { @@ -332,11 +328,10 @@ class TrackerInvoker : public cv::ParallelLoopBody buffIdx = 0; for (y = 0; y < winSize.height; y++) { - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; - const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn; - const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step); - const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); - + const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const short* Iptr = IWinBuf.ptr(y, 0); + const short* dIptr = derivIWinBuf.ptr(y, 0); + const tMaskType* maskPtr = winMaskMat.ptr(y, 0); x = 0; #ifdef RLOF_SSE for (; x <= winSize.width*cn; x += 8, dIptr += 8 * 2) @@ -773,7 +768,6 @@ class TrackerInvoker : public cv::ParallelLoopBody int dstep = (int)(derivI.step / derivI.elemSize1()); int step = (int)(I.step / I.elemSize1()); - int mStep = (int)(winMaskMat.step / winMaskMat.elemSize1()); CV_Assert(step == (int)(J.step / J.elemSize1())); float A11 = 0, A12 = 0, A22 = 0; @@ -802,16 +796,14 @@ class TrackerInvoker : public cv::ParallelLoopBody int x, y; for (y = 0; y < winSize.height; y++) { - const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; - const short* dsrc = (const short*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; - - short* Iptr = (short*)(IWinBuf.data + y * IWinBuf.step); - short* dIptr = (short*)(derivIWinBuf.data + y * derivIWinBuf.step); - + const uchar* src = I.ptr(y + iprevPt.y, 0) + iprevPt.x*cn; + const short* dsrc = derivI.ptr(y + iprevPt.y, 0) + iprevPt.x*cn2; + short* Iptr = IWinBuf.ptr(y, 0); + short* dIptr = derivIWinBuf.ptr(y, 0); x = 0; #ifdef RLOF_SSE - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; + const tMaskType* maskPtr = winMaskMat.ptr(y, 0); for (; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4 * 2, dIptr += 4 * 2) { __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16); @@ -947,9 +939,9 @@ class TrackerInvoker : public cv::ParallelLoopBody buffIdx = 0; for (y = 0; y < winSize.height; y++) { - const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn; - const short* Iptr = (const short*)(IWinBuf.data + (y)*IWinBuf.step); - const short* dIptr = (const short*)(derivIWinBuf.data + (y)*derivIWinBuf.step); + const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const short* Iptr = IWinBuf.ptr(y, 0); + const short* dIptr = derivIWinBuf.ptr(y, 0); x = 0; for (; x < winSize.width*cn; x++, dIptr += 2) { @@ -1008,11 +1000,10 @@ class TrackerInvoker : public cv::ParallelLoopBody for (y = 0; y < _winSize.height; y++) { - const tMaskType* maskPtr = (const tMaskType*)winMaskMat.data + y * mStep; - const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + (inextPt.x)*cn; - const short* Iptr = (const short*)(IWinBuf.data + y * IWinBuf.step); - const short* dIptr = (const short*)(derivIWinBuf.data + y * derivIWinBuf.step); - + const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const short* Iptr = IWinBuf.ptr(y, 0); + const short* dIptr = derivIWinBuf.ptr(y, 0); + const tMaskType* maskPtr = winMaskMat.ptr(y, 0); x = 0; #ifdef RLOF_SSE diff --git a/modules/optflow/src/rlof/rlof_localflow.cpp b/modules/optflow/src/rlof/rlof_localflow.cpp index 7a6227902b7..1bc000d8fa1 100644 --- a/modules/optflow/src/rlof/rlof_localflow.cpp +++ b/modules/optflow/src/rlof/rlof_localflow.cpp @@ -606,7 +606,7 @@ void preprocess(Ptr prevPyramids[2], // Initialize point grid int stepr = prevPyramids[0]->m_Image.rows / 30; - int stepc = prevPyramids[0]->m_Image.rows / 40; + int stepc = prevPyramids[0]->m_Image.cols / 40; for (int r = stepr / 2; r < prevPyramids[0]->m_Image.rows; r += stepr) { for (int c = stepc / 2; c < prevPyramids[0]->m_Image.cols; c += stepc) From 873e03a4fc632909337a7888923245bd868221e6 Mon Sep 17 00:00:00 2001 From: senst Date: Sat, 15 Dec 2018 21:13:23 +0100 Subject: [PATCH 22/25] Avoid using "data" and "step" calculations. Use .ptr(row, col) instead. --- modules/optflow/src/rlof/berlof_invoker.hpp | 112 +++++++++----------- modules/optflow/src/rlof/plk_invoker.hpp | 65 ++++++------ modules/optflow/src/rlof/rlof_invoker.hpp | 74 ++++++------- modules/optflow/src/rlof/rlof_localflow.cpp | 3 +- modules/optflow/src/rlofflow.cpp | 4 +- 5 files changed, 125 insertions(+), 133 deletions(-) diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp index 1a7f6e10502..1f7b3ac8384 100644 --- a/modules/optflow/src/rlof/berlof_invoker.hpp +++ b/modules/optflow/src/rlof/berlof_invoker.hpp @@ -179,10 +179,6 @@ class TrackerInvoker : public cv::ParallelLoopBody int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - int dstep = (int)(derivI.step/derivI.elemSize1()); - int step = (int)(I.step/I.elemSize1()); - CV_Assert( step == (int)(J.step/J.elemSize1()) ); - float A11 = 0, A12 = 0, A22 = 0; float D = 0; @@ -191,11 +187,13 @@ class TrackerInvoker : public cv::ParallelLoopBody for( y = 0; y < winSize.height; y++ ) { const uchar* src = I.ptr(y + iprevPt.y, 0) + iprevPt.x*cn; + const uchar* src1 = I.ptr(y + iprevPt.y + 1, 0) + iprevPt.x*cn; const short* dsrc = derivI.ptr(y + iprevPt.y, 0) + iprevPt.x*cn2; + const short* dsrc1 = derivI.ptr(y + iprevPt.y + 1, 0) + iprevPt.x*cn2; short* Iptr = IWinBuf.ptr(y, 0); short* dIptr = derivIWinBuf.ptr(y, 0); x = 0; - for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 ) + for( ; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2 ) { if( winMaskMat.at(y,x) == 0) { @@ -204,11 +202,11 @@ class TrackerInvoker : public cv::ParallelLoopBody continue; } int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + - src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5); + src1[x]*iw10 + src1[x+cn]*iw11, W_BITS1-5); int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 + - dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1); - int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 + - dsrc[dstep+cn2+1]*iw11, W_BITS1); + dsrc1[0]*iw10 + dsrc1[cn2]*iw11, W_BITS1); + int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + + dsrc1[1]*iw10 + dsrc1[cn2+1]*iw11, W_BITS1); Iptr[x] = (short)ival; dIptr[0] = (short)ixval; dIptr[1] = (short)iyval; @@ -276,6 +274,7 @@ class TrackerInvoker : public cv::ParallelLoopBody for( y = 0; y < winSize.height; y++ ) { const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const uchar* Jptr1 = J.ptr(y + inextPt.y + 1, inextPt.x*cn); const short* Iptr = IWinBuf.ptr(y, 0); const short* dIptr = derivIWinBuf.ptr(y, 0); const tMaskType* maskPtr = winMaskMat.ptr(y, 0); @@ -284,7 +283,7 @@ class TrackerInvoker : public cv::ParallelLoopBody { if( maskPtr[x] == 0) continue; - int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, W_BITS1-5) + int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr1[x]*iw10 + Jptr1[x+cn]*iw11, W_BITS1-5) - Iptr[x]; residualMat.at(buffIdx++) = static_cast(diff); } @@ -310,6 +309,7 @@ class TrackerInvoker : public cv::ParallelLoopBody for( y = 0; y < _winSize.height; y++ ) { const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const uchar* Jptr1 = J.ptr(y + inextPt.y + 1, inextPt.x*cn); const short* Iptr = IWinBuf.ptr(y, 0); const short* dIptr = derivIWinBuf.ptr(y, 0); const tMaskType* maskPtr = winMaskMat.ptr(y, 0); @@ -320,15 +320,15 @@ class TrackerInvoker : public cv::ParallelLoopBody continue; int illValue = - Iptr[x]; - float It[4] = {static_cast((Jptr[x+step+cn]<< 5) + illValue), + float It[4] = {static_cast((Jptr1[x+cn]<< 5) + illValue), static_cast((Jptr[x+cn]<< 5) + illValue), - static_cast((Jptr[x+step]<< 5) + illValue), + static_cast((Jptr1[x]<< 5) + illValue), static_cast((Jptr[x] << 5) + illValue)}; int J_val = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + - Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, + Jptr1[x]*iw10 + Jptr1[x+cn]*iw11, W_BITS1-5); @@ -693,11 +693,6 @@ class TrackerInvoker : public cv::ParallelLoopBody int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - - int dstep = (int)(derivI.step/derivI.elemSize1()); - int step = (int)(I.step/I.elemSize1()); - CV_Assert( step == (int)(J.step/J.elemSize1()) ); - float A11 = 0, A12 = 0, A22 = 0; // tensor @@ -726,13 +721,15 @@ class TrackerInvoker : public cv::ParallelLoopBody for( y = 0; y < winSize.height; y++ ) { x = 0; - const uchar* src = I.ptr(y + iprevPt.y, 0) + iprevPt.x*cn; - const short* dsrc = derivI.ptr(y + iprevPt.y, 0) + iprevPt.x*cn2; + const uchar* src = I.ptr(y + iprevPt.y, 0) + iprevPt.x*cn; + const uchar* src1 = I.ptr(y + iprevPt.y + 1, 0) + iprevPt.x*cn; + const short* dsrc = derivI.ptr(y + iprevPt.y, 0) + iprevPt.x*cn2; + const short* dsrc1 = derivI.ptr(y + iprevPt.y + 1, 0) + iprevPt.x*cn2; short* Iptr = IWinBuf.ptr(y, 0); short* dIptr = derivIWinBuf.ptr(y, 0); #ifdef RLOF_SSE const tMaskType* maskPtr = winMaskMat.ptr(y, 0); - for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 ) + for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dsrc1 += 8, dIptr += 4*2 ) { __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16); __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16); @@ -741,8 +738,8 @@ class TrackerInvoker : public cv::ParallelLoopBody __m128i v00, v01, v10, v11, t0, t1; v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); - v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z); - v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z); + v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x)), z); + v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x + cn)), z); t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); @@ -757,8 +754,8 @@ class TrackerInvoker : public cv::ParallelLoopBody v00 = _mm_loadu_si128((const __m128i*)(dsrc)); v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); - v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep)); - v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2)); + v10 = _mm_loadu_si128((const __m128i*)(dsrc1)); + v11 = _mm_loadu_si128((const __m128i*)(dsrc1 + cn2)); t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); @@ -776,7 +773,7 @@ class TrackerInvoker : public cv::ParallelLoopBody } #else - for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 ) + for( ; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2 ) { if( winMaskMat.at(y,x) == 0) { @@ -785,11 +782,11 @@ class TrackerInvoker : public cv::ParallelLoopBody continue; } int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + - src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5); + src1[x]*iw10 + src1[x+cn]*iw11, W_BITS1-5); int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 + - dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1); - int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 + - dsrc[dstep+cn2+1]*iw11, W_BITS1); + dsrc1[0]*iw10 + dsrc1[cn2]*iw11, W_BITS1); + int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + + dsrc1[1]*iw10 + dsrc1[cn2+1]*iw11, W_BITS1); Iptr[x] = (short)ival; dIptr[0] = (short)ixval; @@ -881,6 +878,7 @@ class TrackerInvoker : public cv::ParallelLoopBody for( y = 0; y < winSize.height; y++ ) { const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const uchar* Jptr1 = J.ptr(y + inextPt.y + 1, inextPt.x*cn); const short* Iptr = IWinBuf.ptr(y, 0); const short* dIptr = derivIWinBuf.ptr(y, 0); const tMaskType* maskPtr = winMaskMat.ptr(y, 0); @@ -889,7 +887,7 @@ class TrackerInvoker : public cv::ParallelLoopBody { if( maskPtr[x] == 0) continue; - int diff = static_cast(CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, W_BITS1-5) + int diff = static_cast(CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr1[x]*iw10 + Jptr1[x+cn]*iw11, W_BITS1-5) - Iptr[x] + Iptr[x] * gainVec.x +gainVec.y); residualMat.at(buffIdx++) = static_cast(diff); } @@ -945,7 +943,8 @@ class TrackerInvoker : public cv::ParallelLoopBody */ for( y = 0; y < _winSize.height; y++ ) { - const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const uchar* Jptr1 = J.ptr(y + inextPt.y + 1, inextPt.x*cn); const short* Iptr = IWinBuf.ptr(y, 0); const short* dIptr = derivIWinBuf.ptr(y, 0); const tMaskType* maskPtr = winMaskMat.ptr(y, 0); @@ -959,8 +958,8 @@ class TrackerInvoker : public cv::ParallelLoopBody __m128i v00 = _mm_unpacklo_epi8( _mm_loadl_epi64((const __m128i*)(Jptr + x)), z); __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); - __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z); - __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z); + __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z); + __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z); __m128i t0 = _mm_add_epi32 (_mm_madd_epi16( @@ -1164,14 +1163,14 @@ class TrackerInvoker : public cv::ParallelLoopBody short iyval = dIptr[1]; int illValue = static_cast(Iptr[x] * gainVec.x + gainVec.y - Iptr[x]); - int It[4] = {(Jptr[x+step+cn]<< 5) + illValue, + int It[4] = {(Jptr1[x+cn]<< 5) + illValue, (Jptr[x+cn]<< 5) + illValue, - (Jptr[x+step]<< 5) + illValue, + (Jptr1[x]<< 5) + illValue, (Jptr[x] << 5) + illValue}; int J_val = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + - Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, + Jptr1[x]*iw10 + Jptr1[x+cn]*iw11, W_BITS1-5); int diff = J_val + illValue; @@ -1646,10 +1645,6 @@ class TrackerInvoker : public cv::ParallelLoopBody int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - - int dstep = (int)(derivI.step/derivI.elemSize1()); - int step = (int)(I.step/I.elemSize1()); - CV_Assert( step == (int)(J.step/J.elemSize1()) ); float A11 = 0, A12 = 0, A22 = 0; #ifdef RLOF_SSE @@ -1668,13 +1663,15 @@ class TrackerInvoker : public cv::ParallelLoopBody for( y = 0; y < winSize.height; y++ ) { const uchar* src = I.ptr(y + iprevPt.y, 0) + iprevPt.x*cn; + const uchar* src1 = I.ptr(y + iprevPt.y, 0) + iprevPt.x*cn; const short* dsrc = derivI.ptr(y + iprevPt.y, 0) + iprevPt.x*cn2; + const short* dsrc1 = derivI.ptr(y + iprevPt.y, 0) + iprevPt.x*cn2; short* Iptr = IWinBuf.ptr(y, 0); short* dIptr = derivIWinBuf.ptr(y, 0); const tMaskType* maskPtr = winMaskMat.ptr(y, 0); x = 0; #ifdef RLOF_SSE - for( ; x < winSize.width*cn; x += 4, dsrc += 4*2, dIptr += 4*2 ) + for( ; x < winSize.width*cn; x += 4, dsrc += 4*2, dsrc1 += 8,dIptr += 4*2 ) { __m128i wMask = _mm_set_epi32(MaskSet * maskPtr[x+3], MaskSet * maskPtr[x+2], @@ -1683,8 +1680,8 @@ class TrackerInvoker : public cv::ParallelLoopBody __m128i v00, v01, v10, v11, t0, t1; v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); - v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z); - v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z); + v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x)), z); + v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x + cn)), z); t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); @@ -1694,8 +1691,8 @@ class TrackerInvoker : public cv::ParallelLoopBody v00 = _mm_loadu_si128((const __m128i*)(dsrc)); v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); - v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep)); - v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2)); + v10 = _mm_loadu_si128((const __m128i*)(dsrc1)); + v11 = _mm_loadu_si128((const __m128i*)(dsrc1 + cn2)); t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); @@ -1724,7 +1721,7 @@ class TrackerInvoker : public cv::ParallelLoopBody } #else - for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2) + for( ; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2) { if( maskPtr[x] == 0) { @@ -1734,11 +1731,11 @@ class TrackerInvoker : public cv::ParallelLoopBody continue; } int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + - src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5); + src1[x]*iw10 + src1[x+cn]*iw11, W_BITS1-5); int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 + - dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1); - int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 + - dsrc[dstep+cn2+1]*iw11, W_BITS1); + dsrc1[0]*iw10 + dsrc1[cn2]*iw11, W_BITS1); + int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc1[1]*iw10 + + dsrc1[cn2+1]*iw11, W_BITS1); Iptr[x] = (short)ival; dIptr[0] = (short)ixval; @@ -1836,6 +1833,7 @@ class TrackerInvoker : public cv::ParallelLoopBody for( y = 0; y < winSize.height; y++ ) { const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const uchar* Jptr1 = J.ptr(y + inextPt.y, inextPt.x*cn); const short* Iptr = IWinBuf.ptr(y, 0); const short* dIptr = derivIWinBuf.ptr(y, 0); x = 0; @@ -1850,8 +1848,8 @@ class TrackerInvoker : public cv::ParallelLoopBody __m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x)); __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z); __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); - __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z); - __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z); + __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z); + __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z); __m128i mmDiffc_epi16[4] = { _mm_subs_epi16(_mm_slli_epi16(v00, 5), diff0), @@ -1953,14 +1951,10 @@ class TrackerInvoker : public cv::ParallelLoopBody { if( dIptr[0] == 0 && dIptr[1] == 0) continue; - /*short It[4] = {CV_DESCALE((Jptr[x]) *(1 << W_BITS),W_BITS1-5) - Iptr[x], - CV_DESCALE((Jptr[x+cn]) *(1 << W_BITS),W_BITS1-5) - Iptr[x], - CV_DESCALE((Jptr[x+step]) *(1 << W_BITS),W_BITS1-5) - Iptr[x], - CV_DESCALE((Jptr[x+step+cn])*(1 << W_BITS),W_BITS1-5) - Iptr[x]};*/ short It[4] = {(Jptr[x] << 5) - Iptr[x], (Jptr[x+cn]<< 5) - Iptr[x], - (Jptr[x+step]<< 5) - Iptr[x], - (Jptr[x+step+cn]<< 5) - Iptr[x]}; + (Jptr1[x]<< 5) - Iptr[x], + (Jptr1[x+cn]<< 5) - Iptr[x]}; _b1[0] += (float)(It[0]*dIptr[0]); _b1[1] += (float)(It[1]*dIptr[0]); diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp index cabf61df104..3f76ce460a4 100644 --- a/modules/optflow/src/rlof/plk_invoker.hpp +++ b/modules/optflow/src/rlof/plk_invoker.hpp @@ -145,9 +145,6 @@ class TrackerInvoker : public cv::ParallelLoopBody int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - int dstep = (int)(derivI.step/derivI.elemSize1()); - int step = (int)(I.step/I.elemSize1()); - CV_Assert( step == (int)(J.step/J.elemSize1()) ); float A11 = 0, A12 = 0, A22 = 0; // tensor float sumIx = 0; @@ -173,21 +170,23 @@ class TrackerInvoker : public cv::ParallelLoopBody for( y = 0; y < winSize.height; y++ ) { const uchar* src = I.ptr(y + iprevPt.y, 0) + iprevPt.x*cn; + const uchar* src1 = I.ptr(y + iprevPt.y + 1, 0) + iprevPt.x*cn; const short* dsrc = derivI.ptr(y + iprevPt.y, 0) + iprevPt.x*cn2; + const short* dsrc1 = derivI.ptr(y + iprevPt.y + 1, 0) + iprevPt.x*cn2; short* Iptr = IWinBuf.ptr(y, 0); short* dIptr = derivIWinBuf.ptr(y, 0); const tMaskType* maskPtr = winMaskMat.ptr(y, 0); x = 0; #ifdef RLOF_SSE - for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 ) + for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dsrc1 += 8, dIptr += 4*2 ) { __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16); __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16); __m128i v00, v01, v10, v11, t0, t1; v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); - v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z); - v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z); + v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x)), z); + v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x + cn)), z); t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); @@ -202,8 +201,8 @@ class TrackerInvoker : public cv::ParallelLoopBody v00 = _mm_loadu_si128((const __m128i*)(dsrc)); v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); - v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep)); - v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2)); + v10 = _mm_loadu_si128((const __m128i*)(dsrc1)); + v11 = _mm_loadu_si128((const __m128i*)(dsrc1 + cn2)); t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); @@ -220,7 +219,7 @@ class TrackerInvoker : public cv::ParallelLoopBody _mm_storeu_si128((__m128i*)dIptr, v00); } #else - for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 ) + for( ; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2 ) { if( maskPtr[x] == 0) { @@ -229,11 +228,11 @@ class TrackerInvoker : public cv::ParallelLoopBody continue; } int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + - src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5); + src1[x]*iw10 + src1[x+cn]*iw11, W_BITS1-5); int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 + - dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1); - int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 + - dsrc[dstep+cn2+1]*iw11, W_BITS1); + dsrc1[0]*iw10 + dsrc1[cn2]*iw11, W_BITS1); + int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc1[1]*iw10 + + dsrc1[cn2+1]*iw11, W_BITS1); Iptr[x] = (short)ival; dIptr[0] = (short)ixval; @@ -308,6 +307,7 @@ class TrackerInvoker : public cv::ParallelLoopBody for( y = 0; y < winSize.height; y++ ) { const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const uchar* Jptr1 = J.ptr(y + inextPt.y + 1, inextPt.x*cn); const short* Iptr = IWinBuf.ptr(y, 0); const short* dIptr = derivIWinBuf.ptr(y, 0); const tMaskType* maskPtr = winMaskMat.ptr(y, 0); @@ -328,8 +328,8 @@ class TrackerInvoker : public cv::ParallelLoopBody _mm_loadl_epi64((const __m128i*)(Jptr + x)) // J0, J1, J2, ..., J7, 64 bit nullen , z); //J0 , 0, J1, 0, J2, 0 ... J7,0 __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); - __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z); - __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z); + __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z); + __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z); __m128i t0 = _mm_add_epi32 @@ -465,7 +465,7 @@ class TrackerInvoker : public cv::ParallelLoopBody if( maskPtr[x] == 0) continue; int J_val = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + - Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, + Jptr1[x]*iw10 + Jptr1[x+cn]*iw11, W_BITS1-5); int diff = static_cast(J_val - Iptr[x] + Iptr[x] * gainVec.x + gainVec.y); @@ -790,10 +790,6 @@ class TrackerInvoker : public cv::ParallelLoopBody int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - - int dstep = (int)(derivI.step/derivI.elemSize1()); - int step = (int)(I.step/I.elemSize1()); - CV_Assert( step == (int)(J.step/J.elemSize1()) ); float A11 = 0, A12 = 0, A22 = 0; #ifdef RLOF_SSE @@ -812,13 +808,15 @@ class TrackerInvoker : public cv::ParallelLoopBody for( y = 0; y < winSize.height; y++ ) { const uchar* src = I.ptr(y + iprevPt.y, 0) + iprevPt.x*cn; + const uchar* src1 = I.ptr(y + iprevPt.y + 1, 0) + iprevPt.x*cn; const short* dsrc = derivI.ptr(y + iprevPt.y, 0) + iprevPt.x*cn2; + const short* dsrc1 = derivI.ptr(y + iprevPt.y + 1, 0) + iprevPt.x*cn2; short* Iptr = IWinBuf.ptr(y, 0); short* dIptr = derivIWinBuf.ptr(y, 0); const tMaskType* maskPtr = winMaskMat.ptr(y, 0); x = 0; #ifdef RLOF_SSE - for( ; x <= winSize.width*cn; x += 4, dsrc += 4*2, dIptr += 4*2 ) + for( ; x <= winSize.width*cn; x += 4, dsrc += 4*2, dsrc1 += 8, dIptr += 4*2 ) { __m128i wMask = _mm_set_epi32(MaskSet * maskPtr[x+3], MaskSet * maskPtr[x+2], @@ -827,8 +825,8 @@ class TrackerInvoker : public cv::ParallelLoopBody __m128i v00, v01, v10, v11, t0, t1; v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); - v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z); - v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z); + v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x)), z); + v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x + cn)), z); t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); @@ -843,8 +841,8 @@ class TrackerInvoker : public cv::ParallelLoopBody v00 = _mm_loadu_si128((const __m128i*)(dsrc)); v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); - v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep)); - v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2)); + v10 = _mm_loadu_si128((const __m128i*)(dsrc1)); + v11 = _mm_loadu_si128((const __m128i*)(dsrc1 + cn2)); t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); @@ -874,7 +872,7 @@ class TrackerInvoker : public cv::ParallelLoopBody } #else - for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 ) + for( ; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2 ) { if( maskPtr[x] == 0) { @@ -883,11 +881,11 @@ class TrackerInvoker : public cv::ParallelLoopBody continue; } int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + - src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5); + src1[x]*iw10 + src1[x+cn]*iw11, W_BITS1-5); int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 + - dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1); - int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 + - dsrc[dstep+cn2+1]*iw11, W_BITS1); + dsrc1[0]*iw10 + dsrc1[cn2]*iw11, W_BITS1); + int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc1[1]*iw10 + + dsrc1[cn2+1]*iw11, W_BITS1); Iptr[x] = (short)ival; dIptr[0] = (short)ixval; @@ -967,6 +965,7 @@ class TrackerInvoker : public cv::ParallelLoopBody for( y = 0; y < winSize.height; y++ ) { const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const uchar* Jptr1 = J.ptr(y + inextPt.y + 1, inextPt.x*cn); const short* Iptr = IWinBuf.ptr(y, 0); const short* dIptr = derivIWinBuf.ptr(y, 0); @@ -981,8 +980,8 @@ class TrackerInvoker : public cv::ParallelLoopBody __m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x)), diff1; __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z); __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); - __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z); - __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z); + __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z); + __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z); __m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); @@ -1030,7 +1029,7 @@ class TrackerInvoker : public cv::ParallelLoopBody if( dIptr[0] == 0 && dIptr[1] == 0) continue; int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + - Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, + Jptr1[x]*iw10 + Jptr1[x+cn]*iw11, W_BITS1-5) - Iptr[x]; b1 += (float)(diff*dIptr[0]) * FLT_RESCALE; diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp index da9fb20a8ab..38648871273 100644 --- a/modules/optflow/src/rlof/rlof_invoker.hpp +++ b/modules/optflow/src/rlof/rlof_invoker.hpp @@ -137,10 +137,6 @@ class TrackerInvoker : public cv::ParallelLoopBody int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - - int dstep = (int)(derivI.step / derivI.elemSize1()); - int step = (int)(I.step / I.elemSize1()); - CV_Assert(step == (int)(J.step / J.elemSize1())); float A11 = 0, A12 = 0, A22 = 0; float b1 = 0, b2 = 0; float D = 0; @@ -162,14 +158,16 @@ class TrackerInvoker : public cv::ParallelLoopBody for (y = 0; y < winSize.height; y++) { const uchar* src = I.ptr(y + iprevPt.y, 0) + iprevPt.x*cn; + const uchar* src1 = I.ptr(y + iprevPt.y + 1, 0) + iprevPt.x*cn; const short* dsrc = derivI.ptr(y + iprevPt.y, 0) + iprevPt.x*cn2; + const short* dsrc1 = derivI.ptr(y + iprevPt.y + 1, 0) + iprevPt.x*cn2; short* Iptr = IWinBuf.ptr(y, 0); short* dIptr = derivIWinBuf.ptr(y, 0); const tMaskType* maskPtr = winMaskMat.ptr(y, 0); x = 0; #ifdef RLOF_SSE - for (; x <= winSize.width*cn; x += 4, dsrc += 4 * 2, dIptr += 4 * 2) + for (; x <= winSize.width*cn; x += 4, dsrc += 4 * 2, dsrc1 += 8, dIptr += 4 * 2) { __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16); __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16); @@ -177,8 +175,8 @@ class TrackerInvoker : public cv::ParallelLoopBody __m128i v00, v01, v10, v11, t0, t1; v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); - v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z); - v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z); + v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x)), z); + v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x + cn)), z); t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); @@ -192,8 +190,8 @@ class TrackerInvoker : public cv::ParallelLoopBody v00 = _mm_loadu_si128((const __m128i*)(dsrc)); v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); - v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep)); - v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2)); + v10 = _mm_loadu_si128((const __m128i*)(dsrc1)); + v11 = _mm_loadu_si128((const __m128i*)(dsrc1 + cn2)); t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); @@ -211,7 +209,7 @@ class TrackerInvoker : public cv::ParallelLoopBody } #else - for (; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2) + for (; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2) { if (maskPtr[x] == 0) { @@ -220,11 +218,11 @@ class TrackerInvoker : public cv::ParallelLoopBody continue; } int ival = CV_DESCALE(src[x] * iw00 + src[x + cn] * iw01 + - src[x + step] * iw10 + src[x + step + cn] * iw11, W_BITS1 - 5); + src1[x] * iw10 + src1[x + cn] * iw11, W_BITS1 - 5); int ixval = CV_DESCALE(dsrc[0] * iw00 + dsrc[cn2] * iw01 + - dsrc[dstep] * iw10 + dsrc[dstep + cn2] * iw11, W_BITS1); - int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc[dstep + 1] * iw10 + - dsrc[dstep + cn2 + 1] * iw11, W_BITS1); + dsrc1[0] * iw10 + dsrc1[cn2] * iw11, W_BITS1); + int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc1[1] * iw10 + + dsrc1[cn2 + 1] * iw11, W_BITS1); Iptr[x] = (short)ival; dIptr[0] = (short)ixval; @@ -284,6 +282,7 @@ class TrackerInvoker : public cv::ParallelLoopBody for (y = 0; y < winSize.height; y++) { const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const uchar* Jptr1 = J.ptr(y + inextPt.y + 1, inextPt.x*cn); const short* Iptr = IWinBuf.ptr(y, 0); const short* dIptr = derivIWinBuf.ptr(y, 0); const tMaskType* maskPtr = winMaskMat.ptr(y, 0); @@ -292,7 +291,7 @@ class TrackerInvoker : public cv::ParallelLoopBody { if (maskPtr[x] == 0) continue; - int diff = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11, W_BITS1 - 5) - Iptr[x]; + int diff = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr1[x] * iw10 + Jptr1[x + cn] * iw11, W_BITS1 - 5) - Iptr[x]; residualMat.at(buffIdx++) = static_cast(diff); } } @@ -329,6 +328,7 @@ class TrackerInvoker : public cv::ParallelLoopBody for (y = 0; y < winSize.height; y++) { const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const uchar* Jptr1 = J.ptr(y + inextPt.y + 1, inextPt.x*cn); const short* Iptr = IWinBuf.ptr(y, 0); const short* dIptr = derivIWinBuf.ptr(y, 0); const tMaskType* maskPtr = winMaskMat.ptr(y, 0); @@ -342,8 +342,8 @@ class TrackerInvoker : public cv::ParallelLoopBody __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z); __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); - __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z); - __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z); + __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z); + __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z); __m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); @@ -464,7 +464,7 @@ class TrackerInvoker : public cv::ParallelLoopBody if (maskPtr[x] == 0) continue; int diff = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + - Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11, + Jptr1[x] * iw10 + Jptr1[x + cn] * iw11, W_BITS1 - 5) - Iptr[x]; @@ -766,10 +766,6 @@ class TrackerInvoker : public cv::ParallelLoopBody int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - int dstep = (int)(derivI.step / derivI.elemSize1()); - int step = (int)(I.step / I.elemSize1()); - CV_Assert(step == (int)(J.step / J.elemSize1())); - float A11 = 0, A12 = 0, A22 = 0; float b1 = 0, b2 = 0, b3 = 0, b4 = 0; // tensor @@ -797,22 +793,24 @@ class TrackerInvoker : public cv::ParallelLoopBody for (y = 0; y < winSize.height; y++) { const uchar* src = I.ptr(y + iprevPt.y, 0) + iprevPt.x*cn; + const uchar* src1 = I.ptr(y + iprevPt.y + 1, 0) + iprevPt.x*cn; const short* dsrc = derivI.ptr(y + iprevPt.y, 0) + iprevPt.x*cn2; + const short* dsrc1 = derivI.ptr(y + iprevPt.y + 1, 0) + iprevPt.x*cn2; short* Iptr = IWinBuf.ptr(y, 0); short* dIptr = derivIWinBuf.ptr(y, 0); x = 0; #ifdef RLOF_SSE const tMaskType* maskPtr = winMaskMat.ptr(y, 0); - for (; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4 * 2, dIptr += 4 * 2) + for (; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4 * 2, dsrc1 += 8, dIptr += 4 * 2) { __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16); __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16); __m128i v00, v01, v10, v11, t0, t1; v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); - v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z); - v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z); + v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x)), z); + v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x + cn)), z); t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); @@ -827,8 +825,8 @@ class TrackerInvoker : public cv::ParallelLoopBody v00 = _mm_loadu_si128((const __m128i*)(dsrc)); v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); - v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep)); - v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2)); + v10 = _mm_loadu_si128((const __m128i*)(dsrc1)); + v11 = _mm_loadu_si128((const __m128i*)(dsrc1 + cn2)); t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); @@ -846,7 +844,7 @@ class TrackerInvoker : public cv::ParallelLoopBody } #else - for (; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2) + for (; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2) { if (winMaskMat.at(y, x) == 0) { @@ -855,11 +853,11 @@ class TrackerInvoker : public cv::ParallelLoopBody continue; } int ival = CV_DESCALE(src[x] * iw00 + src[x + cn] * iw01 + - src[x + step] * iw10 + src[x + step + cn] * iw11, W_BITS1 - 5); + src1[x] * iw10 + src1[x + cn] * iw11, W_BITS1 - 5); int ixval = CV_DESCALE(dsrc[0] * iw00 + dsrc[cn2] * iw01 + - dsrc[dstep] * iw10 + dsrc[dstep + cn2] * iw11, W_BITS1); - int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc[dstep + 1] * iw10 + - dsrc[dstep + cn2 + 1] * iw11, W_BITS1); + dsrc1[0] * iw10 + dsrc1[ cn2] * iw11, W_BITS1); + int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc1[1] * iw10 + + dsrc1[cn2 + 1] * iw11, W_BITS1); Iptr[x] = (short)ival; dIptr[0] = (short)ixval; @@ -940,6 +938,7 @@ class TrackerInvoker : public cv::ParallelLoopBody for (y = 0; y < winSize.height; y++) { const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const uchar* Jptr1 = J.ptr(y + inextPt.y + 1, inextPt.x*cn); const short* Iptr = IWinBuf.ptr(y, 0); const short* dIptr = derivIWinBuf.ptr(y, 0); x = 0; @@ -949,8 +948,8 @@ class TrackerInvoker : public cv::ParallelLoopBody continue; int diff = static_cast(CV_DESCALE( Jptr[x] * iw00 + Jptr[x + cn] * iw01 + - Jptr[x + step] * iw10 + - Jptr[x + step + cn] * iw11, W_BITS1 - 5) + Jptr1[x] * iw10 + + Jptr1[x + cn] * iw11, W_BITS1 - 5) - Iptr[x] + Iptr[x] * gainVec.x + gainVec.y); residualMat.at(buffIdx++) = static_cast(diff); } @@ -1001,6 +1000,7 @@ class TrackerInvoker : public cv::ParallelLoopBody for (y = 0; y < _winSize.height; y++) { const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const uchar* Jptr1 = J.ptr(y + inextPt.y + 1, inextPt.x*cn); const short* Iptr = IWinBuf.ptr(y, 0); const short* dIptr = derivIWinBuf.ptr(y, 0); const tMaskType* maskPtr = winMaskMat.ptr(y, 0); @@ -1016,8 +1016,8 @@ class TrackerInvoker : public cv::ParallelLoopBody __m128i v00 = _mm_unpacklo_epi8( _mm_loadl_epi64((const __m128i*)(Jptr + x)) , z); //J0 , 0, J1, 0, J2, 0 ... J7,0 __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); - __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z); - __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z); + __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z); + __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z); __m128i t0 = _mm_add_epi32 (_mm_madd_epi16( @@ -1205,7 +1205,7 @@ class TrackerInvoker : public cv::ParallelLoopBody { if (maskPtr[x] == 0) continue; - int J_val = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr[x + step] * iw10 + Jptr[x + step + cn] * iw11, + int J_val = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr1[x] * iw10 + Jptr1[x + cn] * iw11, W_BITS1 - 5); short ixval = static_cast(dIptr[0]); short iyval = static_cast(dIptr[1]); diff --git a/modules/optflow/src/rlof/rlof_localflow.cpp b/modules/optflow/src/rlof/rlof_localflow.cpp index 1bc000d8fa1..feb515d51d1 100644 --- a/modules/optflow/src/rlof/rlof_localflow.cpp +++ b/modules/optflow/src/rlof/rlof_localflow.cpp @@ -255,8 +255,7 @@ void preprocess(Ptr prevPyramids[2], Ptr currPyramid inline bool isrobust(const RLOFOpticalFlowParameter & param) { - return (param.normSigma0 != std::numeric_limits::max()) - && (param.normSigma1 != std::numeric_limits::max()); + return (param.normSigma0 < 255 && param.normSigma1 < 255); } inline std::vector get_norm(float sigma0, float sigma1) { diff --git a/modules/optflow/src/rlofflow.cpp b/modules/optflow/src/rlofflow.cpp index 12a72cb035f..5ee07005eb7 100644 --- a/modules/optflow/src/rlofflow.cpp +++ b/modules/optflow/src/rlofflow.cpp @@ -126,9 +126,9 @@ class DenseOpticalFlowRLOFImpl : public DenseRLOFOpticalFlow std::vector currPoints, refPoints; noPoints = 0; cv::Size grid_h = gridStep / 2; - for (int r = grid_h.height; r < prevImage.rows; r += gridStep.height) + for (int r = grid_h.height; r < prevImage.rows - grid_h.height; r += gridStep.height) { - for (int c = grid_h.width; c < prevImage.cols; c += gridStep.width) + for (int c = grid_h.width; c < prevImage.cols - grid_h.width; c += gridStep.width) { prevPoints[noPoints++] = cv::Point2f(static_cast(c), static_cast(r)); } From 0046f5438472508ecd4eadf079ba335cc26a4d31 Mon Sep 17 00:00:00 2001 From: senst Date: Sun, 16 Dec 2018 13:48:37 +0100 Subject: [PATCH 23/25] bugfix on BEPLK with ica and adapt the accuracy tests --- modules/optflow/src/rlof/berlof_invoker.hpp | 6 +++--- modules/optflow/test/test_OF_accuracy.cpp | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp index 1f7b3ac8384..5e618562159 100644 --- a/modules/optflow/src/rlof/berlof_invoker.hpp +++ b/modules/optflow/src/rlof/berlof_invoker.hpp @@ -1663,9 +1663,9 @@ class TrackerInvoker : public cv::ParallelLoopBody for( y = 0; y < winSize.height; y++ ) { const uchar* src = I.ptr(y + iprevPt.y, 0) + iprevPt.x*cn; - const uchar* src1 = I.ptr(y + iprevPt.y, 0) + iprevPt.x*cn; + const uchar* src1 = I.ptr(y + iprevPt.y + 1, 0) + iprevPt.x*cn; const short* dsrc = derivI.ptr(y + iprevPt.y, 0) + iprevPt.x*cn2; - const short* dsrc1 = derivI.ptr(y + iprevPt.y, 0) + iprevPt.x*cn2; + const short* dsrc1 = derivI.ptr(y + iprevPt.y + 1, 0) + iprevPt.x*cn2; short* Iptr = IWinBuf.ptr(y, 0); short* dIptr = derivIWinBuf.ptr(y, 0); const tMaskType* maskPtr = winMaskMat.ptr(y, 0); @@ -1833,7 +1833,7 @@ class TrackerInvoker : public cv::ParallelLoopBody for( y = 0; y < winSize.height; y++ ) { const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); - const uchar* Jptr1 = J.ptr(y + inextPt.y, inextPt.x*cn); + const uchar* Jptr1 = J.ptr(y + inextPt.y + 1, inextPt.x*cn); const short* Iptr = IWinBuf.ptr(y, 0); const short* dIptr = derivIWinBuf.ptr(y, 0); x = 0; diff --git a/modules/optflow/test/test_OF_accuracy.cpp b/modules/optflow/test/test_OF_accuracy.cpp index 26f6b3668c4..2e8940b3a2c 100644 --- a/modules/optflow/test/test_OF_accuracy.cpp +++ b/modules/optflow/test/test_OF_accuracy.cpp @@ -198,7 +198,7 @@ TEST(SparseOpticalFlow, ReferenceAccuracy) param->solverType = ST_BILINEAR; algo->setRLOFOpticalFlowParameter(param); algo->calc(frame1, frame2, prevPts, currPts, status, err); - EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.25f); + EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.27f); param->solverType = ST_STANDART; algo->setRLOFOpticalFlowParameter(param); @@ -209,12 +209,12 @@ TEST(SparseOpticalFlow, ReferenceAccuracy) param->solverType = ST_BILINEAR; algo->setRLOFOpticalFlowParameter(param); algo->calc(frame1, frame2, prevPts, currPts, status, err); - EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.25f); + EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.27f); param->solverType = ST_STANDART; algo->setRLOFOpticalFlowParameter(param); algo->calc(frame1, frame2, prevPts, currPts, status, err); - EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.25f); + EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.27f); param->normSigma0 = numeric_limits::max(); param->normSigma1 = numeric_limits::max(); @@ -235,7 +235,7 @@ TEST(SparseOpticalFlow, ReferenceAccuracy) param->solverType = ST_BILINEAR; algo->setRLOFOpticalFlowParameter(param); algo->calc(frame1, frame2, prevPts, currPts, status, err); - EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.25f); + EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.80f); param->solverType = ST_STANDART; algo->setRLOFOpticalFlowParameter(param); @@ -260,14 +260,14 @@ TEST(DenseOpticalFlow_RLOF, ReferenceAccuracy) ASSERT_EQ(GT.rows, flow.rows); ASSERT_EQ(GT.cols, flow.cols); - EXPECT_LE(calcRMSE(GT, flow), 0.37f); + EXPECT_LE(calcRMSE(GT, flow), 0.44f); algo->setInterpolation(INTERP_GEO); algo->calc(frame1, frame2, flow); ASSERT_EQ(GT.rows, flow.rows); ASSERT_EQ(GT.cols, flow.cols); - EXPECT_LE(calcRMSE(GT, flow), 0.49f); + EXPECT_LE(calcRMSE(GT, flow), 0.53f); } From 221b861afc68763da26ebc8861553cd3b03ba0aa Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Sun, 16 Dec 2018 23:26:37 +0000 Subject: [PATCH 24/25] more 'static' functions --- modules/optflow/src/rlof/rlof_localflow.cpp | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/modules/optflow/src/rlof/rlof_localflow.cpp b/modules/optflow/src/rlof/rlof_localflow.cpp index feb515d51d1..8689aa3ed9f 100644 --- a/modules/optflow/src/rlof/rlof_localflow.cpp +++ b/modules/optflow/src/rlof/rlof_localflow.cpp @@ -245,25 +245,19 @@ void preCalcCrossSegmentation( } -int buildOpticalFlowPyramidScale(InputArray _img, OutputArrayOfArrays pyramid, Size winSize, int maxLevel, bool withDerivatives, - int pyrBorder, int derivBorder, bool tryReuseInputImage, float levelScale[2]); -void calcLocalOpticalFlowCore(Ptr prevPyramids[2], Ptr currPyramids[2], InputArray _prevPts, - InputOutputArray _nextPts, const RLOFOpticalFlowParameter & param); - -void preprocess(Ptr prevPyramids[2], Ptr currPyramids[2], const std::vector & prevPoints, - std::vector & currPoints, const RLOFOpticalFlowParameter & param); - -inline bool isrobust(const RLOFOpticalFlowParameter & param) +static inline +bool isrobust(const RLOFOpticalFlowParameter & param) { return (param.normSigma0 < 255 && param.normSigma1 < 255); } -inline std::vector get_norm(float sigma0, float sigma1) +static inline +std::vector get_norm(float sigma0, float sigma1) { std::vector result = { sigma0, sigma1, sigma0 / (sigma0 - sigma1), sigma0 * sigma1 }; return result; } - +static int buildOpticalFlowPyramidScale(InputArray _img, OutputArrayOfArrays pyramid, Size winSize, int maxLevel, bool withDerivatives, int pyrBorder, int derivBorder, bool tryReuseInputImage, float levelScale[2]) { @@ -368,6 +362,7 @@ int CImageBuffer::buildPyramid(cv::Size winSize, int maxLevel, float levelScale[ return m_maxLevel; } +static void calcLocalOpticalFlowCore( Ptr prevPyramids[2], Ptr currPyramids[2], @@ -581,6 +576,7 @@ void calcLocalOpticalFlowCore( } } +static void preprocess(Ptr prevPyramids[2], Ptr currPyramids[2], const std::vector & prevPoints, From dcdc9c9ca94a7220df0e65fbdd95a8f2ec90890a Mon Sep 17 00:00:00 2001 From: senst Date: Mon, 17 Dec 2018 22:02:40 +0100 Subject: [PATCH 25/25] bugfix after changing ptr + step to .ptr(y,x) calls by adjusting ROI of prevImage, currImage and derivI as well as changing the offset of the points in the invoker classes. add some static_cast to avoid warning remove 50 grid size sample from perf test. This grid size is to sparse for the epic interpolation remove notSameColor function since it is not used anymore --- modules/optflow/perf/perf_rlof.cpp | 2 +- modules/optflow/src/rlof/berlof_invoker.hpp | 1560 ++++++++--------- modules/optflow/src/rlof/plk_invoker.hpp | 42 +- modules/optflow/src/rlof/rlof_invoker.hpp | 38 +- modules/optflow/src/rlof/rlof_invokerbase.hpp | 14 - modules/optflow/src/rlof/rlof_localflow.cpp | 13 +- modules/optflow/test/test_OF_accuracy.cpp | 4 +- 7 files changed, 818 insertions(+), 855 deletions(-) diff --git a/modules/optflow/perf/perf_rlof.cpp b/modules/optflow/perf/perf_rlof.cpp index a273a9456a2..1295c526a6e 100644 --- a/modules/optflow/perf/perf_rlof.cpp +++ b/modules/optflow/perf/perf_rlof.cpp @@ -48,7 +48,7 @@ typedef TestBaseWithParam INTERP_GRID_Dense; PERF_TEST_P(INTERP_GRID_Dense, OpticalFlow_DenseRLOF, testing::Combine( testing::Values("INTERP_EPIC", "INTERP_GEO"), - testing::Values(4,10,50)) + testing::Values(4,10)) ) { Mat flow; diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp index 5e618562159..604305250a5 100644 --- a/modules/optflow/src/rlof/berlof_invoker.hpp +++ b/modules/optflow/src/rlof/berlof_invoker.hpp @@ -97,9 +97,6 @@ class TrackerInvoker : public cv::ParallelLoopBody void operator()(const cv::Range& range) const CV_OVERRIDE { -#ifdef DEBUG_INVOKER - printf("berlof::ica");fflush(stdout); -#endif Point2f halfWin; cv::Size winSize; const Mat& I = *prevImg; @@ -153,12 +150,12 @@ class TrackerInvoker : public cv::ParallelLoopBody { continue; } - prevPt -= halfWin; + halfWin = Point2f(static_cast(maxWinSize), static_cast(maxWinSize) ) - halfWin; + prevPt += halfWin; iprevPt.x = cvFloor(prevPt.x); iprevPt.y = cvFloor(prevPt.y); - - if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols || - iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows ) + if( iprevPt.x < 0 || iprevPt.x >= derivI.cols - winSize.width || + iprevPt.y < 0 || iprevPt.y >= derivI.rows - winSize.height - 1) { if( level == 0 ) { @@ -215,7 +212,7 @@ class TrackerInvoker : public cv::ParallelLoopBody cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1); cv::Point2f backUpNextPt = nextPt; - nextPt -= halfWin; + nextPt += halfWin; Point2f prevDelta(0,0); //denotes h(t-1) cv::Size _winSize = winSize; #ifdef RLOF_SSE @@ -243,9 +240,8 @@ class TrackerInvoker : public cv::ParallelLoopBody { inextPt.x = cvFloor(nextPt.x); inextPt.y = cvFloor(nextPt.y); - - if( inextPt.x < -winSize.width || inextPt.x >= J.cols || - inextPt.y < -winSize.height || inextPt.y >= J.rows ) + if( inextPt.x < 0 || inextPt.x >= J.cols - winSize.width || + inextPt.y < 0 || inextPt.y >= J.rows - winSize.height - 1) { if( level == 0 && status ) status[ptidx] = 3; @@ -511,12 +507,12 @@ class TrackerInvoker : public cv::ParallelLoopBody delta.x = MAX(-1.f, MIN(1.f, delta.x)); delta.y = MAX(-1.f, MIN(1.f, delta.y)); nextPt += delta; - nextPts[ptidx] = nextPt + halfWin; + nextPts[ptidx] = nextPt - halfWin; } else { nextPt += delta; - nextPts[ptidx] = nextPt + halfWin; + nextPts[ptidx] = nextPt - halfWin; noSolvedIteration++; break; } @@ -614,9 +610,6 @@ class TrackerInvoker : public cv::ParallelLoopBody void operator()(const cv::Range& range) const CV_OVERRIDE { -#ifdef DEBUG_INVOKER - printf("berlof::radial");fflush(stdout); -#endif Point2f halfWin; cv::Size winSize; const Mat& I = *prevImg; @@ -667,13 +660,12 @@ class TrackerInvoker : public cv::ParallelLoopBody winMaskMat,winSize,halfWin,winArea, minWinSize,maxWinSize) == false) continue; - - prevPt -= halfWin; + halfWin = Point2f(static_cast(maxWinSize), static_cast(maxWinSize) ) - halfWin; + prevPt += halfWin; iprevPt.x = cvFloor(prevPt.x); iprevPt.y = cvFloor(prevPt.y); - - if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols || - iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows ) + if( iprevPt.x < 0 || iprevPt.x >= derivI.cols - winSize.width || + iprevPt.y < 0 || iprevPt.y >= derivI.rows - winSize.height - 1) { if( level == 0 ) { @@ -797,9 +789,8 @@ class TrackerInvoker : public cv::ParallelLoopBody } cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1); - cv::Point2f backUpNextPt = nextPt; - nextPt -= halfWin; + nextPt += halfWin; Point2f prevDelta(0,0); //relates to h(t-1) Point2f prevGain(1,0); cv::Point2f gainVec = gainVecs[ptidx]; @@ -827,626 +818,626 @@ class TrackerInvoker : public cv::ParallelLoopBody a = nextPt.x - inextPt.x; b = nextPt.y - inextPt.y; float ab = a * b; - if( j == 0 - || (inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j % 2 != 0) ) + if (j == 0 + || (inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j % 2 != 0) ) { - inextPt.x = cvFloor(nextPt.x); - inextPt.y = cvFloor(nextPt.y); + inextPt.x = cvFloor(nextPt.x); + inextPt.y = cvFloor(nextPt.y); - if( inextPt.x < -winSize.width || inextPt.x >= J.cols || - inextPt.y < -winSize.height || inextPt.y >= J.rows ) - { - if( level == 0 && status ) - status[ptidx] = 3; - if (level > 0) + if( inextPt.x < 0 || inextPt.x >= J.cols - winSize.width || + inextPt.y < 0 || inextPt.y >= J.rows - winSize.height - 1) { - nextPts[ptidx] = backUpNextPt; - gainVecs[ptidx] = backUpGain; + if( level == 0 && status ) + status[ptidx] = 3; + if (level > 0) + { + nextPts[ptidx] = backUpNextPt; + gainVecs[ptidx] = backUpGain; + } + noIteration++; + break; } - noIteration++; - break; - } - a = nextPt.x - inextPt.x; - b = nextPt.y - inextPt.y; - ab = a * b; - iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); - iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); - iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); - iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - // mismatch - - if( j == 0) - { - // tensor - w1 = 0; // -IxI - w2 = 0; // -IyI - dI = 0; // I^2 - sumIx = 0; - sumIy = 0; - sumI = 0; - sumW = 0; - A11 = 0; - A12 = 0; - A22 = 0; - } + a = nextPt.x - inextPt.x; + b = nextPt.y - inextPt.y; + ab = a * b; + iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + // mismatch + + if( j == 0) + { + // tensor + w1 = 0; // -IxI + w2 = 0; // -IyI + dI = 0; // I^2 + sumIx = 0; + sumIy = 0; + sumI = 0; + sumW = 0; + A11 = 0; + A12 = 0; + A22 = 0; + } + + if ( j == 0 ) + { + buffIdx = 0; + for( y = 0; y < winSize.height; y++ ) + { + const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const uchar* Jptr1 = J.ptr(y + inextPt.y + 1, inextPt.x*cn); + const short* Iptr = IWinBuf.ptr(y, 0); + const short* dIptr = derivIWinBuf.ptr(y, 0); + const tMaskType* maskPtr = winMaskMat.ptr(y, 0); + x = 0; + for( ; x < winSize.width*cn; x++, dIptr += 2) + { + if( maskPtr[x] == 0) + continue; + int diff = static_cast(CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr1[x]*iw10 + Jptr1[x+cn]*iw11, W_BITS1-5) + - Iptr[x] + Iptr[x] * gainVec.x +gainVec.y); + residualMat.at(buffIdx++) = static_cast(diff); + } + } + /*! Estimation for the residual */ + cv::Mat residualMatRoi(residualMat, cv::Rect(0,0,1, buffIdx)); + MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi); + } + + float eta = 1.f / winArea; + float fParam0 = normSigma0 * 32.f; + float fParam1 = normSigma1 * 32.f; + fParam0 = normSigma0 * MEstimatorScale; + fParam1 = normSigma1 * MEstimatorScale; + + #ifdef RLOF_SSE + + qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128 qb0[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; + __m128 qb1[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; + __m128 qb2[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; + __m128 qb3[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; + __m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps(); + __m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps(); + __m128 mmSumIy = _mm_setzero_ps(), mmSumIx = _mm_setzero_ps(); + __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps(); + __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits::max() -1, static_cast(fParam0))); + __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits::max()- 1, static_cast(fParam1))); + + + float s2Val = std::fabs(normSigma2); + int s2bitShift = normSigma2 == 0 ? 1 : cvCeil(log(200.f / s2Val) / log(2.f)); + __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast(normSigma2 * (float)(1 << s2bitShift))); + __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift); + __m128 mmParam2s = _mm_set1_ps(0.01f * normSigma2); + __m128 mmParam2s2 = _mm_set1_ps(normSigma2 * normSigma2); + float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x; + int bitShift = gainVec.x == 0 ? 1 : cvCeil(log(200.f / gainVal) / log(2.f)); + __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast(gainVec.x * (float)(1 << bitShift))); + __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast(gainVec.y)); + __m128i mmEta = _mm_setzero_si128(); + __m128i mmScale = _mm_set1_epi16(static_cast(MEstimatorScale)); + + #endif - if ( j == 0 ) - { buffIdx = 0; - for( y = 0; y < winSize.height; y++ ) + float _b0[4] = {0,0,0,0}; + float _b1[4] = {0,0,0,0}; + float _b2[4] = {0,0,0,0}; + float _b3[4] = {0,0,0,0}; + /* + */ + for( y = 0; y < _winSize.height; y++ ) { - const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); const uchar* Jptr1 = J.ptr(y + inextPt.y + 1, inextPt.x*cn); const short* Iptr = IWinBuf.ptr(y, 0); const short* dIptr = derivIWinBuf.ptr(y, 0); const tMaskType* maskPtr = winMaskMat.ptr(y, 0); x = 0; - for( ; x < winSize.width*cn; x++, dIptr += 2) + #ifdef RLOF_SSE + for( ; x <= _winSize.width*cn; x += 8, dIptr += 8*2 ) { - if( maskPtr[x] == 0) - continue; - int diff = static_cast(CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr1[x]*iw10 + Jptr1[x+cn]*iw11, W_BITS1-5) - - Iptr[x] + Iptr[x] * gainVec.x +gainVec.y); - residualMat.at(buffIdx++) = static_cast(diff); - } - } - /*! Estimation for the residual */ - cv::Mat residualMatRoi(residualMat, cv::Rect(0,0,1, buffIdx)); - MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi); - } - - float eta = 1.f / winArea; - float fParam0 = normSigma0 * 32.f; - float fParam1 = normSigma1 * 32.f; - fParam0 = normSigma0 * MEstimatorScale; - fParam1 = normSigma1 * MEstimatorScale; + __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16); + __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); -#ifdef RLOF_SSE + __m128i v00 = _mm_unpacklo_epi8( + _mm_loadl_epi64((const __m128i*)(Jptr + x)), z); + __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); + __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z); + __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z); - qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); - qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); - __m128 qb0[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; - __m128 qb1[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; - __m128 qb2[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; - __m128 qb3[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; - __m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps(); - __m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps(); - __m128 mmSumIy = _mm_setzero_ps(), mmSumIx = _mm_setzero_ps(); - __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps(); - __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits::max() -1, static_cast(fParam0))); - __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits::max()- 1, static_cast(fParam1))); - - - float s2Val = std::fabs(normSigma2); - int s2bitShift = normSigma2 == 0 ? 1 : cvCeil(log(200.f / s2Val) / log(2.f)); - __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast(normSigma2 * (float)(1 << s2bitShift))); - __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift); - __m128 mmParam2s = _mm_set1_ps(0.01f * normSigma2); - __m128 mmParam2s2 = _mm_set1_ps(normSigma2 * normSigma2); - float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x; - int bitShift = gainVec.x == 0 ? 1 : cvCeil(log(200.f / gainVal) / log(2.f)); - __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast(gainVec.x * (float)(1 << bitShift))); - __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast(gainVec.y)); - __m128i mmEta = _mm_setzero_si128(); - __m128i mmScale = _mm_set1_epi16(static_cast(MEstimatorScale)); + __m128i t0 = _mm_add_epi32 + (_mm_madd_epi16( + _mm_unpacklo_epi16(v00, v01), + qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); -#endif - - buffIdx = 0; - float _b0[4] = {0,0,0,0}; - float _b1[4] = {0,0,0,0}; - float _b2[4] = {0,0,0,0}; - float _b3[4] = {0,0,0,0}; - /* - */ - for( y = 0; y < _winSize.height; y++ ) - { - const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); - const uchar* Jptr1 = J.ptr(y + inextPt.y + 1, inextPt.x*cn); - const short* Iptr = IWinBuf.ptr(y, 0); - const short* dIptr = derivIWinBuf.ptr(y, 0); - const tMaskType* maskPtr = winMaskMat.ptr(y, 0); - x = 0; -#ifdef RLOF_SSE - for( ; x <= _winSize.width*cn; x += 8, dIptr += 8*2 ) - { - __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16); - __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5); - __m128i v00 = _mm_unpacklo_epi8( - _mm_loadl_epi64((const __m128i*)(Jptr + x)), z); - __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); - __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z); - __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z); + __m128i lo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16); + __m128i hi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16); - __m128i t0 = _mm_add_epi32 - (_mm_madd_epi16( - _mm_unpacklo_epi16(v00, v01), - qw0), - _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); - __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + __m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), bitShift); + __m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), bitShift); + __m128i Igain_epi16 = _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32); - t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); - t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5); - __m128i lo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16); - __m128i hi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16); + __m128i diffValue = _mm_subs_epi16(_mm_add_epi16(Igain_epi16, mmConstValue_epi16), I_0_7_epi16); + __m128i mmDiffc_epi16[4] = + { + _mm_add_epi16(_mm_slli_epi16(v11, 5), diffValue), + _mm_add_epi16(_mm_slli_epi16(v01, 5), diffValue), + _mm_add_epi16(_mm_slli_epi16(v10, 5), diffValue), + _mm_add_epi16(_mm_slli_epi16(v00, 5), diffValue) + }; - __m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), bitShift); - __m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), bitShift); - __m128i Igain_epi16 = _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32); + __m128i mmDiff_epi16 = _mm_add_epi16( _mm_packs_epi32(t0, t1), diffValue); - __m128i diffValue = _mm_subs_epi16(_mm_add_epi16(Igain_epi16, mmConstValue_epi16), I_0_7_epi16); - __m128i mmDiffc_epi16[4] = - { - _mm_add_epi16(_mm_slli_epi16(v11, 5), diffValue), - _mm_add_epi16(_mm_slli_epi16(v01, 5), diffValue), - _mm_add_epi16(_mm_slli_epi16(v10, 5), diffValue), - _mm_add_epi16(_mm_slli_epi16(v00, 5), diffValue) - }; + mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16); - __m128i mmDiff_epi16 = _mm_add_epi16( _mm_packs_epi32(t0, t1), diffValue); + __m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale); + mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1))); - mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16); + __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); + __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); - __m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale); - mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1))); + __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16); + __m128i bSet2_epi16, bSet1_epi16; + // |It| < sigma1 ? + bSet2_epi16 = _mm_cmplt_epi16(abs_epi16, mmParam1); + // It > 0 ? + __m128i diffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128()); + // sigma0 < |It| < sigma1 ? + bSet1_epi16 = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0)); + // val = |It| -/+ sigma1 + __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)), + _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1))); + // It == 0 ? |It| > sigma13 + mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16); - __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); - __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); + for( unsigned int mmi = 0; mmi < 4; mmi++) + { + __m128i mmDiffc_epi16_t = _mm_and_si128(mmDiffc_epi16[mmi], mask_0_7_epi16); + mmDiffc_epi16_t = _mm_and_si128(bSet2_epi16, mmDiffc_epi16_t); + + // It == val ? sigma0 < |It| < sigma1 + mmDiffc_epi16_t = _mm_blendv_epi8(mmDiffc_epi16_t, tmpParam1_epi16, bSet1_epi16); + __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3 + // diff = diff * sigma2 + lo = _mm_mullo_epi16(tale_epi16_, mmDiffc_epi16_t); + hi = _mm_mulhi_epi16(tale_epi16_, mmDiffc_epi16_t); + __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); + __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); + + mmDiffc_epi16_t = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); + __m128i diff1 = _mm_unpackhi_epi16(mmDiffc_epi16_t, mmDiffc_epi16_t); // It4 It4 It5 It5 It6 It6 It7 It7 | It12 It12 It13 It13... + __m128i diff0 = _mm_unpacklo_epi16(mmDiffc_epi16_t, mmDiffc_epi16_t); // It0 It0 It1 It1 It2 It2 It3 It3 | It8 It8 It9 It9... + + // Ix * diff / Iy * diff + v10 = _mm_mullo_epi16(Ixy_0, diff0); + v11 = _mm_mulhi_epi16(Ixy_0, diff0); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + + qb0[mmi] = _mm_add_ps(qb0[mmi], _mm_cvtepi32_ps(v00)); + qb1[mmi] = _mm_add_ps(qb1[mmi], _mm_cvtepi32_ps(v10)); + // It * Ix It * Iy [4 ... 7] + // for set 1 hi sigma 1 + v10 = _mm_mullo_epi16(Ixy_1, diff1); + v11 = _mm_mulhi_epi16(Ixy_1, diff1); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qb0[mmi] = _mm_add_ps(qb0[mmi], _mm_cvtepi32_ps(v00)); + qb1[mmi] = _mm_add_ps(qb1[mmi], _mm_cvtepi32_ps(v10)); + // diff * J [0 ... 7] + // for set 1 sigma 1 + // b3 += diff * Iptr[x] + v10 = _mm_mullo_epi16(mmDiffc_epi16_t, I_0_7_epi16); + v11 = _mm_mulhi_epi16(mmDiffc_epi16_t, I_0_7_epi16); + v00 = _mm_unpacklo_epi16(v10, v11); + + v10 = _mm_unpackhi_epi16(v10, v11); + qb2[mmi] = _mm_add_ps(qb2[mmi], _mm_cvtepi32_ps(v00)); + qb2[mmi] = _mm_add_ps(qb2[mmi], _mm_cvtepi32_ps(v10)); + qb3[mmi] = _mm_add_ps(qb3[mmi], _mm_cvtepi32_ps(diff_0_3_epi32)); + qb3[mmi] = _mm_add_ps(qb3[mmi], _mm_cvtepi32_ps(diff_4_7_epi32)); + } + if( j == 0 ) + { + __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16)); + __m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16,bSet1_epi16), 16)); + __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16)); + __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16),16))); - __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16); - __m128i bSet2_epi16, bSet1_epi16; - // |It| < sigma1 ? - bSet2_epi16 = _mm_cmplt_epi16(abs_epi16, mmParam1); - // It > 0 ? - __m128i diffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128()); - // sigma0 < |It| < sigma1 ? - bSet1_epi16 = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0)); - // val = |It| -/+ sigma1 - __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)), - _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1))); - // It == 0 ? |It| > sigma13 - mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16); + __m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16)); + __m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16),16)); - for( unsigned int mmi = 0; mmi < 4; mmi++) - { - __m128i mmDiffc_epi16_t = _mm_and_si128(mmDiffc_epi16[mmi], mask_0_7_epi16); - mmDiffc_epi16_t = _mm_and_si128(bSet2_epi16, mmDiffc_epi16_t); - - // It == val ? sigma0 < |It| < sigma1 - mmDiffc_epi16_t = _mm_blendv_epi8(mmDiffc_epi16_t, tmpParam1_epi16, bSet1_epi16); - __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3 - // diff = diff * sigma2 - lo = _mm_mullo_epi16(tale_epi16_, mmDiffc_epi16_t); - hi = _mm_mulhi_epi16(tale_epi16_, mmDiffc_epi16_t); - __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); - __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); - - mmDiffc_epi16_t = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); - __m128i diff1 = _mm_unpackhi_epi16(mmDiffc_epi16_t, mmDiffc_epi16_t); // It4 It4 It5 It5 It6 It6 It7 It7 | It12 It12 It13 It13... - __m128i diff0 = _mm_unpacklo_epi16(mmDiffc_epi16_t, mmDiffc_epi16_t); // It0 It0 It1 It1 It2 It2 It3 It3 | It8 It8 It9 It9... - - // Ix * diff / Iy * diff - v10 = _mm_mullo_epi16(Ixy_0, diff0); - v11 = _mm_mulhi_epi16(Ixy_0, diff0); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); + __m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps); + __m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps); + tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps); + tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps); - qb0[mmi] = _mm_add_ps(qb0[mmi], _mm_cvtepi32_ps(v00)); - qb1[mmi] = _mm_add_ps(qb1[mmi], _mm_cvtepi32_ps(v10)); - // It * Ix It * Iy [4 ... 7] - // for set 1 hi sigma 1 - v10 = _mm_mullo_epi16(Ixy_1, diff1); - v11 = _mm_mulhi_epi16(Ixy_1, diff1); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qb0[mmi] = _mm_add_ps(qb0[mmi], _mm_cvtepi32_ps(v00)); - qb1[mmi] = _mm_add_ps(qb1[mmi], _mm_cvtepi32_ps(v10)); - // diff * J [0 ... 7] - // for set 1 sigma 1 - // b3 += diff * Iptr[x] - v10 = _mm_mullo_epi16(mmDiffc_epi16_t, I_0_7_epi16); - v11 = _mm_mulhi_epi16(mmDiffc_epi16_t, I_0_7_epi16); - v00 = _mm_unpacklo_epi16(v10, v11); + tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps); + tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps); - v10 = _mm_unpackhi_epi16(v10, v11); - qb2[mmi] = _mm_add_ps(qb2[mmi], _mm_cvtepi32_ps(v00)); - qb2[mmi] = _mm_add_ps(qb2[mmi], _mm_cvtepi32_ps(v10)); - qb3[mmi] = _mm_add_ps(qb3[mmi], _mm_cvtepi32_ps(diff_0_3_epi32)); - qb3[mmi] = _mm_add_ps(qb3[mmi], _mm_cvtepi32_ps(diff_4_7_epi32)); - } + t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3 + t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3 - if( j == 0 ) - { - __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16)); - __m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16,bSet1_epi16), 16)); - __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16)); - __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16),16))); + __m128 fy = _mm_cvtepi32_ps(t0); + __m128 fx = _mm_cvtepi32_ps(t1); - __m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16)); - __m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16),16)); + // 0 ... 3 + __m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16)); - __m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps); - __m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps); - tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps); - tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps); + // A11 - A22 + __m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps); + __m128 fytale = _mm_mul_ps(fy, tale_0_3_ps); - tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps); - tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps); + mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); + mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); + mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); - t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3 - t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3 + // sumIx und sumIy + mmSumIx = _mm_add_ps(mmSumIx, fxtale); + mmSumIy = _mm_add_ps(mmSumIy, fytale); - __m128 fy = _mm_cvtepi32_ps(t0); - __m128 fx = _mm_cvtepi32_ps(t1); + mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale)); + mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale)); - // 0 ... 3 - __m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16)); + // sumI + __m128 I_tale_ps = _mm_mul_ps(I_ps, tale_0_3_ps); + mmSumI = _mm_add_ps(mmSumI,I_tale_ps); - // A11 - A22 - __m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps); - __m128 fytale = _mm_mul_ps(fy, tale_0_3_ps); + // sumW + mmSumW = _mm_add_ps(mmSumW, tale_0_3_ps); - mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); - mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); - mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); + // sumDI + mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_tale_ps)); - // sumIx und sumIy - mmSumIx = _mm_add_ps(mmSumIx, fxtale); - mmSumIy = _mm_add_ps(mmSumIy, fytale); - mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale)); - mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale)); + t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11 + t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3 - // sumI - __m128 I_tale_ps = _mm_mul_ps(I_ps, tale_0_3_ps); - mmSumI = _mm_add_ps(mmSumI,I_tale_ps); + fy = _mm_cvtepi32_ps(t0); + fx = _mm_cvtepi32_ps(t1); - // sumW - mmSumW = _mm_add_ps(mmSumW, tale_0_3_ps); + // 4 ... 7 + I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16)); - // sumDI - mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_tale_ps)); + // A11 - A22 + fxtale = _mm_mul_ps(fx, tale_4_7_ps); + fytale = _mm_mul_ps(fy, tale_4_7_ps); + mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); + mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); + mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); - t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11 - t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3 + // sumIx und sumIy + mmSumIx = _mm_add_ps(mmSumIx, fxtale); + mmSumIy = _mm_add_ps(mmSumIy, fytale); - fy = _mm_cvtepi32_ps(t0); - fx = _mm_cvtepi32_ps(t1); + mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale)); + mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale)); - // 4 ... 7 - I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16)); + // sumI + I_tale_ps = _mm_mul_ps(I_ps, tale_4_7_ps); + mmSumI = _mm_add_ps(mmSumI, I_tale_ps); - // A11 - A22 - fxtale = _mm_mul_ps(fx, tale_4_7_ps); - fytale = _mm_mul_ps(fy, tale_4_7_ps); + // sumW + mmSumW = _mm_add_ps(mmSumW, tale_4_7_ps); - mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); - mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); - mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); + // sumDI + mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_tale_ps)); + } - // sumIx und sumIy - mmSumIx = _mm_add_ps(mmSumIx, fxtale); - mmSumIy = _mm_add_ps(mmSumIy, fytale); + } + #else + for( ; x < _winSize.width*cn; x++, dIptr += 2 ) + { + if( maskPtr[x] == 0) + continue; - mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale)); - mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale)); + short ixval = dIptr[0]; + short iyval = dIptr[1]; + int illValue = static_cast(Iptr[x] * gainVec.x + gainVec.y - Iptr[x]); - // sumI - I_tale_ps = _mm_mul_ps(I_ps, tale_4_7_ps); - mmSumI = _mm_add_ps(mmSumI, I_tale_ps); + int It[4] = {(Jptr1[x+cn]<< 5) + illValue, + (Jptr[x+cn]<< 5) + illValue, + (Jptr1[x]<< 5) + illValue, + (Jptr[x] << 5) + illValue}; - // sumW - mmSumW = _mm_add_ps(mmSumW, tale_4_7_ps); - // sumDI - mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_tale_ps)); - } + int J_val = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + + Jptr1[x]*iw10 + Jptr1[x+cn]*iw11, + W_BITS1-5); - } -#else - for( ; x < _winSize.width*cn; x++, dIptr += 2 ) - { - if( maskPtr[x] == 0) - continue; + int diff = J_val + illValue; - short ixval = dIptr[0]; - short iyval = dIptr[1]; - int illValue = static_cast(Iptr[x] * gainVec.x + gainVec.y - Iptr[x]); - int It[4] = {(Jptr1[x+cn]<< 5) + illValue, - (Jptr[x+cn]<< 5) + illValue, - (Jptr1[x]<< 5) + illValue, - (Jptr[x] << 5) + illValue}; + MEstimatorScale += (diff < MEstimatorScale) ? -eta : eta; - int J_val = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + - Jptr1[x]*iw10 + Jptr1[x+cn]*iw11, - W_BITS1-5); + int abss = (diff < 0) ? -diff : diff; - int diff = J_val + illValue; + // compute the missmatch vector + if( j >= 0) + { + if( abss > fParam1) + { + It[0] = 0; + It[1] = 0; + It[2] = 0; + It[3] = 0; + } + else if( abss > static_cast(fParam0) && diff >= 0 ) + { + It[0] = static_cast(normSigma2 * (It[0] - fParam1)); + It[1] = static_cast(normSigma2 * (It[1] - fParam1)); + It[2] = static_cast(normSigma2 * (It[2] - fParam1)); + It[3] = static_cast(normSigma2 * (It[3] - fParam1)); + } + else if( abss > static_cast(fParam0) && diff < 0 ) + { + It[0] = static_cast(normSigma2 * (It[0] + fParam1)); + It[1] = static_cast(normSigma2 * (It[1] + fParam1)); + It[2] = static_cast(normSigma2 * (It[2] + fParam1)); + It[3] = static_cast(normSigma2 * (It[3] + fParam1)); + } + } + _b0[0] += (float)(It[0]*dIptr[0]) ; + _b0[1] += (float)(It[1]*dIptr[0]) ; + _b0[2] += (float)(It[2]*dIptr[0]) ; + _b0[3] += (float)(It[3]*dIptr[0]) ; - MEstimatorScale += (diff < MEstimatorScale) ? -eta : eta; + _b1[0] += (float)(It[0]*dIptr[1]) ; + _b1[1] += (float)(It[1]*dIptr[1]) ; + _b1[2] += (float)(It[2]*dIptr[1]) ; + _b1[3] += (float)(It[3]*dIptr[1]) ; - int abss = (diff < 0) ? -diff : diff; + _b2[0] += (float)(It[0])*Iptr[x] ; + _b2[1] += (float)(It[1])*Iptr[x] ; + _b2[2] += (float)(It[2])*Iptr[x] ; + _b2[3] += (float)(It[3])*Iptr[x] ; + _b3[0] += (float)(It[0]); + _b3[1] += (float)(It[1]); + _b3[2] += (float)(It[2]); + _b3[3] += (float)(It[3]); - // compute the missmatch vector - if( j >= 0) - { - if( abss > fParam1) - { - It[0] = 0; - It[1] = 0; - It[2] = 0; - It[3] = 0; - } - else if( abss > static_cast(fParam0) && diff >= 0 ) - { - It[0] = static_cast(normSigma2 * (It[0] - fParam1)); - It[1] = static_cast(normSigma2 * (It[1] - fParam1)); - It[2] = static_cast(normSigma2 * (It[2] - fParam1)); - It[3] = static_cast(normSigma2 * (It[3] - fParam1)); - } - else if( abss > static_cast(fParam0) && diff < 0 ) + // compute the Gradient Matrice + if( j == 0) { - It[0] = static_cast(normSigma2 * (It[0] + fParam1)); - It[1] = static_cast(normSigma2 * (It[1] + fParam1)); - It[2] = static_cast(normSigma2 * (It[2] + fParam1)); - It[3] = static_cast(normSigma2 * (It[3] + fParam1)); - } - } - _b0[0] += (float)(It[0]*dIptr[0]) ; - _b0[1] += (float)(It[1]*dIptr[0]) ; - _b0[2] += (float)(It[2]*dIptr[0]) ; - _b0[3] += (float)(It[3]*dIptr[0]) ; + float tale = normSigma2 * FLT_RESCALE; + if( abss < fParam0 || j < 0) + { + tale = FLT_RESCALE; + } + else if( abss > fParam1) + { + tale *= 0.01f; + } + else + { + tale *= normSigma2; + } + if( j == 0 ) + { + A11 += (float)(ixval*ixval)*tale; + A12 += (float)(ixval*iyval)*tale; + A22 += (float)(iyval*iyval)*tale; + } + dI += Iptr[x] * Iptr[x] * tale; + float dx = static_cast(dIptr[0]) * tale; + float dy = static_cast(dIptr[1]) * tale; + sumIx += dx; + sumIy += dy; + w1 += dx * Iptr[x]; + w2 += dy * Iptr[x]; + sumI += Iptr[x] * tale; + sumW += tale; + } - _b1[0] += (float)(It[0]*dIptr[1]) ; - _b1[1] += (float)(It[1]*dIptr[1]) ; - _b1[2] += (float)(It[2]*dIptr[1]) ; - _b1[3] += (float)(It[3]*dIptr[1]) ; + } + #endif + } - _b2[0] += (float)(It[0])*Iptr[x] ; - _b2[1] += (float)(It[1])*Iptr[x] ; - _b2[2] += (float)(It[2])*Iptr[x] ; - _b2[3] += (float)(It[3])*Iptr[x] ; + #ifdef RLOF_SSE + short etaValues[8]; + _mm_storeu_si128((__m128i*)(etaValues), mmEta); + MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3] + + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]); + float CV_DECL_ALIGNED(32) wbuf[4]; + #endif + if( j == 0 ) + { + #ifdef RLOF_SSE + _mm_store_ps(wbuf, mmSumW1); + w1 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumW2); + w2 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumDI); + dI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumI); + sumI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumIx); + sumIx = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumIy); + sumIy = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumW); + sumW = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + #endif + sumIx *= -FLT_SCALE; + sumIy *= -FLT_SCALE; + sumI *=FLT_SCALE; + sumW *= FLT_SCALE; + w1 *= -FLT_SCALE; + w2 *= -FLT_SCALE; + dI *= FLT_SCALE; + + + #ifdef RLOF_SSE + float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];// + + _mm_store_ps(A11buf, mmAxx); + _mm_store_ps(A12buf, mmAxy); + _mm_store_ps(A22buf, mmAyy); + + + A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; + A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; + A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; + #endif + A11 *= FLT_SCALE; // 54866744. + A12 *= FLT_SCALE; // -628764.00 + A22 *= FLT_SCALE; // 19730.000 - _b3[0] += (float)(It[0]); - _b3[1] += (float)(It[1]); - _b3[2] += (float)(It[2]); - _b3[3] += (float)(It[3]); + D = - A12*A12*sumI*sumI + dI*sumW*A12*A12 + 2*A12*sumI*sumIx*w2 + 2*A12*sumI*sumIy*w1 + - 2*dI*A12*sumIx*sumIy - 2*sumW*A12*w1*w2 + A11*A22*sumI*sumI - 2*A22*sumI*sumIx*w1 + - 2*A11*sumI*sumIy*w2 - sumIx*sumIx*w2*w2 + A22*dI*sumIx*sumIx + 2*sumIx*sumIy*w1*w2 + - sumIy*sumIy*w1*w1 + A11*dI*sumIy*sumIy + A22*sumW*w1*w1 + A11*sumW*w2*w2 - A11*A22*dI*sumW; - // compute the Gradient Matrice - if( j == 0) + float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) + + 4.f*A12*A12))/(2*winArea); + if( minEig < minEigThreshold || std::abs(D) < FLT_EPSILON ) { - float tale = normSigma2 * FLT_RESCALE; - if( abss < fParam0 || j < 0) - { - tale = FLT_RESCALE; - } - else if( abss > fParam1) - { - tale *= 0.01f; - } - else - { - tale *= normSigma2; - } - if( j == 0 ) + if( level == 0 && status ) + status[ptidx] = 0; + if( level > 0) { - A11 += (float)(ixval*ixval)*tale; - A12 += (float)(ixval*iyval)*tale; - A22 += (float)(iyval*iyval)*tale; + nextPts[ptidx] = backUpNextPt; + gainVecs[ptidx] = backUpGain; } - - dI += Iptr[x] * Iptr[x] * tale; - float dx = static_cast(dIptr[0]) * tale; - float dy = static_cast(dIptr[1]) * tale; - sumIx += dx; - sumIy += dy; - w1 += dx * Iptr[x]; - w2 += dy * Iptr[x]; - sumI += Iptr[x] * tale; - sumW += tale; + noIteration++; + break; } - } -#endif - } -#ifdef RLOF_SSE - short etaValues[8]; - _mm_storeu_si128((__m128i*)(etaValues), mmEta); - MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3] - + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]); - float CV_DECL_ALIGNED(32) wbuf[4]; -#endif - if( j == 0 ) - { -#ifdef RLOF_SSE - _mm_store_ps(wbuf, mmSumW1); - w1 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; - _mm_store_ps(wbuf, mmSumW2); - w2 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; - _mm_store_ps(wbuf, mmSumDI); - dI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; - _mm_store_ps(wbuf, mmSumI); - sumI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; - _mm_store_ps(wbuf, mmSumIx); - sumIx = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; - _mm_store_ps(wbuf, mmSumIy); - sumIy = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; - _mm_store_ps(wbuf, mmSumW); - sumW = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; -#endif - sumIx *= -FLT_SCALE; - sumIy *= -FLT_SCALE; - sumI *=FLT_SCALE; - sumW *= FLT_SCALE; - w1 *= -FLT_SCALE; - w2 *= -FLT_SCALE; - dI *= FLT_SCALE; + D = (1.f / D); + invTensorMat(0,0) = (A22*sumI*sumI - 2*sumI*sumIy*w2 + dI*sumIy*sumIy + sumW*w2*w2 - A22*dI*sumW)* D; + invTensorMat(0,1) = (A12*dI*sumW - A12*sumI * sumI - dI*sumIx*sumIy + sumI*sumIx*w2 + sumI*sumIy*w1 - sumW*w1*w2)* D; + invTensorMat(0,2) = (A12*sumI*sumIy - sumIy*sumIy*w1 - A22*sumI*sumIx - A12*sumW*w2 + A22*sumW*w1 + sumIx*sumIy*w2)* D; + invTensorMat(0,3) = (A22*dI*sumIx - A12*dI*sumIy - sumIx*w2*w2 + A12*sumI*w2 - A22*sumI*w1 + sumIy*w1*w2) * D; + invTensorMat(1,0) = invTensorMat(0,1); + invTensorMat(1,1) = (A11*sumI * sumI - 2*sumI*sumIx*w1 + dI*sumIx * sumIx + sumW*w1*w1 - A11*dI*sumW) * D; + invTensorMat(1,2) = (A12*sumI*sumIx - A11*sumI*sumIy - sumIx * sumIx*w2 + A11*sumW*w2 - A12*sumW*w1 + sumIx*sumIy*w1) * D; + invTensorMat(1,3) = (A11*dI*sumIy - sumIy*w1*w1 - A12*dI*sumIx - A11*sumI*w2 + A12*sumI*w1 + sumIx*w1*w2)* D; + invTensorMat(2,0) = invTensorMat(0,2); + invTensorMat(2,1) = invTensorMat(1,2); + invTensorMat(2,2) = (sumW*A12*A12 - 2*A12*sumIx*sumIy + A22*sumIx*sumIx + A11*sumIy*sumIy - A11*A22*sumW)* D; + invTensorMat(2,3) = (A11*A22*sumI - A12*A12*sumI - A11*sumIy*w2 + A12*sumIx*w2 + A12*sumIy*w1 - A22*sumIx*w1)* D; + invTensorMat(3,0) = invTensorMat(0,3); + invTensorMat(3,1) = invTensorMat(1,3); + invTensorMat(3,2) = invTensorMat(2,3); + invTensorMat(3,3) = (dI*A12*A12 - 2*A12*w1*w2 + A22*w1*w1 + A11*w2*w2 - A11*A22*dI)* D; + } -#ifdef RLOF_SSE - float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];// - _mm_store_ps(A11buf, mmAxx); - _mm_store_ps(A12buf, mmAxy); - _mm_store_ps(A22buf, mmAyy); + #ifdef RLOF_SSE + float CV_DECL_ALIGNED(16) bbuf[4]; + for(int mmi = 0; mmi < 4; mmi++) + { + _mm_store_ps(bbuf, _mm_add_ps(qb0[mmi], qb1[mmi])); + _b0[mmi] = bbuf[0] + bbuf[2]; + _b1[mmi] = bbuf[1] + bbuf[3]; + _mm_store_ps(bbuf, qb2[mmi]); + _b2[mmi] = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; + _mm_store_ps(bbuf, qb3[mmi]); + _b3[mmi] = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; - A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; - A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; - A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; -#endif - A11 *= FLT_SCALE; // 54866744. - A12 *= FLT_SCALE; // -628764.00 - A22 *= FLT_SCALE; // 19730.000 - - D = - A12*A12*sumI*sumI + dI*sumW*A12*A12 + 2*A12*sumI*sumIx*w2 + 2*A12*sumI*sumIy*w1 - - 2*dI*A12*sumIx*sumIy - 2*sumW*A12*w1*w2 + A11*A22*sumI*sumI - 2*A22*sumI*sumIx*w1 - - 2*A11*sumI*sumIy*w2 - sumIx*sumIx*w2*w2 + A22*dI*sumIx*sumIx + 2*sumIx*sumIy*w1*w2 - - sumIy*sumIy*w1*w1 + A11*dI*sumIy*sumIy + A22*sumW*w1*w1 + A11*sumW*w2*w2 - A11*A22*dI*sumW; - - float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) + - 4.f*A12*A12))/(2*winArea); - if( minEig < minEigThreshold || std::abs(D) < FLT_EPSILON ) - { - if( level == 0 && status ) - status[ptidx] = 0; - if( level > 0) - { - nextPts[ptidx] = backUpNextPt; - gainVecs[ptidx] = backUpGain; - } - noIteration++; - break; } + #endif + _b0[0] *= FLT_SCALE;_b0[1] *= FLT_SCALE;_b0[2] *= FLT_SCALE;_b0[3] *= FLT_SCALE; + _b1[0] *= FLT_SCALE;_b1[1] *= FLT_SCALE;_b1[2] *= FLT_SCALE;_b1[3] *= FLT_SCALE; + _b2[0] *= FLT_SCALE;_b2[1] *= FLT_SCALE;_b2[2] *= FLT_SCALE;_b2[3] *= FLT_SCALE; + _b3[0] *= FLT_SCALE;_b3[1] *= FLT_SCALE;_b3[2] *= FLT_SCALE;_b3[3] *= FLT_SCALE; - D = (1.f / D); - - invTensorMat(0,0) = (A22*sumI*sumI - 2*sumI*sumIy*w2 + dI*sumIy*sumIy + sumW*w2*w2 - A22*dI*sumW)* D; - invTensorMat(0,1) = (A12*dI*sumW - A12*sumI * sumI - dI*sumIx*sumIy + sumI*sumIx*w2 + sumI*sumIy*w1 - sumW*w1*w2)* D; - invTensorMat(0,2) = (A12*sumI*sumIy - sumIy*sumIy*w1 - A22*sumI*sumIx - A12*sumW*w2 + A22*sumW*w1 + sumIx*sumIy*w2)* D; - invTensorMat(0,3) = (A22*dI*sumIx - A12*dI*sumIy - sumIx*w2*w2 + A12*sumI*w2 - A22*sumI*w1 + sumIy*w1*w2) * D; - invTensorMat(1,0) = invTensorMat(0,1); - invTensorMat(1,1) = (A11*sumI * sumI - 2*sumI*sumIx*w1 + dI*sumIx * sumIx + sumW*w1*w1 - A11*dI*sumW) * D; - invTensorMat(1,2) = (A12*sumI*sumIx - A11*sumI*sumIy - sumIx * sumIx*w2 + A11*sumW*w2 - A12*sumW*w1 + sumIx*sumIy*w1) * D; - invTensorMat(1,3) = (A11*dI*sumIy - sumIy*w1*w1 - A12*dI*sumIx - A11*sumI*w2 + A12*sumI*w1 + sumIx*w1*w2)* D; - invTensorMat(2,0) = invTensorMat(0,2); - invTensorMat(2,1) = invTensorMat(1,2); - invTensorMat(2,2) = (sumW*A12*A12 - 2*A12*sumIx*sumIy + A22*sumIx*sumIx + A11*sumIy*sumIy - A11*A22*sumW)* D; - invTensorMat(2,3) = (A11*A22*sumI - A12*A12*sumI - A11*sumIy*w2 + A12*sumIx*w2 + A12*sumIy*w1 - A22*sumIx*w1)* D; - invTensorMat(3,0) = invTensorMat(0,3); - invTensorMat(3,1) = invTensorMat(1,3); - invTensorMat(3,2) = invTensorMat(2,3); - invTensorMat(3,3) = (dI*A12*A12 - 2*A12*w1*w2 + A22*w1*w1 + A11*w2*w2 - A11*A22*dI)* D; - } + Mc0[0] = _b0[0] - _b0[1] - _b0[2] + _b0[3]; + Mc0[1] = _b1[0] - _b1[1] - _b1[2] + _b1[3]; + Mc0[2] = -(_b2[0] - _b2[1] - _b2[2] + _b2[3]); + Mc0[3] = -(_b3[0] - _b3[1] - _b3[2] + _b3[3]); -#ifdef RLOF_SSE - float CV_DECL_ALIGNED(16) bbuf[4]; - for(int mmi = 0; mmi < 4; mmi++) - { + Mc1[0] = _b0[1] - _b0[3]; + Mc1[1] = _b1[1] - _b1[3]; + Mc1[2] = -(_b2[1] - _b2[3]); + Mc1[3] = -(_b3[1] - _b3[3]); - _mm_store_ps(bbuf, _mm_add_ps(qb0[mmi], qb1[mmi])); - _b0[mmi] = bbuf[0] + bbuf[2]; - _b1[mmi] = bbuf[1] + bbuf[3]; - _mm_store_ps(bbuf, qb2[mmi]); - _b2[mmi] = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; - _mm_store_ps(bbuf, qb3[mmi]); - _b3[mmi] = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; - } -#endif + Mc2[0] = _b0[2] - _b0[3]; + Mc2[1] = _b1[2] - _b1[3]; + Mc2[2] = -(_b2[2] - _b2[3]); + Mc2[3] = -(_b3[2] - _b3[3]); - _b0[0] *= FLT_SCALE;_b0[1] *= FLT_SCALE;_b0[2] *= FLT_SCALE;_b0[3] *= FLT_SCALE; - _b1[0] *= FLT_SCALE;_b1[1] *= FLT_SCALE;_b1[2] *= FLT_SCALE;_b1[3] *= FLT_SCALE; - _b2[0] *= FLT_SCALE;_b2[1] *= FLT_SCALE;_b2[2] *= FLT_SCALE;_b2[3] *= FLT_SCALE; - _b3[0] *= FLT_SCALE;_b3[1] *= FLT_SCALE;_b3[2] *= FLT_SCALE;_b3[3] *= FLT_SCALE; - - - Mc0[0] = _b0[0] - _b0[1] - _b0[2] + _b0[3]; - Mc0[1] = _b1[0] - _b1[1] - _b1[2] + _b1[3]; - Mc0[2] = -(_b2[0] - _b2[1] - _b2[2] + _b2[3]); - Mc0[3] = -(_b3[0] - _b3[1] - _b3[2] + _b3[3]); - - Mc1[0] = _b0[1] - _b0[3]; - Mc1[1] = _b1[1] - _b1[3]; - Mc1[2] = -(_b2[1] - _b2[3]); - Mc1[3] = -(_b3[1] - _b3[3]); - - - Mc2[0] = _b0[2] - _b0[3]; - Mc2[1] = _b1[2] - _b1[3]; - Mc2[2] = -(_b2[2] - _b2[3]); - Mc2[3] = -(_b3[2] - _b3[3]); - - - Mc3[0] = _b0[3]; - Mc3[1] = _b1[3]; - Mc3[2] = -_b2[3]; - Mc3[3] = -_b3[3]; - - // - c[0] = -Mc0[0]; - c[1] = -Mc1[0]; - c[2] = -Mc2[0]; - c[3] = -Mc3[0]; - c[4] = -Mc0[1]; - c[5] = -Mc1[1]; - c[6] = -Mc2[1]; - c[7] = -Mc3[1]; - - float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]); - float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]); - float e2 = e0 * (c[1] * c[7] -c[3] * c[5]); - e0 = e1 * e1 - e2; - hasSolved = false; - if ( e0 > 0) - { - e0 = sqrt(e0); - float _y[2] = {-e1 - e0, e0 - e1}; - float c0yc1[2] = {c[0] * _y[0] + c[1], - c[0] * _y[1] + c[1]}; - float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0], - -(c[2] * _y[1] + c[3]) / c0yc1[1]}; - bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1); - bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1); - - bool isSolution1 = checkSolution(_x[0], _y[0], c ); - bool isSolution2 = checkSolution(_x[1], _y[1], c ); - bool isSink1 = isIn1 && isSolution1; - bool isSink2 = isIn2 && isSolution2; - - if ( isSink1 != isSink2) + + Mc3[0] = _b0[3]; + Mc3[1] = _b1[3]; + Mc3[2] = -_b2[3]; + Mc3[3] = -_b3[3]; + + // + c[0] = -Mc0[0]; + c[1] = -Mc1[0]; + c[2] = -Mc2[0]; + c[3] = -Mc3[0]; + c[4] = -Mc0[1]; + c[5] = -Mc1[1]; + c[6] = -Mc2[1]; + c[7] = -Mc3[1]; + + float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]); + float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]); + float e2 = e0 * (c[1] * c[7] -c[3] * c[5]); + e0 = e1 * e1 - e2; + hasSolved = false; + if ( e0 > 0) { - a = isSink1 ? _x[0] : _x[1]; - b = isSink1 ? _y[0] : _y[1]; - ab = a * b; - hasSolved = true; - delta.x = inextPt.x + a - nextPt.x; - delta.y = inextPt.y + b - nextPt.y; + e0 = sqrt(e0); + float _y[2] = {-e1 - e0, e0 - e1}; + float c0yc1[2] = {c[0] * _y[0] + c[1], + c[0] * _y[1] + c[1]}; + float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0], + -(c[2] * _y[1] + c[3]) / c0yc1[1]}; + bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1); + bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1); - cv::Vec4f mismatchVec = ab * Mc0 + Mc1 *a + Mc2 * b + Mc3; - deltaGain = est_DeltaGain(invTensorMat, mismatchVec); + bool isSolution1 = checkSolution(_x[0], _y[0], c ); + bool isSolution2 = checkSolution(_x[1], _y[1], c ); + bool isSink1 = isIn1 && isSolution1; + bool isSink2 = isIn2 && isSolution2; - } // isIn1 != isIn2 - } + if ( isSink1 != isSink2) + { + a = isSink1 ? _x[0] : _x[1]; + b = isSink1 ? _y[0] : _y[1]; + ab = a * b; + hasSolved = true; + delta.x = inextPt.x + a - nextPt.x; + delta.y = inextPt.y + b - nextPt.y; + + cv::Vec4f mismatchVec = ab * Mc0 + Mc1 *a + Mc2 * b + Mc3; + deltaGain = est_DeltaGain(invTensorMat, mismatchVec); + + } // isIn1 != isIn2 + } if( hasSolved == false) noIteration++; } @@ -1469,14 +1460,14 @@ class TrackerInvoker : public cv::ParallelLoopBody prevGain = deltaGain; gainVec += deltaGain; nextPt += delta ; - nextPts[ptidx] = nextPt + halfWin; + nextPts[ptidx] = nextPt - halfWin; gainVecs[ptidx]= gainVec; } else { nextPt += delta; - nextPts[ptidx] = nextPt + halfWin; + nextPts[ptidx] = nextPt - halfWin; gainVecs[ptidx]= gainVec + deltaGain; noSolvedIteration++; break; @@ -1486,7 +1477,7 @@ class TrackerInvoker : public cv::ParallelLoopBody (std::abs(delta.x - prevDelta.x) < 0.01 && std::abs(delta.y - prevDelta.y) < 0.01) || ((delta.ddot(delta) <= 0.001) && std::abs(prevGain.x - deltaGain.x) < 0.01))) { - nextPts[ptidx] -= delta*0.5f; + nextPts[ptidx] += delta*0.5f; gainVecs[ptidx] -= deltaGain* 0.5f; break; } @@ -1620,12 +1611,13 @@ class TrackerInvoker : public cv::ParallelLoopBody minWinSize,maxWinSize) == false) continue; - prevPt -= halfWin; + halfWin = Point2f(static_cast(maxWinSize), static_cast(maxWinSize)) - halfWin; + prevPt += halfWin; iprevPt.x = cvFloor(prevPt.x); iprevPt.y = cvFloor(prevPt.y); - if( iprevPt.x + halfWin.x < 0 || iprevPt.x + halfWin.x>= derivI.cols || - iprevPt.y + halfWin.y < 0 || iprevPt.y + halfWin.y >= derivI.rows ) + if( iprevPt.x < 0 || iprevPt.x >= derivI.cols - winSize.width || + iprevPt.y < 0 || iprevPt.y >= derivI.rows - winSize.height - 1) { if( level == 0 ) { @@ -1783,7 +1775,7 @@ class TrackerInvoker : public cv::ParallelLoopBody D = 1.f/D; cv::Point2f backUpNextPt = nextPt; - nextPt -= halfWin; + nextPt += halfWin; Point2f prevDelta(0,0); float c[8]; @@ -1802,261 +1794,259 @@ class TrackerInvoker : public cv::ParallelLoopBody if( (inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j == 0)) { - inextPt.x = cvFloor(nextPt.x); - inextPt.y = cvFloor(nextPt.y); - - - if( inextPt.x < -winSize.width || inextPt.x >= J.cols || - inextPt.y < -winSize.height || inextPt.y >= J.rows ) - { - if( level == 0 && status ) - status[ptidx] = 3; - if (level > 0) - nextPts[ptidx] = backUpNextPt; - break; - } - + inextPt.x = cvFloor(nextPt.x); + inextPt.y = cvFloor(nextPt.y); + if( inextPt.x < 0 || inextPt.x >= J.cols - winSize.width || + inextPt.y < 0 || inextPt.y >= J.rows - winSize.height - 1) + { + if( level == 0 && status ) + status[ptidx] = 3; + if (level > 0) + nextPts[ptidx] = backUpNextPt; + break; + } - iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); - iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); - iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); - iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - float _b1[4] = {0,0,0,0}; - float _b2[4] = {0,0,0,0}; -#ifdef RLOF_SSE - __m128 qbc0[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; - __m128 qbc1[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; - qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); - qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); -#endif - for( y = 0; y < winSize.height; y++ ) - { - const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); - const uchar* Jptr1 = J.ptr(y + inextPt.y + 1, inextPt.x*cn); - const short* Iptr = IWinBuf.ptr(y, 0); - const short* dIptr = derivIWinBuf.ptr(y, 0); - x = 0; -#ifdef RLOF_SSE + iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - const tMaskType* maskPtr = winMaskMat.ptr(y, 0); - for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 ) + float _b1[4] = {0,0,0,0}; + float _b2[4] = {0,0,0,0}; + #ifdef RLOF_SSE + __m128 qbc0[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; + __m128 qbc1[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; + qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + #endif + for( y = 0; y < winSize.height; y++ ) { - if( maskPtr[x ] == 0 && maskPtr[x+1] == 0 && maskPtr[x+2] == 0 && maskPtr[x+3] == 0 - && maskPtr[x+4] == 0 && maskPtr[x+5] == 0 && maskPtr[x+6] == 0 && maskPtr[x+7] == 0) - continue; - __m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x)); - __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z); - __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); - __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z); - __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z); - - __m128i mmDiffc_epi16[4] = - { _mm_subs_epi16(_mm_slli_epi16(v00, 5), diff0), - _mm_subs_epi16(_mm_slli_epi16(v01, 5), diff0), - _mm_subs_epi16(_mm_slli_epi16(v10, 5), diff0), - _mm_subs_epi16(_mm_slli_epi16(v11, 5), diff0) - }; - - __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... - __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); - - if( x > winSize.width*cn - 8) + const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const uchar* Jptr1 = J.ptr(y + inextPt.y + 1, inextPt.x*cn); + const short* Iptr = IWinBuf.ptr(y, 0); + const short* dIptr = derivIWinBuf.ptr(y, 0); + x = 0; + #ifdef RLOF_SSE + + const tMaskType* maskPtr = winMaskMat.ptr(y, 0); + for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 ) { - Ixy_0 = _mm_and_si128(Ixy_0, mmMask0); - Ixy_1 = _mm_and_si128(Ixy_1, mmMask1); - } + if( maskPtr[x ] == 0 && maskPtr[x+1] == 0 && maskPtr[x+2] == 0 && maskPtr[x+3] == 0 + && maskPtr[x+4] == 0 && maskPtr[x+5] == 0 && maskPtr[x+6] == 0 && maskPtr[x+7] == 0) + continue; + __m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x)); + __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z); + __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); + __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z); + __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z); + + __m128i mmDiffc_epi16[4] = + { _mm_subs_epi16(_mm_slli_epi16(v00, 5), diff0), + _mm_subs_epi16(_mm_slli_epi16(v01, 5), diff0), + _mm_subs_epi16(_mm_slli_epi16(v10, 5), diff0), + _mm_subs_epi16(_mm_slli_epi16(v11, 5), diff0) + }; + + __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... + __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); + + if( x > winSize.width*cn - 8) + { + Ixy_0 = _mm_and_si128(Ixy_0, mmMask0); + Ixy_1 = _mm_and_si128(Ixy_1, mmMask1); + } - __m128i diffc1[4] = - {_mm_unpackhi_epi16(mmDiffc_epi16[0],mmDiffc_epi16[0]), - _mm_unpackhi_epi16(mmDiffc_epi16[1],mmDiffc_epi16[1]), - _mm_unpackhi_epi16(mmDiffc_epi16[2],mmDiffc_epi16[2]), - _mm_unpackhi_epi16(mmDiffc_epi16[3],mmDiffc_epi16[3]) - }; - - __m128i diffc0[4] = - {_mm_unpacklo_epi16(mmDiffc_epi16[0],mmDiffc_epi16[0]), - _mm_unpacklo_epi16(mmDiffc_epi16[1],mmDiffc_epi16[1]), - _mm_unpacklo_epi16(mmDiffc_epi16[2],mmDiffc_epi16[2]), - _mm_unpacklo_epi16(mmDiffc_epi16[3],mmDiffc_epi16[3]) - }; - - - // It * Ix It * Iy [0 ... 3] - //mask split for multiplication - // for set 1 lo sigma 1 - - - v10 = _mm_mullo_epi16(Ixy_0, diffc0[0]); - v11 = _mm_mulhi_epi16(Ixy_0, diffc0[0]); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qbc0[0] = _mm_add_ps(qbc0[0], _mm_cvtepi32_ps(v00)); - qbc1[0] = _mm_add_ps(qbc1[0], _mm_cvtepi32_ps(v10)); - - v10 = _mm_mullo_epi16(Ixy_0, diffc0[1]); - v11 = _mm_mulhi_epi16(Ixy_0, diffc0[1]); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qbc0[1] = _mm_add_ps(qbc0[1], _mm_cvtepi32_ps(v00)); - qbc1[1] = _mm_add_ps(qbc1[1], _mm_cvtepi32_ps(v10)); - - v10 = _mm_mullo_epi16(Ixy_0, diffc0[2]); - v11 = _mm_mulhi_epi16(Ixy_0, diffc0[2]); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qbc0[2] = _mm_add_ps(qbc0[2], _mm_cvtepi32_ps(v00)); - qbc1[2] = _mm_add_ps(qbc1[2], _mm_cvtepi32_ps(v10)); - - v10 = _mm_mullo_epi16(Ixy_0, diffc0[3]); - v11 = _mm_mulhi_epi16(Ixy_0, diffc0[3]); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qbc0[3] = _mm_add_ps(qbc0[3], _mm_cvtepi32_ps(v00)); - qbc1[3] = _mm_add_ps(qbc1[3], _mm_cvtepi32_ps(v10)); - // It * Ix It * Iy [4 ... 7] - // for set 1 hi sigma 1 - - v10 = _mm_mullo_epi16(Ixy_1, diffc1[0]); - v11 = _mm_mulhi_epi16(Ixy_1, diffc1[0]); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qbc0[0] = _mm_add_ps(qbc0[0], _mm_cvtepi32_ps(v00)); - qbc1[0] = _mm_add_ps(qbc1[0], _mm_cvtepi32_ps(v10)); - - v10 = _mm_mullo_epi16(Ixy_1, diffc1[1]); - v11 = _mm_mulhi_epi16(Ixy_1, diffc1[1]); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qbc0[1] = _mm_add_ps(qbc0[1], _mm_cvtepi32_ps(v00)); - qbc1[1] = _mm_add_ps(qbc1[1], _mm_cvtepi32_ps(v10)); - - v10 = _mm_mullo_epi16(Ixy_1, diffc1[2]); - v11 = _mm_mulhi_epi16(Ixy_1, diffc1[2]); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qbc0[2] = _mm_add_ps(qbc0[2], _mm_cvtepi32_ps(v00)); - qbc1[2] = _mm_add_ps(qbc1[2], _mm_cvtepi32_ps(v10)); - - v10 = _mm_mullo_epi16(Ixy_1, diffc1[3]); - v11 = _mm_mulhi_epi16(Ixy_1, diffc1[3]); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qbc0[3] = _mm_add_ps(qbc0[3], _mm_cvtepi32_ps(v00)); - qbc1[3] = _mm_add_ps(qbc1[3], _mm_cvtepi32_ps(v10)); - - } -#else - for( ; x < winSize.width*cn; x++, dIptr += 2 ) - { - if( dIptr[0] == 0 && dIptr[1] == 0) - continue; - short It[4] = {(Jptr[x] << 5) - Iptr[x], - (Jptr[x+cn]<< 5) - Iptr[x], - (Jptr1[x]<< 5) - Iptr[x], - (Jptr1[x+cn]<< 5) - Iptr[x]}; - - _b1[0] += (float)(It[0]*dIptr[0]); - _b1[1] += (float)(It[1]*dIptr[0]); - _b1[2] += (float)(It[2]*dIptr[0]); - _b1[3] += (float)(It[3]*dIptr[0]); - - _b2[0] += (float)(It[0]*dIptr[1]); - _b2[1] += (float)(It[1]*dIptr[1]); - _b2[2] += (float)(It[2]*dIptr[1]); - _b2[3] += (float)(It[3]*dIptr[1]); - } -#endif + __m128i diffc1[4] = + {_mm_unpackhi_epi16(mmDiffc_epi16[0],mmDiffc_epi16[0]), + _mm_unpackhi_epi16(mmDiffc_epi16[1],mmDiffc_epi16[1]), + _mm_unpackhi_epi16(mmDiffc_epi16[2],mmDiffc_epi16[2]), + _mm_unpackhi_epi16(mmDiffc_epi16[3],mmDiffc_epi16[3]) + }; - } + __m128i diffc0[4] = + {_mm_unpacklo_epi16(mmDiffc_epi16[0],mmDiffc_epi16[0]), + _mm_unpacklo_epi16(mmDiffc_epi16[1],mmDiffc_epi16[1]), + _mm_unpacklo_epi16(mmDiffc_epi16[2],mmDiffc_epi16[2]), + _mm_unpacklo_epi16(mmDiffc_epi16[3],mmDiffc_epi16[3]) + }; -#ifdef RLOF_SSE - float CV_DECL_ALIGNED(16) bbuf[4]; - _mm_store_ps(bbuf, _mm_add_ps(qbc0[0], qbc1[0])); - _b1[0] += bbuf[0] + bbuf[2]; - _b2[0] += bbuf[1] + bbuf[3]; - _mm_store_ps(bbuf, _mm_add_ps(qbc0[1], qbc1[1])); - _b1[1] += bbuf[0] + bbuf[2]; - _b2[1] += bbuf[1] + bbuf[3]; + // It * Ix It * Iy [0 ... 3] + //mask split for multiplication + // for set 1 lo sigma 1 - _mm_store_ps(bbuf, _mm_add_ps(qbc0[2], qbc1[2])); - _b1[2] += bbuf[0] + bbuf[2]; - _b2[2] += bbuf[1] + bbuf[3]; - _mm_store_ps(bbuf, _mm_add_ps(qbc0[3], qbc1[3])); - _b1[3] += bbuf[0] + bbuf[2]; - _b2[3] += bbuf[1] + bbuf[3]; + v10 = _mm_mullo_epi16(Ixy_0, diffc0[0]); + v11 = _mm_mulhi_epi16(Ixy_0, diffc0[0]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[0] = _mm_add_ps(qbc0[0], _mm_cvtepi32_ps(v00)); + qbc1[0] = _mm_add_ps(qbc1[0], _mm_cvtepi32_ps(v10)); -#endif - _b1[0] *= FLT_SCALE; - _b1[1] *= FLT_SCALE; - _b1[2] *= FLT_SCALE; - _b1[3] *= FLT_SCALE; - _b2[0] *= FLT_SCALE; - _b2[1] *= FLT_SCALE; - _b2[2] *= FLT_SCALE; - _b2[3] *= FLT_SCALE; - - float c0[4] = { _b1[3] + _b1[0] - _b1[2] - _b1[1], - _b1[1] - _b1[0], - _b1[2] - _b1[0], - _b1[0]}; - float c1[4] = { _b2[3] + _b2[0] - _b2[2] - _b2[1], - _b2[1] - _b2[0], - _b2[2] - _b2[0], - _b2[0]}; - - float DA12 = A12 * D ; - float DA22 = A22 * D ; - float DA11 = A11 * D ; - c[0] = DA12 * c1[0] - DA22 * c0[0]; - c[1] = DA12 * c1[1] - DA22 * c0[1]; - c[2] = DA12 * c1[2] - DA22 * c0[2]; - c[3] = DA12 * c1[3] - DA22 * c0[3]; - c[4] = DA12 * c0[0] - DA11 * c1[0]; - c[5] = DA12 * c0[1] - DA11 * c1[1]; - c[6] = DA12 * c0[2] - DA11 * c1[2]; - c[7] = DA12 * c0[3] - DA11 * c1[3]; - - float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]); - float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]); - float e2 = e0 * (c[1] * c[7] -c[3] * c[5]); - e0 = e1 * e1 - e2; - if ( e0 >= 0) - { - e0 = sqrt(e0); - float _y[2] = {-e1 - e0, e0 - e1}; - float c0yc1[2] = {c[0] * _y[0] + c[1], - c[0] * _y[1] + c[1]}; + v10 = _mm_mullo_epi16(Ixy_0, diffc0[1]); + v11 = _mm_mulhi_epi16(Ixy_0, diffc0[1]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[1] = _mm_add_ps(qbc0[1], _mm_cvtepi32_ps(v00)); + qbc1[1] = _mm_add_ps(qbc1[1], _mm_cvtepi32_ps(v10)); + + v10 = _mm_mullo_epi16(Ixy_0, diffc0[2]); + v11 = _mm_mulhi_epi16(Ixy_0, diffc0[2]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[2] = _mm_add_ps(qbc0[2], _mm_cvtepi32_ps(v00)); + qbc1[2] = _mm_add_ps(qbc1[2], _mm_cvtepi32_ps(v10)); - float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0], - -(c[2] * _y[1] + c[3]) / c0yc1[1]}; + v10 = _mm_mullo_epi16(Ixy_0, diffc0[3]); + v11 = _mm_mulhi_epi16(Ixy_0, diffc0[3]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[3] = _mm_add_ps(qbc0[3], _mm_cvtepi32_ps(v00)); + qbc1[3] = _mm_add_ps(qbc1[3], _mm_cvtepi32_ps(v10)); + // It * Ix It * Iy [4 ... 7] + // for set 1 hi sigma 1 - bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1); - bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1); + v10 = _mm_mullo_epi16(Ixy_1, diffc1[0]); + v11 = _mm_mulhi_epi16(Ixy_1, diffc1[0]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[0] = _mm_add_ps(qbc0[0], _mm_cvtepi32_ps(v00)); + qbc1[0] = _mm_add_ps(qbc1[0], _mm_cvtepi32_ps(v10)); + v10 = _mm_mullo_epi16(Ixy_1, diffc1[1]); + v11 = _mm_mulhi_epi16(Ixy_1, diffc1[1]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[1] = _mm_add_ps(qbc0[1], _mm_cvtepi32_ps(v00)); + qbc1[1] = _mm_add_ps(qbc1[1], _mm_cvtepi32_ps(v10)); - bool isSink1 = isIn1 && checkSolution(_x[0], _y[0], c ); - bool isSink2 = isIn2 && checkSolution(_x[1], _y[1], c ); + v10 = _mm_mullo_epi16(Ixy_1, diffc1[2]); + v11 = _mm_mulhi_epi16(Ixy_1, diffc1[2]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[2] = _mm_add_ps(qbc0[2], _mm_cvtepi32_ps(v00)); + qbc1[2] = _mm_add_ps(qbc1[2], _mm_cvtepi32_ps(v10)); + v10 = _mm_mullo_epi16(Ixy_1, diffc1[3]); + v11 = _mm_mulhi_epi16(Ixy_1, diffc1[3]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[3] = _mm_add_ps(qbc0[3], _mm_cvtepi32_ps(v00)); + qbc1[3] = _mm_add_ps(qbc1[3], _mm_cvtepi32_ps(v10)); - //if( isSink1 != isSink2 ) - { - if( isSink1 ) + } + #else + for( ; x < winSize.width*cn; x++, dIptr += 2 ) { - hasSolved = true; - delta.x = inextPt.x + _x[0] - nextPt.x; - delta.y = inextPt.y + _y[0] - nextPt.y; + if( dIptr[0] == 0 && dIptr[1] == 0) + continue; + short It[4] = {(Jptr[x] << 5) - Iptr[x], + (Jptr[x+cn]<< 5) - Iptr[x], + (Jptr1[x]<< 5) - Iptr[x], + (Jptr1[x+cn]<< 5) - Iptr[x]}; + + _b1[0] += (float)(It[0]*dIptr[0]); + _b1[1] += (float)(It[1]*dIptr[0]); + _b1[2] += (float)(It[2]*dIptr[0]); + _b1[3] += (float)(It[3]*dIptr[0]); + + _b2[0] += (float)(It[0]*dIptr[1]); + _b2[1] += (float)(It[1]*dIptr[1]); + _b2[2] += (float)(It[2]*dIptr[1]); + _b2[3] += (float)(It[3]*dIptr[1]); } - if (isSink2 ) + #endif + + } + + #ifdef RLOF_SSE + float CV_DECL_ALIGNED(16) bbuf[4]; + _mm_store_ps(bbuf, _mm_add_ps(qbc0[0], qbc1[0])); + _b1[0] += bbuf[0] + bbuf[2]; + _b2[0] += bbuf[1] + bbuf[3]; + + _mm_store_ps(bbuf, _mm_add_ps(qbc0[1], qbc1[1])); + _b1[1] += bbuf[0] + bbuf[2]; + _b2[1] += bbuf[1] + bbuf[3]; + + _mm_store_ps(bbuf, _mm_add_ps(qbc0[2], qbc1[2])); + _b1[2] += bbuf[0] + bbuf[2]; + _b2[2] += bbuf[1] + bbuf[3]; + + _mm_store_ps(bbuf, _mm_add_ps(qbc0[3], qbc1[3])); + _b1[3] += bbuf[0] + bbuf[2]; + _b2[3] += bbuf[1] + bbuf[3]; + + #endif + _b1[0] *= FLT_SCALE; + _b1[1] *= FLT_SCALE; + _b1[2] *= FLT_SCALE; + _b1[3] *= FLT_SCALE; + _b2[0] *= FLT_SCALE; + _b2[1] *= FLT_SCALE; + _b2[2] *= FLT_SCALE; + _b2[3] *= FLT_SCALE; + + float c0[4] = { _b1[3] + _b1[0] - _b1[2] - _b1[1], + _b1[1] - _b1[0], + _b1[2] - _b1[0], + _b1[0]}; + float c1[4] = { _b2[3] + _b2[0] - _b2[2] - _b2[1], + _b2[1] - _b2[0], + _b2[2] - _b2[0], + _b2[0]}; + + float DA12 = A12 * D ; + float DA22 = A22 * D ; + float DA11 = A11 * D ; + c[0] = DA12 * c1[0] - DA22 * c0[0]; + c[1] = DA12 * c1[1] - DA22 * c0[1]; + c[2] = DA12 * c1[2] - DA22 * c0[2]; + c[3] = DA12 * c1[3] - DA22 * c0[3]; + c[4] = DA12 * c0[0] - DA11 * c1[0]; + c[5] = DA12 * c0[1] - DA11 * c1[1]; + c[6] = DA12 * c0[2] - DA11 * c1[2]; + c[7] = DA12 * c0[3] - DA11 * c1[3]; + + float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]); + float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]); + float e2 = e0 * (c[1] * c[7] -c[3] * c[5]); + e0 = e1 * e1 - e2; + if ( e0 >= 0) + { + e0 = sqrt(e0); + float _y[2] = {-e1 - e0, e0 - e1}; + float c0yc1[2] = {c[0] * _y[0] + c[1], + c[0] * _y[1] + c[1]}; + + float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0], + -(c[2] * _y[1] + c[3]) / c0yc1[1]}; + + bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1); + bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1); + + + bool isSink1 = isIn1 && checkSolution(_x[0], _y[0], c ); + bool isSink2 = isIn2 && checkSolution(_x[1], _y[1], c ); + + + //if( isSink1 != isSink2 ) { - hasSolved = true; - delta.x = inextPt.x + _x[1] - nextPt.x; - delta.y = inextPt.y + _y[1] - nextPt.y; - } - } // isIn1 != isIn2 - } // e0 >= 0 + if( isSink1 ) + { + hasSolved = true; + delta.x = inextPt.x + _x[0] - nextPt.x; + delta.y = inextPt.y + _y[0] - nextPt.y; + } + if (isSink2 ) + { + hasSolved = true; + delta.x = inextPt.x + _x[1] - nextPt.x; + delta.y = inextPt.y + _y[1] - nextPt.y; + } + } // isIn1 != isIn2 + } // e0 >= 0 } else hasSolved = false; @@ -2065,12 +2055,12 @@ class TrackerInvoker : public cv::ParallelLoopBody delta = Point2f( c[0] * ab + c[1] * a + c[2] * b + c[3], c[4] * ab + c[5] * a + c[6] * b + c[7]); nextPt += 0.7 * delta; - nextPts[ptidx] = nextPt + halfWin; + nextPts[ptidx] = nextPt - halfWin; } else { nextPt += delta; - nextPts[ptidx] = nextPt + halfWin; + nextPts[ptidx] = nextPt - halfWin; break; } diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp index 3f76ce460a4..3c0bad6f963 100644 --- a/modules/optflow/src/rlof/plk_invoker.hpp +++ b/modules/optflow/src/rlof/plk_invoker.hpp @@ -61,9 +61,6 @@ class TrackerInvoker : public cv::ParallelLoopBody void operator()(const cv::Range& range) const CV_OVERRIDE { -#ifdef DEBUG_INVOKER - printf("plk::radial");fflush(stdout); -#endif cv::Size winSize; cv::Point2f halfWin; @@ -119,14 +116,14 @@ class TrackerInvoker : public cv::ParallelLoopBody winMaskMat,winSize,halfWin,winArea, minWinSize,maxWinSize) == false) continue; - - prevPt -= halfWin; + halfWin = Point2f(static_cast(maxWinSize) ,static_cast(maxWinSize) ) - halfWin; + prevPt += halfWin; iprevPt.x = cvFloor(prevPt.x); iprevPt.y = cvFloor(prevPt.y); - if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols || - iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows ) + if( iprevPt.x < 0 || iprevPt.x >= derivI.cols - winSize.width || + iprevPt.y < 0 || iprevPt.y >= derivI.rows - winSize.height - 1) { if( level == 0 ) { @@ -243,7 +240,7 @@ class TrackerInvoker : public cv::ParallelLoopBody } cv::Point2f backUpNextPt = nextPt; - nextPt -= halfWin; + nextPt += halfWin; Point2f prevDelta(0,0); //relates to h(t-1) Point2f prevGain(1,0); cv::Point2f gainVec = gainVecs[ptidx]; @@ -255,9 +252,8 @@ class TrackerInvoker : public cv::ParallelLoopBody inextPt.x = cvFloor(nextPt.x); inextPt.y = cvFloor(nextPt.y); - - if( inextPt.x + halfWin.x < 0 || inextPt.x + halfWin.x>= derivI.cols || - inextPt.y + halfWin.y < 0 || inextPt.y + halfWin.y >= derivI.rows ) + if( inextPt.x < 0 || inextPt.x >= J.cols - winSize.width || + inextPt.y < 0 || inextPt.y >= J.rows - winSize.height - 1) { if( level == 0 && status ) status[ptidx] = 3; @@ -606,7 +602,7 @@ class TrackerInvoker : public cv::ParallelLoopBody if( j == 0) prevGain = deltaGain; nextPt += delta; - nextPts[ptidx] = nextPt + halfWin; + nextPts[ptidx] = nextPt - halfWin; gainVecs[ptidx]= gainVec; if( delta.ddot(delta) <= criteria.epsilon) break; @@ -709,9 +705,6 @@ class TrackerInvoker : public cv::ParallelLoopBody void operator()(const cv::Range& range) const CV_OVERRIDE { -#ifdef DEBUG_INVOKER - printf("plk::ica");fflush(stdout); -#endif cv::Size winSize; cv::Point2f halfWin; @@ -762,15 +755,13 @@ class TrackerInvoker : public cv::ParallelLoopBody winMaskMat,winSize,halfWin,winArea, minWinSize,maxWinSize) == false) continue; - - - prevPt -= halfWin; + halfWin = Point2f(static_cast(maxWinSize), static_cast(maxWinSize)) - halfWin; + prevPt += halfWin; iprevPt.x = cvFloor(prevPt.x); iprevPt.y = cvFloor(prevPt.y); - - if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols || - iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows ) + if( iprevPt.x < 0 || iprevPt.x >= derivI.cols - winSize.width || + iprevPt.y < 0 || iprevPt.y >= derivI.rows - winSize.height - 1) { if( level == 0 ) { @@ -927,7 +918,7 @@ class TrackerInvoker : public cv::ParallelLoopBody D = 1.f/D; - nextPt -= halfWin; + nextPt += halfWin; Point2f prevDelta(0,0); //relates to h(t-1) #ifdef RLOF_SSE __m128i mmMask0, mmMask1, mmMask; @@ -939,9 +930,8 @@ class TrackerInvoker : public cv::ParallelLoopBody inextPt.x = cvFloor(nextPt.x); inextPt.y = cvFloor(nextPt.y); - - if( inextPt.x + halfWin.x < 0 || inextPt.x + halfWin.x>= derivI.cols || - inextPt.y + halfWin.y < 0 || inextPt.y + halfWin.y >= derivI.rows ) + if( inextPt.x < 0 || inextPt.x >= J.cols - winSize.width || + inextPt.y < 0 || inextPt.y >= J.rows - winSize.height - 1) { if( level == 0 && status ) status[ptidx] = 3; @@ -1054,7 +1044,7 @@ class TrackerInvoker : public cv::ParallelLoopBody nextPt += delta; - nextPts[ptidx] = nextPt + halfWin; + nextPts[ptidx] = nextPt - halfWin; if( delta.ddot(delta) <= criteria.epsilon) break; diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp index 38648871273..8dd0ecccd75 100644 --- a/modules/optflow/src/rlof/rlof_invoker.hpp +++ b/modules/optflow/src/rlof/rlof_invoker.hpp @@ -59,9 +59,6 @@ class TrackerInvoker : public cv::ParallelLoopBody void operator()(const cv::Range& range) const CV_OVERRIDE { -#ifdef DEBUG_INVOKER - printf("rlof::ica");fflush(stdout); -#endif cv::Size winSize; cv::Point2f halfWin; @@ -110,14 +107,12 @@ class TrackerInvoker : public cv::ParallelLoopBody winMaskMat, winSize, halfWin, winArea, minWinSize, maxWinSize) == false) continue; - - - prevPt -= halfWin; + halfWin = Point2f(static_cast(maxWinSize) ,static_cast(maxWinSize) ) - halfWin; + prevPt += halfWin; iprevPt.x = cvFloor(prevPt.x); iprevPt.y = cvFloor(prevPt.y); - - if (iprevPt.x + halfWin.x < 0 || iprevPt.x + halfWin.x >= derivI.cols || - iprevPt.y + halfWin.y < 0 || iprevPt.y + halfWin.y >= derivI.rows) + if( iprevPt.x < 0 || iprevPt.x >= derivI.cols - winSize.width || + iprevPt.y < 0 || iprevPt.y >= derivI.rows - winSize.height - 1) { if (level == 0) { @@ -234,7 +229,7 @@ class TrackerInvoker : public cv::ParallelLoopBody cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1); cv::Point2f backUpNextPt = nextPt; - nextPt -= halfWin; + nextPt += halfWin; int j; #ifdef RLOF_SSE __m128i mmMask0, mmMask1, mmMask; @@ -249,8 +244,8 @@ class TrackerInvoker : public cv::ParallelLoopBody inextPt.x = cvFloor(nextPt.x); inextPt.y = cvFloor(nextPt.y); - if (inextPt.x < -winSize.width || inextPt.x >= J.cols || - inextPt.y < -winSize.height || inextPt.y >= J.rows) + if( inextPt.x < 0 || inextPt.x >= J.cols - winSize.width || + inextPt.y < 0 || inextPt.y >= J.rows - winSize.height - 1) { if (level == 0 && status) status[ptidx] = 3; @@ -584,7 +579,7 @@ class TrackerInvoker : public cv::ParallelLoopBody delta.y = (delta.y != delta.y) ? 0 : delta.y; nextPt += delta * 0.7; - nextPts[ptidx] = nextPt + halfWin; + nextPts[ptidx] = nextPt - halfWin; if (j > 0 && std::abs(delta.x - prevDelta.x) < 0.01 && std::abs(delta.y - prevDelta.y) < 0.01) @@ -739,13 +734,12 @@ class TrackerInvoker : public cv::ParallelLoopBody winMaskMat, winSize, halfWin, winArea, minWinSize, maxWinSize) == false) continue; - - prevPt -= halfWin; + halfWin = Point2f(static_cast(maxWinSize) ,static_cast(maxWinSize) ) - halfWin; + prevPt += halfWin; iprevPt.x = cvFloor(prevPt.x); iprevPt.y = cvFloor(prevPt.y); - - if (iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols || - iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows) + if( iprevPt.x < 0 || iprevPt.x >= derivI.cols - winSize.width || + iprevPt.y < 0 || iprevPt.y >= derivI.rows - winSize.height - 1) { if (level == 0) { @@ -870,7 +864,7 @@ class TrackerInvoker : public cv::ParallelLoopBody cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1); cv::Point2f backUpNextPt = nextPt; - nextPt -= halfWin; + nextPt += halfWin; Point2f prevDelta(0, 0); //related to h(t-1) Point2f prevGain(0, 0); cv::Point2f gainVec = gainVecs[ptidx]; @@ -892,8 +886,8 @@ class TrackerInvoker : public cv::ParallelLoopBody inextPt.x = cvFloor(nextPt.x); inextPt.y = cvFloor(nextPt.y); - if (inextPt.x < -winSize.width || inextPt.x >= J.cols || - inextPt.y < -winSize.height || inextPt.y >= J.rows) + if( inextPt.x < 0 || inextPt.x >= J.cols - winSize.width || + inextPt.y < 0 || inextPt.y >= J.rows - winSize.height - 1) { if (level == 0 && status) status[ptidx] = 3; @@ -1382,7 +1376,7 @@ class TrackerInvoker : public cv::ParallelLoopBody prevGain = deltaGain; gainVec += deltaGain * 0.8; nextPt += delta * 0.8; - nextPts[ptidx] = nextPt + halfWin; + nextPts[ptidx] = nextPt - halfWin; gainVecs[ptidx] = gainVec; if ( diff --git a/modules/optflow/src/rlof/rlof_invokerbase.hpp b/modules/optflow/src/rlof/rlof_invokerbase.hpp index 470f4d87ad7..a7d09ceaff3 100644 --- a/modules/optflow/src/rlof/rlof_invokerbase.hpp +++ b/modules/optflow/src/rlof/rlof_invokerbase.hpp @@ -58,20 +58,6 @@ typedef uchar tMaskType; #define tCVMaskType CV_8UC1 #define MaskSet 0xffffffff -static inline bool notSameColor(const cv::Point3_ & ref, int r, int c, const cv::Mat & img, int winSize, int threshold) -{ - if (r >= img.rows + winSize || c >= img.cols + winSize || r < -winSize || c < -winSize) return true; - int step = static_cast(img.step1()); - const cv::Point3_ * tval = img.ptr>(); - tval += c; - tval += r * step / 3; - int R = std::abs((int)ref.x - (int)tval->x); - int G = std::abs((int)ref.y - (int)tval->y); - int B = std::abs((int)ref.z - (int)tval->z); - int diff = MAX(R, MAX(G, B)); - return diff > threshold; -} - static void getLocalPatch( const cv::Mat & src, diff --git a/modules/optflow/src/rlof/rlof_localflow.cpp b/modules/optflow/src/rlof/rlof_localflow.cpp index 8689aa3ed9f..e1e59860926 100644 --- a/modules/optflow/src/rlof/rlof_localflow.cpp +++ b/modules/optflow/src/rlof/rlof_localflow.cpp @@ -4,7 +4,7 @@ #include "../precomp.hpp" #include "opencv2/calib3d.hpp" // findHomography - +#include "opencv2/highgui.hpp" #include "rlof_localflow.h" #include "berlof_invoker.hpp" #include "rlof_invoker.hpp" @@ -156,13 +156,13 @@ class HorizontalCrossSegmentation : public cv::ParallelLoopBody { if( m_mask->at(r,c) == 0) continue; - const cv::Point3_ & ucval = m_data->at>(r,c); - cv::Point3i val(static_cast(ucval.x), static_cast(ucval.y), static_cast(ucval.z)); + const Point3_ & ucval = m_data->at>(r,c); + Point3i val(static_cast(ucval.x), static_cast(ucval.y), static_cast(ucval.z)); int x = c - hWinSize; - cv::Point dstPos = m_stride ? cv::Point(r,c) : cv::Point(c,r); + Point dstPos = m_stride ? Point(r,c) : Point(c,r); for(int ix = 0; ix < m_winSize; ix++, x++) { - const cv::Point3_ & valref = m_data->at>(r,x); + const Point3_ & valref = m_data->at>(r,x); differenz[ix] = MAX(std::abs(static_cast(valref.x) - val.x), MAX(std::abs(static_cast(valref.y) - val.y), (std::abs(static_cast(valref.z) - val.z)))); @@ -460,6 +460,9 @@ void calcLocalOpticalFlowCore( tRGBPrevPyr = preCrossMap; } // apply plk like tracker + prevImage.adjustROI(iWinSize, iWinSize, iWinSize, iWinSize); + currImage.adjustROI(iWinSize, iWinSize, iWinSize, iWinSize); + derivI.adjustROI(iWinSize, iWinSize, iWinSize, iWinSize); if (isrobust(param) == false) { if (param.useIlluminationModel) diff --git a/modules/optflow/test/test_OF_accuracy.cpp b/modules/optflow/test/test_OF_accuracy.cpp index 2e8940b3a2c..5d093cd49c3 100644 --- a/modules/optflow/test/test_OF_accuracy.cpp +++ b/modules/optflow/test/test_OF_accuracy.cpp @@ -198,7 +198,7 @@ TEST(SparseOpticalFlow, ReferenceAccuracy) param->solverType = ST_BILINEAR; algo->setRLOFOpticalFlowParameter(param); algo->calc(frame1, frame2, prevPts, currPts, status, err); - EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.27f); + EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.3f); param->solverType = ST_STANDART; algo->setRLOFOpticalFlowParameter(param); @@ -267,7 +267,7 @@ TEST(DenseOpticalFlow_RLOF, ReferenceAccuracy) ASSERT_EQ(GT.rows, flow.rows); ASSERT_EQ(GT.cols, flow.cols); - EXPECT_LE(calcRMSE(GT, flow), 0.53f); + EXPECT_LE(calcRMSE(GT, flow), 0.55f); }