diff --git a/modules/optflow/CMakeLists.txt b/modules/optflow/CMakeLists.txt index 39a37f143dd..e4232749ea2 100644 --- a/modules/optflow/CMakeLists.txt +++ b/modules/optflow/CMakeLists.txt @@ -1,2 +1,2 @@ set(the_description "Optical Flow Algorithms") -ocv_define_module(optflow opencv_core opencv_imgproc opencv_video opencv_ximgproc opencv_imgcodecs opencv_flann WRAP python) +ocv_define_module(optflow opencv_core opencv_imgproc opencv_calib3d opencv_video opencv_ximgproc opencv_imgcodecs opencv_flann WRAP python) diff --git a/modules/optflow/doc/optflow.bib b/modules/optflow/doc/optflow.bib index c36504b3e3c..5b4869564bd 100644 --- a/modules/optflow/doc/optflow.bib +++ b/modules/optflow/doc/optflow.bib @@ -61,3 +61,52 @@ @inproceedings{Wang_2016_CVPR month = {June}, year = {2016} } + +@inproceedings{Geistert2016, + author = {Jonas Geistert and Tobias Senst and Thomas Sikora}, + title = {Robust Local Optical Flow: Dense Motion Vector Field Interpolation}, + booktitle = {Picture Coding Symposium}, + year = {2016}, + pages = {1--5}, +} + +@inproceedings{Senst2016, + author = {Tobias Senst and Jonas Geistert and Thomas Sikora}, + title = {Robust local optical flow: Long-range motions and varying illuminations}, + booktitle = {IEEE International Conference on Image Processing}, + year = {2016}, + pages = {4478--4482}, +} + +@inproceedings{Senst2014, + author = {Tobias Senst and Thilo Borgmann and Ivo Keller and Thomas Sikora}, + title = {Cross based Robust Local Optical Flow}, + booktitle = {21th IEEE International Conference on Image Processing}, + year = {2014}, + pages = {1967--1971}, +} + +@inproceedings{Senst2013, + author = {Tobias Senst and Jonas Geistert and Ivo Keller and Thomas Sikora}, + title = {Robust Local Optical Flow Estimation using Bilinear Equations for Sparse Motion Estimation}, + booktitle = {20th IEEE International Conference on Image Processing}, + year = {2013}, + pages = {2499--2503}, +} + +@article{Senst2012, + author = {Tobias Senst and Volker Eiselein and Thomas Sikora}, + title = {Robust Local Optical Flow for Feature Tracking}, + journal = {IEEE Transactions on Circuits and Systems for Video Technology}, + year = {2012}, + pages = {1377--1387}, + volume = {22}, + number = {9}, +} + +@article{Tibshirani2008, + title={Fast computation of the median by successive binning}, + author={Tibshirani, Ryan J}, + journal={arXiv preprint arXiv:0806.3301}, + year={2008} +} diff --git a/modules/optflow/include/opencv2/optflow.hpp b/modules/optflow/include/opencv2/optflow.hpp index 65802750d1a..093b5fe2c7f 100644 --- a/modules/optflow/include/opencv2/optflow.hpp +++ b/modules/optflow/include/opencv2/optflow.hpp @@ -68,7 +68,7 @@ Functions reading and writing .flo files in "Middlebury" format, see: ::max() the least-square estimator will be used + * instead of the M-estimator. Althoug M-estimator is more robust against outlier in the support + * region the least-square can be fast in computation. + */ + float normSigma1; + /**< &sigma paramter of the shrinked Hampel norm introduced in @cite Senst2012. If + * &sigma = std::numeric_limist::max() the least-square estimator will be used + * instead of the M-estimator. Althoug M-estimator is more robust against outlier in the support + * region the least-square can be fast in computation. + */ + int smallWinSize; + /**< Minimal window size of the support region. This parameter is only used if supportRegionType is SR_CROSS. + */ + int largeWinSize; + /**< Maximal window size of the support region. If supportRegionType is SR_FIXED this gives the exact support + * region size. The speed of the RLOF is related to the applied win sizes. The smaller the window size the lower is the runtime, + * but the more sensitive to noise is the method. + */ + int crossSegmentationThreshold; + /**< Color similarity threshold used by cross-based segmentation following @cite Senst2014 . + * (Only used if supportRegionType is SR_CROSS). With the cross-bassed segmentation + * motion boundaries can be computed more accurately. + */ + int maxLevel; + /**< Maximal number of pyramid level used. The large this value is the more likely it is + * to obtain accurate solutions for long-range motions. The runtime is linear related to + * this parameter. + */ + bool useInitialFlow; + /**< Use next point list as initial values. A good intialization can imporve the algortihm + * accuracy and reduce the runtime by a faster convergence of the iteration refinement. + */ + bool useIlluminationModel; + /**< Use the Gennert and Negahdaripour illumination model instead of the intensity brigthness + * constraint. (proposed in @cite Senst2016 ) This model is defined as follow: + * \f[ I(\mathbf{x},t) + m \cdot I(\mathbf{x},t) + c = I(\mathbf{x},t+1) \f] + * and contains with m and c a multiplicative and additive term which makes the estimate + * more robust against illumination changes. The computational complexity is increased by + * enabling the illumination model. + */ + bool useGlobalMotionPrior; + /**< Use global motion prior initialisation has been introduced in @cite Senst2016 . It + * allows to be more accurate for long-range motion. The computational complexity is + * slightly increased by enabling the global motion prior initialisation. + */ + int maxIteration; + /**< Number of maximal iterations used for the iterative refinement. Lower values can + * reduce the runtime but also the accuracy. + */ + float minEigenValue; + /**< Threshold for the minimal eigenvalue of the gradient matrix defines when to abort the + * iterative refinement. + */ + float globalMotionRansacThreshold; + /**< To apply the global motion prior motion vectors will be computed on a regulary sampled which + * are the basis for Homography estimation using RANSAC. The reprojection threshold is based on + * n-th percentil (given by this value [0 ... 100]) of the motion vectors magnitude. + * See @cite Senst2016 for more details. + */ + + CV_WRAP void setSolverType(SolverType val); + CV_WRAP SolverType getSolverType() const; + + CV_WRAP void setSupportRegionType(SupportRegionType val); + CV_WRAP SupportRegionType getSupportRegionType() const; + + CV_WRAP void setNormSigma0(float val); + CV_WRAP float getNormSigma0() const; + + CV_WRAP void setNormSigma1(float val); + CV_WRAP float getNormSigma1() const; + + CV_WRAP void setSmallWinSize(int val); + CV_WRAP int getSmallWinSize() const; + + CV_WRAP void setLargeWinSize(int val); + CV_WRAP int getLargeWinSize() const; + + CV_WRAP void setCrossSegmentationThreshold(int val); + CV_WRAP int getCrossSegmentationThreshold() const; + + CV_WRAP void setMaxLevel(int val); + CV_WRAP int getMaxLevel() const; + + CV_WRAP void setUseInitialFlow(bool val); + CV_WRAP bool getUseInitialFlow() const; + + CV_WRAP void setUseIlluminationModel(bool val); + CV_WRAP bool getUseIlluminationModel() const; + + CV_WRAP void setUseGlobalMotionPrior(bool val); + CV_WRAP bool getUseGlobalMotionPrior() const; + + CV_WRAP void setMaxIteration(int val); + CV_WRAP int getMaxIteration() const; + + CV_WRAP void setMinEigenValue(float val); + CV_WRAP float getMinEigenValue() const; + + CV_WRAP void setGlobalMotionRansacThreshold(float val); + CV_WRAP float getGlobalMotionRansacThreshold() const; + + //! @brief Creates instance of optflow::RLOFOpticalFlowParameter + CV_WRAP static Ptr create(); +}; + +/** @brief Fast dense optical flow computation based on robust local optical flow (RLOF) algorithms and sparse-to-dense interpolation + * scheme. + * + * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as + * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). + * + * The sparse-to-dense interpolation scheme allows for fast computation of dense optical flow using RLOF (see @cite Geistert2016). + * For this scheme the following steps are applied: + * -# motion vector seeded at a regular sampled grid are computed. The sparsity of this grid can be configured with setGridStep + * -# (optinally) errornous motion vectors are filter based on the forward backward confidence. The threshold can be configured + * with setForwardBackward. The filter is only applied if the threshold >0 but than the runtime is doubled due to the estimation + * of the backward flow. + * -# Vector field interpolation is applied to the motion vector set to obtain a dense vector field. + * + * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details. + * Parameters have been described in @cite Senst2012 @cite Senst2013 @cite Senst2014 and @cite Senst2016. + * + * @note SIMD parallelization is only available when compiling with SSE4.1. If the grid size is set to (1,1) and the + * forward backward threshold <= 0 that the dense optical flow field is purely. + * computed with the RLOF. + * + * @see optflow::calcOpticalFlowDenseRLOF(), optflow::RLOFOpticalFlowParameter +*/ +class CV_EXPORTS_W DenseRLOFOpticalFlow : public DenseOpticalFlow +{ +public: + //! @brief Configuration of the RLOF alogrithm. + /** + @see optflow::RLOFOpticalFlowParameter, getRLOFOpticalFlowParameter + */ + CV_WRAP virtual void setRLOFOpticalFlowParameter(Ptr val) = 0; + /** @copybrief setRLOFOpticalFlowParameter + @see optflow::RLOFOpticalFlowParameter, setRLOFOpticalFlowParameter + */ + CV_WRAP virtual Ptr getRLOFOpticalFlowParameter() const = 0; + //! @brief Threshold for the forward backward confidence check + /**For each grid point \f$ \mathbf{x} \f$ a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed. + * If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f] + * is larger than threshold given by this function then the motion vector will not be used by the following + * vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test + * will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation. + * @see getForwardBackward, setGridStep + */ + CV_WRAP virtual void setForwardBackward(float val) = 0; + /** @copybrief setForwardBackward + @see setForwardBackward + */ + CV_WRAP virtual float getForwardBackward() const = 0; + //! @brief Size of the grid to spawn the motion vectors. + /** For each grid point a motion vector is computed. Some motion vectors will be removed due to the forwatd backward + * threshold (if set >0). The rest will be the base of the vector field interpolation. + * @see getForwardBackward, setGridStep + */ + CV_WRAP virtual Size getGridStep() const = 0; + /** @copybrief getGridStep + * @see getGridStep + */ + CV_WRAP virtual void setGridStep(Size val) = 0; + + //! @brief Interpolation used to compute the dense optical flow. + /** Two interpolation algorithms are supported + * - **INTERP_GEO** applies the fast geodesic interpolation, see @cite Geistert2016. + * - **INTERP_EPIC_RESIDUAL** applies the edge-preserving interpolation, see @cite Revaud2015,Geistert2016. + * @see ximgproc::EdgeAwareInterpolator, getInterpolation + */ + CV_WRAP virtual void setInterpolation(InterpolationType val) = 0; + /** @copybrief setInterpolation + * @see ximgproc::EdgeAwareInterpolator, setInterpolation + */ + CV_WRAP virtual InterpolationType getInterpolation() const = 0; + //! @brief see ximgproc::EdgeAwareInterpolator() K value. + /** K is a number of nearest-neighbor matches considered, when fitting a locally affine + * model. Usually it should be around 128. However, lower values would make the interpolation noticeably faster. + * @see ximgproc::EdgeAwareInterpolator, setEPICK + */ + CV_WRAP virtual int getEPICK() const = 0; + /** @copybrief getEPICK + * @see ximgproc::EdgeAwareInterpolator, getEPICK + */ + CV_WRAP virtual void setEPICK(int val) = 0; + //! @brief see ximgproc::EdgeAwareInterpolator() sigma value. + /** Sigma is a parameter defining how fast the weights decrease in the locally-weighted affine + * fitting. Higher values can help preserve fine details, lower values can help to get rid of noise in the + * output flow. + * @see ximgproc::EdgeAwareInterpolator, setEPICSigma + */ + CV_WRAP virtual float getEPICSigma() const = 0; + /** @copybrief getEPICSigma + * @see ximgproc::EdgeAwareInterpolator, getEPICSigma + */ + CV_WRAP virtual void setEPICSigma(float val) = 0; + //! @brief see ximgproc::EdgeAwareInterpolator() lambda value. + /** Lambda is a parameter defining the weight of the edge-aware term in geodesic distance, + * should be in the range of 0 to 1000. + * @see ximgproc::EdgeAwareInterpolator, setEPICSigma + */ + CV_WRAP virtual float getEPICLambda() const = 0; + /** @copybrief getEPICLambda + * @see ximgproc::EdgeAwareInterpolator, getEPICLambda + */ + CV_WRAP virtual void setEPICLambda(float val) = 0; + //! @brief see ximgproc::EdgeAwareInterpolator(). + /** Sets the respective fastGlobalSmootherFilter() parameter. + * @see ximgproc::EdgeAwareInterpolator, setFgsLambda + */ + CV_WRAP virtual float getFgsLambda() const = 0; + /** @copybrief getFgsLambda + * @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, getFgsLambda + */ + CV_WRAP virtual void setFgsLambda(float val) = 0; + //! @brief see ximgproc::EdgeAwareInterpolator(). + /** Sets the respective fastGlobalSmootherFilter() parameter. + * @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, setFgsSigma + */ + CV_WRAP virtual float getFgsSigma() const = 0; + /** @copybrief getFgsSigma + * @see ximgproc::EdgeAwareInterpolator, ximgproc::fastGlobalSmootherFilter, getFgsSigma + */ + CV_WRAP virtual void setFgsSigma(float val) = 0; + //! @brief enables ximgproc::fastGlobalSmootherFilter + /** + * @see getUsePostProc + */ + CV_WRAP virtual void setUsePostProc(bool val) = 0; + /** @copybrief setUsePostProc + * @see ximgproc::fastGlobalSmootherFilter, setUsePostProc + */ + CV_WRAP virtual bool getUsePostProc() const = 0; + //! @brief Creates instance of optflow::DenseRLOFOpticalFlow + /** + * @param rlofParam see optflow::RLOFOpticalFlowParameter + * @param forwardBackwardThreshold see setForwardBackward + * @param gridStep see setGridStep + * @param interp_type see setInterpolation + * @param epicK see setEPICK + * @param epicSigma see setEPICSigma + * @param epicLambda see setEPICLambda + * @param use_post_proc see setUsePostProc + * @param fgsLambda see setFgsLambda + * @param fgsSigma see setFgsSigma + */ + CV_WRAP static Ptr create( + Ptr rlofParam = Ptr(), + float forwardBackwardThreshold = 1.f, + Size gridStep = Size(6, 6), + InterpolationType interp_type = InterpolationType::INTERP_EPIC, + int epicK = 128, + float epicSigma = 0.05f, + float epicLambda = 999.0f, + bool use_post_proc = true, + float fgsLambda = 500.0f, + float fgsSigma = 1.5f); +}; + +/** @brief Class used for calculation sparse optical flow and feature tracking with robust local optical flow (RLOF) algorithms. +* +* The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as +* proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). +* +* For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details. +* Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014 and @cite Senst2016. +* +* @note SIMD parallelization is only available when compiling with SSE4.1. +* @see optflow::calcOpticalFlowSparseRLOF(), optflow::RLOFOpticalFlowParameter +*/ +class CV_EXPORTS_W SparseRLOFOpticalFlow : public SparseOpticalFlow +{ +public: + /** @copydoc DenseRLOFOpticalFlow::setRLOFOpticalFlowParameter + */ + CV_WRAP virtual void setRLOFOpticalFlowParameter(Ptr val) = 0; + /** @copybrief setRLOFOpticalFlowParameter + * @see setRLOFOpticalFlowParameter + */ + CV_WRAP virtual Ptr getRLOFOpticalFlowParameter() const = 0; + //! @brief Threshold for the forward backward confidence check + /** For each feature point a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed. + * If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f] + * is larger than threshold given by this function then the status will not be used by the following + * vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test + * will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation. + * @see setForwardBackward + */ + CV_WRAP virtual void setForwardBackward(float val) = 0; + /** @copybrief setForwardBackward + * @see setForwardBackward + */ + CV_WRAP virtual float getForwardBackward() const = 0; + + //! @brief Creates instance of SparseRLOFOpticalFlow + /** + * @param rlofParam see setRLOFOpticalFlowParameter + * @param forwardBackwardThreshold see setForwardBackward + */ + CV_WRAP static Ptr create( + Ptr rlofParam = Ptr(), + float forwardBackwardThreshold = 1.f); + +}; + +/** @brief Fast dense optical flow computation based on robust local optical flow (RLOF) algorithms and sparse-to-dense interpolation scheme. + * + * The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as + * proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). + * + * The sparse-to-dense interpolation scheme allows for fast computation of dense optical flow using RLOF (see @cite Geistert2016). + * For this scheme the following steps are applied: + * -# motion vector seeded at a regular sampled grid are computed. The sparsity of this grid can be configured with setGridStep + * -# (optinally) errornous motion vectors are filter based on the forward backward confidence. The threshold can be configured + * with setForwardBackward. The filter is only applied if the threshold >0 but than the runtime is doubled due to the estimation + * of the backward flow. + * -# Vector field interpolation is applied to the motion vector set to obtain a dense vector field. + * + * @param I0 first 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType + * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image. + * @param I1 second 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType + * = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image. + * @param flow computed flow image that has the same size as I0 and type CV_32FC2. + * @param rlofParam see optflow::RLOFOpticalFlowParameter + * @param forwardBackwardThreshold Threshold for the forward backward confidence check. + * For each grid point \f$ \mathbf{x} \f$ a motion vector \f$ d_{I0,I1}(\mathbf{x}) \f$ is computed. + * If the forward backward error \f[ EP_{FB} = || d_{I0,I1} + d_{I1,I0} || \f] + * is larger than threshold given by this function then the motion vector will not be used by the following + * vector field interpolation. \f$ d_{I1,I0} \f$ denotes the backward flow. Note, the forward backward test + * will only be applied if the threshold > 0. This may results into a doubled runtime for the motion estimation. + * @param gridStep Size of the grid to spawn the motion vectors. For each grid point a motion vector is computed. + * Some motion vectors will be removed due to the forwatd backward threshold (if set >0). The rest will be the + * base of the vector field interpolation. + * @param interp_type interpolation method used to compute the dense optical flow. Two interpolation algorithms are + * supported: + * - **INTERP_GEO** applies the fast geodesic interpolation, see @cite Geistert2016. + * - **INTERP_EPIC_RESIDUAL** applies the edge-preserving interpolation, see @cite Revaud2015,Geistert2016. + * @param epicK see ximgproc::EdgeAwareInterpolator() sets the respective parameter. + * @param epicSigma see ximgproc::EdgeAwareInterpolator() sets the respective parameter. + * @param epicLambda see ximgproc::EdgeAwareInterpolator() sets the respective parameter. + * @param use_post_proc enables ximgproc::fastGlobalSmootherFilter() parameter. + * @param fgsLambda sets the respective ximgproc::fastGlobalSmootherFilter() parameter. + * @param fgsSigma sets the respective ximgproc::fastGlobalSmootherFilter() parameter. + * + * Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014, @cite Senst2016. + * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details. + * @note If the grid size is set to (1,1) and the forward backward threshold <= 0 that the dense optical flow field is purely + * computed with the RLOF. + * + * @note SIMD parallelization is only available when compiling with SSE4.1. + * + * @sa optflow::DenseRLOFOpticalFlow, optflow::RLOFOpticalFlowParameter +*/ +CV_EXPORTS_W void calcOpticalFlowDenseRLOF(InputArray I0, InputArray I1, InputOutputArray flow, + Ptr rlofParam = Ptr(), + float forwardBackwardThreshold = 0, Size gridStep = Size(6, 6), + InterpolationType interp_type = InterpolationType::INTERP_EPIC, + int epicK = 128, float epicSigma = 0.05f, float epicLambda = 100.f, + bool use_post_proc = true, float fgsLambda = 500.0f, float fgsSigma = 1.5f); + +/** @brief Calculates fast optical flow for a sparse feature set using the robust local optical flow (RLOF) similar +* to optflow::calcOpticalFlowPyrLK(). +* +* The RLOF is a fast local optical flow approach similar to the pyramidal iterative Lucas-Kanade method as +* proposed by @cite Bouguet00. The implementation is derived from optflow::calcOpticalFlowPyrLK(). +* +* @param prevImg first 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType +* = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image. +* @param nextImg second 8-bit input image. If The cross-based RLOF is used (by selecting optflow::RLOFOpticalFlowParameter::supportRegionType +* = SupportRegionType::SR_CROSS) image has to be a 8-bit 3 channel image. +* @param prevPts vector of 2D points for which the flow needs to be found; point coordinates must be single-precision +* floating-point numbers. +* @param nextPts output vector of 2D points (with single-precision floating-point coordinates) containing the calculated +* new positions of input features in the second image; when optflow::RLOFOpticalFlowParameter::useInitialFlow variable is true the vector must +* have the same size as in the input and contain the initialization point correspondences. +* @param status output status vector (of unsigned chars); each element of the vector is set to 1 if the flow for the +* corresponding features has passed the forward backward check. +* @param err output vector of errors; each element of the vector is set to the forward backward error for the corresponding feature. +* @param rlofParam see optflow::RLOFOpticalFlowParameter +* @param forwardBackwardThreshold Threshold for the forward backward confidence check. If forewardBackwardThreshold <=0 the forward +* +* @note SIMD parallelization is only available when compiling with SSE4.1. +* +* Parameters have been described in @cite Senst2012, @cite Senst2013, @cite Senst2014 and @cite Senst2016. +* For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details. +*/ +CV_EXPORTS_W void calcOpticalFlowSparseRLOF(InputArray prevImg, InputArray nextImg, + InputArray prevPts, InputOutputArray nextPts, + OutputArray status, OutputArray err, + Ptr rlofParam = Ptr(), + float forwardBackwardThreshold = 0); + +//! Additional interface to the Dense RLOF algorithm - optflow::calcOpticalFlowDenseRLOF() +CV_EXPORTS_W Ptr createOptFlow_DenseRLOF(); + +//! Additional interface to the Sparse RLOF algorithm - optflow::calcOpticalFlowSparseRLOF() +CV_EXPORTS_W Ptr createOptFlow_SparseRLOF(); +//! @} + +} // namespace +} // namespace +#endif diff --git a/modules/optflow/perf/perf_rlof.cpp b/modules/optflow/perf/perf_rlof.cpp new file mode 100644 index 00000000000..1295c526a6e --- /dev/null +++ b/modules/optflow/perf/perf_rlof.cpp @@ -0,0 +1,72 @@ +#include "perf_precomp.hpp" +namespace opencv_test { namespace { + +typedef tuple ST_SR_IM_Sparse_t; +typedef TestBaseWithParam ST_SR_IM_Sparse; +PERF_TEST_P(ST_SR_IM_Sparse, OpticalFlow_SparseRLOF, + testing::Combine( + testing::Values("ST_BILINEAR", "ST_STANDART"), + testing::Values("SR_CROSS", "SR_FIXED"), + testing::Values(true, false)) +) +{ + Mat frame1 = imread(getDataPath("cv/optflow/RubberWhale1.png")); + Mat frame2 = imread(getDataPath("cv/optflow/RubberWhale2.png")); + ASSERT_FALSE(frame1.empty()); + ASSERT_FALSE(frame2.empty()); + vector prevPts, currPts; + for (int r = 0; r < frame1.rows; r += 10) + { + for (int c = 0; c < frame1.cols; c += 10) + { + prevPts.push_back(Point2f(static_cast(c), static_cast(r))); + } + } + vector status(prevPts.size()); + vector err(prevPts.size()); + + Ptr param = Ptr(new RLOFOpticalFlowParameter); + if (get<0>(GetParam()) == "ST_BILINEAR") + param->solverType = ST_BILINEAR; + if (get<0>(GetParam()) == "ST_STANDART") + param->solverType = ST_STANDART; + if (get<1>(GetParam()) == "SR_CROSS") + param->supportRegionType = SR_CROSS; + if (get<1>(GetParam()) == "SR_FIXED") + param->supportRegionType = SR_FIXED; + param->useIlluminationModel = get<2>(GetParam()); + + PERF_SAMPLE_BEGIN() + calcOpticalFlowSparseRLOF(frame1, frame2, prevPts, currPts, status, err, param, 1.f); + PERF_SAMPLE_END() + + SANITY_CHECK_NOTHING(); +} + +typedef tuple INTERP_GRID_Dense_t; +typedef TestBaseWithParam INTERP_GRID_Dense; +PERF_TEST_P(INTERP_GRID_Dense, OpticalFlow_DenseRLOF, + testing::Combine( + testing::Values("INTERP_EPIC", "INTERP_GEO"), + testing::Values(4,10)) +) +{ + Mat flow; + Mat frame1 = imread(getDataPath("cv/optflow/RubberWhale1.png")); + Mat frame2 = imread(getDataPath("cv/optflow/RubberWhale1.png")); + ASSERT_FALSE(frame1.empty()); + ASSERT_FALSE(frame2.empty()); + Ptr param = Ptr(new RLOFOpticalFlowParameter);; + Ptr< DenseRLOFOpticalFlow> algo = DenseRLOFOpticalFlow::create(); + InterpolationType interp_type = INTERP_EPIC; + if (get<0>(GetParam()) == "INTERP_EPIC") + interp_type = INTERP_EPIC; + if (get<0>(GetParam()) == "INTERP_GEO") + interp_type = INTERP_GEO; + PERF_SAMPLE_BEGIN() + calcOpticalFlowDenseRLOF(frame1, frame2,flow, param, 1.0f, Size(get<1>(GetParam()), get<1>(GetParam())), interp_type); + PERF_SAMPLE_END() + SANITY_CHECK_NOTHING(); +} + +}} // namespace diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp new file mode 100644 index 00000000000..604305250a5 --- /dev/null +++ b/modules/optflow/src/rlof/berlof_invoker.hpp @@ -0,0 +1,2107 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +#ifndef _BERLOF_INVOKER_HPP_ +#define _BERLOF_INVOKER_HPP_ +#include "rlof_invokerbase.hpp" + + +namespace cv{ +namespace optflow{ + + +static inline bool checkSolution(float x, float y, float * c ) +{ + float _a = x - 0.002f; + float _b = y - 0.002f; + cv::Point2f tl( c[0] * _a * _b + c[1] * _a + c[2] * _b + c[3], c[4] * _a * _b + c[5] * _a + c[6] * _b + c[7]); + _a = x + 0.002f; + cv::Point2f tr( c[0] * _a * _b + c[1] * _a + c[2] * _b + c[3], c[4] * _a * _b + c[5] * _a + c[6] * _b + c[7]); + _b = y + 0.002f; + cv::Point2f br( c[0] * _a * _b + c[1] * _a + c[2] * _b + c[3], c[4] * _a * _b + c[5] * _a + c[6] * _b + c[7]); + _a = x - 0.002f; + cv::Point2f bl( c[0] * _a * _b + c[1] * _a + c[2] * _b + c[3], c[4] * _a * _b + c[5] * _a + c[6] * _b + c[7]); + return (tl.x >= 0 && tl.y >= 0) && (tr.x <= 0 && tr.y >= 0) + && (bl.x >= 0 && bl.y <= 0) && (br.x <= 0 && br.y <= 0); +} + +static inline cv::Point2f est_DeltaGain(const cv::Matx44f& src, const cv::Vec4f& val) +{ + return cv::Point2f( + src(2,0) * val[0] + src(2,1) * val[1] + src(2,2) * val[2] + src(2,3) * val[3], + src(3,0) * val[0] + src(3,1) * val[1] + src(3,2) * val[2] + src(3,3) * val[3]); +} +static inline void est_Result(const cv::Matx44f& src, const cv::Vec4f & val, cv::Point2f & delta, cv::Point2f & gain) +{ + + delta = cv::Point2f( + -(src(0,0) * val[0] + src(0,1) * val[1] + src(0,2) * val[2] + src(0,3) * val[3]), + -(src(1,0) * val[0] + src(1,1) * val[1] + src(1,2) * val[2] + src(1,3) * val[3])); + + gain = cv::Point2f( + src(2,0) * val[0] + src(2,1) * val[1] + src(2,2) * val[2] + src(2,3) * val[3], + src(3,0) * val[0] + src(3,1) * val[1] + src(3,2) * val[2] + src(3,3) * val[3]); +} + +namespace berlof { + +namespace ica { + +class TrackerInvoker : public cv::ParallelLoopBody +{ +public: + TrackerInvoker( + const Mat& _prevImg, + const Mat& _prevDeriv, + const Mat& _nextImg, + const Mat& _rgbPrevImg, + const Mat& _rgbNextImg, + const Point2f* _prevPts, + Point2f* _nextPts, + uchar* _status, + float* _err, + int _level, + int _maxLevel, + int _winSize[2], + int _maxIteration, + bool _useInitialFlow, + int _supportRegionType, + int _crossSegmentationThreshold, + const std::vector& _normSigmas, + float _minEigenValue + ) : + normSigma0(_normSigmas[0]), + normSigma1(_normSigmas[1]), + normSigma2(_normSigmas[2]) + { + prevImg = &_prevImg; + prevDeriv = &_prevDeriv; + nextImg = &_nextImg; + rgbPrevImg = &_rgbPrevImg; + rgbNextImg = &_rgbNextImg; + prevPts = _prevPts; + nextPts = _nextPts; + status = _status; + err = _err; + minWinSize = _winSize[0]; + maxWinSize = _winSize[1]; + criteria.maxCount = _maxIteration; + criteria.epsilon = 0.01; + level = _level; + maxLevel = _maxLevel; + windowType = _supportRegionType; + minEigThreshold = _minEigenValue; + useInitialFlow = _useInitialFlow; + crossSegmentationThreshold = _crossSegmentationThreshold; + } + + void operator()(const cv::Range& range) const CV_OVERRIDE + { + Point2f halfWin; + cv::Size winSize; + const Mat& I = *prevImg; + const Mat& J = *nextImg; + const Mat& derivI = *prevDeriv; + const Mat& BI = *rgbPrevImg; + + + const float FLT_SCALE = (1.f/(1 << 16)); + + winSize = cv::Size(maxWinSize,maxWinSize); + int winMaskwidth = roundUp(winSize.width, 16); + cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); + winMaskMatBuf.setTo(1); + + int j, cn = I.channels(), cn2 = cn*2; + int winbufwidth = roundUp(winSize.width, 16); + cv::Size winBufSize(winbufwidth,winbufwidth); + + cv::AutoBuffer _buf(winBufSize.area()*(cn + cn2)); + int derivDepth = DataType::depth; + + Mat IWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf.data()); + Mat derivIWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf.data() + winBufSize.area()*cn); + + + for( int ptidx = range.start; ptidx < range.end; ptidx++ ) + { + Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level)); + Point2f nextPt; + if( level == maxLevel ) + { + if( useInitialFlow ) + nextPt = nextPts[ptidx]*(float)(1./(1 << level)); + else + nextPt = prevPt; + } + else + nextPt = nextPts[ptidx]*2.f; + nextPts[ptidx] = nextPt; + + Point2i iprevPt, inextPt; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + int winArea = maxWinSize * maxWinSize; + cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize)); + winMaskMatBuf.setTo(0); + if( calcWinMaskMat(BI, windowType, iprevPt, + winMaskMat,winSize,halfWin,winArea, + minWinSize,maxWinSize) == false) + { + continue; + } + halfWin = Point2f(static_cast(maxWinSize), static_cast(maxWinSize) ) - halfWin; + prevPt += halfWin; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + if( iprevPt.x < 0 || iprevPt.x >= derivI.cols - winSize.width || + iprevPt.y < 0 || iprevPt.y >= derivI.rows - winSize.height - 1) + { + if( level == 0 ) + { + if( status ) + status[ptidx] = 3; + if( err ) + err[ptidx] = 0; + } + continue; + } + + float a = prevPt.x - iprevPt.x; + float b = prevPt.y - iprevPt.y; + const int W_BITS = 14, W_BITS1 = 14; + + int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + + float A11 = 0, A12 = 0, A22 = 0; + float D = 0; + + // extract the patch from the first image, compute covariation matrix of derivatives + int x, y; + for( y = 0; y < winSize.height; y++ ) + { + const uchar* src = I.ptr(y + iprevPt.y, 0) + iprevPt.x*cn; + const uchar* src1 = I.ptr(y + iprevPt.y + 1, 0) + iprevPt.x*cn; + const short* dsrc = derivI.ptr(y + iprevPt.y, 0) + iprevPt.x*cn2; + const short* dsrc1 = derivI.ptr(y + iprevPt.y + 1, 0) + iprevPt.x*cn2; + short* Iptr = IWinBuf.ptr(y, 0); + short* dIptr = derivIWinBuf.ptr(y, 0); + x = 0; + for( ; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2 ) + { + if( winMaskMat.at(y,x) == 0) + { + dIptr[0] = 0; + dIptr[1] = 0; + continue; + } + int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + + src1[x]*iw10 + src1[x+cn]*iw11, W_BITS1-5); + int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 + + dsrc1[0]*iw10 + dsrc1[cn2]*iw11, W_BITS1); + int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + + dsrc1[1]*iw10 + dsrc1[cn2+1]*iw11, W_BITS1); + Iptr[x] = (short)ival; + dIptr[0] = (short)ixval; + dIptr[1] = (short)iyval; + } + } + + cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1); + cv::Point2f backUpNextPt = nextPt; + nextPt += halfWin; + Point2f prevDelta(0,0); //denotes h(t-1) + cv::Size _winSize = winSize; +#ifdef RLOF_SSE + __m128i mmMask0, mmMask1, mmMask; + getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask); +#endif + float MEstimatorScale = 1; + int buffIdx = 0; + float c[8]; + cv::Mat GMc0, GMc1, GMc2, GMc3; + cv::Vec2f Mc0, Mc1, Mc2, Mc3; + int noIteration = 0; + int noReusedIteration = 0; + int noSolvedIteration = 0; + for( j = 0; j < criteria.maxCount; j++ ) + { + cv::Point2f delta(0,0); + cv::Point2f deltaGain(0,0); + bool hasSolved = false; + a = nextPt.x - inextPt.x; + b = nextPt.y - inextPt.y; + float ab = a * b; + if( j == 0 + || ( inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j % 2 != 0 )) + { + inextPt.x = cvFloor(nextPt.x); + inextPt.y = cvFloor(nextPt.y); + if( inextPt.x < 0 || inextPt.x >= J.cols - winSize.width || + inextPt.y < 0 || inextPt.y >= J.rows - winSize.height - 1) + { + if( level == 0 && status ) + status[ptidx] = 3; + break; + } + + + a = nextPt.x - inextPt.x; + b = nextPt.y - inextPt.y; + ab = a * b; + iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + // mismatch + if( j == 0 ) + { + A11 = 0; + A12 = 0; + A22 = 0; + } + + if ( j == 0 ) + { + buffIdx = 0; + for( y = 0; y < winSize.height; y++ ) + { + const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const uchar* Jptr1 = J.ptr(y + inextPt.y + 1, inextPt.x*cn); + const short* Iptr = IWinBuf.ptr(y, 0); + const short* dIptr = derivIWinBuf.ptr(y, 0); + const tMaskType* maskPtr = winMaskMat.ptr(y, 0); + x = 0; + for( ; x < winSize.width*cn; x++, dIptr += 2) + { + if( maskPtr[x] == 0) + continue; + int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr1[x]*iw10 + Jptr1[x+cn]*iw11, W_BITS1-5) + - Iptr[x]; + residualMat.at(buffIdx++) = static_cast(diff); + } + } + /*! Estimation for the residual */ + cv::Mat residualMatRoi(residualMat, cv::Rect(0,0,1, buffIdx)); + MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi); + } + + float eta = 1.f / winArea; + float fParam0 = normSigma0 * 32.f; + float fParam1 = normSigma1 * 32.f; + fParam0 = normSigma0 * MEstimatorScale; + fParam1 = normSigma1 * MEstimatorScale; + + + buffIdx = 0; + float _b0[4] = {0,0,0,0}; + float _b1[4] = {0,0,0,0}; + + /* + */ + for( y = 0; y < _winSize.height; y++ ) + { + const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const uchar* Jptr1 = J.ptr(y + inextPt.y + 1, inextPt.x*cn); + const short* Iptr = IWinBuf.ptr(y, 0); + const short* dIptr = derivIWinBuf.ptr(y, 0); + const tMaskType* maskPtr = winMaskMat.ptr(y, 0); + x = 0; + for( ; x < _winSize.width*cn; x++, dIptr += 2 ) + { + if( maskPtr[x] == 0) + continue; + int illValue = - Iptr[x]; + + float It[4] = {static_cast((Jptr1[x+cn]<< 5) + illValue), + static_cast((Jptr[x+cn]<< 5) + illValue), + static_cast((Jptr1[x]<< 5) + illValue), + static_cast((Jptr[x] << 5) + illValue)}; + + + + int J_val = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + + Jptr1[x]*iw10 + Jptr1[x+cn]*iw11, + W_BITS1-5); + + + int diff = J_val + illValue; + + + MEstimatorScale += (diff < MEstimatorScale) ? -eta : eta; + + int abss = (diff < 0) ? -diff : diff; + + // compute the missmatch vector + if( j >= 0) + { + if( abss > fParam1) + { + It[0] = 0; + It[1] = 0; + It[2] = 0; + It[3] = 0; + } + else if( abss > fParam0 && diff >= 0 ) + { + It[0] = normSigma2 * (It[0] - fParam1); + It[1] = normSigma2 * (It[1] - fParam1); + It[2] = normSigma2 * (It[2] - fParam1); + It[3] = normSigma2 * (It[3] - fParam1); + } + else if( abss > fParam0 && diff < 0 ) + { + It[0] = normSigma2 * (It[0] + fParam1); + It[1] = normSigma2 * (It[1] + fParam1); + It[2] = normSigma2 * (It[2] + fParam1); + It[3] = normSigma2 * (It[3] + fParam1); + } + } + + float It0 = It[0]; + float It1 = It[1]; + float It2 = It[2]; + float It3 = It[3]; + + float ixval = static_cast(dIptr[0]); + float iyval = static_cast(dIptr[1]); + _b0[0] += It0 * ixval; + _b0[1] += It1 * ixval; + _b0[2] += It2 * ixval; + _b0[3] += It3 * ixval; + + + _b1[0] += It0*iyval; + _b1[1] += It1*iyval; + _b1[2] += It2*iyval; + _b1[3] += It3*iyval; + + + // compute the Gradient Matrice + if( j == 0) + { + float tale = normSigma2 * FLT_RESCALE; + if( abss < fParam0 || j < 0) + { + tale = FLT_RESCALE; + } + else if( abss > fParam1) + { + tale *= 0.01f; + } + else + { + tale *= normSigma2; + } + + A11 += (float)(ixval*ixval)*tale; + A12 += (float)(ixval*iyval)*tale; + A22 += (float)(iyval*iyval)*tale; + + } + } + } + + if( j == 0 ) + { + + A11 *= FLT_SCALE; // 54866744. + A12 *= FLT_SCALE; // -628764.00 + A22 *= FLT_SCALE; // 19730.000 + + D = A11 * A22 - A12 * A12; + float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) + + 4.f*A12*A12))/(2*winArea); + + if( minEig < minEigThreshold || std::abs(D) < FLT_EPSILON) + { + if( level == 0 && status ) + status[ptidx] = 0; + if( level > 0) + { + nextPts[ptidx] = backUpNextPt; + } + noIteration++; + break; + } + + D = (1.f / D); + + } + + _b0[0] *= FLT_SCALE;_b0[1] *= FLT_SCALE;_b0[2] *= FLT_SCALE;_b0[3] *= FLT_SCALE; + _b1[0] *= FLT_SCALE;_b1[1] *= FLT_SCALE;_b1[2] *= FLT_SCALE;_b1[3] *= FLT_SCALE; + + + Mc0[0] = _b0[0] - _b0[1] - _b0[2] + _b0[3]; + Mc0[1] = _b1[0] - _b1[1] - _b1[2] + _b1[3]; + + Mc1[0] = _b0[1] - _b0[3]; + Mc1[1] = _b1[1] - _b1[3]; + + + Mc2[0] = _b0[2] - _b0[3]; + Mc2[1] = _b1[2] - _b1[3]; + + + Mc3[0] = _b0[3]; + Mc3[1] = _b1[3]; + + c[0] = -Mc0[0]; + c[1] = -Mc1[0]; + c[2] = -Mc2[0]; + c[3] = -Mc3[0]; + c[4] = -Mc0[1]; + c[5] = -Mc1[1]; + c[6] = -Mc2[1]; + c[7] = -Mc3[1]; + + float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]); + float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]); + float e2 = e0 * (c[1] * c[7] -c[3] * c[5]); + e0 = e1 * e1 - e2; + hasSolved = false; + if ( e0 > 0) + { + e0 = sqrt(e0); + float _y[2] = {-e1 - e0, e0 - e1}; + float c0yc1[2] = {c[0] * _y[0] + c[1], + c[0] * _y[1] + c[1]}; + float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0], + -(c[2] * _y[1] + c[3]) / c0yc1[1]}; + bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1); + bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1); + + bool isSolution1 = checkSolution(_x[0], _y[0], c ); + bool isSolution2 = checkSolution(_x[1], _y[1], c ); + bool isSink1 = isIn1 && isSolution1; + bool isSink2 = isIn2 && isSolution2; + + if ( isSink1 != isSink2) + { + a = isSink1 ? _x[0] : _x[1]; + b = isSink1 ? _y[0] : _y[1]; + ab = a * b; + hasSolved = true; + delta.x = inextPt.x + a - nextPt.x; + delta.y = inextPt.y + b - nextPt.y; + } // isIn1 != isIn2 + } + if( hasSolved == false) + noIteration++; + } + else + { + hasSolved = false; + noReusedIteration++; + } + if( hasSolved == false ) + { + + cv::Vec2f mismatchVec = ab * Mc0 + Mc1 *a + Mc2 * b + Mc3; + delta.x = (A12 * mismatchVec.val[1] - A22 * mismatchVec.val[0]) * D; + delta.y = (A12 * mismatchVec.val[0] - A11 * mismatchVec.val[1]) * D; + delta.x = MAX(-1.f, MIN(1.f, delta.x)); + delta.y = MAX(-1.f, MIN(1.f, delta.y)); + nextPt += delta; + nextPts[ptidx] = nextPt - halfWin; + } + else + { + nextPt += delta; + nextPts[ptidx] = nextPt - halfWin; + noSolvedIteration++; + break; + } + + delta.x = ( delta.x != delta.x) ? 0 : delta.x; + delta.y = ( delta.y != delta.y) ? 0 : delta.y; + + if(j > 0 && ( + (std::abs(delta.x - prevDelta.x) < 0.01 && std::abs(delta.y - prevDelta.y) < 0.01))) + { + nextPts[ptidx] -= delta*0.5f; + break; + } + + prevDelta = delta; + } + + } + + } + + const Mat* prevImg; + const Mat* nextImg; + const Mat* prevDeriv; + const Mat* rgbPrevImg; + const Mat* rgbNextImg; + const Point2f* prevPts; + Point2f* nextPts; + uchar* status; + float* err; + int maxWinSize; + int minWinSize; + TermCriteria criteria; + int level; + int maxLevel; + int windowType; + float minEigThreshold; + bool useInitialFlow; + const float normSigma0, normSigma1, normSigma2; + int crossSegmentationThreshold; + +}; + +} // namespace +namespace radial { +class TrackerInvoker : public cv::ParallelLoopBody +{ +public: + TrackerInvoker( + const Mat& _prevImg, + const Mat& _prevDeriv, + const Mat& _nextImg, + const Mat& _rgbPrevImg, + const Mat& _rgbNextImg, + const Point2f* _prevPts, + Point2f* _nextPts, + uchar* _status, + float* _err, + Point2f* _gainVecs, + int _level, + int _maxLevel, + int _winSize[2], + int _maxIteration, + bool _useInitialFlow, + int _supportRegionType, + int _crossSegmentationThreshold, + const std::vector& _normSigmas, + float _minEigenValue + ) : + normSigma0(_normSigmas[0]), + normSigma1(_normSigmas[1]), + normSigma2(_normSigmas[2]) + { + prevImg = &_prevImg; + prevDeriv = &_prevDeriv; + nextImg = &_nextImg; + rgbPrevImg = &_rgbPrevImg; + rgbNextImg = &_rgbNextImg; + prevPts = _prevPts; + nextPts = _nextPts; + status = _status; + err = _err; + gainVecs = _gainVecs; + minWinSize = _winSize[0]; + maxWinSize = _winSize[1]; + criteria.maxCount = _maxIteration; + criteria.epsilon = 0.01; + level = _level; + maxLevel = _maxLevel; + windowType = _supportRegionType; + minEigThreshold = _minEigenValue; + useInitialFlow = _useInitialFlow; + crossSegmentationThreshold = _crossSegmentationThreshold; + } + + void operator()(const cv::Range& range) const CV_OVERRIDE + { + Point2f halfWin; + cv::Size winSize; + const Mat& I = *prevImg; + const Mat& J = *nextImg; + const Mat& derivI = *prevDeriv; + const Mat& BI = *rgbPrevImg; + const float FLT_SCALE = (1.f/(1 << 16));//(1.f/(1 << 20)); // 20 + winSize = cv::Size(maxWinSize,maxWinSize); + int winMaskwidth = roundUp(winSize.width, 16); + cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); + winMaskMatBuf.setTo(1); + + int cn = I.channels(), cn2 = cn*2; + int winbufwidth = roundUp(winSize.width, 16); + cv::Size winBufSize(winbufwidth,winbufwidth); + + + cv::Matx44f invTensorMat; + + cv::AutoBuffer _buf(winBufSize.area()*(cn + cn2)); + int derivDepth = DataType::depth; + + Mat IWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf.data()); + Mat derivIWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf.data() + winBufSize.area()*cn); + + for( int ptidx = range.start; ptidx < range.end; ptidx++ ) + { + Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level)); + Point2f nextPt; + if( level == maxLevel ) + { + if( useInitialFlow ) + nextPt = nextPts[ptidx]*(float)(1./(1 << level)); + else + nextPt = prevPt; + } + else + nextPt = nextPts[ptidx]*2.f; + nextPts[ptidx] = nextPt; + + Point2i iprevPt, inextPt; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + int winArea = maxWinSize * maxWinSize; + cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize)); + winMaskMatBuf.setTo(0); + if( calcWinMaskMat(BI, windowType, iprevPt, + winMaskMat,winSize,halfWin,winArea, + minWinSize,maxWinSize) == false) + continue; + halfWin = Point2f(static_cast(maxWinSize), static_cast(maxWinSize) ) - halfWin; + prevPt += halfWin; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + if( iprevPt.x < 0 || iprevPt.x >= derivI.cols - winSize.width || + iprevPt.y < 0 || iprevPt.y >= derivI.rows - winSize.height - 1) + { + if( level == 0 ) + { + if( status ) + status[ptidx] = 3; + if( err ) + err[ptidx] = 0; + } + continue; + } + + float a = prevPt.x - iprevPt.x; + float b = prevPt.y - iprevPt.y; + const int W_BITS = 14, W_BITS1 = 14; + + int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + float A11 = 0, A12 = 0, A22 = 0; + + // tensor + float sumIx = 0; + float sumIy = 0; + float sumI = 0; + float sumW = 0; + float w1 = 0, w2 = 0; // -IyI + float dI = 0; // I^2 + float D = 0; + +#ifdef RLOF_SSE + + __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128i z = _mm_setzero_si128(); + __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1)); + __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1)); + __m128i mmMask4_epi32; + __m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits::max()); + get4BitMask(winSize.width, mmMask4_epi32); +#endif + + // extract the patch from the first image, compute covariation matrix of derivatives + int x, y; + for( y = 0; y < winSize.height; y++ ) + { + x = 0; + const uchar* src = I.ptr(y + iprevPt.y, 0) + iprevPt.x*cn; + const uchar* src1 = I.ptr(y + iprevPt.y + 1, 0) + iprevPt.x*cn; + const short* dsrc = derivI.ptr(y + iprevPt.y, 0) + iprevPt.x*cn2; + const short* dsrc1 = derivI.ptr(y + iprevPt.y + 1, 0) + iprevPt.x*cn2; + short* Iptr = IWinBuf.ptr(y, 0); + short* dIptr = derivIWinBuf.ptr(y, 0); +#ifdef RLOF_SSE + const tMaskType* maskPtr = winMaskMat.ptr(y, 0); + for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dsrc1 += 8, dIptr += 4*2 ) + { + __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16); + __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16); + + + __m128i v00, v01, v10, v11, t0, t1; + v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); + v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); + v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x)), z); + v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x + cn)), z); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); + if( x + 4 > winSize.width) + { + t0 = _mm_and_si128(t0, mmMask4_epi32); + } + t0 = _mm_and_si128(t0, mask_0_3_epi16); + _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0)); + + + v00 = _mm_loadu_si128((const __m128i*)(dsrc)); + v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); + v10 = _mm_loadu_si128((const __m128i*)(dsrc1)); + v11 = _mm_loadu_si128((const __m128i*)(dsrc1 + cn2)); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1); + v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... + if( x + 4 > winSize.width) + { + v00 = _mm_and_si128(v00, mmMask4_epi32); + } + v00 = _mm_and_si128(v00, mask_0_3_epi16); + _mm_storeu_si128((__m128i*)dIptr, v00); + } +#else + + for( ; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2 ) + { + if( winMaskMat.at(y,x) == 0) + { + dIptr[0] = 0; + dIptr[1] = 0; + continue; + } + int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + + src1[x]*iw10 + src1[x+cn]*iw11, W_BITS1-5); + int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 + + dsrc1[0]*iw10 + dsrc1[cn2]*iw11, W_BITS1); + int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + + dsrc1[1]*iw10 + dsrc1[cn2+1]*iw11, W_BITS1); + + Iptr[x] = (short)ival; + dIptr[0] = (short)ixval; + dIptr[1] = (short)iyval; + + } +#endif + } + + cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1); + cv::Point2f backUpNextPt = nextPt; + nextPt += halfWin; + Point2f prevDelta(0,0); //relates to h(t-1) + Point2f prevGain(1,0); + cv::Point2f gainVec = gainVecs[ptidx]; + cv::Point2f backUpGain = gainVec; + cv::Size _winSize = winSize; + int j; +#ifdef RLOF_SSE + __m128i mmMask0, mmMask1, mmMask; + getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask); + __m128 mmOnes = _mm_set1_ps(1.f ); +#endif + float MEstimatorScale = 1; + int buffIdx = 0; + float c[8]; + cv::Mat GMc0, GMc1, GMc2, GMc3; + cv::Vec4f Mc0, Mc1, Mc2, Mc3; + int noIteration = 0; + int noReusedIteration = 0; + int noSolvedIteration = 0; + for( j = 0; j < criteria.maxCount; j++ ) + { + cv::Point2f delta(0,0); + cv::Point2f deltaGain(0,0); + bool hasSolved = false; + a = nextPt.x - inextPt.x; + b = nextPt.y - inextPt.y; + float ab = a * b; + if (j == 0 + || (inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j % 2 != 0) ) + { + inextPt.x = cvFloor(nextPt.x); + inextPt.y = cvFloor(nextPt.y); + + if( inextPt.x < 0 || inextPt.x >= J.cols - winSize.width || + inextPt.y < 0 || inextPt.y >= J.rows - winSize.height - 1) + { + if( level == 0 && status ) + status[ptidx] = 3; + if (level > 0) + { + nextPts[ptidx] = backUpNextPt; + gainVecs[ptidx] = backUpGain; + } + noIteration++; + break; + } + + + a = nextPt.x - inextPt.x; + b = nextPt.y - inextPt.y; + ab = a * b; + iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + // mismatch + + if( j == 0) + { + // tensor + w1 = 0; // -IxI + w2 = 0; // -IyI + dI = 0; // I^2 + sumIx = 0; + sumIy = 0; + sumI = 0; + sumW = 0; + A11 = 0; + A12 = 0; + A22 = 0; + } + + if ( j == 0 ) + { + buffIdx = 0; + for( y = 0; y < winSize.height; y++ ) + { + const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const uchar* Jptr1 = J.ptr(y + inextPt.y + 1, inextPt.x*cn); + const short* Iptr = IWinBuf.ptr(y, 0); + const short* dIptr = derivIWinBuf.ptr(y, 0); + const tMaskType* maskPtr = winMaskMat.ptr(y, 0); + x = 0; + for( ; x < winSize.width*cn; x++, dIptr += 2) + { + if( maskPtr[x] == 0) + continue; + int diff = static_cast(CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr1[x]*iw10 + Jptr1[x+cn]*iw11, W_BITS1-5) + - Iptr[x] + Iptr[x] * gainVec.x +gainVec.y); + residualMat.at(buffIdx++) = static_cast(diff); + } + } + /*! Estimation for the residual */ + cv::Mat residualMatRoi(residualMat, cv::Rect(0,0,1, buffIdx)); + MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi); + } + + float eta = 1.f / winArea; + float fParam0 = normSigma0 * 32.f; + float fParam1 = normSigma1 * 32.f; + fParam0 = normSigma0 * MEstimatorScale; + fParam1 = normSigma1 * MEstimatorScale; + + #ifdef RLOF_SSE + + qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128 qb0[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; + __m128 qb1[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; + __m128 qb2[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; + __m128 qb3[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; + __m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps(); + __m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps(); + __m128 mmSumIy = _mm_setzero_ps(), mmSumIx = _mm_setzero_ps(); + __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps(); + __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits::max() -1, static_cast(fParam0))); + __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits::max()- 1, static_cast(fParam1))); + + + float s2Val = std::fabs(normSigma2); + int s2bitShift = normSigma2 == 0 ? 1 : cvCeil(log(200.f / s2Val) / log(2.f)); + __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast(normSigma2 * (float)(1 << s2bitShift))); + __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift); + __m128 mmParam2s = _mm_set1_ps(0.01f * normSigma2); + __m128 mmParam2s2 = _mm_set1_ps(normSigma2 * normSigma2); + float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x; + int bitShift = gainVec.x == 0 ? 1 : cvCeil(log(200.f / gainVal) / log(2.f)); + __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast(gainVec.x * (float)(1 << bitShift))); + __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast(gainVec.y)); + __m128i mmEta = _mm_setzero_si128(); + __m128i mmScale = _mm_set1_epi16(static_cast(MEstimatorScale)); + + #endif + + buffIdx = 0; + float _b0[4] = {0,0,0,0}; + float _b1[4] = {0,0,0,0}; + float _b2[4] = {0,0,0,0}; + float _b3[4] = {0,0,0,0}; + /* + */ + for( y = 0; y < _winSize.height; y++ ) + { + const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const uchar* Jptr1 = J.ptr(y + inextPt.y + 1, inextPt.x*cn); + const short* Iptr = IWinBuf.ptr(y, 0); + const short* dIptr = derivIWinBuf.ptr(y, 0); + const tMaskType* maskPtr = winMaskMat.ptr(y, 0); + x = 0; + #ifdef RLOF_SSE + for( ; x <= _winSize.width*cn; x += 8, dIptr += 8*2 ) + { + __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16); + __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); + + __m128i v00 = _mm_unpacklo_epi8( + _mm_loadl_epi64((const __m128i*)(Jptr + x)), z); + __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); + __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z); + __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z); + + __m128i t0 = _mm_add_epi32 + (_mm_madd_epi16( + _mm_unpacklo_epi16(v00, v01), + qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5); + + __m128i lo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16); + __m128i hi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16); + + __m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), bitShift); + __m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), bitShift); + __m128i Igain_epi16 = _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32); + + + __m128i diffValue = _mm_subs_epi16(_mm_add_epi16(Igain_epi16, mmConstValue_epi16), I_0_7_epi16); + __m128i mmDiffc_epi16[4] = + { + _mm_add_epi16(_mm_slli_epi16(v11, 5), diffValue), + _mm_add_epi16(_mm_slli_epi16(v01, 5), diffValue), + _mm_add_epi16(_mm_slli_epi16(v10, 5), diffValue), + _mm_add_epi16(_mm_slli_epi16(v00, 5), diffValue) + }; + + __m128i mmDiff_epi16 = _mm_add_epi16( _mm_packs_epi32(t0, t1), diffValue); + + + mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16); + + __m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale); + mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1))); + + + __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); + __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); + + + __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16); + __m128i bSet2_epi16, bSet1_epi16; + // |It| < sigma1 ? + bSet2_epi16 = _mm_cmplt_epi16(abs_epi16, mmParam1); + // It > 0 ? + __m128i diffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128()); + // sigma0 < |It| < sigma1 ? + bSet1_epi16 = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0)); + // val = |It| -/+ sigma1 + __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)), + _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1))); + // It == 0 ? |It| > sigma13 + mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16); + + for( unsigned int mmi = 0; mmi < 4; mmi++) + { + __m128i mmDiffc_epi16_t = _mm_and_si128(mmDiffc_epi16[mmi], mask_0_7_epi16); + mmDiffc_epi16_t = _mm_and_si128(bSet2_epi16, mmDiffc_epi16_t); + + // It == val ? sigma0 < |It| < sigma1 + mmDiffc_epi16_t = _mm_blendv_epi8(mmDiffc_epi16_t, tmpParam1_epi16, bSet1_epi16); + __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3 + // diff = diff * sigma2 + lo = _mm_mullo_epi16(tale_epi16_, mmDiffc_epi16_t); + hi = _mm_mulhi_epi16(tale_epi16_, mmDiffc_epi16_t); + __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); + __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); + + mmDiffc_epi16_t = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); + __m128i diff1 = _mm_unpackhi_epi16(mmDiffc_epi16_t, mmDiffc_epi16_t); // It4 It4 It5 It5 It6 It6 It7 It7 | It12 It12 It13 It13... + __m128i diff0 = _mm_unpacklo_epi16(mmDiffc_epi16_t, mmDiffc_epi16_t); // It0 It0 It1 It1 It2 It2 It3 It3 | It8 It8 It9 It9... + + // Ix * diff / Iy * diff + v10 = _mm_mullo_epi16(Ixy_0, diff0); + v11 = _mm_mulhi_epi16(Ixy_0, diff0); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + + qb0[mmi] = _mm_add_ps(qb0[mmi], _mm_cvtepi32_ps(v00)); + qb1[mmi] = _mm_add_ps(qb1[mmi], _mm_cvtepi32_ps(v10)); + // It * Ix It * Iy [4 ... 7] + // for set 1 hi sigma 1 + v10 = _mm_mullo_epi16(Ixy_1, diff1); + v11 = _mm_mulhi_epi16(Ixy_1, diff1); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qb0[mmi] = _mm_add_ps(qb0[mmi], _mm_cvtepi32_ps(v00)); + qb1[mmi] = _mm_add_ps(qb1[mmi], _mm_cvtepi32_ps(v10)); + // diff * J [0 ... 7] + // for set 1 sigma 1 + // b3 += diff * Iptr[x] + v10 = _mm_mullo_epi16(mmDiffc_epi16_t, I_0_7_epi16); + v11 = _mm_mulhi_epi16(mmDiffc_epi16_t, I_0_7_epi16); + v00 = _mm_unpacklo_epi16(v10, v11); + + v10 = _mm_unpackhi_epi16(v10, v11); + qb2[mmi] = _mm_add_ps(qb2[mmi], _mm_cvtepi32_ps(v00)); + qb2[mmi] = _mm_add_ps(qb2[mmi], _mm_cvtepi32_ps(v10)); + qb3[mmi] = _mm_add_ps(qb3[mmi], _mm_cvtepi32_ps(diff_0_3_epi32)); + qb3[mmi] = _mm_add_ps(qb3[mmi], _mm_cvtepi32_ps(diff_4_7_epi32)); + } + + if( j == 0 ) + { + __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16)); + __m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16,bSet1_epi16), 16)); + __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16)); + __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16),16))); + + __m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16)); + __m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16),16)); + + __m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps); + __m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps); + tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps); + tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps); + + tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps); + tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps); + + t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3 + t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3 + + __m128 fy = _mm_cvtepi32_ps(t0); + __m128 fx = _mm_cvtepi32_ps(t1); + + // 0 ... 3 + __m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16)); + + // A11 - A22 + __m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps); + __m128 fytale = _mm_mul_ps(fy, tale_0_3_ps); + + mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); + mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); + mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); + + // sumIx und sumIy + mmSumIx = _mm_add_ps(mmSumIx, fxtale); + mmSumIy = _mm_add_ps(mmSumIy, fytale); + + mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale)); + mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale)); + + // sumI + __m128 I_tale_ps = _mm_mul_ps(I_ps, tale_0_3_ps); + mmSumI = _mm_add_ps(mmSumI,I_tale_ps); + + // sumW + mmSumW = _mm_add_ps(mmSumW, tale_0_3_ps); + + // sumDI + mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_tale_ps)); + + + t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11 + t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3 + + fy = _mm_cvtepi32_ps(t0); + fx = _mm_cvtepi32_ps(t1); + + // 4 ... 7 + I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16)); + + // A11 - A22 + fxtale = _mm_mul_ps(fx, tale_4_7_ps); + fytale = _mm_mul_ps(fy, tale_4_7_ps); + + mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); + mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); + mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); + + // sumIx und sumIy + mmSumIx = _mm_add_ps(mmSumIx, fxtale); + mmSumIy = _mm_add_ps(mmSumIy, fytale); + + mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale)); + mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale)); + + // sumI + I_tale_ps = _mm_mul_ps(I_ps, tale_4_7_ps); + mmSumI = _mm_add_ps(mmSumI, I_tale_ps); + + // sumW + mmSumW = _mm_add_ps(mmSumW, tale_4_7_ps); + + // sumDI + mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_tale_ps)); + } + + } + #else + for( ; x < _winSize.width*cn; x++, dIptr += 2 ) + { + if( maskPtr[x] == 0) + continue; + + short ixval = dIptr[0]; + short iyval = dIptr[1]; + int illValue = static_cast(Iptr[x] * gainVec.x + gainVec.y - Iptr[x]); + + int It[4] = {(Jptr1[x+cn]<< 5) + illValue, + (Jptr[x+cn]<< 5) + illValue, + (Jptr1[x]<< 5) + illValue, + (Jptr[x] << 5) + illValue}; + + + int J_val = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + + Jptr1[x]*iw10 + Jptr1[x+cn]*iw11, + W_BITS1-5); + + int diff = J_val + illValue; + + + + MEstimatorScale += (diff < MEstimatorScale) ? -eta : eta; + + int abss = (diff < 0) ? -diff : diff; + + + // compute the missmatch vector + if( j >= 0) + { + if( abss > fParam1) + { + It[0] = 0; + It[1] = 0; + It[2] = 0; + It[3] = 0; + } + else if( abss > static_cast(fParam0) && diff >= 0 ) + { + It[0] = static_cast(normSigma2 * (It[0] - fParam1)); + It[1] = static_cast(normSigma2 * (It[1] - fParam1)); + It[2] = static_cast(normSigma2 * (It[2] - fParam1)); + It[3] = static_cast(normSigma2 * (It[3] - fParam1)); + } + else if( abss > static_cast(fParam0) && diff < 0 ) + { + It[0] = static_cast(normSigma2 * (It[0] + fParam1)); + It[1] = static_cast(normSigma2 * (It[1] + fParam1)); + It[2] = static_cast(normSigma2 * (It[2] + fParam1)); + It[3] = static_cast(normSigma2 * (It[3] + fParam1)); + } + } + _b0[0] += (float)(It[0]*dIptr[0]) ; + _b0[1] += (float)(It[1]*dIptr[0]) ; + _b0[2] += (float)(It[2]*dIptr[0]) ; + _b0[3] += (float)(It[3]*dIptr[0]) ; + + + _b1[0] += (float)(It[0]*dIptr[1]) ; + _b1[1] += (float)(It[1]*dIptr[1]) ; + _b1[2] += (float)(It[2]*dIptr[1]) ; + _b1[3] += (float)(It[3]*dIptr[1]) ; + + _b2[0] += (float)(It[0])*Iptr[x] ; + _b2[1] += (float)(It[1])*Iptr[x] ; + _b2[2] += (float)(It[2])*Iptr[x] ; + _b2[3] += (float)(It[3])*Iptr[x] ; + + _b3[0] += (float)(It[0]); + _b3[1] += (float)(It[1]); + _b3[2] += (float)(It[2]); + _b3[3] += (float)(It[3]); + + // compute the Gradient Matrice + if( j == 0) + { + float tale = normSigma2 * FLT_RESCALE; + if( abss < fParam0 || j < 0) + { + tale = FLT_RESCALE; + } + else if( abss > fParam1) + { + tale *= 0.01f; + } + else + { + tale *= normSigma2; + } + if( j == 0 ) + { + A11 += (float)(ixval*ixval)*tale; + A12 += (float)(ixval*iyval)*tale; + A22 += (float)(iyval*iyval)*tale; + } + + dI += Iptr[x] * Iptr[x] * tale; + float dx = static_cast(dIptr[0]) * tale; + float dy = static_cast(dIptr[1]) * tale; + sumIx += dx; + sumIy += dy; + w1 += dx * Iptr[x]; + w2 += dy * Iptr[x]; + sumI += Iptr[x] * tale; + sumW += tale; + } + + } + #endif + } + + #ifdef RLOF_SSE + short etaValues[8]; + _mm_storeu_si128((__m128i*)(etaValues), mmEta); + MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3] + + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]); + float CV_DECL_ALIGNED(32) wbuf[4]; + #endif + if( j == 0 ) + { + #ifdef RLOF_SSE + _mm_store_ps(wbuf, mmSumW1); + w1 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumW2); + w2 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumDI); + dI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumI); + sumI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumIx); + sumIx = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumIy); + sumIy = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumW); + sumW = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + #endif + sumIx *= -FLT_SCALE; + sumIy *= -FLT_SCALE; + sumI *=FLT_SCALE; + sumW *= FLT_SCALE; + w1 *= -FLT_SCALE; + w2 *= -FLT_SCALE; + dI *= FLT_SCALE; + + + #ifdef RLOF_SSE + float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];// + + _mm_store_ps(A11buf, mmAxx); + _mm_store_ps(A12buf, mmAxy); + _mm_store_ps(A22buf, mmAyy); + + + A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; + A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; + A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; + #endif + A11 *= FLT_SCALE; // 54866744. + A12 *= FLT_SCALE; // -628764.00 + A22 *= FLT_SCALE; // 19730.000 + + D = - A12*A12*sumI*sumI + dI*sumW*A12*A12 + 2*A12*sumI*sumIx*w2 + 2*A12*sumI*sumIy*w1 + - 2*dI*A12*sumIx*sumIy - 2*sumW*A12*w1*w2 + A11*A22*sumI*sumI - 2*A22*sumI*sumIx*w1 + - 2*A11*sumI*sumIy*w2 - sumIx*sumIx*w2*w2 + A22*dI*sumIx*sumIx + 2*sumIx*sumIy*w1*w2 + - sumIy*sumIy*w1*w1 + A11*dI*sumIy*sumIy + A22*sumW*w1*w1 + A11*sumW*w2*w2 - A11*A22*dI*sumW; + + float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) + + 4.f*A12*A12))/(2*winArea); + if( minEig < minEigThreshold || std::abs(D) < FLT_EPSILON ) + { + if( level == 0 && status ) + status[ptidx] = 0; + if( level > 0) + { + nextPts[ptidx] = backUpNextPt; + gainVecs[ptidx] = backUpGain; + } + noIteration++; + break; + } + + + D = (1.f / D); + + invTensorMat(0,0) = (A22*sumI*sumI - 2*sumI*sumIy*w2 + dI*sumIy*sumIy + sumW*w2*w2 - A22*dI*sumW)* D; + invTensorMat(0,1) = (A12*dI*sumW - A12*sumI * sumI - dI*sumIx*sumIy + sumI*sumIx*w2 + sumI*sumIy*w1 - sumW*w1*w2)* D; + invTensorMat(0,2) = (A12*sumI*sumIy - sumIy*sumIy*w1 - A22*sumI*sumIx - A12*sumW*w2 + A22*sumW*w1 + sumIx*sumIy*w2)* D; + invTensorMat(0,3) = (A22*dI*sumIx - A12*dI*sumIy - sumIx*w2*w2 + A12*sumI*w2 - A22*sumI*w1 + sumIy*w1*w2) * D; + invTensorMat(1,0) = invTensorMat(0,1); + invTensorMat(1,1) = (A11*sumI * sumI - 2*sumI*sumIx*w1 + dI*sumIx * sumIx + sumW*w1*w1 - A11*dI*sumW) * D; + invTensorMat(1,2) = (A12*sumI*sumIx - A11*sumI*sumIy - sumIx * sumIx*w2 + A11*sumW*w2 - A12*sumW*w1 + sumIx*sumIy*w1) * D; + invTensorMat(1,3) = (A11*dI*sumIy - sumIy*w1*w1 - A12*dI*sumIx - A11*sumI*w2 + A12*sumI*w1 + sumIx*w1*w2)* D; + invTensorMat(2,0) = invTensorMat(0,2); + invTensorMat(2,1) = invTensorMat(1,2); + invTensorMat(2,2) = (sumW*A12*A12 - 2*A12*sumIx*sumIy + A22*sumIx*sumIx + A11*sumIy*sumIy - A11*A22*sumW)* D; + invTensorMat(2,3) = (A11*A22*sumI - A12*A12*sumI - A11*sumIy*w2 + A12*sumIx*w2 + A12*sumIy*w1 - A22*sumIx*w1)* D; + invTensorMat(3,0) = invTensorMat(0,3); + invTensorMat(3,1) = invTensorMat(1,3); + invTensorMat(3,2) = invTensorMat(2,3); + invTensorMat(3,3) = (dI*A12*A12 - 2*A12*w1*w2 + A22*w1*w1 + A11*w2*w2 - A11*A22*dI)* D; + } + + + #ifdef RLOF_SSE + float CV_DECL_ALIGNED(16) bbuf[4]; + for(int mmi = 0; mmi < 4; mmi++) + { + + _mm_store_ps(bbuf, _mm_add_ps(qb0[mmi], qb1[mmi])); + _b0[mmi] = bbuf[0] + bbuf[2]; + _b1[mmi] = bbuf[1] + bbuf[3]; + _mm_store_ps(bbuf, qb2[mmi]); + _b2[mmi] = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; + _mm_store_ps(bbuf, qb3[mmi]); + _b3[mmi] = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; + + } + #endif + + _b0[0] *= FLT_SCALE;_b0[1] *= FLT_SCALE;_b0[2] *= FLT_SCALE;_b0[3] *= FLT_SCALE; + _b1[0] *= FLT_SCALE;_b1[1] *= FLT_SCALE;_b1[2] *= FLT_SCALE;_b1[3] *= FLT_SCALE; + _b2[0] *= FLT_SCALE;_b2[1] *= FLT_SCALE;_b2[2] *= FLT_SCALE;_b2[3] *= FLT_SCALE; + _b3[0] *= FLT_SCALE;_b3[1] *= FLT_SCALE;_b3[2] *= FLT_SCALE;_b3[3] *= FLT_SCALE; + + + Mc0[0] = _b0[0] - _b0[1] - _b0[2] + _b0[3]; + Mc0[1] = _b1[0] - _b1[1] - _b1[2] + _b1[3]; + Mc0[2] = -(_b2[0] - _b2[1] - _b2[2] + _b2[3]); + Mc0[3] = -(_b3[0] - _b3[1] - _b3[2] + _b3[3]); + + Mc1[0] = _b0[1] - _b0[3]; + Mc1[1] = _b1[1] - _b1[3]; + Mc1[2] = -(_b2[1] - _b2[3]); + Mc1[3] = -(_b3[1] - _b3[3]); + + + Mc2[0] = _b0[2] - _b0[3]; + Mc2[1] = _b1[2] - _b1[3]; + Mc2[2] = -(_b2[2] - _b2[3]); + Mc2[3] = -(_b3[2] - _b3[3]); + + + Mc3[0] = _b0[3]; + Mc3[1] = _b1[3]; + Mc3[2] = -_b2[3]; + Mc3[3] = -_b3[3]; + + // + c[0] = -Mc0[0]; + c[1] = -Mc1[0]; + c[2] = -Mc2[0]; + c[3] = -Mc3[0]; + c[4] = -Mc0[1]; + c[5] = -Mc1[1]; + c[6] = -Mc2[1]; + c[7] = -Mc3[1]; + + float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]); + float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]); + float e2 = e0 * (c[1] * c[7] -c[3] * c[5]); + e0 = e1 * e1 - e2; + hasSolved = false; + if ( e0 > 0) + { + e0 = sqrt(e0); + float _y[2] = {-e1 - e0, e0 - e1}; + float c0yc1[2] = {c[0] * _y[0] + c[1], + c[0] * _y[1] + c[1]}; + float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0], + -(c[2] * _y[1] + c[3]) / c0yc1[1]}; + bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1); + bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1); + + bool isSolution1 = checkSolution(_x[0], _y[0], c ); + bool isSolution2 = checkSolution(_x[1], _y[1], c ); + bool isSink1 = isIn1 && isSolution1; + bool isSink2 = isIn2 && isSolution2; + + if ( isSink1 != isSink2) + { + a = isSink1 ? _x[0] : _x[1]; + b = isSink1 ? _y[0] : _y[1]; + ab = a * b; + hasSolved = true; + delta.x = inextPt.x + a - nextPt.x; + delta.y = inextPt.y + b - nextPt.y; + + cv::Vec4f mismatchVec = ab * Mc0 + Mc1 *a + Mc2 * b + Mc3; + deltaGain = est_DeltaGain(invTensorMat, mismatchVec); + + } // isIn1 != isIn2 + } + if( hasSolved == false) + noIteration++; + } + else + { + hasSolved = false; + noReusedIteration++; + } + if( hasSolved == false ) + { + + cv::Vec4f mismatchVec = ab * Mc0 + Mc1 *a + Mc2 * b + Mc3; + est_Result(invTensorMat, mismatchVec, delta, deltaGain); + + delta.x = MAX(-1.f, MIN( 1.f , delta.x)); + delta.y = MAX(-1.f, MIN( 1.f , delta.y)); + + + if( j == 0) + prevGain = deltaGain; + gainVec += deltaGain; + nextPt += delta ; + nextPts[ptidx] = nextPt - halfWin; + gainVecs[ptidx]= gainVec; + + } + else + { + nextPt += delta; + nextPts[ptidx] = nextPt - halfWin; + gainVecs[ptidx]= gainVec + deltaGain; + noSolvedIteration++; + break; + } + + if(j > 0 && ( + (std::abs(delta.x - prevDelta.x) < 0.01 && std::abs(delta.y - prevDelta.y) < 0.01) + || ((delta.ddot(delta) <= 0.001) && std::abs(prevGain.x - deltaGain.x) < 0.01))) + { + nextPts[ptidx] += delta*0.5f; + gainVecs[ptidx] -= deltaGain* 0.5f; + break; + } + + + prevDelta = delta; + prevGain = deltaGain; + } + + } + + } + + const Mat* prevImg; + const Mat* nextImg; + const Mat* prevDeriv; + const Mat* rgbPrevImg; + const Mat* rgbNextImg; + const Point2f* prevPts; + Point2f* nextPts; + uchar* status; + cv::Point2f* gainVecs; // gain vector x -> multiplier y -> offset + float* err; + int maxWinSize; + int minWinSize; + TermCriteria criteria; + int level; + int maxLevel; + int windowType; + float minEigThreshold; + bool useInitialFlow; + const float normSigma0, normSigma1, normSigma2; + int crossSegmentationThreshold; +}; + +}} // namespace +namespace beplk { +namespace ica { + +class TrackerInvoker : public cv::ParallelLoopBody +{ +public: + TrackerInvoker( + const Mat& _prevImg, + const Mat& _prevDeriv, + const Mat& _nextImg, + const Mat& _rgbPrevImg, + const Mat& _rgbNextImg, + const Point2f* _prevPts, + Point2f* _nextPts, + uchar* _status, + float* _err, + int _level, + int _maxLevel, + int _winSize[2], + int _maxIteration, + bool _useInitialFlow, + int _supportRegionType, + int _crossSegmentationThreshold, + float _minEigenValue) + { + prevImg = &_prevImg; + prevDeriv = &_prevDeriv; + nextImg = &_nextImg; + rgbPrevImg = &_rgbPrevImg; + rgbNextImg = &_rgbNextImg; + prevPts = _prevPts; + nextPts = _nextPts; + status = _status; + err = _err; + minWinSize = _winSize[0]; + maxWinSize = _winSize[1]; + criteria.maxCount = _maxIteration; + criteria.epsilon = 0.01; + level = _level; + maxLevel = _maxLevel; + windowType = _supportRegionType; + minEigThreshold = _minEigenValue; + useInitialFlow = _useInitialFlow; + crossSegmentationThreshold = _crossSegmentationThreshold; + } + + void operator()(const cv::Range& range) const CV_OVERRIDE + { + cv::Size winSize; + cv::Point2f halfWin; + + const Mat& I = *prevImg; + const Mat& J = *nextImg; + const Mat& derivI = *prevDeriv; + const Mat& BI = *rgbPrevImg; + + winSize = cv::Size(maxWinSize,maxWinSize); + int winMaskwidth = roundUp(winSize.width, 8) * 2; + cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); + winMaskMatBuf.setTo(1); + + const float FLT_SCALE = (1.f/(1 << 20)); // 20 + + int j, cn = I.channels(), cn2 = cn*2; + int winbufwidth = roundUp(winSize.width, 8); + cv::Size winBufSize(winbufwidth,winbufwidth); + + std::vector _buf(winBufSize.area()*(cn + cn2)); + Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]); + Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]); + + for( int ptidx = range.start; ptidx < range.end; ptidx++ ) + { + Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level)); + Point2f nextPt; + if( level == maxLevel ) + { + if( useInitialFlow ) + nextPt = nextPts[ptidx]*(float)(1./(1 << level)); + else + nextPt = prevPt; + } + else + nextPt = nextPts[ptidx]*2.f; + nextPts[ptidx] = nextPt; + + Point2i iprevPt, inextPt; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + int winArea = maxWinSize * maxWinSize; + cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize)); + + if( calcWinMaskMat(BI, windowType, iprevPt, + winMaskMat,winSize,halfWin,winArea, + minWinSize,maxWinSize) == false) + continue; + + halfWin = Point2f(static_cast(maxWinSize), static_cast(maxWinSize)) - halfWin; + prevPt += halfWin; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + + if( iprevPt.x < 0 || iprevPt.x >= derivI.cols - winSize.width || + iprevPt.y < 0 || iprevPt.y >= derivI.rows - winSize.height - 1) + { + if( level == 0 ) + { + if( status ) + status[ptidx] = 3; + if( err ) + err[ptidx] = 0; + } + continue; + } + + float a = prevPt.x - iprevPt.x; + float b = prevPt.y - iprevPt.y; + const int W_BITS = 14, W_BITS1 = 14; + + int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + float A11 = 0, A12 = 0, A22 = 0; + +#ifdef RLOF_SSE + __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128i z = _mm_setzero_si128(); + __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1)); + __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1)); + __m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps(); + __m128i mmMask4_epi32; + get4BitMask(winSize.width, mmMask4_epi32); +#endif + + + int x, y; + for( y = 0; y < winSize.height; y++ ) + { + const uchar* src = I.ptr(y + iprevPt.y, 0) + iprevPt.x*cn; + const uchar* src1 = I.ptr(y + iprevPt.y + 1, 0) + iprevPt.x*cn; + const short* dsrc = derivI.ptr(y + iprevPt.y, 0) + iprevPt.x*cn2; + const short* dsrc1 = derivI.ptr(y + iprevPt.y + 1, 0) + iprevPt.x*cn2; + short* Iptr = IWinBuf.ptr(y, 0); + short* dIptr = derivIWinBuf.ptr(y, 0); + const tMaskType* maskPtr = winMaskMat.ptr(y, 0); + x = 0; +#ifdef RLOF_SSE + for( ; x < winSize.width*cn; x += 4, dsrc += 4*2, dsrc1 += 8,dIptr += 4*2 ) + { + __m128i wMask = _mm_set_epi32(MaskSet * maskPtr[x+3], + MaskSet * maskPtr[x+2], + MaskSet * maskPtr[x+1], + MaskSet * maskPtr[x]); + __m128i v00, v01, v10, v11, t0, t1; + v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); + v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); + v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x)), z); + v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x + cn)), z); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); + _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0)); + + v00 = _mm_loadu_si128((const __m128i*)(dsrc)); + v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); + v10 = _mm_loadu_si128((const __m128i*)(dsrc1)); + v11 = _mm_loadu_si128((const __m128i*)(dsrc1 + cn2)); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1); + v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... + + if( x + 4 > winSize.width) + { + v00 = _mm_and_si128(v00, mmMask4_epi32); + } + v00 = _mm_and_si128(v00, wMask); + + _mm_storeu_si128((__m128i*)dIptr, v00); + t0 = _mm_srai_epi32(v00, 16); // Iy0 Iy1 Iy2 Iy3 + t1 = _mm_srai_epi32(_mm_slli_epi32(v00, 16), 16); // Ix0 Ix1 Ix2 Ix3 + + __m128 fy = _mm_cvtepi32_ps(t0); + __m128 fx = _mm_cvtepi32_ps(t1); + + qA22 = _mm_add_ps(qA22, _mm_mul_ps(fy, fy)); + qA12 = _mm_add_ps(qA12, _mm_mul_ps(fx, fy)); + qA11 = _mm_add_ps(qA11, _mm_mul_ps(fx, fx)); + } +#else + + for( ; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2) + { + if( maskPtr[x] == 0) + { + dIptr[0] = (short)0; + dIptr[1] = (short)0; + + continue; + } + int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + + src1[x]*iw10 + src1[x+cn]*iw11, W_BITS1-5); + int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 + + dsrc1[0]*iw10 + dsrc1[cn2]*iw11, W_BITS1); + int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc1[1]*iw10 + + dsrc1[cn2+1]*iw11, W_BITS1); + + Iptr[x] = (short)ival; + dIptr[0] = (short)ixval; + dIptr[1] = (short)iyval; + A11 += (float)(ixval*ixval); + A12 += (float)(ixval*iyval); + A22 += (float)(iyval*iyval); + + } + + +#endif + + } + +#ifdef RLOF_SSE + float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4]; + _mm_store_ps(A11buf, qA11); + _mm_store_ps(A12buf, qA12); + _mm_store_ps(A22buf, qA22); + A11 += A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; + A12 += A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; + A22 += A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; +#endif + + A11 *= FLT_SCALE; + A12 *= FLT_SCALE; + A22 *= FLT_SCALE; + + + float D = A11*A22 - A12*A12; + float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) + + 4.f*A12*A12))/(2 * winArea); + + if( err ) + err[ptidx] = (float)minEig; + + if( D < FLT_EPSILON ) + { + if( level == 0 && status ) + status[ptidx] = 0; + continue; + } + + D = 1.f/D; + + cv::Point2f backUpNextPt = nextPt; + nextPt += halfWin; + Point2f prevDelta(0,0); + + float c[8]; +#ifdef RLOF_SSE + __m128i mmMask0, mmMask1, mmMask; + getWBitMask(winSize.width, mmMask0, mmMask1, mmMask); +#endif + for( j = 0; j < criteria.maxCount; j++ ) + { + cv::Point2f delta; + bool hasSolved = false; + a = nextPt.x - cvFloor(nextPt.x); + b = nextPt.y - cvFloor(nextPt.y); + float ab = a * b; + + + if( (inextPt.x != cvFloor(nextPt.x) || inextPt.y != cvFloor(nextPt.y) || j == 0)) + { + inextPt.x = cvFloor(nextPt.x); + inextPt.y = cvFloor(nextPt.y); + if( inextPt.x < 0 || inextPt.x >= J.cols - winSize.width || + inextPt.y < 0 || inextPt.y >= J.rows - winSize.height - 1) + { + if( level == 0 && status ) + status[ptidx] = 3; + if (level > 0) + nextPts[ptidx] = backUpNextPt; + break; + } + + + iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + + float _b1[4] = {0,0,0,0}; + float _b2[4] = {0,0,0,0}; + #ifdef RLOF_SSE + __m128 qbc0[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; + __m128 qbc1[4] = {_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps()}; + qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + #endif + for( y = 0; y < winSize.height; y++ ) + { + const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const uchar* Jptr1 = J.ptr(y + inextPt.y + 1, inextPt.x*cn); + const short* Iptr = IWinBuf.ptr(y, 0); + const short* dIptr = derivIWinBuf.ptr(y, 0); + x = 0; + #ifdef RLOF_SSE + + const tMaskType* maskPtr = winMaskMat.ptr(y, 0); + for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 ) + { + if( maskPtr[x ] == 0 && maskPtr[x+1] == 0 && maskPtr[x+2] == 0 && maskPtr[x+3] == 0 + && maskPtr[x+4] == 0 && maskPtr[x+5] == 0 && maskPtr[x+6] == 0 && maskPtr[x+7] == 0) + continue; + __m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x)); + __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z); + __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); + __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z); + __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z); + + __m128i mmDiffc_epi16[4] = + { _mm_subs_epi16(_mm_slli_epi16(v00, 5), diff0), + _mm_subs_epi16(_mm_slli_epi16(v01, 5), diff0), + _mm_subs_epi16(_mm_slli_epi16(v10, 5), diff0), + _mm_subs_epi16(_mm_slli_epi16(v11, 5), diff0) + }; + + __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... + __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); + + if( x > winSize.width*cn - 8) + { + Ixy_0 = _mm_and_si128(Ixy_0, mmMask0); + Ixy_1 = _mm_and_si128(Ixy_1, mmMask1); + } + + __m128i diffc1[4] = + {_mm_unpackhi_epi16(mmDiffc_epi16[0],mmDiffc_epi16[0]), + _mm_unpackhi_epi16(mmDiffc_epi16[1],mmDiffc_epi16[1]), + _mm_unpackhi_epi16(mmDiffc_epi16[2],mmDiffc_epi16[2]), + _mm_unpackhi_epi16(mmDiffc_epi16[3],mmDiffc_epi16[3]) + }; + + __m128i diffc0[4] = + {_mm_unpacklo_epi16(mmDiffc_epi16[0],mmDiffc_epi16[0]), + _mm_unpacklo_epi16(mmDiffc_epi16[1],mmDiffc_epi16[1]), + _mm_unpacklo_epi16(mmDiffc_epi16[2],mmDiffc_epi16[2]), + _mm_unpacklo_epi16(mmDiffc_epi16[3],mmDiffc_epi16[3]) + }; + + + // It * Ix It * Iy [0 ... 3] + //mask split for multiplication + // for set 1 lo sigma 1 + + + v10 = _mm_mullo_epi16(Ixy_0, diffc0[0]); + v11 = _mm_mulhi_epi16(Ixy_0, diffc0[0]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[0] = _mm_add_ps(qbc0[0], _mm_cvtepi32_ps(v00)); + qbc1[0] = _mm_add_ps(qbc1[0], _mm_cvtepi32_ps(v10)); + + v10 = _mm_mullo_epi16(Ixy_0, diffc0[1]); + v11 = _mm_mulhi_epi16(Ixy_0, diffc0[1]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[1] = _mm_add_ps(qbc0[1], _mm_cvtepi32_ps(v00)); + qbc1[1] = _mm_add_ps(qbc1[1], _mm_cvtepi32_ps(v10)); + + v10 = _mm_mullo_epi16(Ixy_0, diffc0[2]); + v11 = _mm_mulhi_epi16(Ixy_0, diffc0[2]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[2] = _mm_add_ps(qbc0[2], _mm_cvtepi32_ps(v00)); + qbc1[2] = _mm_add_ps(qbc1[2], _mm_cvtepi32_ps(v10)); + + v10 = _mm_mullo_epi16(Ixy_0, diffc0[3]); + v11 = _mm_mulhi_epi16(Ixy_0, diffc0[3]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[3] = _mm_add_ps(qbc0[3], _mm_cvtepi32_ps(v00)); + qbc1[3] = _mm_add_ps(qbc1[3], _mm_cvtepi32_ps(v10)); + // It * Ix It * Iy [4 ... 7] + // for set 1 hi sigma 1 + + v10 = _mm_mullo_epi16(Ixy_1, diffc1[0]); + v11 = _mm_mulhi_epi16(Ixy_1, diffc1[0]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[0] = _mm_add_ps(qbc0[0], _mm_cvtepi32_ps(v00)); + qbc1[0] = _mm_add_ps(qbc1[0], _mm_cvtepi32_ps(v10)); + + v10 = _mm_mullo_epi16(Ixy_1, diffc1[1]); + v11 = _mm_mulhi_epi16(Ixy_1, diffc1[1]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[1] = _mm_add_ps(qbc0[1], _mm_cvtepi32_ps(v00)); + qbc1[1] = _mm_add_ps(qbc1[1], _mm_cvtepi32_ps(v10)); + + v10 = _mm_mullo_epi16(Ixy_1, diffc1[2]); + v11 = _mm_mulhi_epi16(Ixy_1, diffc1[2]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[2] = _mm_add_ps(qbc0[2], _mm_cvtepi32_ps(v00)); + qbc1[2] = _mm_add_ps(qbc1[2], _mm_cvtepi32_ps(v10)); + + v10 = _mm_mullo_epi16(Ixy_1, diffc1[3]); + v11 = _mm_mulhi_epi16(Ixy_1, diffc1[3]); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qbc0[3] = _mm_add_ps(qbc0[3], _mm_cvtepi32_ps(v00)); + qbc1[3] = _mm_add_ps(qbc1[3], _mm_cvtepi32_ps(v10)); + + } + #else + for( ; x < winSize.width*cn; x++, dIptr += 2 ) + { + if( dIptr[0] == 0 && dIptr[1] == 0) + continue; + short It[4] = {(Jptr[x] << 5) - Iptr[x], + (Jptr[x+cn]<< 5) - Iptr[x], + (Jptr1[x]<< 5) - Iptr[x], + (Jptr1[x+cn]<< 5) - Iptr[x]}; + + _b1[0] += (float)(It[0]*dIptr[0]); + _b1[1] += (float)(It[1]*dIptr[0]); + _b1[2] += (float)(It[2]*dIptr[0]); + _b1[3] += (float)(It[3]*dIptr[0]); + + _b2[0] += (float)(It[0]*dIptr[1]); + _b2[1] += (float)(It[1]*dIptr[1]); + _b2[2] += (float)(It[2]*dIptr[1]); + _b2[3] += (float)(It[3]*dIptr[1]); + } + #endif + + } + + #ifdef RLOF_SSE + float CV_DECL_ALIGNED(16) bbuf[4]; + _mm_store_ps(bbuf, _mm_add_ps(qbc0[0], qbc1[0])); + _b1[0] += bbuf[0] + bbuf[2]; + _b2[0] += bbuf[1] + bbuf[3]; + + _mm_store_ps(bbuf, _mm_add_ps(qbc0[1], qbc1[1])); + _b1[1] += bbuf[0] + bbuf[2]; + _b2[1] += bbuf[1] + bbuf[3]; + + _mm_store_ps(bbuf, _mm_add_ps(qbc0[2], qbc1[2])); + _b1[2] += bbuf[0] + bbuf[2]; + _b2[2] += bbuf[1] + bbuf[3]; + + _mm_store_ps(bbuf, _mm_add_ps(qbc0[3], qbc1[3])); + _b1[3] += bbuf[0] + bbuf[2]; + _b2[3] += bbuf[1] + bbuf[3]; + + #endif + _b1[0] *= FLT_SCALE; + _b1[1] *= FLT_SCALE; + _b1[2] *= FLT_SCALE; + _b1[3] *= FLT_SCALE; + _b2[0] *= FLT_SCALE; + _b2[1] *= FLT_SCALE; + _b2[2] *= FLT_SCALE; + _b2[3] *= FLT_SCALE; + + float c0[4] = { _b1[3] + _b1[0] - _b1[2] - _b1[1], + _b1[1] - _b1[0], + _b1[2] - _b1[0], + _b1[0]}; + float c1[4] = { _b2[3] + _b2[0] - _b2[2] - _b2[1], + _b2[1] - _b2[0], + _b2[2] - _b2[0], + _b2[0]}; + + float DA12 = A12 * D ; + float DA22 = A22 * D ; + float DA11 = A11 * D ; + c[0] = DA12 * c1[0] - DA22 * c0[0]; + c[1] = DA12 * c1[1] - DA22 * c0[1]; + c[2] = DA12 * c1[2] - DA22 * c0[2]; + c[3] = DA12 * c1[3] - DA22 * c0[3]; + c[4] = DA12 * c0[0] - DA11 * c1[0]; + c[5] = DA12 * c0[1] - DA11 * c1[1]; + c[6] = DA12 * c0[2] - DA11 * c1[2]; + c[7] = DA12 * c0[3] - DA11 * c1[3]; + + float e0 = 1.f / (c[6] * c[0] - c[4] * c[2]); + float e1 = e0 * 0.5f * (c[6] * c[1] + c[7] * c[0] - c[5] * c[2] - c[4] * c[3]); + float e2 = e0 * (c[1] * c[7] -c[3] * c[5]); + e0 = e1 * e1 - e2; + if ( e0 >= 0) + { + e0 = sqrt(e0); + float _y[2] = {-e1 - e0, e0 - e1}; + float c0yc1[2] = {c[0] * _y[0] + c[1], + c[0] * _y[1] + c[1]}; + + float _x[2] = {-(c[2] * _y[0] + c[3]) / c0yc1[0], + -(c[2] * _y[1] + c[3]) / c0yc1[1]}; + + bool isIn1 = (_x[0] >=0 && _x[0] <=1 && _y[0] >= 0 && _y[0] <= 1); + bool isIn2 = (_x[1] >=0 && _x[1] <=1 && _y[1] >= 0 && _y[1] <= 1); + + + bool isSink1 = isIn1 && checkSolution(_x[0], _y[0], c ); + bool isSink2 = isIn2 && checkSolution(_x[1], _y[1], c ); + + + //if( isSink1 != isSink2 ) + { + if( isSink1 ) + { + hasSolved = true; + delta.x = inextPt.x + _x[0] - nextPt.x; + delta.y = inextPt.y + _y[0] - nextPt.y; + } + if (isSink2 ) + { + hasSolved = true; + delta.x = inextPt.x + _x[1] - nextPt.x; + delta.y = inextPt.y + _y[1] - nextPt.y; + } + } // isIn1 != isIn2 + } // e0 >= 0 + } + else + hasSolved = false; + if(hasSolved == false) + { + delta = Point2f( c[0] * ab + c[1] * a + c[2] * b + c[3], + c[4] * ab + c[5] * a + c[6] * b + c[7]); + nextPt += 0.7 * delta; + nextPts[ptidx] = nextPt - halfWin; + } + else + { + nextPt += delta; + nextPts[ptidx] = nextPt - halfWin; + break; + } + + + if( delta.ddot(delta) <= criteria.epsilon) + break; + + if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01 && + std::abs(delta.y - prevDelta.y) < 0.01) + { + nextPts[ptidx] -= delta*0.35f; + break; + } + + prevDelta = delta; + + } + + } + } + + + const Mat* prevImg; + const Mat* nextImg; + const Mat* prevDeriv; + const Mat* rgbPrevImg; + const Mat* rgbNextImg; + const Point2f* prevPts; + Point2f* nextPts; + uchar* status; + float* err; + int maxWinSize; + int minWinSize; + TermCriteria criteria; + int level; + int maxLevel; + int windowType; + float minEigThreshold; + bool useInitialFlow; + int crossSegmentationThreshold; +}; + +}}}} // namespace +#endif diff --git a/modules/optflow/src/rlof/geo_interpolation.cpp b/modules/optflow/src/rlof/geo_interpolation.cpp new file mode 100644 index 00000000000..0ac2e1715da --- /dev/null +++ b/modules/optflow/src/rlof/geo_interpolation.cpp @@ -0,0 +1,453 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// This functions have been contributed by Jonas Geisters + +#include "../precomp.hpp" + +#include "geo_interpolation.hpp" +#include +#include +namespace cv { +namespace optflow { + +struct Graph_helper { + std::vector mem; + int e_size; + Graph_helper(int k, int num_nodes) { + e_size = (2 * k + 1); + mem.resize(e_size * num_nodes, 0); + } + inline int size(int id) { + int r_addr = id * (e_size); + return mem[r_addr]; + } + inline int * data(int id) { + int r_addr = id * (e_size)+1; + return &mem[r_addr]; + } + inline void add(int id, std::pair data) { + int r_addr = id * (e_size); + int size = ++mem[r_addr]; + r_addr += 2 * size - 1;//== 1 + 2*(size-1); + *(float*)&mem[r_addr] = data.first; + mem[r_addr + 1] = data.second; + } + inline bool color_in_target(int id, int color) { + int r_addr = id * (e_size); + int size = mem[r_addr]; + r_addr += 2; + for (int i = 0; i < size; i++) { + if (mem[r_addr] == color) { + return true; + } + r_addr += 2; + } + return false; + } + +}; + +Mat sgeo_dist(const Mat& gra, int y, int x, float max, Mat &prev) +{ + std::vector points; + points.push_back(Point2f(static_cast(x), static_cast(y))); + return sgeo_dist(gra, points, max, prev); +} +Mat sgeo_dist(const Mat& gra, const std::vector & points, float max, Mat &prev) +{ + int Dx[] = { -1,0,1,-1,1,-1,0,1 }; + int Dy[] = { -1,-1,-1,0,0,1,1,1 }; + Mat dm(gra.rows, gra.cols, CV_32F, Scalar(max)); + prev = Mat(gra.rows, gra.cols, CV_8U, Scalar(255)); + + std::multimap not_visited_with_value; + + for (auto i = points.begin(); i != points.end(); i++) + { + int y = static_cast(i->y); + int x = static_cast(i->x); + not_visited_with_value.insert(std::pair(0.f, Vec2i(y, x))); + dm.at(y, x) = 0; + } + + bool done = false; + while (!done) + { + if (not_visited_with_value.begin() == not_visited_with_value.end()) { + done = true; + break; + } + std::multimap::iterator current_it = not_visited_with_value.begin(); + std::pair current_p = *current_it; + not_visited_with_value.erase(current_it); + + int y = current_p.second[0]; + int x = current_p.second[1]; + float cur_d = current_p.first; + + if (dm.at(y, x) != cur_d) { + continue; + } + + Vec8f gra_e = gra.at(y, x); + + for (int i = 0; i < 8; i++) { + if (gra_e[i] < 0) { + continue; + } + int dx = Dx[i]; + int dy = Dy[i]; + + if (dm.at(y + dy, x + dx) > cur_d + gra_e[i]) { + dm.at(y + dy, x + dx) = cur_d + gra_e[i]; + prev.at(y + dy, x + dx) = static_cast(7 - i); + not_visited_with_value.insert(std::pair(cur_d + gra_e[i], Vec2i(y + dy, x + dx))); + } + + } + } + + + + + return dm; +} + +Mat interpolate_irregular_nn_raster(const std::vector & prevPoints, + const std::vector & nextPoints, + const std::vector & status, + const Mat & i1) +{ + Mat gra = getGraph(i1, 0.1f); + int Dx[] = { -1,0,1,-1,1,-1,0,1 }; + int Dy[] = { -1,-1,-1,0,0,1,1,1 }; + int max_rounds = 10; + Mat dirt = Mat(gra.rows, gra.cols, CV_8U, Scalar(0)); + Mat quellknoten = Mat(gra.rows, gra.cols, CV_32S, Scalar(-1)); + Mat dist = Mat(gra.rows, gra.cols, CV_32F, Scalar(std::numeric_limits::max())); + /* + * assign quellknoten ids. + */ + for (int i = 0; i < static_cast(prevPoints.size()); i++) + { + int x = (int)prevPoints[i].x; + int y = (int)prevPoints[i].y; + if (status[i] == 0) + continue; + dirt.at(y, x) = 1; + dist.at(y, x) = 0; + quellknoten.at(y, x) = i; + } + + bool clean = true; + bool done = false; + int x = 0; + int y = 0; + int rounds = 0; + while (!done) { + /* + * Update x and y + * on even rounds go rasterscanorder , on odd round inverse rasterscanorder + */ + if (rounds % 2 == 0) { + x++; + if (x >= gra.cols) { + x = 0; + y++; + if (y >= gra.rows) { + y = 0; + rounds++; + y = gra.rows - 1; + x = gra.cols - 1; + if (rounds >= max_rounds || clean) { + done = true; + break; + } + } + } + } + else { + x--; + if (x < 0) { + x = gra.cols - 1; + y--; + if (y < 0) { + y = gra.rows - 1; + rounds++; + y = 0; + x = 0; + if (rounds >= max_rounds || clean) { + done = true; + break; + } + } + } + } + if (dirt.at(y, x) == 0) { + continue; + } + dirt.at(y, x) = 0; + + float c_dist = dist.at(y, x); + Vec8f gra_e = gra.at(y, x); + + for (int i = 0; i < 8; i++) { + int tx = Dx[i]; + int ty = Dy[i]; + if (ty == 0 && tx == 0) { + continue; + } + if (x + tx < 0 || x + tx >= gra.cols) { + continue; + } + if (y + ty < 0 || y + ty >= gra.rows) { + continue; + } + if (c_dist > dist.at(y + ty, x + tx)) { + if (c_dist > dist.at(y + ty, x + tx) + gra_e[i]) { + quellknoten.at(y, x) = quellknoten.at(y + ty, x + tx); + dist.at(y, x) = dist.at(y + ty, x + tx) + gra_e[i]; + dirt.at(y, x) = 1; + clean = false; + } + } + else { + if (c_dist + gra_e[i] < dist.at(y + ty, x + tx)) { + quellknoten.at(y + ty, x + tx) = quellknoten.at(y, x); + dist.at(y + ty, x + tx) = dist.at(y, x) + gra_e[i]; + dirt.at(y + ty, x + tx) = 1; + clean = false; + } + } + } + + + + } + + Mat nnFlow(i1.rows, i1.cols, CV_32FC2, Scalar(0)); + for (y = 0; y < i1.rows; y++) { + for (x = 0; x < i1.cols; x++) { + + int id = quellknoten.at(y, x); + if (id != -1) + { + nnFlow.at(y, x) = nextPoints[id] - prevPoints[id]; + } + } + } + return nnFlow; +} + +Mat interpolate_irregular_knn( + const std::vector & _prevPoints, + const std::vector & _nextPoints, + const std::vector & status, + const Mat &color_img, + int k, + float pixeldistance) +{ + Mat in(color_img.rows, color_img.cols, CV_32FC2); + Mat mask = Mat::zeros(color_img.rows, color_img.cols, CV_8UC1); + + for (unsigned n = 0; n < _prevPoints.size(); n++) + { + if (_prevPoints[n].x >= 0 && _prevPoints[n].y >= 0 && _prevPoints[n].x < color_img.cols && _prevPoints[n].y < color_img.rows) + { + in.at(_prevPoints[n]) = _nextPoints[n] - _prevPoints[n]; + mask.at(_prevPoints[n]) = status[n]; + } + + } + int Dx[] = { -1,0,1,-1,1,-1,0,1 }; + int Dy[] = { -1,-1,-1,0,0,1,1,1 }; + Mat gra = getGraph(color_img, pixeldistance); + Mat nnFlow(in.rows, in.cols, CV_32FC2, Scalar(0)); + + std::multimap my_agents; // > + Graph_helper graph_helper(k, in.rows*in.cols); //< arrivaltime, color> + + + int color = 0; + std::vector flow_point_list; + for (int y = 0; y < in.rows; y++) { + for (int x = 0; x < in.cols; x++) { + if (mask.at(y, x) > 0) { + flow_point_list.push_back(Vec2i(y, x)); + nnFlow.at(y, x) = in.at(y, x); + + int v_id = (y * in.cols + x); + graph_helper.add(v_id, std::pair(0.f, color)); + + + Vec8f gra_e = gra.at(y, x); + for (int i = 0; i < 8; i++) { + if (gra_e[i] < 0) + continue; + int dx = Dx[i]; + int dy = Dy[i]; + int target = (y + dy) * in.cols + (x + dx); + Vec2i agent(target, color); + my_agents.insert(std::pair(gra_e[i], agent)); + + } + color++; + } + } + } + + int global_time = 0; + + bool done = false; + while (!done) { + if (my_agents.size() == 0) { + done = true; + break; + } + global_time++; + + std::multimap::iterator current_it = my_agents.begin(); + std::pair current_p = *current_it; + my_agents.erase(current_it); + + int target = current_p.second[0]; + color = current_p.second[1]; + float arriv_time = current_p.first; + + Vec8f gra_e = gra.at(target);// (y*cols+x) + if (graph_helper.size(target) >= k) { + continue; + } + + bool color_found_in_target = graph_helper.color_in_target(target, color); + if (color_found_in_target) { + continue; + } + graph_helper.add(target, std::pair(arriv_time, color)); + + + for (int i = 0; i < 8; i++) { + if (gra_e[i] < 0) + continue; + + int dx = Dx[i]; + int dy = Dy[i]; + int new_target = target + dx + (dy*in.cols); + if (graph_helper.size(new_target) >= k) { + continue; + } + color_found_in_target = graph_helper.color_in_target(new_target, color); + if (color_found_in_target) { + continue; + } + Vec2i new_agent(new_target, color); + my_agents.insert(std::pair(arriv_time + gra_e[i], new_agent)); + + } + } + + Mat ret(in.rows, in.cols*k, CV_32FC2); + for (int y = 0; y < in.rows; y++) { + for (int x = 0; x < in.cols; x++) { + for (int i = 0; i < k; i++) { + float dist = *((float*)(graph_helper.data(y*in.cols + x) + 2 * i)); + float id = *((float*)(graph_helper.data(y*in.cols + x) + 2 * i + 1)); + ret.at(y, k*x + i) = Vec2f(dist, id); + } + } + } + return ret; +} + +Mat getGraph(const Mat &image, float edge_length) +{ + + int Dx[] = { -1,0,1,-1,1,-1,0,1 }; + int Dy[] = { -1,-1,-1,0,0,1,1,1 }; + Mat gra(image.rows, image.cols, CV_32FC(8)); + + for (int y = 0; y < gra.rows; y++) { + for (int x = 0; x < gra.cols; x++) { + + for (int i = 0; i < 8; i++) { + int dx = Dx[i]; + int dy = Dy[i]; + gra.at(y, x)[i] = -1; + + if (x + dx < 0 || y + dy < 0 || x + dx >= gra.cols || y + dy >= gra.rows) { + continue; + } + + if (i < 4) { + int si = 7 - i; + gra.at(y, x)[i] = gra.at(y + dy, x + dx)[si]; + } + else { + float p1 = dx * dx*edge_length*edge_length + dy * dy*edge_length*edge_length; + float p2 = static_cast(image.at(y, x)[0] - image.at(y + dy, x + dx)[0]); + float p3 = static_cast(image.at(y, x)[1] - image.at(y + dy, x + dx)[1]); + float p4 = static_cast(image.at(y, x)[2] - image.at(y + dy, x + dx)[2]); + gra.at(y, x)[i] = sqrt(p1 + p2 * p2 + p3 * p3 + p4 * p4); + } + + } + + } + } + + return gra; +} + + +Mat interpolate_irregular_nn( + const std::vector & _prevPoints, + const std::vector & _nextPoints, + const std::vector & status, + const Mat &color_img, + float pixeldistance) +{ + int Dx[] = { -1,0,1,-1,1,-1,0,1 }; + int Dy[] = { -1,-1,-1,0,0,1,1,1 }; + std::vector prevPoints, nextPoints; + std::map, std::pair> flowMap; + for (unsigned n = 0; n < _prevPoints.size(); n++) + { + if (status[n] != 0) + { + flowMap.insert(std::make_pair( + std::make_pair(_prevPoints[n].x, _prevPoints[n].y), + std::make_pair(_nextPoints[n].x, _nextPoints[n].y))); + prevPoints.push_back(_prevPoints[n]); + nextPoints.push_back(_nextPoints[n]); + } + + } + + Mat gra = getGraph(color_img, pixeldistance); + + Mat prev; + Mat geo_dist = sgeo_dist(gra, prevPoints, std::numeric_limits::max(), prev); + + + Mat nnFlow = Mat::zeros(color_img.size(), CV_32FC2); + for (int y = 0; y < nnFlow.rows; y++) + { + for (int x = 0; x < nnFlow.cols; x++) + { + int cx = x; + int cy = y; + while (prev.at(cy, cx) != 255) + { + int i = prev.at(cy, cx); + cx += Dx[i]; + cy += Dy[i]; + } + auto val = flowMap[std::make_pair(static_cast(cx), static_cast(cy))]; + nnFlow.at(y, x) = Vec2f(val.first - cx, val.second - cy); + } + } + return nnFlow; +} + +}} // namespace diff --git a/modules/optflow/src/rlof/geo_interpolation.hpp b/modules/optflow/src/rlof/geo_interpolation.hpp new file mode 100644 index 00000000000..526768903b5 --- /dev/null +++ b/modules/optflow/src/rlof/geo_interpolation.hpp @@ -0,0 +1,35 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +#ifndef _GEO_INTERPOLATION_HPP_ +#define _GEO_INTERPOLATION_HPP_ + +namespace cv { +namespace optflow { + +typedef Vec Vec8f; +Mat getGraph(const Mat & image, float edge_length); +Mat sgeo_dist(const Mat& gra, int y, int x, float max, Mat &prev); +Mat sgeo_dist(const Mat& gra, const std::vector & points, float max, Mat &prev); +Mat interpolate_irregular_nw(const Mat &in, const Mat &mask, const Mat &color_img, float max_d, float bandwidth, float pixeldistance); +Mat interpolate_irregular_nn( + const std::vector & prevPoints, + const std::vector & nextPoints, + const std::vector & status, + const Mat &color_img, + float pixeldistance); +Mat interpolate_irregular_knn( + const std::vector & _prevPoints, + const std::vector & _nextPoints, + const std::vector & status, + const Mat &color_img, + int k, + float pixeldistance); + +Mat interpolate_irregular_nn_raster(const std::vector & prevPoints, + const std::vector & nextPoints, + const std::vector & status, + const Mat & i1); + +}} // namespace +#endif diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp new file mode 100644 index 00000000000..3c0bad6f963 --- /dev/null +++ b/modules/optflow/src/rlof/plk_invoker.hpp @@ -0,0 +1,1087 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +#ifndef _PLK_INVOKER_HPP_ +#define _PLK_INVOKER_HPP_ +#include "rlof_invokerbase.hpp" + + +namespace cv { +namespace optflow { +namespace plk { + +// implementierung ohne SSE +namespace radial { + +class TrackerInvoker : public cv::ParallelLoopBody +{ +public: + TrackerInvoker( + + const Mat& _prevImg, + const Mat& _prevDeriv, + const Mat& _nextImg, + const Mat& _rgbPrevImg, + const Mat& _rgbNextImg, + const Point2f* _prevPts, + Point2f* _nextPts, + uchar* _status, + float* _err, + Point2f* _gainVecs, + int _level, + int _maxLevel, + int _winSize[2], + int _maxIteration, + bool _useInitialFlow, + int _supportRegionType, + float _minEigenValue, + int _crossSegmentationThreshold) + { + prevImg = &_prevImg; + prevDeriv = &_prevDeriv; + nextImg = &_nextImg; + rgbPrevImg = &_rgbPrevImg; + rgbNextImg = &_rgbNextImg; + prevPts = _prevPts; + nextPts = _nextPts; + status = _status; + err = _err; + gainVecs = _gainVecs; + minWinSize = _winSize[0]; + maxWinSize = _winSize[1]; + criteria.maxCount = _maxIteration; + criteria.epsilon = 0.01; + level = _level; + maxLevel = _maxLevel; + windowType = _supportRegionType; + minEigThreshold = _minEigenValue; + useInitialFlow = _useInitialFlow; + crossSegmentationThreshold = _crossSegmentationThreshold; + } + + void operator()(const cv::Range& range) const CV_OVERRIDE + { + cv::Size winSize; + cv::Point2f halfWin; + + const Mat& I = *prevImg; + const Mat& J = *nextImg; + const Mat& derivI = *prevDeriv; + const Mat& BI = *rgbPrevImg; + + winSize = cv::Size(maxWinSize,maxWinSize); + int winMaskwidth = roundUp(winSize.width, 16); + cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); + winMaskMatBuf.setTo(1); + const float FLT_SCALE = (1.f/(1 << 16));//(1.f/(1 << 20)); // 20 + + int cn = I.channels(), cn2 = cn*2; + int winbufwidth = roundUp(winSize.width, 16); + cv::Size winBufSize(winbufwidth,winbufwidth); + + cv::Matx44f invTensorMat; + Vec4f mismatchMat; + Vec4f resultMat; + + cv::AutoBuffer _buf(winBufSize.area()*(cn + cn2)); + int derivDepth = DataType::depth; + + Mat IWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf.data()); + Mat derivIWinBuf(winBufSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf.data() + winBufSize.area()*cn); + + for( int ptidx = range.start; ptidx < range.end; ptidx++ ) + { + Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level)); + Point2f nextPt; + if( level == maxLevel ) + { + if( useInitialFlow ) + { + nextPt = nextPts[ptidx]*(float)(1./(1 << level)); + } + else + nextPt = prevPt; + } + else + nextPt = nextPts[ptidx]*2.f; + nextPts[ptidx] = nextPt; + + Point2i iprevPt, inextPt; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + int winArea = maxWinSize * maxWinSize; + cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize)); + winMaskMatBuf.setTo(0); + if( calcWinMaskMat(BI, windowType, iprevPt, + winMaskMat,winSize,halfWin,winArea, + minWinSize,maxWinSize) == false) + continue; + halfWin = Point2f(static_cast(maxWinSize) ,static_cast(maxWinSize) ) - halfWin; + prevPt += halfWin; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + + + if( iprevPt.x < 0 || iprevPt.x >= derivI.cols - winSize.width || + iprevPt.y < 0 || iprevPt.y >= derivI.rows - winSize.height - 1) + { + if( level == 0 ) + { + if( status ) + status[ptidx] = 3; + if( err ) + err[ptidx] = 0; + } + continue; + } + + float a = prevPt.x - iprevPt.x; + float b = prevPt.y - iprevPt.y; + const int W_BITS = 14, W_BITS1 = 14; + int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + float A11 = 0, A12 = 0, A22 = 0; + // tensor + float sumIx = 0; + float sumIy = 0; + float sumI = 0; + float sumW = 0; + float w1 = 0, w2 = 0; // -IyI + float dI = 0; // I^2 + float D = 0; + #ifdef RLOF_SSE + + __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128i z = _mm_setzero_si128(); + __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1)); + __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1)); + __m128i mmMask4_epi32; + __m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits::max()); + get4BitMask(winSize.width, mmMask4_epi32); +#endif + // extract the patch from the first image, compute covariation matrix of derivatives + int x, y; + for( y = 0; y < winSize.height; y++ ) + { + const uchar* src = I.ptr(y + iprevPt.y, 0) + iprevPt.x*cn; + const uchar* src1 = I.ptr(y + iprevPt.y + 1, 0) + iprevPt.x*cn; + const short* dsrc = derivI.ptr(y + iprevPt.y, 0) + iprevPt.x*cn2; + const short* dsrc1 = derivI.ptr(y + iprevPt.y + 1, 0) + iprevPt.x*cn2; + short* Iptr = IWinBuf.ptr(y, 0); + short* dIptr = derivIWinBuf.ptr(y, 0); + const tMaskType* maskPtr = winMaskMat.ptr(y, 0); + x = 0; +#ifdef RLOF_SSE + for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dsrc1 += 8, dIptr += 4*2 ) + { + __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16); + __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16); + __m128i v00, v01, v10, v11, t0, t1; + v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); + v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); + v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x)), z); + v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x + cn)), z); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); + if( x + 4 > winSize.width) + { + t0 = _mm_and_si128(t0, mmMask4_epi32); + } + t0 = _mm_and_si128(t0, mask_0_3_epi16); + _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0)); + + + v00 = _mm_loadu_si128((const __m128i*)(dsrc)); + v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); + v10 = _mm_loadu_si128((const __m128i*)(dsrc1)); + v11 = _mm_loadu_si128((const __m128i*)(dsrc1 + cn2)); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1); + v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... + if( x + 4 > winSize.width) + { + v00 = _mm_and_si128(v00, mmMask4_epi32); + } + v00 = _mm_and_si128(v00, mask_0_3_epi16); + _mm_storeu_si128((__m128i*)dIptr, v00); + } +#else + for( ; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2 ) + { + if( maskPtr[x] == 0) + { + dIptr[0] = 0; + dIptr[1] = 0; + continue; + } + int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + + src1[x]*iw10 + src1[x+cn]*iw11, W_BITS1-5); + int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 + + dsrc1[0]*iw10 + dsrc1[cn2]*iw11, W_BITS1); + int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc1[1]*iw10 + + dsrc1[cn2+1]*iw11, W_BITS1); + + Iptr[x] = (short)ival; + dIptr[0] = (short)ixval; + dIptr[1] = (short)iyval; + } +#endif + + } + + cv::Point2f backUpNextPt = nextPt; + nextPt += halfWin; + Point2f prevDelta(0,0); //relates to h(t-1) + Point2f prevGain(1,0); + cv::Point2f gainVec = gainVecs[ptidx]; + cv::Point2f backUpGain = gainVec; + int j; + for( j = 0; j < criteria.maxCount; j++ ) + { + status[ptidx] = static_cast(j); + inextPt.x = cvFloor(nextPt.x); + inextPt.y = cvFloor(nextPt.y); + + if( inextPt.x < 0 || inextPt.x >= J.cols - winSize.width || + inextPt.y < 0 || inextPt.y >= J.rows - winSize.height - 1) + { + if( level == 0 && status ) + status[ptidx] = 3; + break; + } + + a = nextPt.x - inextPt.x; + b = nextPt.y - inextPt.y; + iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + float b1,b2,b3,b4; + b1 = 0; + b2 = 0; + b3 = 0; + b4 = 0; + + if( j == 0 ) + { + // tensor + w1 = 0; // -IxI + w2 = 0; // -IyI + dI = 0; // I^2 + sumIx = 0; + sumIy = 0; + sumI = 0; + sumW = 0; + A11 = 0; + A12 = 0; + A22 = 0; + } +#ifdef RLOF_SSE + qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(), qb2 = _mm_setzero_ps(); + __m128 qb3 = _mm_setzero_ps(); + __m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps(); + __m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps(); + __m128 mmSumIy = _mm_setzero_ps(), mmSumIx = _mm_setzero_ps(); + __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps(); + float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x; + int bitShift = gainVec.x == 0 ? 1 : cvCeil(log(200.f / gainVal) / log(2.f)); + __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast(gainVec.x * (float)(1 << bitShift))); + __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast(gainVec.y)); +#endif + for( y = 0; y < winSize.height; y++ ) + { + const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const uchar* Jptr1 = J.ptr(y + inextPt.y + 1, inextPt.x*cn); + const short* Iptr = IWinBuf.ptr(y, 0); + const short* dIptr = derivIWinBuf.ptr(y, 0); + const tMaskType* maskPtr = winMaskMat.ptr(y, 0); + x = 0; +#ifdef RLOF_SSE + + for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 ) + { + // iw = iw << 14 (16384 short / 2) + // iq0 = iw01 |iw00, iw01 |iw00, iw01 |iw00, iw01 |iw00 16 bit + __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16); + + // I [0...8] + __m128i diff0, diff1; + __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); // element 0 to 7 + + __m128i v00 = _mm_unpacklo_epi8( + _mm_loadl_epi64((const __m128i*)(Jptr + x)) // J0, J1, J2, ..., J7, 64 bit nullen + , z); //J0 , 0, J1, 0, J2, 0 ... J7,0 + __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); + __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z); + __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z); + + + __m128i t0 = _mm_add_epi32 + (_mm_madd_epi16( + _mm_unpacklo_epi16(v00, v01), //J0, , J1, J1, J2, J2, ... J3 , J4 + qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + + __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5); + + + __m128i Ilo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16); + __m128i Ihi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16); + + __m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(Ilo, Ihi), bitShift); + __m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(Ilo, Ihi), bitShift); + __m128i Igain_epi16 = _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32); + + // diff = J - I + I*gain.x + gain.y + __m128i mmDiff_epi16 = _mm_add_epi16( + _mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16), // J - I + _mm_add_epi16(Igain_epi16, mmConstValue_epi16)); + + + mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16); + + + __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... + __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); + + diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7 | It12 It12 It13 It13... + diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3 | It8 It8 It9 It9... + + v10 = _mm_mullo_epi16(Ixy_0, diff0); + v11 = _mm_mulhi_epi16(Ixy_0, diff0); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); + qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); + + v10 = _mm_mullo_epi16(Ixy_1, diff1); + v11 = _mm_mulhi_epi16(Ixy_1, diff1); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); + qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); + // for set 1 sigma 1 + // b3 += diff * Iptr[x] + Ilo = _mm_mullo_epi16(mmDiff_epi16, I_0_7_epi16); + Ihi = _mm_mulhi_epi16(mmDiff_epi16, I_0_7_epi16); + v00 = _mm_unpacklo_epi16(Ilo, Ihi); + v10 = _mm_unpackhi_epi16(Ilo, Ihi); + qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v00)); + qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v10)); + + // b4+= diff + __m128 mmDiff_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mmDiff_epi16)); // (_mm_srai_epi32(_mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16),16)); + __m128 mmDiff_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16),16))); + qb3 = _mm_add_ps(qb3, mmDiff_0_3_ps); + qb3 = _mm_add_ps(qb3, mmDiff_4_7_ps); + if( j == 0 ) + { + __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16)); + __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16))); + + t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3 + t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3 + + __m128 fy = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t0), mask_0_4_ps); + __m128 fx = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t1), mask_0_4_ps); + + // 0 ... 3 + __m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16)); + + // A11 - A22 + mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fy)); + mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fy)); + mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fx)); + + // sumIx und sumIy + mmSumIx = _mm_add_ps(mmSumIx, fx); + mmSumIy = _mm_add_ps(mmSumIy, fy); + + mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fx)); + mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fy)); + + // sumI + I_ps = _mm_blendv_ps(_mm_set1_ps(0), I_ps, mask_0_4_ps); + mmSumI = _mm_add_ps(mmSumI,I_ps); + + // sumDI + mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_ps)); + + t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11 + t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3 + + fy = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t0), mask_4_7_ps); + fx = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t1), mask_4_7_ps); + + // 4 ... 7 + I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16)); + + + + mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fy)); + mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fy)); + mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fx)); + + // sumIx und sumIy + mmSumIx = _mm_add_ps(mmSumIx, fx); + mmSumIy = _mm_add_ps(mmSumIy, fy); + + mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fx)); + mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fy)); + + // sumI + I_ps = _mm_blendv_ps(_mm_set1_ps(0), I_ps, mask_4_7_ps); + mmSumI = _mm_add_ps(mmSumI, I_ps); + + // sumW + // sumDI + mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_ps)); + } + + } +#else + for( ; x < winSize.width*cn; x++, dIptr += 2 ) + { + if( maskPtr[x] == 0) + continue; + int J_val = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + + Jptr1[x]*iw10 + Jptr1[x+cn]*iw11, + W_BITS1-5); + int diff = static_cast(J_val - Iptr[x] + Iptr[x] * gainVec.x + gainVec.y); + + b1 += (float)(diff*dIptr[0]) * FLT_RESCALE; + b2 += (float)(diff*dIptr[1]) * FLT_RESCALE; + b3 += (float)(diff) * Iptr[x] * FLT_RESCALE; + b4 += (float)(diff); + + // compute the Gradient Matrice + if( j == 0 ) + { + A11 += (float)(dIptr[0]*dIptr[0]); + A12 += (float)(dIptr[0]*dIptr[1]); + A22 += (float)(dIptr[1]*dIptr[1]); + + + dI += Iptr[x] * Iptr[x] * FLT_RESCALE; + float dx = static_cast(dIptr[0]) * FLT_RESCALE; + float dy = static_cast(dIptr[1]) * FLT_RESCALE; + sumIx += dx; + sumIy += dy; + w1 += dx * Iptr[x]; + w2 += dy * Iptr[x]; + sumI += Iptr[x] * FLT_RESCALE; + //sumW += FLT_RESCALE; + + } + } +#endif + } +#ifdef RLOF_SSE + float CV_DECL_ALIGNED(16) wbuf[4]; +#endif + if( j == 0 ) + { +#ifdef RLOF_SSE + _mm_store_ps(wbuf, mmSumW1); + w1 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumW2); + w2 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumDI); + dI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumI); + sumI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumIx); + sumIx = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumIy); + sumIy = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumW); + sumW = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; +#endif + sumIx *= -FLT_SCALE; + sumIy *= -FLT_SCALE; + sumI *=FLT_SCALE; + sumW = winArea * FLT_SCALE; + w1 *= -FLT_SCALE; + w2 *= -FLT_SCALE; + dI *= FLT_SCALE; +#ifdef RLOF_SSE + float CV_DECL_ALIGNED(32) A11buf[4], A12buf[4], A22buf[4];// + + _mm_store_ps(A11buf, mmAxx); + _mm_store_ps(A12buf, mmAxy); + _mm_store_ps(A22buf, mmAyy); + + A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; + A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; + A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; +#endif + A11 *= FLT_SCALE; + A12 *= FLT_SCALE; + A22 *= FLT_SCALE; + } + +#ifdef RLOF_SSE + float CV_DECL_ALIGNED(16) bbuf[4]; + _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1)); + b1 = bbuf[0] + bbuf[2]; + b2 = bbuf[1] + bbuf[3]; + _mm_store_ps(bbuf, qb2); + b3 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; + _mm_store_ps(bbuf, qb3); + b4 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; +#endif + + mismatchMat(0) = b1 * FLT_SCALE; + mismatchMat(1) = b2 * FLT_SCALE; + mismatchMat(2) = -b3 * FLT_SCALE; + mismatchMat(3) = -b4 * FLT_SCALE; + + D = - A12*A12*sumI*sumI + dI*sumW*A12*A12 + 2*A12*sumI*sumIx*w2 + 2*A12*sumI*sumIy*w1 + - 2*dI*A12*sumIx*sumIy - 2*sumW*A12*w1*w2 + A11*A22*sumI*sumI - 2*A22*sumI*sumIx*w1 + - 2*A11*sumI*sumIy*w2 - sumIx*sumIx*w2*w2 + A22*dI*sumIx*sumIx + 2*sumIx*sumIy*w1*w2 + - sumIy*sumIy*w1*w1 + A11*dI*sumIy*sumIy + A22*sumW*w1*w1 + A11*sumW*w2*w2 - A11*A22*dI*sumW; + + + float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) + + 4.f*A12*A12))/(2*winArea); + if( minEig < minEigThreshold ) + { + if( (level == 0 && status) || std::abs(D) < FLT_EPSILON) + status[ptidx] = 0; + if( level > 0) + { + nextPts[ptidx] = backUpNextPt; + gainVecs[ptidx] = backUpGain; + } + break; + } + + + + D = (1.f / D); + invTensorMat(0,0) = (A22*sumI*sumI - 2*sumI*sumIy*w2 + dI*sumIy*sumIy + sumW*w2*w2 - A22*dI*sumW)* D; + invTensorMat(0,1) = (A12*dI*sumW - A12*sumI * sumI - dI*sumIx*sumIy + sumI*sumIx*w2 + sumI*sumIy*w1 - sumW*w1*w2)* D; + invTensorMat(0,2) = (A12*sumI*sumIy - sumIy*sumIy*w1 - A22*sumI*sumIx - A12*sumW*w2 + A22*sumW*w1 + sumIx*sumIy*w2)* D; + invTensorMat(0,3) = (A22*dI*sumIx - A12*dI*sumIy - sumIx*w2*w2 + A12*sumI*w2 - A22*sumI*w1 + sumIy*w1*w2) * D; + invTensorMat(1,0) = invTensorMat(0,1); + invTensorMat(1,1) = (A11*sumI * sumI - 2*sumI*sumIx*w1 + dI*sumIx * sumIx + sumW*w1*w1 - A11*dI*sumW) * D; + invTensorMat(1,2) = (A12*sumI*sumIx - A11*sumI*sumIy - sumIx * sumIx*w2 + A11*sumW*w2 - A12*sumW*w1 + sumIx*sumIy*w1) * D; + invTensorMat(1,3) = (A11*dI*sumIy - sumIy*w1*w1 - A12*dI*sumIx - A11*sumI*w2 + A12*sumI*w1 + sumIx*w1*w2)* D; + invTensorMat(2,0) = invTensorMat(0,2); + invTensorMat(2,1) = invTensorMat(1,2); + invTensorMat(2,2) = (sumW*A12*A12 - 2*A12*sumIx*sumIy + A22*sumIx*sumIx + A11*sumIy*sumIy - A11*A22*sumW)* D; + invTensorMat(2,3) = (A11*A22*sumI - A12*A12*sumI - A11*sumIy*w2 + A12*sumIx*w2 + A12*sumIy*w1 - A22*sumIx*w1)* D; + invTensorMat(3,0) = invTensorMat(0,3); + invTensorMat(3,1) = invTensorMat(1,3); + invTensorMat(3,2) = invTensorMat(2,3); + invTensorMat(3,3) = (dI*A12*A12 - 2*A12*w1*w2 + A22*w1*w1 + A11*w2*w2 - A11*A22*dI)* D; + + resultMat = invTensorMat * mismatchMat; + + // 0.08 -12.10 + Point2f delta(-resultMat(0), -resultMat(1)); + Point2f deltaGain(-resultMat(2), -resultMat(3)); + + + if( j == 0) + prevGain = deltaGain; + nextPt += delta; + nextPts[ptidx] = nextPt - halfWin; + gainVecs[ptidx]= gainVec; + if( delta.ddot(delta) <= criteria.epsilon) + break; + if (( + std::abs(delta.x - prevDelta.x) < 0.01 && + std::abs(delta.y - prevDelta.y) < 0.01 + ) || ( + delta.ddot(delta) <= 0.001 && + std::abs(prevGain.x - deltaGain.x) < 0.01 + )) + { + nextPts[ptidx] -= delta*0.5f; + gainVecs[ptidx] -= deltaGain* 0.5f; + break; + } + + if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01 && + std::abs(delta.y - prevDelta.y) < 0.01) + { + nextPts[ptidx] -= delta*0.5f; + break; + } + + prevDelta = delta; + prevGain = deltaGain; + + } + + + } + + } + + const Mat* prevImg; + const Mat* nextImg; + const Mat* prevDeriv; + const Mat* rgbPrevImg; + const Mat* rgbNextImg; + const Point2f* prevPts; + Point2f* nextPts; + uchar* status; + cv::Point2f* gainVecs; // gain vector x -> multiplier y -> offset + float* err; + int maxWinSize; + int minWinSize; + TermCriteria criteria; + int level; + int maxLevel; + int windowType; + float minEigThreshold; + bool useInitialFlow; + int crossSegmentationThreshold; +}; + +} // namespace +namespace ica { + +class TrackerInvoker : public cv::ParallelLoopBody +{ +public: + TrackerInvoker( + const Mat& _prevImg, + const Mat& _prevDeriv, + const Mat& _nextImg, + const Mat& _rgbPrevImg, + const Mat& _rgbNextImg, + const Point2f* _prevPts, + Point2f* _nextPts, + uchar* _status, + float* _err, + int _level, + int _maxLevel, + int _winSize[2], + int _maxIteration, + bool _useInitialFlow, + int _supportRegionType, + int _crossSegmentationThreshold, + float _minEigenValue) + { + prevImg = &_prevImg; + prevDeriv = &_prevDeriv; + nextImg = &_nextImg; + rgbPrevImg = &_rgbPrevImg; + rgbNextImg = &_rgbNextImg; + prevPts = _prevPts; + nextPts = _nextPts; + status = _status; + err = _err; + minWinSize = _winSize[0]; + maxWinSize = _winSize[1]; + criteria.maxCount = _maxIteration; + criteria.epsilon = 0.01; + level = _level; + maxLevel = _maxLevel; + windowType = _supportRegionType; + minEigThreshold = _minEigenValue; + useInitialFlow = _useInitialFlow; + crossSegmentationThreshold = _crossSegmentationThreshold; + } + + void operator()(const cv::Range& range) const CV_OVERRIDE + { + cv::Size winSize; + cv::Point2f halfWin; + + const Mat& I = *prevImg; + const Mat& J = *nextImg; + const Mat& derivI = *prevDeriv; + const Mat& BI = *rgbPrevImg; + + winSize = cv::Size(maxWinSize,maxWinSize); + int winMaskwidth = roundUp(winSize.width, 8) * 2; + cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); + winMaskMatBuf.setTo(1); + + + + const float FLT_SCALE = (1.f/(1 << 20)); // 20 + + int j, cn = I.channels(), cn2 = cn*2; + int winbufwidth = roundUp(winSize.width, 8); + cv::Size winBufSize(winbufwidth,winbufwidth); + + std::vector _buf(winBufSize.area()*(cn + cn2)); + Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]); + Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]); + + for( int ptidx = range.start; ptidx < range.end; ptidx++ ) + { + Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level)); + Point2f nextPt; + if( level == maxLevel ) + { + if( useInitialFlow ) + nextPt = nextPts[ptidx]*(float)(1./(1 << level)); + else + nextPt = prevPt; + } + else + nextPt = nextPts[ptidx]*2.f; + nextPts[ptidx] = nextPt; + + Point2i iprevPt, inextPt; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + int winArea = maxWinSize * maxWinSize; + cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0,0, maxWinSize,maxWinSize)); + + if( calcWinMaskMat(BI, windowType, iprevPt, + winMaskMat,winSize,halfWin,winArea, + minWinSize,maxWinSize) == false) + continue; + halfWin = Point2f(static_cast(maxWinSize), static_cast(maxWinSize)) - halfWin; + prevPt += halfWin; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + + if( iprevPt.x < 0 || iprevPt.x >= derivI.cols - winSize.width || + iprevPt.y < 0 || iprevPt.y >= derivI.rows - winSize.height - 1) + { + if( level == 0 ) + { + if( status ) + status[ptidx] = 3; + if( err ) + err[ptidx] = 0; + } + continue; + } + + float a = prevPt.x - iprevPt.x; + float b = prevPt.y - iprevPt.y; + const int W_BITS = 14, W_BITS1 = 14; + + int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + float A11 = 0, A12 = 0, A22 = 0; + +#ifdef RLOF_SSE + __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128i z = _mm_setzero_si128(); + __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1)); + __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1)); + __m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps(); + __m128i mmMask4_epi32; + get4BitMask(winSize.width, mmMask4_epi32); +#endif + + // extract the patch from the first image, compute covariation matrix of derivatives + int x, y; + for( y = 0; y < winSize.height; y++ ) + { + const uchar* src = I.ptr(y + iprevPt.y, 0) + iprevPt.x*cn; + const uchar* src1 = I.ptr(y + iprevPt.y + 1, 0) + iprevPt.x*cn; + const short* dsrc = derivI.ptr(y + iprevPt.y, 0) + iprevPt.x*cn2; + const short* dsrc1 = derivI.ptr(y + iprevPt.y + 1, 0) + iprevPt.x*cn2; + short* Iptr = IWinBuf.ptr(y, 0); + short* dIptr = derivIWinBuf.ptr(y, 0); + const tMaskType* maskPtr = winMaskMat.ptr(y, 0); + x = 0; +#ifdef RLOF_SSE + for( ; x <= winSize.width*cn; x += 4, dsrc += 4*2, dsrc1 += 8, dIptr += 4*2 ) + { + __m128i wMask = _mm_set_epi32(MaskSet * maskPtr[x+3], + MaskSet * maskPtr[x+2], + MaskSet * maskPtr[x+1], + MaskSet * maskPtr[x]); + __m128i v00, v01, v10, v11, t0, t1; + v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); + v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); + v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x)), z); + v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x + cn)), z); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + + + + + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); + _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0)); + + + + v00 = _mm_loadu_si128((const __m128i*)(dsrc)); + v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); + v10 = _mm_loadu_si128((const __m128i*)(dsrc1)); + v11 = _mm_loadu_si128((const __m128i*)(dsrc1 + cn2)); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1); + v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... + + if( x + 4 > winSize.width) + { + v00 = _mm_and_si128(v00, mmMask4_epi32); + } + v00 = _mm_and_si128(v00, wMask); + + _mm_storeu_si128((__m128i*)dIptr, v00); + t0 = _mm_srai_epi32(v00, 16); // Iy0 Iy1 Iy2 Iy3 + t1 = _mm_srai_epi32(_mm_slli_epi32(v00, 16), 16); // Ix0 Ix1 Ix2 Ix3 + + __m128 fy = _mm_cvtepi32_ps(t0); + __m128 fx = _mm_cvtepi32_ps(t1); + + + qA22 = _mm_add_ps(qA22, _mm_mul_ps(fy, fy)); + qA12 = _mm_add_ps(qA12, _mm_mul_ps(fx, fy)); + qA11 = _mm_add_ps(qA11, _mm_mul_ps(fx, fx)); + } +#else + + for( ; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2 ) + { + if( maskPtr[x] == 0) + { + dIptr[0] = 0; + dIptr[1] = 0; + continue; + } + int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + + src1[x]*iw10 + src1[x+cn]*iw11, W_BITS1-5); + int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 + + dsrc1[0]*iw10 + dsrc1[cn2]*iw11, W_BITS1); + int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc1[1]*iw10 + + dsrc1[cn2+1]*iw11, W_BITS1); + + Iptr[x] = (short)ival; + dIptr[0] = (short)ixval; + dIptr[1] = (short)iyval; + A11 += (float)(ixval*ixval); + A12 += (float)(ixval*iyval); + A22 += (float)(iyval*iyval); + } +#endif + } + +#ifdef RLOF_SSE + float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4]; + _mm_store_ps(A11buf, qA11); + _mm_store_ps(A12buf, qA12); + _mm_store_ps(A22buf, qA22); + A11 += A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; + A12 += A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; + A22 += A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; +#endif + + A11 *= FLT_SCALE; + A12 *= FLT_SCALE; + A22 *= FLT_SCALE; + + float D = A11*A22 - A12*A12; + float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) + + 4.f*A12*A12))/(2 * winArea); + + if( err ) + err[ptidx] = (float)minEig; + + if( minEig < minEigThreshold || D < FLT_EPSILON ) + { + if( level == 0 && status ) + status[ptidx] = 0; + continue; + } + + D = 1.f/D; + + nextPt += halfWin; + Point2f prevDelta(0,0); //relates to h(t-1) +#ifdef RLOF_SSE + __m128i mmMask0, mmMask1, mmMask; + getWBitMask(winSize.width, mmMask0, mmMask1, mmMask); +#endif + for( j = 0; j < criteria.maxCount; j++ ) + { + status[ptidx] = static_cast(j); + inextPt.x = cvFloor(nextPt.x); + inextPt.y = cvFloor(nextPt.y); + + if( inextPt.x < 0 || inextPt.x >= J.cols - winSize.width || + inextPt.y < 0 || inextPt.y >= J.rows - winSize.height - 1) + { + if( level == 0 && status ) + status[ptidx] = 3; + break; + } + + a = nextPt.x - inextPt.x; + b = nextPt.y - inextPt.y; + iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + float b1 = 0, b2 = 0; +#ifdef RLOF_SSE + qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(); + + +#endif + for( y = 0; y < winSize.height; y++ ) + { + const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const uchar* Jptr1 = J.ptr(y + inextPt.y + 1, inextPt.x*cn); + const short* Iptr = IWinBuf.ptr(y, 0); + const short* dIptr = derivIWinBuf.ptr(y, 0); + + x = 0; +#ifdef RLOF_SSE + const tMaskType* maskPtr = winMaskMat.ptr(y, 0); + for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 ) + { + if( maskPtr[x ] == 0 && maskPtr[x+1] == 0 && maskPtr[x+2] == 0 && maskPtr[x+3] == 0 + && maskPtr[x+4] == 0 && maskPtr[x+5] == 0 && maskPtr[x+6] == 0 && maskPtr[x+7] == 0) + continue; + __m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x)), diff1; + __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z); + __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); + __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z); + __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z); + + __m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5); + + __m128i mmDiff_epi16 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), diff0); + + __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... + __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); + + if( x > winSize.width*cn - 8) + { + Ixy_0 = _mm_and_si128(Ixy_0, mmMask0); + Ixy_1 = _mm_and_si128(Ixy_1, mmMask1); + } + + + diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7 | It12 It12 It13 It13... + diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3 | It8 It8 It9 It9... + + + v10 = _mm_mullo_epi16(Ixy_0, diff0); + v11 = _mm_mulhi_epi16(Ixy_0, diff0); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); + qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); + + // It * Ix It * Iy [4 ... 7] + // for set 1 hi sigma 1 + + v10 = _mm_mullo_epi16(Ixy_1, diff1); + v11 = _mm_mulhi_epi16(Ixy_1, diff1); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); + qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); + } +#else + for( ; x < winSize.width*cn; x++, dIptr += 2 ) + { + if( dIptr[0] == 0 && dIptr[1] == 0) + continue; + int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + + Jptr1[x]*iw10 + Jptr1[x+cn]*iw11, + W_BITS1-5) - Iptr[x]; + + b1 += (float)(diff*dIptr[0]) * FLT_RESCALE; + b2 += (float)(diff*dIptr[1]) * FLT_RESCALE; + } +#endif + } + +#ifdef RLOF_SSE + float CV_DECL_ALIGNED(16) bbuf[4]; + _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1)); + b1 += bbuf[0] + bbuf[2]; + b2 += bbuf[1] + bbuf[3]; +#endif + + b1 *= FLT_SCALE; + b2 *= FLT_SCALE; + + Point2f delta( (float)((A12*b2 - A22*b1) * D), + (float)((A12*b1 - A11*b2) * D)); + + + + nextPt += delta; + nextPts[ptidx] = nextPt - halfWin; + + if( delta.ddot(delta) <= criteria.epsilon) + break; + if(j > 0 && std::abs(delta.x - prevDelta.x) < 0.01 && + std::abs(delta.y - prevDelta.y) < 0.01) + { + nextPts[ptidx] -= delta*0.5f; + break; + } + + prevDelta = delta; + + } + + } + + } + + const Mat* prevImg; + const Mat* nextImg; + const Mat* prevDeriv; + const Mat* rgbPrevImg; + const Mat* rgbNextImg; + const Point2f* prevPts; + Point2f* nextPts; + uchar* status; + float* err; + int maxWinSize; + int minWinSize; + TermCriteria criteria; + int level; + int maxLevel; + int windowType; + float minEigThreshold; + bool useInitialFlow; + int crossSegmentationThreshold; +}; + +}}}} // namespace +#endif diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp new file mode 100644 index 00000000000..8dd0ecccd75 --- /dev/null +++ b/modules/optflow/src/rlof/rlof_invoker.hpp @@ -0,0 +1,1423 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +#ifndef _RLOF_INVOKER_HPP_ +#define _RLOF_INVOKER_HPP_ +#include "rlof_invokerbase.hpp" +namespace cv { +namespace optflow { +namespace rlof { +namespace ica { + +class TrackerInvoker : public cv::ParallelLoopBody +{ +public: + TrackerInvoker( + const Mat& _prevImg, + const Mat& _prevDeriv, + const Mat& _nextImg, + const Mat& _rgbPrevImg, + const Mat& _rgbNextImg, + const Point2f* _prevPts, + Point2f* _nextPts, + uchar* _status, + float* _err, + int _level, + int _maxLevel, + int _winSize[2], + int _maxIteration, + bool _useInitialFlow, + int _supportRegionType, + const std::vector& _normSigmas, + float _minEigenValue, + int _crossSegmentationThreshold + ) : + normSigma0(_normSigmas[0]), + normSigma1(_normSigmas[1]), + normSigma2(_normSigmas[2]) + { + prevImg = &_prevImg; + prevDeriv = &_prevDeriv; + nextImg = &_nextImg; + rgbPrevImg = &_rgbPrevImg; + rgbNextImg = &_rgbNextImg; + prevPts = _prevPts; + nextPts = _nextPts; + status = _status; + err = _err; + minWinSize = _winSize[0]; + maxWinSize = _winSize[1]; + criteria.maxCount = _maxIteration; + criteria.epsilon = 0.01; + level = _level; + maxLevel = _maxLevel; + windowType = _supportRegionType; + minEigThreshold = _minEigenValue; + useInitialFlow = _useInitialFlow; + crossSegmentationThreshold = _crossSegmentationThreshold; + } + + void operator()(const cv::Range& range) const CV_OVERRIDE + { + cv::Size winSize; + cv::Point2f halfWin; + + const Mat& I = *prevImg; + const Mat& J = *nextImg; + const Mat& derivI = *prevDeriv; + const Mat& BI = *rgbPrevImg; + + winSize = cv::Size(maxWinSize, maxWinSize); + int winMaskwidth = roundUp(winSize.width, 8) * 2; + cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); + winMaskMatBuf.setTo(1); + + const float FLT_SCALE = (1.f / (1 << 20)); // 20 + + int cn = I.channels(), cn2 = cn * 2; + int winbufwidth = roundUp(winSize.width, 8); + cv::Size winBufSize(winbufwidth, winbufwidth); + + std::vector _buf(winBufSize.area()*(cn + cn2)); + Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]); + Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]); + + for (int ptidx = range.start; ptidx < range.end; ptidx++) + { + Point2f prevPt = prevPts[ptidx] * (float)(1. / (1 << level)); + Point2f nextPt; + if (level == maxLevel) + { + if ( useInitialFlow) + nextPt = nextPts[ptidx] * (float)(1. / (1 << level)); + else + nextPt = prevPt; + } + else + nextPt = nextPts[ptidx] * 2.f; + nextPts[ptidx] = nextPt; + + Point2i iprevPt, inextPt; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + int winArea = maxWinSize * maxWinSize; + cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0, 0, maxWinSize, maxWinSize)); + + if (calcWinMaskMat(BI, windowType, iprevPt, + winMaskMat, winSize, halfWin, winArea, + minWinSize, maxWinSize) == false) + continue; + halfWin = Point2f(static_cast(maxWinSize) ,static_cast(maxWinSize) ) - halfWin; + prevPt += halfWin; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + if( iprevPt.x < 0 || iprevPt.x >= derivI.cols - winSize.width || + iprevPt.y < 0 || iprevPt.y >= derivI.rows - winSize.height - 1) + { + if (level == 0) + { + if (status) + status[ptidx] = 3; + if (err) + err[ptidx] = 0; + } + continue; + } + + float a = prevPt.x - iprevPt.x; + float b = prevPt.y - iprevPt.y; + const int W_BITS = 14, W_BITS1 = 14; + + int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + float A11 = 0, A12 = 0, A22 = 0; + float b1 = 0, b2 = 0; + float D = 0; + float minEig; + +#ifdef RLOF_SSE + __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128i z = _mm_setzero_si128(); + __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1 - 1)); + __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1 - 5 - 1)); + __m128i mmMask4_epi32; + __m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits::max()); + get4BitMask(winSize.width, mmMask4_epi32); +#endif + + // extract the patch from the first image, compute covariation matrix of derivatives + int x, y; + for (y = 0; y < winSize.height; y++) + { + const uchar* src = I.ptr(y + iprevPt.y, 0) + iprevPt.x*cn; + const uchar* src1 = I.ptr(y + iprevPt.y + 1, 0) + iprevPt.x*cn; + const short* dsrc = derivI.ptr(y + iprevPt.y, 0) + iprevPt.x*cn2; + const short* dsrc1 = derivI.ptr(y + iprevPt.y + 1, 0) + iprevPt.x*cn2; + short* Iptr = IWinBuf.ptr(y, 0); + short* dIptr = derivIWinBuf.ptr(y, 0); + const tMaskType* maskPtr = winMaskMat.ptr(y, 0); + x = 0; +#ifdef RLOF_SSE + + for (; x <= winSize.width*cn; x += 4, dsrc += 4 * 2, dsrc1 += 8, dIptr += 4 * 2) + { + __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16); + __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16); + + __m128i v00, v01, v10, v11, t0, t1; + v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); + v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); + v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x)), z); + v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x + cn)), z); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5); + if (x + 4 > winSize.width) + { + t0 = _mm_and_si128(t0, mmMask4_epi32); + } + t0 = _mm_and_si128(t0, mask_0_3_epi16); + _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0, t0)); + + v00 = _mm_loadu_si128((const __m128i*)(dsrc)); + v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); + v10 = _mm_loadu_si128((const __m128i*)(dsrc1)); + v11 = _mm_loadu_si128((const __m128i*)(dsrc1 + cn2)); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1); + v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... + if (x + 4 > winSize.width) + { + v00 = _mm_and_si128(v00, mmMask4_epi32); + } + v00 = _mm_and_si128(v00, mask_0_3_epi16); + _mm_storeu_si128((__m128i*)dIptr, v00); + } +#else + + for (; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2) + { + if (maskPtr[x] == 0) + { + dIptr[0] = 0; + dIptr[1] = 0; + continue; + } + int ival = CV_DESCALE(src[x] * iw00 + src[x + cn] * iw01 + + src1[x] * iw10 + src1[x + cn] * iw11, W_BITS1 - 5); + int ixval = CV_DESCALE(dsrc[0] * iw00 + dsrc[cn2] * iw01 + + dsrc1[0] * iw10 + dsrc1[cn2] * iw11, W_BITS1); + int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc1[1] * iw10 + + dsrc1[cn2 + 1] * iw11, W_BITS1); + + Iptr[x] = (short)ival; + dIptr[0] = (short)ixval; + dIptr[1] = (short)iyval; + } +#endif + } + + cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1); + + cv::Point2f backUpNextPt = nextPt; + nextPt += halfWin; + int j; +#ifdef RLOF_SSE + __m128i mmMask0, mmMask1, mmMask; + getWBitMask(winSize.width, mmMask0, mmMask1, mmMask); +#endif + float MEstimatorScale = 1; + int buffIdx = 0; + cv::Point2f prevDelta(0, 0); + + for (j = 0; j < criteria.maxCount; j++) + { + inextPt.x = cvFloor(nextPt.x); + inextPt.y = cvFloor(nextPt.y); + + if( inextPt.x < 0 || inextPt.x >= J.cols - winSize.width || + inextPt.y < 0 || inextPt.y >= J.rows - winSize.height - 1) + { + if (level == 0 && status) + status[ptidx] = 3; + if (level > 0) + nextPts[ptidx] = backUpNextPt; + break; + } + + + a = nextPt.x - inextPt.x; + b = nextPt.y - inextPt.y; + iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + b1 = 0; + b2 = 0; + + if (j == 0 ) + { + A11 = 0; + A12 = 0; + A22 = 0; + } + + if (j == 0 ) + { + buffIdx = 0; + for (y = 0; y < winSize.height; y++) + { + const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const uchar* Jptr1 = J.ptr(y + inextPt.y + 1, inextPt.x*cn); + const short* Iptr = IWinBuf.ptr(y, 0); + const short* dIptr = derivIWinBuf.ptr(y, 0); + const tMaskType* maskPtr = winMaskMat.ptr(y, 0); + x = 0; + for (; x < winSize.width*cn; x++, dIptr += 2) + { + if (maskPtr[x] == 0) + continue; + int diff = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr1[x] * iw10 + Jptr1[x + cn] * iw11, W_BITS1 - 5) - Iptr[x]; + residualMat.at(buffIdx++) = static_cast(diff); + } + } + /*! Estimation for the residual */ + cv::Mat residualMatRoi(residualMat, cv::Rect(0, 0, 1, buffIdx)); + MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi); + } + + float eta = 1.f / winArea; + float fParam0 = normSigma0 * 32.f; + float fParam1 = normSigma1 * 32.f; + fParam0 = normSigma0 * MEstimatorScale; + fParam1 = normSigma1 * MEstimatorScale; +#ifdef RLOF_SSE + qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(); + __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps(); + __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits::max() - 1, static_cast(fParam0))); + __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits::max() - 1, static_cast(fParam1))); + float s2Val = std::fabs(normSigma2); + int s2bitShift = normSigma2 == 0 ? 1 : cvCeil(log(200.f / s2Val) / log(2.f)); + __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast(normSigma2 * (float)(1 << s2bitShift))); + __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift); + __m128 mmParam2s = _mm_set1_ps(0.01f * normSigma2); + __m128 mmParam2s2 = _mm_set1_ps(normSigma2 * normSigma2); + __m128 mmOnes = _mm_set1_ps(1.f); + __m128i mmEta = _mm_setzero_si128(); + __m128i mmScale = _mm_set1_epi16(static_cast(MEstimatorScale)); + +#endif + + buffIdx = 0; + for (y = 0; y < winSize.height; y++) + { + const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const uchar* Jptr1 = J.ptr(y + inextPt.y + 1, inextPt.x*cn); + const short* Iptr = IWinBuf.ptr(y, 0); + const short* dIptr = derivIWinBuf.ptr(y, 0); + const tMaskType* maskPtr = winMaskMat.ptr(y, 0); + x = 0; +#ifdef RLOF_SSE + for (; x <= winSize.width*cn; x += 8, dIptr += 8 * 2) + { + __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16); + __m128i diff0, diff1; + __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); // von element 0 bis 7 + + __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z); + __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); + __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z); + __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z); + + __m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1 - 5); + + __m128i mmDiff_epi16 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16); + + mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16); + + + __m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale); + mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1))); + + + __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... + __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); + + __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16); + __m128i bSet2_epi16, bSet1_epi16; + // |It| < sigma1 ? + bSet2_epi16 = _mm_cmplt_epi16(abs_epi16, mmParam1); + // It > 0 ? + __m128i diffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128()); + // sigma0 < |It| < sigma1 ? + bSet1_epi16 = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0)); + // val = |It| -/+ sigma1 + __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)), + _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1))); + // It == 0 ? |It| > sigma13 + mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16); + // It == val ? sigma0 < |It| < sigma1 + mmDiff_epi16 = _mm_blendv_epi8(mmDiff_epi16, tmpParam1_epi16, bSet1_epi16); + + + __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3 + // diff = diff * sigma2 + __m128i lo = _mm_mullo_epi16(tale_epi16_, mmDiff_epi16); + __m128i hi = _mm_mulhi_epi16(tale_epi16_, mmDiff_epi16); + __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); //diff 0_3_epi32 + __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); // diff 4_7_epi32 + + mmDiff_epi16 = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); + diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7 | It12 It12 It13 It13... + diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3 | It8 It8 It9 It9... + + // Ix * diff / Iy * diff + v10 = _mm_mullo_epi16(Ixy_0, diff0); + v11 = _mm_mulhi_epi16(Ixy_0, diff0); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); + qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); + + // It * Ix It * Iy [4 ... 7] + // for set 1 hi sigma 1 + + v10 = _mm_mullo_epi16(Ixy_1, diff1); + v11 = _mm_mulhi_epi16(Ixy_1, diff1); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); + qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); + + + if (j == 0) + { + __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16)); + __m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16, bSet1_epi16), 16)); + __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16)); + __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16))); + + __m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16)); + __m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16), 16)); + + __m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps); + __m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps); + tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps); + tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps); + + tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps); + tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps); + + t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3 + t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3 + + __m128 fy = _mm_cvtepi32_ps(t0); + __m128 fx = _mm_cvtepi32_ps(t1); + + // A11 - A22 + __m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps); + __m128 fytale = _mm_mul_ps(fy, tale_0_3_ps); + + mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); + mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); + mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); + + t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11 + t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3 + + fy = _mm_cvtepi32_ps(t0); + fx = _mm_cvtepi32_ps(t1); + + // A11 - A22 + fxtale = _mm_mul_ps(fx, tale_4_7_ps); + fytale = _mm_mul_ps(fy, tale_4_7_ps); + + mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); + mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); + mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); + } + } +#else + for (; x < winSize.width*cn; x++, dIptr += 2) + { + if (maskPtr[x] == 0) + continue; + int diff = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + + Jptr1[x] * iw10 + Jptr1[x + cn] * iw11, + W_BITS1 - 5) - Iptr[x]; + + + + + + if (diff > MEstimatorScale) + MEstimatorScale += eta; + if (diff < MEstimatorScale) + MEstimatorScale -= eta; + + int abss = (diff < 0) ? -diff : diff; + if (abss > fParam1) + { + diff = 0; + } + else if (abss > fParam0 && diff >= 0) + { + //diff = fParam1 * (diff - fParam1); + diff = static_cast(normSigma2 * (diff - fParam1)); + } + else if (abss > fParam0 && diff < 0) + { + //diff = fParam1 * (diff + fParam1); + diff = static_cast(normSigma2 * (diff + fParam1)); + + } + + float ixval = (float)(dIptr[0]); + float iyval = (float)(dIptr[1]); + b1 += (float)(diff*ixval); + b2 += (float)(diff*iyval); + + if ( j == 0) + { + float tale = normSigma2 * FLT_RESCALE; + if (abss < fParam0) + { + tale = FLT_RESCALE; + } + else if (abss > fParam1) + { + tale *= 0.01f; + } + else + { + tale *= normSigma2; + } + A11 += (float)(ixval*ixval*tale); + A12 += (float)(ixval*iyval*tale); + A22 += (float)(iyval*iyval*tale); + } + + + } +#endif + } + +#ifdef RLOF_SSE + short etaValues[8]; + _mm_storeu_si128((__m128i*)(etaValues), mmEta); + MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3] + + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]); + + +#endif + if (j == 0) + { +#ifdef RLOF_SSE + float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];// + + _mm_store_ps(A11buf, mmAxx); + _mm_store_ps(A12buf, mmAxy); + _mm_store_ps(A22buf, mmAyy); + + + A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; + A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; + A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; +#endif + A11 *= FLT_SCALE; + A12 *= FLT_SCALE; + A22 *= FLT_SCALE; + + + D = A11 * A22 - A12 * A12; + minEig = (A22 + A11 - std::sqrt((A11 - A22)*(A11 - A22) + + 4.f*A12*A12)) / (2 * winArea); + D = 1.f / D; + + if (minEig < minEigThreshold || std::abs(D) < FLT_EPSILON) + { + if (level == 0 && status) + status[ptidx] = 0; + if (level > 0) + { + nextPts[ptidx] = backUpNextPt; + } + break; + } + } + +#ifdef RLOF_SSE + float CV_DECL_ALIGNED(16) bbuf[4]; + _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1)); + b1 += bbuf[0] + bbuf[2]; + b2 += bbuf[1] + bbuf[3]; +#endif + + b1 *= FLT_SCALE; + b2 *= FLT_SCALE; + + + Point2f delta((float)((A12*b2 - A22 * b1) * D), (float)((A12*b1 - A11 * b2) * D)); + + delta.x = (delta.x != delta.x) ? 0 : delta.x; + delta.y = (delta.y != delta.y) ? 0 : delta.y; + + nextPt += delta * 0.7; + nextPts[ptidx] = nextPt - halfWin; + + if (j > 0 && std::abs(delta.x - prevDelta.x) < 0.01 && + std::abs(delta.y - prevDelta.y) < 0.01) + { + nextPts[ptidx] -= delta * 0.5f; + break; + } + + prevDelta = delta; + + } + + } + + } + + const Mat* prevImg; + const Mat* nextImg; + const Mat* prevDeriv; + const Mat* rgbPrevImg; + const Mat* rgbNextImg; + const Point2f* prevPts; + Point2f* nextPts; + uchar* status; + float* err; + int maxWinSize; + int minWinSize; + TermCriteria criteria; + int level; + int maxLevel; + int windowType; + float minEigThreshold; + bool useInitialFlow; + const float normSigma0, normSigma1, normSigma2; + int crossSegmentationThreshold; +}; + +} // namespace +namespace radial { + +class TrackerInvoker : public cv::ParallelLoopBody +{ +public: + TrackerInvoker( + const Mat& _prevImg, + const Mat& _prevDeriv, + const Mat& _nextImg, + const Mat& _rgbPrevImg, + const Mat& _rgbNextImg, + const Point2f* _prevPts, + Point2f* _nextPts, + uchar* _status, + float* _err, + Point2f* _gainVecs, + int _level, + int _maxLevel, + int _winSize[2], + int _maxIteration, + bool _useInitialFlow, + int _supportRegionType, + const std::vector& _normSigmas, + float _minEigenValue, + int _crossSegmentationThreshold + ) : + normSigma0(_normSigmas[0]), + normSigma1(_normSigmas[1]), + normSigma2(_normSigmas[2]) + { + prevImg = &_prevImg; + prevDeriv = &_prevDeriv; + nextImg = &_nextImg; + rgbPrevImg = &_rgbPrevImg; + rgbNextImg = &_rgbNextImg; + prevPts = _prevPts; + nextPts = _nextPts; + status = _status; + err = _err; + gainVecs = _gainVecs; + minWinSize = _winSize[0]; + maxWinSize = _winSize[1]; + criteria.maxCount = _maxIteration; + criteria.epsilon = 0.01; + level = _level; + maxLevel = _maxLevel; + windowType = _supportRegionType; + minEigThreshold = _minEigenValue; + useInitialFlow = _useInitialFlow; + crossSegmentationThreshold = _crossSegmentationThreshold; + } + + void operator()(const cv::Range& range) const CV_OVERRIDE + { +#ifdef DEBUG_INVOKER + printf("rlof::radial");fflush(stdout); +#endif + Point2f halfWin; + cv::Size winSize; + const Mat& I = *prevImg; + const Mat& J = *nextImg; + const Mat& derivI = *prevDeriv; + const Mat& BI = *rgbPrevImg; + + + const float FLT_SCALE = (1.f / (1 << 16)); + + winSize = cv::Size(maxWinSize, maxWinSize); + int winMaskwidth = roundUp(winSize.width, 16); + cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType); + winMaskMatBuf.setTo(1); + + int cn = I.channels(), cn2 = cn * 2; + int winbufwidth = roundUp(winSize.width, 16); + cv::Size winBufSize(winbufwidth, winbufwidth); + + cv::Matx44f invTensorMat; + cv::Vec4f mismatchMat; + cv::Vec4f resultMat; + + + std::vector _buf(winBufSize.area()*(cn + cn2)); + Mat IWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn), &_buf[0]); + Mat derivIWinBuf(winBufSize, CV_MAKETYPE(CV_16S, cn2), &_buf[winBufSize.area()*cn]); + + for (int ptidx = range.start; ptidx < range.end; ptidx++) + { + + Point2f prevPt = prevPts[ptidx] * (float)(1. / (1 << level)); + Point2f nextPt; + if (level == maxLevel) + { + if (useInitialFlow) + { + nextPt = nextPts[ptidx] * (float)(1. / (1 << level)); + } + else + nextPt = prevPt; + } + else + { + nextPt = nextPts[ptidx] * 2.f; + + } + nextPts[ptidx] = nextPt; + + Point2i iprevPt, inextPt; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + int winArea = maxWinSize * maxWinSize; + cv::Mat winMaskMat(winMaskMatBuf, cv::Rect(0, 0, maxWinSize, maxWinSize)); + winMaskMatBuf.setTo(0); + if (calcWinMaskMat(BI, windowType, iprevPt, + winMaskMat, winSize, halfWin, winArea, + minWinSize, maxWinSize) == false) + continue; + halfWin = Point2f(static_cast(maxWinSize) ,static_cast(maxWinSize) ) - halfWin; + prevPt += halfWin; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + if( iprevPt.x < 0 || iprevPt.x >= derivI.cols - winSize.width || + iprevPt.y < 0 || iprevPt.y >= derivI.rows - winSize.height - 1) + { + if (level == 0) + { + if (status) + status[ptidx] = 3; + if (err) + err[ptidx] = 0; + } + continue; + } + + float a = prevPt.x - iprevPt.x; + float b = prevPt.y - iprevPt.y; + const int W_BITS = 14, W_BITS1 = 14; + + int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + + float A11 = 0, A12 = 0, A22 = 0; + float b1 = 0, b2 = 0, b3 = 0, b4 = 0; + // tensor + float sumIx = 0; + float sumIy = 0; + float sumI = 0; + float sumW = 0; + float w1 = 0, w2 = 0; // -IyI + float dI = 0; // I^2 + float D = 0; + +#ifdef RLOF_SSE + __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128i z = _mm_setzero_si128(); + __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1 - 1)); + __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1 - 5 - 1)); + __m128i mmMask4_epi32; + __m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits::max()); + get4BitMask(winSize.width, mmMask4_epi32); +#endif + + // extract the patch from the first image, compute covariation matrix of derivatives + int x, y; + for (y = 0; y < winSize.height; y++) + { + const uchar* src = I.ptr(y + iprevPt.y, 0) + iprevPt.x*cn; + const uchar* src1 = I.ptr(y + iprevPt.y + 1, 0) + iprevPt.x*cn; + const short* dsrc = derivI.ptr(y + iprevPt.y, 0) + iprevPt.x*cn2; + const short* dsrc1 = derivI.ptr(y + iprevPt.y + 1, 0) + iprevPt.x*cn2; + short* Iptr = IWinBuf.ptr(y, 0); + short* dIptr = derivIWinBuf.ptr(y, 0); + x = 0; + +#ifdef RLOF_SSE + const tMaskType* maskPtr = winMaskMat.ptr(y, 0); + for (; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4 * 2, dsrc1 += 8, dIptr += 4 * 2) + { + __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16); + __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16); + __m128i v00, v01, v10, v11, t0, t1; + v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); + v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); + v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x)), z); + v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x + cn)), z); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5); + if (x + 4 > winSize.width) + { + t0 = _mm_and_si128(t0, mmMask4_epi32); + } + t0 = _mm_and_si128(t0, mask_0_3_epi16); + _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0, t0)); + + + v00 = _mm_loadu_si128((const __m128i*)(dsrc)); + v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); + v10 = _mm_loadu_si128((const __m128i*)(dsrc1)); + v11 = _mm_loadu_si128((const __m128i*)(dsrc1 + cn2)); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1); + v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... + if (x + 4 > winSize.width) + { + v00 = _mm_and_si128(v00, mmMask4_epi32); + } + v00 = _mm_and_si128(v00, mask_0_3_epi16); + _mm_storeu_si128((__m128i*)dIptr, v00); + } +#else + + for (; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2) + { + if (winMaskMat.at(y, x) == 0) + { + dIptr[0] = 0; + dIptr[1] = 0; + continue; + } + int ival = CV_DESCALE(src[x] * iw00 + src[x + cn] * iw01 + + src1[x] * iw10 + src1[x + cn] * iw11, W_BITS1 - 5); + int ixval = CV_DESCALE(dsrc[0] * iw00 + dsrc[cn2] * iw01 + + dsrc1[0] * iw10 + dsrc1[ cn2] * iw11, W_BITS1); + int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc1[1] * iw10 + + dsrc1[cn2 + 1] * iw11, W_BITS1); + + Iptr[x] = (short)ival; + dIptr[0] = (short)ixval; + dIptr[1] = (short)iyval; + + } +#endif + } + + cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1); + + cv::Point2f backUpNextPt = nextPt; + nextPt += halfWin; + Point2f prevDelta(0, 0); //related to h(t-1) + Point2f prevGain(0, 0); + cv::Point2f gainVec = gainVecs[ptidx]; + cv::Point2f backUpGain = gainVec; + cv::Size _winSize = winSize; + int j; +#ifdef RLOF_SSE + __m128i mmMask0, mmMask1, mmMask; + getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask); + __m128 mmOnes = _mm_set1_ps(1.f); +#endif + float MEstimatorScale = 1; + int buffIdx = 0; + float minEigValue; + + for (j = 0; j < criteria.maxCount; j++) + { + + inextPt.x = cvFloor(nextPt.x); + inextPt.y = cvFloor(nextPt.y); + + if( inextPt.x < 0 || inextPt.x >= J.cols - winSize.width || + inextPt.y < 0 || inextPt.y >= J.rows - winSize.height - 1) + { + if (level == 0 && status) + status[ptidx] = 3; + if (level > 0) + { + nextPts[ptidx] = backUpNextPt; + gainVecs[ptidx] = backUpGain; + } + break; + } + + a = nextPt.x - inextPt.x; + b = nextPt.y - inextPt.y; + iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + + // mismatch + b1 = 0; + b2 = 0; + b3 = 0; + b4 = 0; + if (j == 0) + { + // tensor + w1 = 0; // -IxI + w2 = 0; // -IyI + dI = 0; // I^2 + sumIx = 0; + sumIy = 0; + sumI = 0; + sumW = 0; + A11 = 0; + A12 = 0; + A22 = 0; + } + + if (j == 0 ) + { + buffIdx = 0; + for (y = 0; y < winSize.height; y++) + { + const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const uchar* Jptr1 = J.ptr(y + inextPt.y + 1, inextPt.x*cn); + const short* Iptr = IWinBuf.ptr(y, 0); + const short* dIptr = derivIWinBuf.ptr(y, 0); + x = 0; + for (; x < winSize.width*cn; x++, dIptr += 2) + { + if (dIptr[0] == 0 && dIptr[1] == 0) + continue; + int diff = static_cast(CV_DESCALE( Jptr[x] * iw00 + + Jptr[x + cn] * iw01 + + Jptr1[x] * iw10 + + Jptr1[x + cn] * iw11, W_BITS1 - 5) + - Iptr[x] + Iptr[x] * gainVec.x + gainVec.y); + residualMat.at(buffIdx++) = static_cast(diff); + } + } + /*! Estimation for the residual */ + cv::Mat residualMatRoi(residualMat, cv::Rect(0, 0, 1, buffIdx)); + MEstimatorScale = (buffIdx == 0) ? 1.f : estimateScale(residualMatRoi); + } + + float eta = 1.f / winArea; + float fParam0 = normSigma0 * 32.f; + float fParam1 = normSigma1 * 32.f; + fParam0 = normSigma0 * MEstimatorScale; + fParam1 = normSigma1 * MEstimatorScale; + +#ifdef RLOF_SSE + + qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(), qb2 = _mm_setzero_ps(); + __m128 qb3 = _mm_setzero_ps(); + __m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps(); + __m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps(); + __m128 mmSumIy = _mm_setzero_ps(), mmSumIx = _mm_setzero_ps(); + __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps(); + __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits::max() - 1, static_cast(fParam0))); + __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits::max() - 1, static_cast(fParam1))); + + + float s2Val = std::fabs(normSigma2); + int s2bitShift = normSigma2 == 0 ? 1 : cvCeil(log(200.f / s2Val) / log(2.f)); + __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast(normSigma2 * (float)(1 << s2bitShift))); + __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift); + __m128 mmParam2s = _mm_set1_ps(0.01f * normSigma2); + __m128 mmParam2s2 = _mm_set1_ps(normSigma2 * normSigma2); + + float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x; + int bitShift = gainVec.x == 0 ? 1 : cvCeil(log(200.f / gainVal) / log(2.f)); + + __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast(gainVec.x * (float)(1 << bitShift))); + __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast(gainVec.y)); + __m128i mmEta = _mm_setzero_si128(); + __m128i mmScale = _mm_set1_epi16(static_cast(MEstimatorScale)); + +#endif + buffIdx = 0; + + for (y = 0; y < _winSize.height; y++) + { + const uchar* Jptr = J.ptr(y + inextPt.y, inextPt.x*cn); + const uchar* Jptr1 = J.ptr(y + inextPt.y + 1, inextPt.x*cn); + const short* Iptr = IWinBuf.ptr(y, 0); + const short* dIptr = derivIWinBuf.ptr(y, 0); + const tMaskType* maskPtr = winMaskMat.ptr(y, 0); + x = 0; +#ifdef RLOF_SSE + + + for (; x <= _winSize.width*cn; x += 8, dIptr += 8 * 2) + { + __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16); + __m128i diff0, diff1; + __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); // von element 0 bis 7 + __m128i v00 = _mm_unpacklo_epi8( + _mm_loadl_epi64((const __m128i*)(Jptr + x)) , z); //J0 , 0, J1, 0, J2, 0 ... J7,0 + __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); + __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z); + __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z); + + __m128i t0 = _mm_add_epi32 + (_mm_madd_epi16( + _mm_unpacklo_epi16(v00, v01), //J0, , J1, J1, J2, J2, ... J3 , J4 + qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1 - 5); + + + __m128i lo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16); + __m128i hi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16); + + __m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), bitShift); + __m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), bitShift); + __m128i Igain_epi16 = _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32); + + // diff = J - I + I*gain.x + gain.y + __m128i mmDiff_epi16 = _mm_add_epi16( + _mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16), // J - I + _mm_add_epi16(Igain_epi16, mmConstValue_epi16)); + + + mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16); + + + __m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale); + mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1))); + + + __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 + __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); + + + __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16); + __m128i bSet2_epi16, bSet1_epi16; + // |It| < sigma1 ? + bSet2_epi16 = _mm_cmplt_epi16(abs_epi16, mmParam1); + // It > 0 ? + __m128i diffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128()); + // sigma0 < |It| < sigma1 ? + bSet1_epi16 = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0)); + // val = |It| -/+ sigma1 + __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)), + _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1))); + // It == 0 ? |It| > sigma13 + mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16); + // It == val ? sigma0 < |It| < sigma1 + mmDiff_epi16 = _mm_blendv_epi8(mmDiff_epi16, tmpParam1_epi16, bSet1_epi16); + + + __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3 + // diff = diff * sigma2 + lo = _mm_mullo_epi16(tale_epi16_, mmDiff_epi16); + hi = _mm_mulhi_epi16(tale_epi16_, mmDiff_epi16); + __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); //diff 0_3_epi32 + __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); // diff 4_7_epi32 + + mmDiff_epi16 = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); // ,da das ergebniss kleiner als 16 bit sein sollte + + diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7 | It12 It12 It13 It13... + diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3 | It8 It8 It9 It9... + + // Ix * diff / Iy * diff + v10 = _mm_mullo_epi16(Ixy_0, diff0); + v11 = _mm_mulhi_epi16(Ixy_0, diff0); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); + qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); + + // It * Ix It * Iy [4 ... 7] + // for set 1 hi sigma 1 + + v10 = _mm_mullo_epi16(Ixy_1, diff1); + v11 = _mm_mulhi_epi16(Ixy_1, diff1); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); + qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); + // diff * J [0 ... 7] + // for set 1 sigma 1 + // b3 += diff * Iptr[x] + + v10 = _mm_mullo_epi16(mmDiff_epi16, I_0_7_epi16); + v11 = _mm_mulhi_epi16(mmDiff_epi16, I_0_7_epi16); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v00)); + qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v10)); + + qb3 = _mm_add_ps(qb3, _mm_cvtepi32_ps(diff_0_3_epi32)); + qb3 = _mm_add_ps(qb3, _mm_cvtepi32_ps(diff_4_7_epi32)); + + if (j == 0) + { + __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16)); + __m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16, bSet1_epi16), 16)); + __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16)); + __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16))); + + __m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16)); + __m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16), 16)); + + __m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps); + __m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps); + tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps); + tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps); + + tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps); + tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps); + t0 = _mm_srai_epi32(Ixy_0, 16); + t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); + + __m128 fy = _mm_cvtepi32_ps(t0); + __m128 fx = _mm_cvtepi32_ps(t1); + + // 0 ... 3 + __m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16)); + + // A11 - A22 + __m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps); + __m128 fytale = _mm_mul_ps(fy, tale_0_3_ps); + + mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); + mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); + mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); + + // sumIx und sumIy + mmSumIx = _mm_add_ps(mmSumIx, fxtale); + mmSumIy = _mm_add_ps(mmSumIy, fytale); + + mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale)); + mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale)); + + // sumI + __m128 I_tale_ps = _mm_mul_ps(I_ps, tale_0_3_ps); + mmSumI = _mm_add_ps(mmSumI, I_tale_ps); + + // sumW + mmSumW = _mm_add_ps(mmSumW, tale_0_3_ps); + + // sumDI + mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps(I_ps, I_tale_ps)); + + t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11 + t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3 + + fy = _mm_cvtepi32_ps(t0); + fx = _mm_cvtepi32_ps(t1); + + // 4 ... 7 + I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16)); + + // A11 - A22 + fxtale = _mm_mul_ps(fx, tale_4_7_ps); + fytale = _mm_mul_ps(fy, tale_4_7_ps); + + mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale)); + mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale)); + mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale)); + + // sumIx und sumIy + mmSumIx = _mm_add_ps(mmSumIx, fxtale); + mmSumIy = _mm_add_ps(mmSumIy, fytale); + + mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale)); + mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale)); + + // sumI + I_tale_ps = _mm_mul_ps(I_ps, tale_4_7_ps); + mmSumI = _mm_add_ps(mmSumI, I_tale_ps); + + mmSumW = _mm_add_ps(mmSumW, tale_4_7_ps); + + mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps(I_ps, I_tale_ps)); + } + + } +#else + for (; x < _winSize.width*cn; x++, dIptr += 2) + { + if (maskPtr[x] == 0) + continue; + int J_val = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr1[x] * iw10 + Jptr1[x + cn] * iw11, + W_BITS1 - 5); + short ixval = static_cast(dIptr[0]); + short iyval = static_cast(dIptr[1]); + int diff = static_cast(J_val - Iptr[x] + Iptr[x] * gainVec.x + gainVec.y); + int abss = (diff < 0) ? -diff : diff; + if (diff > MEstimatorScale) + MEstimatorScale += eta; + if (diff < MEstimatorScale) + MEstimatorScale -= eta; + if (abss > static_cast(fParam1)) + { + diff = 0; + } + else if (abss > static_cast(fParam0) && diff >= 0) + { + diff = static_cast(normSigma2 * (diff - fParam1)); + } + else if (abss > static_cast(fParam0) && diff < 0) + { + diff = static_cast(normSigma2 * (diff + fParam1)); + } + b1 += (float)(diff*ixval); + b2 += (float)(diff*iyval); ; + b3 += (float)(diff)* Iptr[x]; + b4 += (float)(diff); + + + // compute the Gradient Matrice + if (j == 0) + { + float tale = normSigma2 * FLT_RESCALE; + if (abss < fParam0 || j < 0) + { + tale = FLT_RESCALE; + } + else if (abss > fParam1) + { + tale *= 0.01f; + } + else + { + tale *= normSigma2; + } + if (j == 0) + { + A11 += (float)(ixval*ixval)*tale; + A12 += (float)(ixval*iyval)*tale; + A22 += (float)(iyval*iyval)*tale; + } + + dI += Iptr[x] * Iptr[x] * tale; + float dx = static_cast(dIptr[0]) * tale; + float dy = static_cast(dIptr[1]) * tale; + sumIx += dx; + sumIy += dy; + w1 += dx * Iptr[x]; + w2 += dy * Iptr[x]; + sumI += Iptr[x] * tale; + sumW += tale; + } + + } +#endif + } +#ifdef RLOF_SSE + short etaValues[8]; + _mm_storeu_si128((__m128i*)(etaValues), mmEta); + MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3] + + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]); + float CV_DECL_ALIGNED(32) wbuf[4]; +#endif + if (j == 0) + { +#ifdef RLOF_SSE + _mm_store_ps(wbuf, mmSumW1); + w1 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumW2); + w2 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumDI); + dI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumI); + sumI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumIx); + sumIx = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumIy); + sumIy = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; + _mm_store_ps(wbuf, mmSumW); + sumW = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3]; +#endif + sumIx *= -FLT_SCALE; + sumIy *= -FLT_SCALE; + sumI *= FLT_SCALE; + sumW *= FLT_SCALE; + w1 *= -FLT_SCALE; + w2 *= -FLT_SCALE; + dI *= FLT_SCALE; +#ifdef RLOF_SSE + float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4]; + _mm_store_ps(A11buf, mmAxx); + _mm_store_ps(A12buf, mmAxy); + _mm_store_ps(A22buf, mmAyy); + + + A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; + A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; + A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; +#endif + A11 *= FLT_SCALE; + A12 *= FLT_SCALE; + A22 *= FLT_SCALE; + } +#ifdef RLOF_SSE + float CV_DECL_ALIGNED(16) bbuf[4]; + _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1)); + b1 = bbuf[0] + bbuf[2]; + b2 = bbuf[1] + bbuf[3]; + _mm_store_ps(bbuf, qb2); + b3 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; + _mm_store_ps(bbuf, qb3); + b4 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3]; +#endif + mismatchMat(0) = b1 * FLT_SCALE; + mismatchMat(1) = b2 * FLT_SCALE; + mismatchMat(2) = -b3 * FLT_SCALE; + mismatchMat(3) = -b4 * FLT_SCALE; + + D = -A12 * A12*sumI*sumI + dI * sumW*A12*A12 + 2 * A12*sumI*sumIx*w2 + 2 * A12*sumI*sumIy*w1 + - 2 * dI*A12*sumIx*sumIy - 2 * sumW*A12*w1*w2 + A11 * A22*sumI*sumI - 2 * A22*sumI*sumIx*w1 + - 2 * A11*sumI*sumIy*w2 - sumIx * sumIx*w2*w2 + A22 * dI*sumIx*sumIx + 2 * sumIx*sumIy*w1*w2 + - sumIy * sumIy*w1*w1 + A11 * dI*sumIy*sumIy + A22 * sumW*w1*w1 + A11 * sumW*w2*w2 - A11 * A22*dI*sumW; + + float sqrtVal = std::sqrt((A11 - A22)*(A11 - A22) + 4.f*A12*A12); + minEigValue = (A22 + A11 - sqrtVal) / (2.0f*winArea); + if (minEigValue < minEigThreshold || std::abs(D) < FLT_EPSILON) + { + if (level == 0 && status) + status[ptidx] = 0; + if (level > 0) + { + nextPts[ptidx] = backUpNextPt; + gainVecs[ptidx] = backUpGain; + } + break; + } + + D = (1.f / D); + + invTensorMat(0, 0) = (A22*sumI*sumI - 2 * sumI*sumIy*w2 + dI * sumIy*sumIy + sumW * w2*w2 - A22 * dI*sumW)* D; + invTensorMat(0, 1) = (A12*dI*sumW - A12 * sumI * sumI - dI * sumIx*sumIy + sumI * sumIx*w2 + sumI * sumIy*w1 - sumW * w1*w2)* D; + invTensorMat(0, 2) = (A12*sumI*sumIy - sumIy * sumIy*w1 - A22 * sumI*sumIx - A12 * sumW*w2 + A22 * sumW*w1 + sumIx * sumIy*w2)* D; + invTensorMat(0, 3) = (A22*dI*sumIx - A12 * dI*sumIy - sumIx * w2*w2 + A12 * sumI*w2 - A22 * sumI*w1 + sumIy * w1*w2) * D; + invTensorMat(1, 0) = invTensorMat(0, 1); + invTensorMat(1, 1) = (A11*sumI * sumI - 2 * sumI*sumIx*w1 + dI * sumIx * sumIx + sumW * w1*w1 - A11 * dI*sumW) * D; + invTensorMat(1, 2) = (A12*sumI*sumIx - A11 * sumI*sumIy - sumIx * sumIx*w2 + A11 * sumW*w2 - A12 * sumW*w1 + sumIx * sumIy*w1) * D; + invTensorMat(1, 3) = (A11*dI*sumIy - sumIy * w1*w1 - A12 * dI*sumIx - A11 * sumI*w2 + A12 * sumI*w1 + sumIx * w1*w2)* D; + invTensorMat(2, 0) = invTensorMat(0, 2); + invTensorMat(2, 1) = invTensorMat(1, 2); + invTensorMat(2, 2) = (sumW*A12*A12 - 2 * A12*sumIx*sumIy + A22 * sumIx*sumIx + A11 * sumIy*sumIy - A11 * A22*sumW)* D; + invTensorMat(2, 3) = (A11*A22*sumI - A12 * A12*sumI - A11 * sumIy*w2 + A12 * sumIx*w2 + A12 * sumIy*w1 - A22 * sumIx*w1)* D; + invTensorMat(3, 0) = invTensorMat(0, 3); + invTensorMat(3, 1) = invTensorMat(1, 3); + invTensorMat(3, 2) = invTensorMat(2, 3); + invTensorMat(3, 3) = (dI*A12*A12 - 2 * A12*w1*w2 + A22 * w1*w1 + A11 * w2*w2 - A11 * A22*dI)* D; + + resultMat = invTensorMat * mismatchMat; + + Point2f delta(-resultMat(0), -resultMat(1)); + Point2f deltaGain(resultMat(2), resultMat(3)); + + + + + if (j == 0) + prevGain = deltaGain; + gainVec += deltaGain * 0.8; + nextPt += delta * 0.8; + nextPts[ptidx] = nextPt - halfWin; + gainVecs[ptidx] = gainVec; + + if ( + (std::abs(delta.x - prevDelta.x) < 0.01 && std::abs(delta.y - prevDelta.y) < 0.01) + || ((delta.ddot(delta) <= 0.001) && std::abs(prevGain.x - deltaGain.x) < 0.01) + ) + { + nextPts[ptidx] -= delta * 0.5f; + gainVecs[ptidx] -= deltaGain * 0.5f; + break; + } + + prevDelta = delta; + prevGain = deltaGain; + } + + } + + } + + const Mat* prevImg; + const Mat* nextImg; + const Mat* prevDeriv; + const Mat* rgbPrevImg; + const Mat* rgbNextImg; + const Point2f* prevPts; + Point2f* nextPts; + uchar* status; + cv::Point2f* gainVecs; + float* err; + int maxWinSize; + int minWinSize; + TermCriteria criteria; + int level; + int maxLevel; + int windowType; + float minEigThreshold; + bool useInitialFlow; + const float normSigma0, normSigma1, normSigma2; + int crossSegmentationThreshold; +}; + +}}}} // namespace +#endif diff --git a/modules/optflow/src/rlof/rlof_invokerbase.hpp b/modules/optflow/src/rlof/rlof_invokerbase.hpp new file mode 100644 index 00000000000..a7d09ceaff3 --- /dev/null +++ b/modules/optflow/src/rlof/rlof_invokerbase.hpp @@ -0,0 +1,178 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +#ifndef _RLOF_INVOKERBASE_HPP_ +#define _RLOF_INVOKERBASE_HPP_ + + +#if CV_SSE4_1 +#define RLOF_SSE +#endif + +//#define DEBUG_INVOKER + +#ifndef CV_DESCALE +#define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n)) +#endif + +#define FLT_RESCALE 1 + + +#include "rlof_localflow.h" +#include +#include "opencv2/core/hal/intrin.hpp" +using namespace std; +using namespace cv; + + +namespace cv { +namespace optflow { + +typedef short deriv_type; +#ifdef RLOF_SSE +static inline void get4BitMask(const int & width, __m128i & mask) +{ + int noBits = width - static_cast(floor(width / 4.f) * 4.f); + unsigned int val[4]; + for (int n = 0; n < 4; n++) + { + val[n] = (noBits > n) ? (std::numeric_limits::max()) : 0; + } + mask = _mm_set_epi32(val[3], val[2], val[1], val[0]); + +} +static inline void getWBitMask(const int & width, __m128i & t0, __m128i & t1, __m128i & t2) +{ + int noBits = width - static_cast(floor(width / 8.f) * 8.f); + unsigned short val[8]; + for (int n = 0; n < 8; n++) + { + val[n] = (noBits > n) ? (0xffff) : 0; + } + t1 = _mm_set_epi16(val[7], val[7], val[6], val[6], val[5], val[5], val[4], val[4]); + t0 = _mm_set_epi16(val[3], val[3], val[2], val[2], val[1], val[1], val[0], val[0]); + t2 = _mm_set_epi16(val[7], val[6], val[5], val[4], val[3], val[2], val[1], val[0]); +} +#endif +typedef uchar tMaskType; +#define tCVMaskType CV_8UC1 +#define MaskSet 0xffffffff + +static +void getLocalPatch( + const cv::Mat & src, + const cv::Point2i & prevPoint, // feature points + cv::Mat & winPointMask, + int & noPoints, + cv::Rect & winRoi, + int minWinSize) +{ + int maxWinSizeH = (winPointMask.cols - 1) / 2; + winRoi.x = prevPoint.x;// - maxWinSizeH; + winRoi.y = prevPoint.y;// - maxWinSizeH; + winRoi.width = winPointMask.cols; + winRoi.height = winPointMask.rows; + + if( minWinSize == winPointMask.cols || prevPoint.x < 0 || prevPoint.y < 0 + || prevPoint.x + 2*maxWinSizeH >= src.cols || prevPoint.y + 2*maxWinSizeH >= src.rows) + { + winRoi.x = prevPoint.x - maxWinSizeH; + winRoi.y = prevPoint.y - maxWinSizeH; + winPointMask.setTo(1); + noPoints = winPointMask.size().area(); + return; + } + winPointMask.setTo(0); + noPoints = 0; + int c = prevPoint.x + maxWinSizeH; + int r = prevPoint.y + maxWinSizeH; + int min_c = c; + int max_c = c; + int border_left = c - maxWinSizeH; + int border_top = r - maxWinSizeH; + cv::Vec4i bounds = src.at(r,c); + int min_r = bounds.val[2]; + int max_r = bounds.val[3]; + + for( int _r = min_r; _r <= max_r; _r++) + { + cv::Rect roi(maxWinSizeH, _r - border_top, winPointMask.cols, 1); + if( _r >= 0 && _r < src.cols) + { + bounds = src.at(_r,c); + roi.x = bounds.val[0] - border_left; + roi.width = bounds.val[1] - bounds.val[0]; + } + else + { + bounds.val[0] = border_left; + bounds.val[1] = border_left + roi.width; + } + cv::Mat(winPointMask, roi).setTo(1); + min_c = MIN(min_c, bounds.val[0]); + max_c = MAX(max_c, bounds.val[1]); + noPoints += roi.width; + } + + if( noPoints < minWinSize * minWinSize) + { + cv::Rect roi( winPointMask.cols / 2 - (minWinSize-1)/2, + winPointMask.rows / 2 - (minWinSize-1)/2, + minWinSize, minWinSize); + cv::Mat(winPointMask, roi).setTo(1); + roi.x += border_left; + roi.y += border_top; + min_c = MIN(MIN(min_c, roi.tl().x),roi.br().x); + max_c = MAX(MAX(max_c, roi.tl().x),roi.br().x); + min_r = MIN(MIN(min_r, roi.tl().y),roi.br().y); + max_r = MAX(MAX(max_r, roi.tl().y),roi.br().y); + noPoints += minWinSize * minWinSize; + } + winRoi.x = min_c - maxWinSizeH; + winRoi.y = min_r - maxWinSizeH; + winRoi.width = max_c - min_c; + winRoi.height = max_r - min_r; + winPointMask = winPointMask(cv::Rect(min_c - border_left, min_r - border_top, winRoi.width, winRoi.height)); +} + +static inline +bool calcWinMaskMat( + const cv::Mat & BI, + const int windowType, + const cv::Point2i & iprevPt, + cv::Mat & winMaskMat, + cv::Size & winSize, + cv::Point2f & halfWin, + int & winArea, + const int minWinSize, + const int maxWinSize) +{ + if (windowType == SR_CROSS && maxWinSize != minWinSize) + { + // patch generation + cv::Rect winRoi; + getLocalPatch(BI, iprevPt, winMaskMat, winArea, winRoi, minWinSize); + if (winArea == 0) + return false; + winSize = winRoi.size(); + halfWin = Point2f(static_cast(iprevPt.x - winRoi.tl().x), + static_cast(iprevPt.y - winRoi.tl().y)); + } + else + { + winSize = cv::Size(maxWinSize, maxWinSize); + halfWin = Point2f((winSize.width - 1) / 2.f, (winSize.height - 1) / 2.f); + winMaskMat.setTo(1); + } + return true; +} + +static inline +short estimateScale(cv::Mat & residuals) +{ + cv::Mat absMat = cv::abs(residuals); + return quickselect(absMat, absMat.rows / 2); +} + +}} // namespace +#endif diff --git a/modules/optflow/src/rlof/rlof_localflow.cpp b/modules/optflow/src/rlof/rlof_localflow.cpp new file mode 100644 index 00000000000..e1e59860926 --- /dev/null +++ b/modules/optflow/src/rlof/rlof_localflow.cpp @@ -0,0 +1,691 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +#include "../precomp.hpp" + +#include "opencv2/calib3d.hpp" // findHomography +#include "opencv2/highgui.hpp" +#include "rlof_localflow.h" +#include "berlof_invoker.hpp" +#include "rlof_invoker.hpp" +#include "plk_invoker.hpp" +using namespace std; +using namespace cv; + +namespace cv { +namespace detail { +typedef short deriv_type; +} // namespace + +namespace { +static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst) +{ + using namespace cv; + using cv::detail::deriv_type; + int rows = src.rows, cols = src.cols, cn = src.channels(), colsn = cols * cn, depth = src.depth(); + CV_Assert(depth == CV_8U); + dst.create(rows, cols, CV_MAKETYPE(DataType::depth, cn * 2)); + + int x, y, delta = (int)alignSize((cols + 2)*cn, 16); + AutoBuffer _tempBuf(delta * 2 + 64); + deriv_type *trow0 = alignPtr(_tempBuf.data() + cn, 16), *trow1 = alignPtr(trow0 + delta, 16); + +#if CV_SIMD128 + v_int16x8 c3 = v_setall_s16(3), c10 = v_setall_s16(10); + bool haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); +#endif + + for (y = 0; y < rows; y++) + { + const uchar* srow0 = src.ptr(y > 0 ? y - 1 : rows > 1 ? 1 : 0); + const uchar* srow1 = src.ptr(y); + const uchar* srow2 = src.ptr(y < rows - 1 ? y + 1 : rows > 1 ? rows - 2 : 0); + deriv_type* drow = dst.ptr(y); + + // do vertical convolution + x = 0; +#if CV_SIMD128 + if (haveSIMD) + { + for (; x <= colsn - 8; x += 8) + { + v_int16x8 s0 = v_reinterpret_as_s16(v_load_expand(srow0 + x)); + v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(srow1 + x)); + v_int16x8 s2 = v_reinterpret_as_s16(v_load_expand(srow2 + x)); + + v_int16x8 t1 = s2 - s0; + v_int16x8 t0 = v_mul_wrap(s0 + s2, c3) + v_mul_wrap(s1, c10); + + v_store(trow0 + x, t0); + v_store(trow1 + x, t1); + } + } +#endif + + for (; x < colsn; x++) + { + int t0 = (srow0[x] + srow2[x]) * 3 + srow1[x] * 10; + int t1 = srow2[x] - srow0[x]; + trow0[x] = (deriv_type)t0; + trow1[x] = (deriv_type)t1; + } + + // make border + int x0 = (cols > 1 ? 1 : 0)*cn, x1 = (cols > 1 ? cols - 2 : 0)*cn; + for (int k = 0; k < cn; k++) + { + trow0[-cn + k] = trow0[x0 + k]; trow0[colsn + k] = trow0[x1 + k]; + trow1[-cn + k] = trow1[x0 + k]; trow1[colsn + k] = trow1[x1 + k]; + } + + // do horizontal convolution, interleave the results and store them to dst + x = 0; +#if CV_SIMD128 + if (haveSIMD) + { + for (; x <= colsn - 8; x += 8) + { + v_int16x8 s0 = v_load(trow0 + x - cn); + v_int16x8 s1 = v_load(trow0 + x + cn); + v_int16x8 s2 = v_load(trow1 + x - cn); + v_int16x8 s3 = v_load(trow1 + x); + v_int16x8 s4 = v_load(trow1 + x + cn); + + v_int16x8 t0 = s1 - s0; + v_int16x8 t1 = v_mul_wrap(s2 + s4, c3) + v_mul_wrap(s3, c10); + + v_store_interleave((drow + x * 2), t0, t1); + } + } +#endif + for (; x < colsn; x++) + { + deriv_type t0 = (deriv_type)(trow0[x + cn] - trow0[x - cn]); + deriv_type t1 = (deriv_type)((trow1[x + cn] + trow1[x - cn]) * 3 + trow1[x] * 10); + drow[x * 2] = t0; drow[x * 2 + 1] = t1; + } + } +} + +} // namespace +namespace optflow { + +/*! Helper function for preCalcCrossSegmentation. Everything is performed on the large +*\param data CV_8UC3 image ( use extended image mit winSize) +*\param winSize +*\param dst CV_32SC1 bounding map +*\param threshold +*\param stride if true store into first two bounding maps +*/ +class HorizontalCrossSegmentation : public cv::ParallelLoopBody +{ +public: + HorizontalCrossSegmentation( + const cv::Point2f * ptList, + int npoints, + float pointScale, + const cv::Mat * data, + const int winSize, + cv::Mat * dst, + int threshold, + bool stride, + const cv::Mat * mask + ) + { + m_ptList = ptList; + m_npoints = npoints; + m_pointScale = pointScale; + m_data = data; + m_winSize = winSize; + m_dst = dst; + m_threshold = threshold; + m_stride = stride; + m_mask = mask; + } + + void operator()(const cv::Range& range) const CV_OVERRIDE + { + uchar channel[2]; + channel[0] = m_stride ? 2 : 0; + channel[1] = m_stride ? 3 : 1; + int hWinSize = (m_winSize - 1) / 2; + std::vector differenz(m_winSize); + for( int r = range.start; r < range.end; r++ ) + { + for(int c = hWinSize; c < m_data->cols - hWinSize; c++) + { + if( m_mask->at(r,c) == 0) + continue; + const Point3_ & ucval = m_data->at>(r,c); + Point3i val(static_cast(ucval.x), static_cast(ucval.y), static_cast(ucval.z)); + int x = c - hWinSize; + Point dstPos = m_stride ? Point(r,c) : Point(c,r); + for(int ix = 0; ix < m_winSize; ix++, x++) + { + const Point3_ & valref = m_data->at>(r,x); + differenz[ix] = MAX(std::abs(static_cast(valref.x) - val.x), + MAX(std::abs(static_cast(valref.y) - val.y), + (std::abs(static_cast(valref.z) - val.z)))); + + } + cv::Vec4i & bounds = m_dst->at(dstPos); + bounds.val[channel[0]] = c - hWinSize; + bounds.val[channel[1]] = c + hWinSize; + int * diffPtr = &differenz[hWinSize]; + bool useUpperBound = false; + bool useLowerBound = false; + for(int ix = 1; ix <= hWinSize; ix++) + { + if( !useUpperBound && diffPtr[-ix] > m_threshold) + { + useUpperBound = true; + bounds.val[channel[0]] = c - ix; + } + if( !useLowerBound && diffPtr[ix-1] > m_threshold) + { + useLowerBound = true; + bounds.val[channel[1]] = c + ix - 1; + } + if( useUpperBound && useLowerBound) + break; + } + } + } + } + + const cv::Point2f * m_ptList; + int m_npoints; + float m_pointScale; + const cv::Mat * m_data; + int m_winSize; + cv::Mat * m_dst; + int m_threshold; + bool m_stride; + const cv::Mat * m_mask; +}; + +static +void preCalcCrossSegmentation( + const cv::Point2f * ptList, + int npoints, + float pointScale, + const cv::Mat & img, + const int winSize, + cv::Mat & dst, + int threshold +) +{ + int hWinSize = (winSize - 1) / 2; + cv::Mat data = img; + data.adjustROI(hWinSize, hWinSize, hWinSize, hWinSize); + if( dst.size() != dst.size() || dst.type() != CV_32SC4) + { + dst.release(); + dst.create(data.size(), CV_32SC4); + } + cv::Mat mask(data.cols, data.rows, CV_8UC1); + mask.setTo(0); + for( unsigned int n = 0; n < static_cast(npoints); n++) + { + cv::Point ipos( static_cast(floor(ptList[n].y * pointScale)), + static_cast(floor(ptList[n].x * pointScale) + hWinSize)); + ipos.x = MAX( MIN(ipos.x, mask.cols - 1), 0); + int to = MIN( mask.cols - 1, ipos.x + winSize ); + int ypos = MAX( MIN(ipos.y, mask.rows - 1), 0); + for(int x = ipos.x; x <= to ; x++) + { + mask.at(ypos, x) = 255; + } + } + cv::Mat datat = data.t(); + cv::Mat maskt = mask.t(); + parallel_for_(cv::Range(0, datat.rows), HorizontalCrossSegmentation(ptList, npoints, pointScale, &datat, winSize, &dst, threshold, true, &mask)); + parallel_for_(cv::Range(0, data.rows), HorizontalCrossSegmentation(ptList, npoints, pointScale, &data, winSize, &dst, threshold, false, &maskt)); + +} + + +static inline +bool isrobust(const RLOFOpticalFlowParameter & param) +{ + return (param.normSigma0 < 255 && param.normSigma1 < 255); +} +static inline +std::vector get_norm(float sigma0, float sigma1) +{ + std::vector result = { sigma0, sigma1, sigma0 / (sigma0 - sigma1), sigma0 * sigma1 }; + return result; +} + +static +int buildOpticalFlowPyramidScale(InputArray _img, OutputArrayOfArrays pyramid, Size winSize, int maxLevel, bool withDerivatives, + int pyrBorder, int derivBorder, bool tryReuseInputImage, float levelScale[2]) +{ + Mat img = _img.getMat(); + CV_Assert(img.depth() == CV_8U && winSize.width > 2 && winSize.height > 2); + int pyrstep = withDerivatives ? 2 : 1; + + pyramid.create(1, (maxLevel + 1) * pyrstep, 0 /*type*/, -1, true); + + int derivType = CV_MAKETYPE(DataType::depth, img.channels() * 2); + + //level 0 + bool lvl0IsSet = false; + if (tryReuseInputImage && img.isSubmatrix() && (pyrBorder & BORDER_ISOLATED) == 0) + { + Size wholeSize; + Point ofs; + img.locateROI(wholeSize, ofs); + if (ofs.x >= winSize.width && ofs.y >= winSize.height + && ofs.x + img.cols + winSize.width <= wholeSize.width + && ofs.y + img.rows + winSize.height <= wholeSize.height) + { + pyramid.getMatRef(0) = img; + lvl0IsSet = true; + } + } + + if (!lvl0IsSet) + { + Mat& temp = pyramid.getMatRef(0); + + if (!temp.empty()) + temp.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width); + if (temp.type() != img.type() || temp.cols != winSize.width * 2 + img.cols || temp.rows != winSize.height * 2 + img.rows) + temp.create(img.rows + winSize.height * 2, img.cols + winSize.width * 2, img.type()); + + if (pyrBorder == BORDER_TRANSPARENT) + img.copyTo(temp(Rect(winSize.width, winSize.height, img.cols, img.rows))); + else + copyMakeBorder(img, temp, winSize.height, winSize.height, winSize.width, winSize.width, pyrBorder); + temp.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width); + } + + Size sz = img.size(); + Mat prevLevel = pyramid.getMatRef(0); + Mat thisLevel = prevLevel; + + for (int level = 0; level <= maxLevel; ++level) + { + if (level != 0) + { + Mat& temp = pyramid.getMatRef(level * pyrstep); + + if (!temp.empty()) + temp.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width); + if (temp.type() != img.type() || temp.cols != winSize.width * 2 + sz.width || temp.rows != winSize.height * 2 + sz.height) + temp.create(sz.height + winSize.height * 2, sz.width + winSize.width * 2, img.type()); + + thisLevel = temp(Rect(winSize.width, winSize.height, sz.width, sz.height)); + pyrDown(prevLevel, thisLevel, sz); + + if (pyrBorder != BORDER_TRANSPARENT) + copyMakeBorder(thisLevel, temp, winSize.height, winSize.height, winSize.width, winSize.width, pyrBorder | BORDER_ISOLATED); + temp.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width); + } + + if (withDerivatives) + { + Mat& deriv = pyramid.getMatRef(level * pyrstep + 1); + + if (!deriv.empty()) + deriv.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width); + if (deriv.type() != derivType || deriv.cols != winSize.width * 2 + sz.width || deriv.rows != winSize.height * 2 + sz.height) + deriv.create(sz.height + winSize.height * 2, sz.width + winSize.width * 2, derivType); + + Mat derivI = deriv(Rect(winSize.width, winSize.height, sz.width, sz.height)); + calcSharrDeriv(thisLevel, derivI); + + if (derivBorder != BORDER_TRANSPARENT) + copyMakeBorder(derivI, deriv, winSize.height, winSize.height, winSize.width, winSize.width, derivBorder | BORDER_ISOLATED); + deriv.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width); + } + + sz = Size(static_cast((sz.width + 1) / levelScale[0]), + static_cast((sz.height + 1) / levelScale[1])); + if (sz.width <= winSize.width || sz.height <= winSize.height) + { + pyramid.create(1, (level + 1) * pyrstep, 0 /*type*/, -1, true);//check this + return level; + } + + prevLevel = thisLevel; + } + + return maxLevel; +} +int CImageBuffer::buildPyramid(cv::Size winSize, int maxLevel, float levelScale[2]) +{ + if (m_Overwrite == false) + return m_maxLevel; + m_maxLevel = buildOpticalFlowPyramidScale(m_Image, m_ImagePyramid, winSize, maxLevel, false, 4, 0, true, levelScale); + return m_maxLevel; +} + +static +void calcLocalOpticalFlowCore( + Ptr prevPyramids[2], + Ptr currPyramids[2], + InputArray _prevPts, + InputOutputArray _nextPts, + const RLOFOpticalFlowParameter & param) +{ + + bool useAdditionalRGB = param.supportRegionType == SR_CROSS; + int iWinSize = param.largeWinSize; + int winSizes[2] = { iWinSize, iWinSize }; + if (param.supportRegionType != SR_FIXED) + { + winSizes[0] = param.smallWinSize; + } + //cv::Size winSize(iWinSize, iWinSize); + cv::TermCriteria criteria(cv::TermCriteria::COUNT | cv::TermCriteria::EPS, param.maxIteration, 0.01); + std::vector rlofNorm = get_norm(param.normSigma0, param.normSigma1); + CV_Assert(winSizes[0] <= winSizes[1]); + + bool usePreComputedCross = winSizes[0] != winSizes[1]; + Mat prevPtsMat = _prevPts.getMat(); + const int derivDepth = DataType::depth; + + CV_Assert(param.maxLevel >= 0 && iWinSize > 2); + + int level = 0, npoints; + CV_Assert((npoints = prevPtsMat.checkVector(2, CV_32F, true)) >= 0); + + if (!(param.useInitialFlow)) + _nextPts.create(prevPtsMat.size(), prevPtsMat.type(), -1, true); + + Mat nextPtsMat = _nextPts.getMat(); + CV_Assert(nextPtsMat.checkVector(2, CV_32F, true) == npoints); + + const Point2f* prevPts = (const Point2f*)prevPtsMat.data; + Point2f* nextPts = (Point2f*)nextPtsMat.data; + std::vector status(npoints); + std::vector err(npoints); + std::vector gainPts(npoints); + + float levelScale[2] = { 2.f,2.f }; + + int maxLevel = prevPyramids[0]->buildPyramid(cv::Size(iWinSize, iWinSize), param.maxLevel, levelScale); + + maxLevel = currPyramids[0]->buildPyramid(cv::Size(iWinSize, iWinSize), maxLevel, levelScale); + if (useAdditionalRGB) + { + prevPyramids[1]->buildPyramid(cv::Size(iWinSize, iWinSize), maxLevel, levelScale); + currPyramids[1]->buildPyramid(cv::Size(iWinSize, iWinSize), maxLevel, levelScale); + } + + if ((criteria.type & TermCriteria::COUNT) == 0) + criteria.maxCount = 30; + else + criteria.maxCount = std::min(std::max(criteria.maxCount, 0), 100); + if ((criteria.type & TermCriteria::EPS) == 0) + criteria.epsilon = 0.001; + else + criteria.epsilon = std::min(std::max(criteria.epsilon, 0.), 10.); + criteria.epsilon *= criteria.epsilon; + + // dI/dx ~ Ix, dI/dy ~ Iy + Mat derivIBuf; + derivIBuf.create(prevPyramids[0]->m_ImagePyramid[0].rows + iWinSize * 2, prevPyramids[0]->m_ImagePyramid[0].cols + iWinSize * 2, CV_MAKETYPE(derivDepth, prevPyramids[0]->m_ImagePyramid[0].channels() * 2)); + + for (level = maxLevel; level >= 0; level--) + { + Mat derivI; + Size imgSize = prevPyramids[0]->getImage(level).size(); + Mat _derivI(imgSize.height + iWinSize * 2, imgSize.width + iWinSize * 2, derivIBuf.type(), derivIBuf.data); + derivI = _derivI(Rect(iWinSize, iWinSize, imgSize.width, imgSize.height)); + calcSharrDeriv(prevPyramids[0]->getImage(level), derivI); + copyMakeBorder(derivI, _derivI, iWinSize, iWinSize, iWinSize, iWinSize, BORDER_CONSTANT | BORDER_ISOLATED); + + cv::Mat tRGBPrevPyr; + cv::Mat tRGBNextPyr; + if (useAdditionalRGB) + { + tRGBPrevPyr = prevPyramids[1]->getImage(level); + tRGBNextPyr = prevPyramids[1]->getImage(level); + + prevPyramids[1]->m_Overwrite = false; + currPyramids[1]->m_Overwrite = false; + } + + cv::Mat prevImage = prevPyramids[0]->getImage(level); + cv::Mat currImage = currPyramids[0]->getImage(level); + + cv::Mat preCrossMap; + if( usePreComputedCross ) + { + preCalcCrossSegmentation(prevPts, npoints, (float)(1./(1 << level)), tRGBPrevPyr, winSizes[1], preCrossMap, param.crossSegmentationThreshold); + tRGBNextPyr = cv::Mat(); + tRGBPrevPyr = preCrossMap; + } + // apply plk like tracker + prevImage.adjustROI(iWinSize, iWinSize, iWinSize, iWinSize); + currImage.adjustROI(iWinSize, iWinSize, iWinSize, iWinSize); + derivI.adjustROI(iWinSize, iWinSize, iWinSize, iWinSize); + if (isrobust(param) == false) + { + if (param.useIlluminationModel) + { + cv::parallel_for_(cv::Range(0, npoints), + plk::radial::TrackerInvoker( + prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, + prevPts, nextPts, &status[0], &err[0], &gainPts[0], + level, maxLevel, winSizes, + param.maxIteration, + param.useInitialFlow, + param.supportRegionType, + param.minEigenValue, + param.crossSegmentationThreshold)); + } + else + { + if (param.solverType == SolverType::ST_STANDART) + { + cv::parallel_for_(cv::Range(0, npoints), + plk::ica::TrackerInvoker( + prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, + prevPts, nextPts, &status[0], &err[0], + level, maxLevel, winSizes, + param.maxIteration, + param.useInitialFlow, + param.supportRegionType, + param.crossSegmentationThreshold, + param.minEigenValue)); + } + else + { + cv::parallel_for_(cv::Range(0, npoints), + beplk::ica::TrackerInvoker(prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, + prevPts, nextPts, &status[0], &err[0], + level, maxLevel, winSizes, + param.maxIteration, + param.useInitialFlow, + param.supportRegionType, + param.crossSegmentationThreshold, + param.minEigenValue)); + } + } + } + // for robust models + else + { + if (param.useIlluminationModel) + { + if (param.solverType == SolverType::ST_STANDART) + { + cv::parallel_for_(cv::Range(0, npoints), + rlof::radial::TrackerInvoker( + prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, + prevPts, nextPts, &status[0], &err[0], &gainPts[0], + level, maxLevel, winSizes, + param.maxIteration, + param.useInitialFlow, + param.supportRegionType, + rlofNorm, + param.minEigenValue, + param.crossSegmentationThreshold)); + } + else + { + cv::parallel_for_(cv::Range(0, npoints), + berlof::radial::TrackerInvoker(prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, + prevPts, nextPts, &status[0], &err[0], &gainPts[0], + level, maxLevel, winSizes, + param.maxIteration, + param.useInitialFlow, + param.supportRegionType, + param.crossSegmentationThreshold, + rlofNorm, + param.minEigenValue)); + } + } + else + { + + if (param.solverType == SolverType::ST_STANDART) + { + cv::parallel_for_(cv::Range(0, npoints), + rlof::ica::TrackerInvoker( + prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, + prevPts, nextPts, &status[0], &err[0], + level, maxLevel, winSizes, + param.maxIteration, + param.useInitialFlow, + param.supportRegionType, + rlofNorm, + param.minEigenValue, + param.crossSegmentationThreshold)); + } + else + { + cv::parallel_for_(cv::Range(0, npoints), + berlof::ica::TrackerInvoker(prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr, + prevPts, nextPts, &status[0], &err[0], + level, maxLevel, winSizes, + param.maxIteration, + param.useInitialFlow, + param.supportRegionType, + param.crossSegmentationThreshold, + rlofNorm, + param.minEigenValue)); + } + + } + } + + prevPyramids[0]->m_Overwrite = true; + currPyramids[0]->m_Overwrite = true; + } +} + +static +void preprocess(Ptr prevPyramids[2], + Ptr currPyramids[2], + const std::vector & prevPoints, + std::vector & currPoints, + const RLOFOpticalFlowParameter & param) +{ + cv::Mat mask, homography; + if (param.useGlobalMotionPrior == false) + return; + + currPoints.resize(prevPoints.size()); + + RLOFOpticalFlowParameter gmeTrackerParam = param; + gmeTrackerParam.useGlobalMotionPrior = false; + gmeTrackerParam.largeWinSize = 17; + // use none robust tracker for global motion estimation since it is faster + gmeTrackerParam.normSigma0 = std::numeric_limits::max(); + gmeTrackerParam.maxIteration = MAX(15, param.maxIteration); + gmeTrackerParam.minEigenValue = 0.000001f; + + std::vector gmPrevPoints, gmCurrPoints; + + // Initialize point grid + int stepr = prevPyramids[0]->m_Image.rows / 30; + int stepc = prevPyramids[0]->m_Image.cols / 40; + for (int r = stepr / 2; r < prevPyramids[0]->m_Image.rows; r += stepr) + { + for (int c = stepc / 2; c < prevPyramids[0]->m_Image.cols; c += stepc) + { + gmPrevPoints.push_back(cv::Point2f(static_cast(c), static_cast(r))); + } + } + + // perform motion estimation + calcLocalOpticalFlowCore(prevPyramids, currPyramids, gmPrevPoints, gmCurrPoints, gmeTrackerParam); + + cv::Mat prevPointsMat(static_cast(gmPrevPoints.size()), 1, CV_32FC2); + cv::Mat currPointsMat(static_cast(gmPrevPoints.size()), 1, CV_32FC2); + cv::Mat distMat(static_cast(gmPrevPoints.size()), 1, CV_32FC1); + + // Forward backward confidence to estimate optimal ransac reprojection error + int noPoints = 0; + for (unsigned int n = 0; n < gmPrevPoints.size(); n++) + { + cv::Point2f flow = gmCurrPoints[n] - gmPrevPoints[n]; + prevPointsMat.at(noPoints) = gmPrevPoints[n]; + currPointsMat.at(noPoints) = gmCurrPoints[n]; + distMat.at(noPoints) = flow.x * flow.x + flow.y* flow.y; + if (isnan(distMat.at(noPoints)) == false) + noPoints++; + } + + float medianDist = (param.globalMotionRansacThreshold == 0) ? 1.f : + quickselect(distMat, static_cast(noPoints * static_cast(param.globalMotionRansacThreshold) / 100.f)); + medianDist = sqrt(medianDist); + + if (noPoints < 8) + return; + + cv::findHomography(prevPointsMat(cv::Rect(0, 0, 1, noPoints)), currPointsMat(cv::Rect(0, 0, 1, noPoints)), cv::RANSAC, medianDist, mask).convertTo(homography, CV_32FC1); + + if (homography.empty()) + return; + + cv::perspectiveTransform(prevPoints, currPoints, homography); +} + +void calcLocalOpticalFlow( + const Mat prevImage, + const Mat currImage, + Ptr prevPyramids[2], + Ptr currPyramids[2], + const std::vector & prevPoints, + std::vector & currPoints, + const RLOFOpticalFlowParameter & param) +{ + if (prevImage.empty() == false && currImage.empty()== false) + { + prevPyramids[0]->m_Overwrite = true; + currPyramids[0]->m_Overwrite = true; + prevPyramids[1]->m_Overwrite = true; + currPyramids[1]->m_Overwrite = true; + if (prevImage.type() == CV_8UC3) + { + prevPyramids[0]->setGrayFromRGB(prevImage); + currPyramids[0]->setGrayFromRGB(currImage); + prevPyramids[1]->setImage(prevImage); + currPyramids[1]->setImage(currImage); + + if (param.supportRegionType == SR_CROSS) + { + prevPyramids[1]->setBlurFromRGB(prevImage); + currPyramids[1]->setBlurFromRGB(currImage); + } + } + else + { + prevPyramids[0]->setImage(prevImage); + currPyramids[0]->setImage(currImage); + } + } + preprocess(prevPyramids, currPyramids, prevPoints, currPoints, param); + RLOFOpticalFlowParameter internParam = param; + if (param.useGlobalMotionPrior == true) + internParam.useInitialFlow = true; + calcLocalOpticalFlowCore(prevPyramids, currPyramids, prevPoints, currPoints, internParam); +} + +}} // namespace diff --git a/modules/optflow/src/rlof/rlof_localflow.h b/modules/optflow/src/rlof/rlof_localflow.h new file mode 100644 index 00000000000..9ffe42cea3e --- /dev/null +++ b/modules/optflow/src/rlof/rlof_localflow.h @@ -0,0 +1,117 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +#ifndef _RLOF_LOCALFLOW_H_ +#define _RLOF_LOCALFLOW_H_ +#include +#include +#include +#include +#include "opencv2/imgproc.hpp" +#include "opencv2/optflow/rlofflow.hpp" +//! Fast median estimation method based on @cite Tibshirani2008. This implementation relates to http://www.stat.cmu.edu/~ryantibs/median/ +using namespace cv; +template +T quickselect(const Mat & inp, int k) +{ + unsigned long i; + unsigned long ir; + unsigned long j; + unsigned long l; + unsigned long mid; + Mat values = inp.clone(); + T a; + + l = 0; + ir = MAX(values.rows, values.cols) - 1; + while(true) + { + if (ir <= l + 1) + { + if (ir == l + 1 && values.at(ir) < values.at(l)) + std::swap(values.at(l), values.at(ir)); + return values.at(k); + } + else + { + mid = (l + ir) >> 1; + std::swap(values.at(mid), values.at(l+1)); + if (values.at(l) > values.at(ir)) + std::swap(values.at(l), values.at(ir)); + if (values.at(l+1) > values.at(ir)) + std::swap(values.at(l+1), values.at(ir)); + if (values.at(l) > values.at(l+1)) + std::swap(values.at(l), values.at(l+1)); + i = l + 1; + j = ir; + a = values.at(l+1); + while (true) + { + do + { + i++; + } + while (values.at(i) < a); + do + { + j--; + } + while (values.at(j) > a); + if (j < i) break; + std::swap(values.at(i), values.at(j)); + } + values.at(l+1) = values.at(j); + values.at(j) = a; + if (j >= static_cast(k)) ir = j - 1; + if (j <= static_cast(k)) l = i; + } + } +} + +namespace cv { +namespace optflow { + +class CImageBuffer +{ +public: + CImageBuffer() + : m_Overwrite(true) + {} + void setGrayFromRGB(const cv::Mat & inp) + { + if(m_Overwrite) + cv::cvtColor(inp, m_Image, cv::COLOR_BGR2GRAY); + } + void setImage(const cv::Mat & inp) + { + if(m_Overwrite) + inp.copyTo(m_Image); + } + void setBlurFromRGB(const cv::Mat & inp) + { + if(m_Overwrite) + cv::GaussianBlur(inp, m_BlurredImage, cv::Size(7,7), -1); + } + + int buildPyramid(cv::Size winSize, int maxLevel, float levelScale[2]); + cv::Mat & getImage(int level) {return m_ImagePyramid[level];} + + std::vector m_ImagePyramid; + cv::Mat m_BlurredImage; + cv::Mat m_Image; + std::vector m_CrossPyramid; + int m_maxLevel; + bool m_Overwrite; +}; + +void calcLocalOpticalFlow( + const Mat prevImage, + const Mat currImage, + Ptr prevPyramids[2], + Ptr currPyramids[2], + const std::vector & prevPoints, + std::vector & currPoints, + const RLOFOpticalFlowParameter & param); + +}} // namespace +#endif diff --git a/modules/optflow/src/rlofflow.cpp b/modules/optflow/src/rlofflow.cpp new file mode 100644 index 00000000000..5ee07005eb7 --- /dev/null +++ b/modules/optflow/src/rlofflow.cpp @@ -0,0 +1,399 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +#include "precomp.hpp" +#include "rlof/rlof_localflow.h" +#include "rlof/geo_interpolation.hpp" +#include "opencv2/ximgproc.hpp" + +namespace cv { +namespace optflow { + +Ptr RLOFOpticalFlowParameter::create() +{ + return Ptr(new RLOFOpticalFlowParameter); +} + +void RLOFOpticalFlowParameter::setSolverType(SolverType val){ solverType = val;} +SolverType RLOFOpticalFlowParameter::getSolverType() const { return solverType;} + +void RLOFOpticalFlowParameter::setSupportRegionType(SupportRegionType val){ supportRegionType = val;} +SupportRegionType RLOFOpticalFlowParameter::getSupportRegionType() const { return supportRegionType;} + +void RLOFOpticalFlowParameter::setNormSigma0(float val){ normSigma0 = val;} +float RLOFOpticalFlowParameter::getNormSigma0() const { return normSigma0;} + +void RLOFOpticalFlowParameter::setNormSigma1(float val){ normSigma1 = val;} +float RLOFOpticalFlowParameter::getNormSigma1() const { return normSigma1;} + +void RLOFOpticalFlowParameter::setSmallWinSize(int val){ smallWinSize = val;} +int RLOFOpticalFlowParameter::getSmallWinSize() const { return smallWinSize;} + +void RLOFOpticalFlowParameter::setLargeWinSize(int val){ largeWinSize = val;} +int RLOFOpticalFlowParameter::getLargeWinSize() const { return largeWinSize;} + +void RLOFOpticalFlowParameter::setCrossSegmentationThreshold(int val){ crossSegmentationThreshold = val;} +int RLOFOpticalFlowParameter::getCrossSegmentationThreshold() const { return crossSegmentationThreshold;} + +void RLOFOpticalFlowParameter::setMaxLevel(int val){ maxLevel = val;} +int RLOFOpticalFlowParameter::getMaxLevel() const { return maxLevel;} + +void RLOFOpticalFlowParameter::setUseInitialFlow(bool val){ useInitialFlow = val;} +bool RLOFOpticalFlowParameter::getUseInitialFlow() const { return useInitialFlow;} + +void RLOFOpticalFlowParameter::setUseIlluminationModel(bool val){ useIlluminationModel = val;} +bool RLOFOpticalFlowParameter::getUseIlluminationModel() const { return useIlluminationModel;} + +void RLOFOpticalFlowParameter::setUseGlobalMotionPrior(bool val){ useGlobalMotionPrior = val;} +bool RLOFOpticalFlowParameter::getUseGlobalMotionPrior() const { return useGlobalMotionPrior;} + +void RLOFOpticalFlowParameter::setMaxIteration(int val){ maxIteration = val;} +int RLOFOpticalFlowParameter::getMaxIteration() const { return maxIteration;} + +void RLOFOpticalFlowParameter::setMinEigenValue(float val){ minEigenValue = val;} +float RLOFOpticalFlowParameter::getMinEigenValue() const { return minEigenValue;} + +void RLOFOpticalFlowParameter::setGlobalMotionRansacThreshold(float val){ globalMotionRansacThreshold = val;} +float RLOFOpticalFlowParameter::getGlobalMotionRansacThreshold() const { return globalMotionRansacThreshold;} + +class DenseOpticalFlowRLOFImpl : public DenseRLOFOpticalFlow +{ +public: + DenseOpticalFlowRLOFImpl() + : param(Ptr(new RLOFOpticalFlowParameter)) + , forwardBackwardThreshold(1.f) + , gridStep(6, 6) + , interp_type(InterpolationType::INTERP_GEO) + , k(128) + , sigma(0.05f) + , lambda(999.f) + , fgs_lambda(500.0f) + , fgs_sigma(1.5f) + , use_post_proc(true) + + { + prevPyramid[0] = cv::Ptr(new CImageBuffer); + prevPyramid[1] = cv::Ptr(new CImageBuffer); + currPyramid[0] = cv::Ptr(new CImageBuffer); + currPyramid[1] = cv::Ptr(new CImageBuffer); + } + virtual void setRLOFOpticalFlowParameter(Ptr val) CV_OVERRIDE { param = val; } + virtual Ptr getRLOFOpticalFlowParameter() const CV_OVERRIDE { return param; } + + virtual float getForwardBackward() const CV_OVERRIDE { return forwardBackwardThreshold; } + virtual void setForwardBackward(float val) CV_OVERRIDE { forwardBackwardThreshold = val; } + + virtual void setInterpolation(InterpolationType val) CV_OVERRIDE { interp_type = val; } + virtual InterpolationType getInterpolation() const CV_OVERRIDE { return interp_type; } + + virtual Size getGridStep() const CV_OVERRIDE { return gridStep; } + virtual void setGridStep(Size val) CV_OVERRIDE { gridStep = val; } + + virtual int getEPICK() const CV_OVERRIDE { return k; } + virtual void setEPICK(int val) CV_OVERRIDE { k = val; } + + virtual float getEPICSigma() const CV_OVERRIDE { return sigma; } + virtual void setEPICSigma(float val) CV_OVERRIDE { sigma = val; } + + virtual float getEPICLambda() const CV_OVERRIDE { return lambda; } + virtual void setEPICLambda(float val) CV_OVERRIDE { lambda = val; } + + virtual float getFgsLambda() const CV_OVERRIDE { return fgs_lambda; } + virtual void setFgsLambda(float val) CV_OVERRIDE { fgs_lambda = val; } + + virtual float getFgsSigma() const CV_OVERRIDE { return fgs_sigma; } + virtual void setFgsSigma(float val) CV_OVERRIDE { fgs_sigma = val; } + + virtual bool getUsePostProc() const CV_OVERRIDE { return use_post_proc; } + virtual void setUsePostProc(bool val) CV_OVERRIDE { use_post_proc = val; } + + virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow) CV_OVERRIDE + { + CV_Assert(!I0.empty() && I0.depth() == CV_8U && (I0.channels() == 3 || I0.channels() == 1)); + CV_Assert(!I1.empty() && I1.depth() == CV_8U && (I1.channels() == 3 || I1.channels() == 1)); + CV_Assert(I0.sameSize(I1)); + if (param.empty()) + param = Ptr(new RLOFOpticalFlowParameter()); + if (param->supportRegionType == SR_CROSS) + CV_Assert( I0.channels() == 3 && I1.channels() == 3); + CV_Assert(interp_type == InterpolationType::INTERP_EPIC || interp_type == InterpolationType::INTERP_GEO); + // if no parameter is used use the default parameter + + Mat prevImage = I0.getMat(); + Mat currImage = I1.getMat(); + int noPoints = prevImage.cols * prevImage.rows; + std::vector prevPoints(noPoints); + std::vector currPoints, refPoints; + noPoints = 0; + cv::Size grid_h = gridStep / 2; + for (int r = grid_h.height; r < prevImage.rows - grid_h.height; r += gridStep.height) + { + for (int c = grid_h.width; c < prevImage.cols - grid_h.width; c += gridStep.width) + { + prevPoints[noPoints++] = cv::Point2f(static_cast(c), static_cast(r)); + } + } + prevPoints.erase(prevPoints.begin() + noPoints, prevPoints.end()); + currPoints.resize(prevPoints.size()); + calcLocalOpticalFlow(prevImage, currImage, prevPyramid, currPyramid, prevPoints, currPoints, *(param.get())); + flow.create(prevImage.size(), CV_32FC2); + Mat dense_flow = flow.getMat(); + + std::vector filtered_prevPoints; + std::vector filtered_currPoints; + if (gridStep == cv::Size(1, 1) && forwardBackwardThreshold <= 0) + { + for (unsigned int n = 0; n < prevPoints.size(); n++) + { + dense_flow.at(prevPoints[n]) = currPoints[n] - prevPoints[n]; + } + return; + } + if (forwardBackwardThreshold > 0) + { + // reuse image pyramids + calcLocalOpticalFlow(currImage, prevImage, currPyramid, prevPyramid, currPoints, refPoints, *(param.get())); + + filtered_prevPoints.resize(prevPoints.size()); + filtered_currPoints.resize(prevPoints.size()); + float sqrForwardBackwardThreshold = forwardBackwardThreshold * forwardBackwardThreshold; + noPoints = 0; + for (unsigned int r = 0; r < refPoints.size(); r++) + { + Point2f diff = refPoints[r] - prevPoints[r]; + if (diff.x * diff.x + diff.y * diff.y < sqrForwardBackwardThreshold) + { + filtered_prevPoints[noPoints] = prevPoints[r]; + filtered_currPoints[noPoints++] = currPoints[r]; + } + } + + filtered_prevPoints.erase(filtered_prevPoints.begin() + noPoints, filtered_prevPoints.end()); + filtered_currPoints.erase(filtered_currPoints.begin() + noPoints, filtered_currPoints.end()); + + } + else + { + filtered_prevPoints = prevPoints; + filtered_currPoints = currPoints; + } + + if (interp_type == InterpolationType::INTERP_EPIC) + { + Ptr gd = ximgproc::createEdgeAwareInterpolator(); + gd->setK(k); + gd->setSigma(sigma); + gd->setLambda(lambda); + gd->setUsePostProcessing(false); + gd->interpolate(prevImage, filtered_prevPoints, currImage, filtered_currPoints, dense_flow); + } + else + { + Mat blurredPrevImage, blurredCurrImage; + GaussianBlur(prevImage, blurredPrevImage, cv::Size(5, 5), -1); + std::vector status(filtered_currPoints.size(), 1); + interpolate_irregular_nn_raster(filtered_prevPoints, filtered_currPoints, status, blurredPrevImage).copyTo(dense_flow); + std::vector vecMats; + std::vector vecMats2(2); + cv::split(dense_flow, vecMats); + cv::bilateralFilter(vecMats[0], vecMats2[0], 5, 2, 20); + cv::bilateralFilter(vecMats[1], vecMats2[1], 5, 2, 20); + cv::merge(vecMats2, dense_flow); + } + if (use_post_proc) + { + ximgproc::fastGlobalSmootherFilter(prevImage, flow, flow, fgs_lambda, fgs_sigma); + } + } + + virtual void collectGarbage() CV_OVERRIDE + { + prevPyramid[0].release(); + prevPyramid[1].release(); + currPyramid[0].release(); + currPyramid[1].release(); + } + +protected: + Ptr param; + float forwardBackwardThreshold; + Ptr prevPyramid[2]; + Ptr currPyramid[2]; + cv::Size gridStep; + InterpolationType interp_type; + int k; + float sigma; + float lambda; + float fgs_lambda; + float fgs_sigma; + bool use_post_proc; +}; + +Ptr DenseRLOFOpticalFlow::create( + Ptr rlofParam, + float forwardBackwardThreshold, + cv::Size gridStep, + InterpolationType interp_type, + int epicK, + float epicSigma, + float epicLambda, + bool use_post_proc, + float fgs_lambda, + float fgs_sigma) +{ + Ptr algo = makePtr(); + algo->setRLOFOpticalFlowParameter(rlofParam); + algo->setForwardBackward(forwardBackwardThreshold); + algo->setGridStep(gridStep); + algo->setInterpolation(interp_type); + algo->setEPICK(epicK); + algo->setEPICSigma(epicSigma); + algo->setEPICLambda(epicLambda); + algo->setUsePostProc(use_post_proc); + algo->setFgsLambda(fgs_lambda); + algo->setFgsSigma(fgs_sigma); + return algo; +} + +class SparseRLOFOpticalFlowImpl : public SparseRLOFOpticalFlow +{ + public: + SparseRLOFOpticalFlowImpl() + : param(Ptr(new RLOFOpticalFlowParameter)) + , forwardBackwardThreshold(1.f) + { + prevPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer); + prevPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer); + currPyramid[0] = cv::Ptr< CImageBuffer>(new CImageBuffer); + currPyramid[1] = cv::Ptr< CImageBuffer>(new CImageBuffer); + } + virtual void setRLOFOpticalFlowParameter(Ptr val) CV_OVERRIDE { param = val; } + virtual Ptr getRLOFOpticalFlowParameter() const CV_OVERRIDE { return param; } + + virtual float getForwardBackward() const CV_OVERRIDE { return forwardBackwardThreshold; } + virtual void setForwardBackward(float val) CV_OVERRIDE { forwardBackwardThreshold = val; } + + virtual void calc(InputArray prevImg, InputArray nextImg, + InputArray prevPts, InputOutputArray nextPts, + OutputArray status, + OutputArray err) CV_OVERRIDE + { + CV_Assert(!prevImg.empty() && prevImg.depth() == CV_8U && (prevImg.channels() == 3 || prevImg.channels() == 1)); + CV_Assert(!nextImg.empty() && nextImg.depth() == CV_8U && (nextImg.channels() == 3 || nextImg.channels() == 1)); + CV_Assert(prevImg.sameSize(nextImg)); + + Mat prevImage = prevImg.getMat(); + Mat nextImage = nextImg.getMat(); + Mat prevPtsMat = prevPts.getMat(); + + if (param.empty()) + { + param = Ptr(new RLOFOpticalFlowParameter); + } + + if (param->useInitialFlow == false) + nextPts.create(prevPtsMat.size(), prevPtsMat.type(), -1, true); + + int npoints = 0; + CV_Assert((npoints = prevPtsMat.checkVector(2, CV_32F, true)) >= 0); + + if (npoints == 0) + { + nextPts.release(); + status.release(); + err.release(); + return; + } + + Mat nextPtsMat = nextPts.getMat(); + CV_Assert(nextPtsMat.checkVector(2, CV_32F, true) == npoints); + std::vector prevPoints(npoints), nextPoints(npoints), refPoints; + prevPtsMat.copyTo(cv::Mat(1, npoints, CV_32FC2, &prevPoints[0])); + if (param->useInitialFlow ) + nextPtsMat.copyTo(cv::Mat(1, nextPtsMat.cols, CV_32FC2, &nextPoints[0])); + + cv::Mat statusMat; + cv::Mat errorMat; + if (status.needed() || forwardBackwardThreshold > 0) + { + status.create((int)npoints, 1, CV_8U, -1, true); + statusMat = status.getMat(); + statusMat.setTo(1); + } + + if (err.needed() || forwardBackwardThreshold > 0) + { + err.create((int)npoints, 1, CV_32F, -1, true); + errorMat = err.getMat(); + errorMat.setTo(0); + } + + calcLocalOpticalFlow(prevImage, nextImage, prevPyramid, currPyramid, prevPoints, nextPoints, *(param.get())); + cv::Mat(1,npoints , CV_32FC2, &nextPoints[0]).copyTo(nextPtsMat); + if (forwardBackwardThreshold > 0) + { + // reuse image pyramids + calcLocalOpticalFlow(nextImage, prevImage, currPyramid, prevPyramid, nextPoints, refPoints, *(param.get())); + } + for (unsigned int r = 0; r < refPoints.size(); r++) + { + Point2f diff = refPoints[r] - prevPoints[r]; + errorMat.at(r) = sqrt(diff.x * diff.x + diff.y * diff.y); + if (errorMat.at(r) <= forwardBackwardThreshold) + statusMat.at(r) = 0; + } + + } + +protected: + Ptr param; + float forwardBackwardThreshold; + Ptr prevPyramid[2]; + Ptr currPyramid[2]; +}; + +Ptr SparseRLOFOpticalFlow::create( + Ptr rlofParam, + float forwardBackwardThreshold) +{ + Ptr algo = makePtr(); + algo->setRLOFOpticalFlowParameter(rlofParam); + algo->setForwardBackward(forwardBackwardThreshold); + return algo; +} + +void calcOpticalFlowDenseRLOF(InputArray I0, InputArray I1, InputOutputArray flow, + Ptr rlofParam , + float forewardBackwardThreshold, Size gridStep, + InterpolationType interp_type, + int epicK, float epicSigma, float epicLambda, + bool use_post_proc, float fgsLambda, float fgsSigma) +{ + Ptr algo = DenseRLOFOpticalFlow::create( + rlofParam, forewardBackwardThreshold, gridStep, interp_type, + epicK, epicSigma, epicLambda, use_post_proc, fgsLambda, fgsSigma); + algo->calc(I0, I1, flow); + algo->collectGarbage(); +} + +void calcOpticalFlowSparseRLOF(InputArray prevImg, InputArray nextImg, + InputArray prevPts, InputOutputArray nextPts, + OutputArray status, OutputArray err, + Ptr rlofParam, + float forewardBackwardThreshold) +{ + Ptr algo = SparseRLOFOpticalFlow::create( + rlofParam, forewardBackwardThreshold); + algo->calc(prevImg, nextImg, prevPts, nextPts, status, err); +} +Ptr createOptFlow_DenseRLOF() +{ + return DenseRLOFOpticalFlow::create(); +} + +Ptr createOptFlow_SparseRLOF() +{ + return SparseRLOFOpticalFlow::create(); +} + +}} // namespace diff --git a/modules/optflow/test/test_OF_accuracy.cpp b/modules/optflow/test/test_OF_accuracy.cpp index 6a5ed702e48..5d093cd49c3 100644 --- a/modules/optflow/test/test_OF_accuracy.cpp +++ b/modules/optflow/test/test_OF_accuracy.cpp @@ -83,7 +83,20 @@ static float calcRMSE(Mat flow1, Mat flow2) } return (float)sqrt(sum / (1e-9 + counter)); } - +static float calcRMSE(vector prevPts, vector currPts, Mat flow) +{ + vector ee; + for (unsigned int n = 0; n < prevPts.size(); n++) + { + Point2f gtFlow = flow.at(prevPts[n]); + if (isFlowCorrect(gtFlow.x) && isFlowCorrect(gtFlow.y)) + { + Point2f diffFlow = (currPts[n] - prevPts[n]) - gtFlow; + ee.push_back(sqrt(diffFlow.x * diffFlow.x + diffFlow.y * diffFlow.y)); + } + } + return static_cast(mean(ee).val[0]); +} static float calcAvgEPE(vector< pair > corr, Mat flow) { double sum = 0; @@ -111,9 +124,13 @@ static float calcAvgEPE(vector< pair > corr, Mat flow) bool readRubberWhale(Mat &dst_frame_1, Mat &dst_frame_2, Mat &dst_GT) { - const string frame1_path = getRubberWhaleFrame1(); - const string frame2_path = getRubberWhaleFrame2(); - const string gt_flow_path = getRubberWhaleGroundTruth(); + string frame1_path = getRubberWhaleFrame1(); + string frame2_path = getRubberWhaleFrame2(); + string gt_flow_path = getRubberWhaleGroundTruth(); + // removing space may be an issue on windows machines + frame1_path.erase(std::remove_if(frame1_path.begin(), frame1_path.end(), isspace), frame1_path.end()); + frame2_path.erase(std::remove_if(frame2_path.begin(), frame2_path.end(), isspace), frame2_path.end()); + gt_flow_path.erase(std::remove_if(gt_flow_path.begin(), gt_flow_path.end(), isspace), gt_flow_path.end()); dst_frame_1 = imread(frame1_path); dst_frame_2 = imread(frame2_path); @@ -125,6 +142,7 @@ bool readRubberWhale(Mat &dst_frame_1, Mat &dst_frame_2, Mat &dst_GT) return true; } + TEST(DenseOpticalFlow_SimpleFlow, ReferenceAccuracy) { Mat frame1, frame2, GT; @@ -157,6 +175,102 @@ TEST(DenseOpticalFlow_DeepFlow, ReferenceAccuracy) EXPECT_LE(calcRMSE(GT, flow), target_RMSE); } +TEST(SparseOpticalFlow, ReferenceAccuracy) +{ + // with the following test each invoker class should be tested once + Mat frame1, frame2, GT; + ASSERT_TRUE(readRubberWhale(frame1, frame2, GT)); + vector prevPts, currPts; + for (int r = 0; r < frame1.rows; r+=10) + { + for (int c = 0; c < frame1.cols; c+=10) + { + prevPts.push_back(Point2f(static_cast(c), static_cast(r))); + } + } + vector status(prevPts.size()); + vector err(prevPts.size()); + Ptr algo = SparseRLOFOpticalFlow::create(); + algo->setForwardBackward(0.0f); + Ptr param = Ptr(new RLOFOpticalFlowParameter); + param->supportRegionType = SR_CROSS; + param->useIlluminationModel = true; + param->solverType = ST_BILINEAR; + algo->setRLOFOpticalFlowParameter(param); + algo->calc(frame1, frame2, prevPts, currPts, status, err); + EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.3f); + + param->solverType = ST_STANDART; + algo->setRLOFOpticalFlowParameter(param); + algo->calc(frame1, frame2, prevPts, currPts, status, err); + EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.34f); + + param->useIlluminationModel = false; + param->solverType = ST_BILINEAR; + algo->setRLOFOpticalFlowParameter(param); + algo->calc(frame1, frame2, prevPts, currPts, status, err); + EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.27f); + + param->solverType = ST_STANDART; + algo->setRLOFOpticalFlowParameter(param); + algo->calc(frame1, frame2, prevPts, currPts, status, err); + EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.27f); + + param->normSigma0 = numeric_limits::max(); + param->normSigma1 = numeric_limits::max(); + param->useIlluminationModel = true; + + param->solverType = ST_BILINEAR; + algo->setRLOFOpticalFlowParameter(param); + algo->calc(frame1, frame2, prevPts, currPts, status, err); + EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.28f); + + param->solverType = ST_STANDART; + algo->setRLOFOpticalFlowParameter(param); + algo->calc(frame1, frame2, prevPts, currPts, status, err); + EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.28f); + + param->useIlluminationModel = false; + + param->solverType = ST_BILINEAR; + algo->setRLOFOpticalFlowParameter(param); + algo->calc(frame1, frame2, prevPts, currPts, status, err); + EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.80f); + + param->solverType = ST_STANDART; + algo->setRLOFOpticalFlowParameter(param); + algo->calc(frame1, frame2, prevPts, currPts, status, err); + EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.28f); +} + +TEST(DenseOpticalFlow_RLOF, ReferenceAccuracy) +{ + Mat frame1, frame2, GT; + ASSERT_TRUE(readRubberWhale(frame1, frame2, GT)); + Mat flow; + Ptr algo = DenseRLOFOpticalFlow::create(); + Ptr param = Ptr(new RLOFOpticalFlowParameter); + param->supportRegionType = SR_CROSS; + param->solverType = ST_BILINEAR; + algo->setRLOFOpticalFlowParameter(param); + algo->setForwardBackward(1.0f); + algo->setGridStep(cv::Size(4, 4)); + algo->setInterpolation(INTERP_EPIC); + algo->calc(frame1, frame2, flow); + + ASSERT_EQ(GT.rows, flow.rows); + ASSERT_EQ(GT.cols, flow.cols); + EXPECT_LE(calcRMSE(GT, flow), 0.44f); + + algo->setInterpolation(INTERP_GEO); + algo->calc(frame1, frame2, flow); + + ASSERT_EQ(GT.rows, flow.rows); + ASSERT_EQ(GT.cols, flow.cols); + EXPECT_LE(calcRMSE(GT, flow), 0.55f); + +} + TEST(DenseOpticalFlow_SparseToDenseFlow, ReferenceAccuracy) { Mat frame1, frame2, GT;