diff --git a/modules/cudaimgproc/src/cuda/gftt.cu b/modules/cudaimgproc/src/cuda/gftt.cu index ab8713f868a..66bd6e0dbc2 100644 --- a/modules/cudaimgproc/src/cuda/gftt.cu +++ b/modules/cudaimgproc/src/cuda/gftt.cu @@ -52,37 +52,33 @@ namespace cv { namespace cuda { namespace device { namespace gfft { - texture eigTex(0, cudaFilterModePoint, cudaAddressModeClamp); - - __device__ int g_counter = 0; - - template __global__ void findCorners(float threshold, const Mask mask, float2* corners, int max_count, int rows, int cols) + template __global__ void findCorners(float threshold, const Mask mask, float2* corners, int max_count, int rows, int cols, cudaTextureObject_t eigTex, int *g_counter) { const int j = blockIdx.x * blockDim.x + threadIdx.x; const int i = blockIdx.y * blockDim.y + threadIdx.y; if (i > 0 && i < rows - 1 && j > 0 && j < cols - 1 && mask(i, j)) { - float val = tex2D(eigTex, j, i); + float val = tex2D(eigTex, j, i); if (val > threshold) { float maxVal = val; - maxVal = ::fmax(tex2D(eigTex, j - 1, i - 1), maxVal); - maxVal = ::fmax(tex2D(eigTex, j , i - 1), maxVal); - maxVal = ::fmax(tex2D(eigTex, j + 1, i - 1), maxVal); + maxVal = ::fmax(tex2D(eigTex, j - 1, i - 1), maxVal); + maxVal = ::fmax(tex2D(eigTex, j , i - 1), maxVal); + maxVal = ::fmax(tex2D(eigTex, j + 1, i - 1), maxVal); - maxVal = ::fmax(tex2D(eigTex, j - 1, i), maxVal); - maxVal = ::fmax(tex2D(eigTex, j + 1, i), maxVal); + maxVal = ::fmax(tex2D(eigTex, j - 1, i), maxVal); + maxVal = ::fmax(tex2D(eigTex, j + 1, i), maxVal); - maxVal = ::fmax(tex2D(eigTex, j - 1, i + 1), maxVal); - maxVal = ::fmax(tex2D(eigTex, j , i + 1), maxVal); - maxVal = ::fmax(tex2D(eigTex, j + 1, i + 1), maxVal); + maxVal = ::fmax(tex2D(eigTex, j - 1, i + 1), maxVal); + maxVal = ::fmax(tex2D(eigTex, j , i + 1), maxVal); + maxVal = ::fmax(tex2D(eigTex, j + 1, i + 1), maxVal); if (val == maxVal) { - const int ind = ::atomicAdd(&g_counter, 1); + const int ind = ::atomicAdd(g_counter, 1); if (ind < max_count) corners[ind] = make_float2(j, i); @@ -91,22 +87,20 @@ namespace cv { namespace cuda { namespace device } } - int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count, cudaStream_t stream) + int findCorners_gpu(const cudaTextureObject_t &eigTex, const int &rows, const int &cols, float threshold, PtrStepSzb mask, float2* corners, int max_count, cudaStream_t stream) { - void* counter_ptr; - cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) ); + int* counter_ptr; + cudaSafeCall( cudaMalloc(&counter_ptr, sizeof(int)) ); cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) ); - bindTexture(&eigTex, eig); - dim3 block(16, 16); - dim3 grid(divUp(eig.cols, block.x), divUp(eig.rows, block.y)); + dim3 grid(divUp(cols, block.x), divUp(rows, block.y)); if (mask.data) - findCorners<<>>(threshold, SingleMask(mask), corners, max_count, eig.rows, eig.cols); + findCorners<<>>(threshold, SingleMask(mask), corners, max_count, rows, cols, eigTex, counter_ptr); else - findCorners<<>>(threshold, WithOutMask(), corners, max_count, eig.rows, eig.cols); + findCorners<<>>(threshold, WithOutMask(), corners, max_count, rows, cols, eigTex, counter_ptr); cudaSafeCall( cudaGetLastError() ); @@ -122,25 +116,27 @@ namespace cv { namespace cuda { namespace device class EigGreater { public: + EigGreater(const cudaTextureObject_t &eigTex_) : eigTex(eigTex_) + { + } __device__ __forceinline__ bool operator()(float2 a, float2 b) const { - return tex2D(eigTex, a.x, a.y) > tex2D(eigTex, b.x, b.y); + return tex2D(eigTex, a.x, a.y) > tex2D(eigTex, b.x, b.y); } - }; + cudaTextureObject_t eigTex; + }; - void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count, cudaStream_t stream) + void sortCorners_gpu(const cudaTextureObject_t &eigTex, float2* corners, int count, cudaStream_t stream) { - bindTexture(&eigTex, eig); - thrust::device_ptr ptr(corners); #if THRUST_VERSION >= 100802 if (stream) - thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()).on(stream), ptr, ptr + count, EigGreater()); + thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()).on(stream), ptr, ptr + count, EigGreater(eigTex)); else - thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()), ptr, ptr + count, EigGreater()); + thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()), ptr, ptr + count, EigGreater(eigTex)); #else - thrust::sort(ptr, ptr + count, EigGreater()); + thrust::sort(ptr, ptr + count, EigGreater(eigTex)); #endif } } // namespace optical_flow diff --git a/modules/cudaimgproc/src/gftt.cpp b/modules/cudaimgproc/src/gftt.cpp index bf5d01b1174..f25158a68d8 100644 --- a/modules/cudaimgproc/src/gftt.cpp +++ b/modules/cudaimgproc/src/gftt.cpp @@ -55,8 +55,8 @@ namespace cv { namespace cuda { namespace device { namespace gfft { - int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count, cudaStream_t stream); - void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count, cudaStream_t stream); + int findCorners_gpu(const cudaTextureObject_t &eigTex_, const int &rows, const int &cols, float threshold, PtrStepSzb mask, float2* corners, int max_count, cudaStream_t stream); + void sortCorners_gpu(const cudaTextureObject_t &eigTex_, float2* corners, int count, cudaStream_t stream); } }}} @@ -112,7 +112,21 @@ namespace cudaStream_t stream_ = StreamAccessor::getStream(stream); ensureSizeIsEnough(1, std::max(1000, static_cast(image.size().area() * 0.05)), CV_32FC2, tmpCorners_); - int total = findCorners_gpu(eig_, static_cast(maxVal * qualityLevel_), mask, tmpCorners_.ptr(), tmpCorners_.cols, stream_); + //create texture object for findCorners_gpu and sortCorners_gpu + cudaTextureDesc texDesc; + memset(&texDesc, 0, sizeof(texDesc)); + texDesc.readMode = cudaReadModeElementType; + texDesc.filterMode = cudaFilterModePoint; + texDesc.addressMode[0] = cudaAddressModeClamp; + texDesc.addressMode[1] = cudaAddressModeClamp; + texDesc.addressMode[2] = cudaAddressModeClamp; + + cudaTextureObject_t eigTex_; + PtrStepSzf eig = eig_; + cv::cuda::device::createTextureObjectPitch2D(&eigTex_, eig, texDesc); + + int total = findCorners_gpu(eigTex_, eig_.rows, eig_.cols, static_cast(maxVal * qualityLevel_), mask, tmpCorners_.ptr(), tmpCorners_.cols, stream_); + if (total == 0) { @@ -120,7 +134,7 @@ namespace return; } - sortCorners_gpu(eig_, tmpCorners_.ptr(), total, stream_); + sortCorners_gpu(eigTex_, tmpCorners_.ptr(), total, stream_); if (minDistance_ < 1) {