opencv
diff --git a/‎modules/cudaimgproc/src/cuda/build_point_list.cu
+7-13 b/‎modules/cudaimgproc/src/cuda/build_point_list.cu
+7-13
diff --git a/‎modules/cudaimgproc/src/cuda/hough_circles.cu
+18-28 b/‎modules/cudaimgproc/src/cuda/hough_circles.cu
+18-28
diff --git a/‎modules/cudaimgproc/src/cuda/hough_lines.cu
+14-19 b/‎modules/cudaimgproc/src/cuda/hough_lines.cu
+14-19
diff --git a/‎modules/cudaimgproc/src/cuda/hough_segments.cu
+12-15 b/‎modules/cudaimgproc/src/cuda/hough_segments.cu
+12-15
@@ -49,10 +49,8 @@ namespace cv { namespace cuda { namespace device
 {
     namespace hough
     {
-        __device__ int g_counter;
-
         template <int PIXELS_PER_THREAD>
-        __global__ void buildPointList(const PtrStepSzb src, unsigned int* list)
+        __global__ void buildPointList(const PtrStepSzb src, unsigned int* list, int* counterPtr)
         {
             __shared__ unsigned int s_queues[4][32 * PIXELS_PER_THREAD];
             __shared__ int s_qsize[4];
@@ -94,7 +92,7 @@ namespace cv { namespace cuda { namespace device
                 }
 
                 // calculate the offset in the global list
-                const int globalOffset = atomicAdd(&g_counter, totalSize);
+                const int globalOffset = atomicAdd(counterPtr, totalSize);
                 for (int i = 0; i < blockDim.y; ++i)
                     s_globStart[i] += globalOffset;
             }
@@ -108,27 +106,23 @@ namespace cv { namespace cuda { namespace device
                 list[gidx] = s_queues[threadIdx.y][i];
         }
 
-        int buildPointList_gpu(PtrStepSzb src, unsigned int* list)
+        int buildPointList_gpu(PtrStepSzb src, unsigned int* list, int* counterPtr, cudaStream_t stream)
         {
             const int PIXELS_PER_THREAD = 16;
 
-            void* counterPtr;
-            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
-
-            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+            cudaSafeCall( cudaMemsetAsync(counterPtr, 0, sizeof(int), stream) );
 
             const dim3 block(32, 4);
             const dim3 grid(divUp(src.cols, block.x * PIXELS_PER_THREAD), divUp(src.rows, block.y));
 
             cudaSafeCall( cudaFuncSetCacheConfig(buildPointList<PIXELS_PER_THREAD>, cudaFuncCachePreferShared) );
 
-            buildPointList<PIXELS_PER_THREAD><<<grid, block>>>(src, list);
+            buildPointList<PIXELS_PER_THREAD><<<grid, block, 0, stream>>>(src, list, counterPtr);
             cudaSafeCall( cudaGetLastError() );
 
-            cudaSafeCall( cudaDeviceSynchronize() );
-
             int totalCount;
-            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+            cudaSafeCall( cudaMemcpyAsync(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
+            cudaSafeCall( cudaStreamSynchronize(stream) );
 
             return totalCount;
         }
 
@@ -54,8 +54,6 @@ namespace cv { namespace cuda { namespace device
 {
     namespace hough_circles
     {
-        __device__ int g_counter;
-
         ////////////////////////////////////////////////////////////////////////
         // circlesAccumCenters
 
@@ -111,23 +109,22 @@ namespace cv { namespace cuda { namespace device
             }
         }
 
-        void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp)
+        void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp, cudaStream_t stream)
         {
             const dim3 block(256);
             const dim3 grid(divUp(count, block.x));
 
             cudaSafeCall( cudaFuncSetCacheConfig(circlesAccumCenters, cudaFuncCachePreferL1) );
 
-            circlesAccumCenters<<<grid, block>>>(list, count, dx, dy, accum, accum.cols - 2, accum.rows - 2, minRadius, maxRadius, idp);
+            circlesAccumCenters<<<grid, block, 0, stream>>>(list, count, dx, dy, accum, accum.cols - 2, accum.rows - 2, minRadius, maxRadius, idp);
             cudaSafeCall( cudaGetLastError() );
 
-            cudaSafeCall( cudaDeviceSynchronize() );
+            cudaSafeCall( cudaStreamSynchronize(stream) );
         }
 
         ////////////////////////////////////////////////////////////////////////
         // buildCentersList
-
-        __global__ void buildCentersList(const PtrStepSzi accum, unsigned int* centers, const int threshold)
+        __global__ void buildCentersList(const PtrStepSzi accum, unsigned int* centers, const int threshold, int* counterPtr)
         {
             const int x = blockIdx.x * blockDim.x + threadIdx.x;
             const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -145,31 +142,27 @@ namespace cv { namespace cuda { namespace device
                 if (cur > threshold && cur > top && cur >= bottom && cur >  left && cur >= right)
                 {
                     const unsigned int val = (y << 16) | x;
-                    const int idx = ::atomicAdd(&g_counter, 1);
+                    const int idx = ::atomicAdd(counterPtr, 1);
                     centers[idx] = val;
                 }
             }
         }
 
-        int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold)
+        int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold, int* counterPtr, cudaStream_t stream)
         {
-            void* counterPtr;
-            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
-
-            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+            cudaSafeCall( cudaMemsetAsync(counterPtr, 0, sizeof(int), stream) );
 
             const dim3 block(32, 8);
             const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
 
             cudaSafeCall( cudaFuncSetCacheConfig(buildCentersList, cudaFuncCachePreferL1) );
 
-            buildCentersList<<<grid, block>>>(accum, centers, threshold);
+            buildCentersList<<<grid, block, 0, stream>>>(accum, centers, threshold, counterPtr);
             cudaSafeCall( cudaGetLastError() );
 
-            cudaSafeCall( cudaDeviceSynchronize() );
-
             int totalCount;
-            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+            cudaSafeCall( cudaMemcpyAsync(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
+            cudaSafeCall( cudaStreamSynchronize(stream) );
 
             return totalCount;
         }
@@ -179,7 +172,8 @@ namespace cv { namespace cuda { namespace device
 
         __global__ void circlesAccumRadius(const unsigned int* centers, const unsigned int* list, const int count,
                                            float3* circles, const int maxCircles, const float dp,
-                                           const int minRadius, const int maxRadius, const int histSize, const int threshold)
+                                           const int minRadius, const int maxRadius, const int histSize, const int threshold,
+                                           int* counterPtr)
         {
             int* smem = DynamicSharedMem<int>();
 
@@ -219,34 +213,30 @@ namespace cv { namespace cuda { namespace device
 
                 if (curVotes >= threshold && curVotes > smem[i] && curVotes >= smem[i + 2])
                 {
-                    const int ind = ::atomicAdd(&g_counter, 1);
+                    const int ind = ::atomicAdd(counterPtr, 1);
                     if (ind < maxCircles)
                         circles[ind] = make_float3(cx, cy, i + minRadius);
                 }
             }
         }
 
         int circlesAccumRadius_gpu(const unsigned int* centers, int centersCount, const unsigned int* list, int count,
-                                   float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20)
+                                   float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20, int* counterPtr, cudaStream_t stream)
         {
-            void* counterPtr;
-            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
-
-            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+            cudaSafeCall( cudaMemsetAsync(counterPtr, 0, sizeof(int), stream) );
 
             const dim3 block(has20 ? 1024 : 512);
             const dim3 grid(centersCount);
 
             const int histSize = maxRadius - minRadius + 1;
             size_t smemSize = (histSize + 2) * sizeof(int);
 
-            circlesAccumRadius<<<grid, block, smemSize>>>(centers, list, count, circles, maxCircles, dp, minRadius, maxRadius, histSize, threshold);
+            circlesAccumRadius<<<grid, block, smemSize, stream>>>(centers, list, count, circles, maxCircles, dp, minRadius, maxRadius, histSize, threshold, counterPtr);
             cudaSafeCall( cudaGetLastError() );
 
-            cudaSafeCall( cudaDeviceSynchronize() );
-
             int totalCount;
-            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+            cudaSafeCall( cudaMemcpyAsync(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
+            cudaSafeCall( cudaStreamSynchronize(stream) );
 
             totalCount = ::min(totalCount, maxCircles);
 
 
@@ -44,6 +44,7 @@
 
 #include <thrust/device_ptr.h>
 #include <thrust/sort.h>
+#include <thrust/system/cuda/execution_policy.h>
 
 #include "opencv2/core/cuda/common.hpp"
 #include "opencv2/core/cuda/emulation.hpp"
@@ -53,8 +54,6 @@ namespace cv { namespace cuda { namespace device
 {
     namespace hough_lines
     {
-        __device__ int g_counter;
-
         ////////////////////////////////////////////////////////////////////////
         // linesAccum
 
@@ -126,27 +125,26 @@ namespace cv { namespace cuda { namespace device
                 accumRow[i] = smem[i];
         }
 
-        void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20)
+        void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20, cudaStream_t stream)
         {
             const dim3 block(has20 ? 1024 : 512);
             const dim3 grid(accum.rows - 2);
 
             size_t smemSize = (accum.cols - 1) * sizeof(int);
 
             if (smemSize < sharedMemPerBlock - 1000)
-                linesAccumShared<<<grid, block, smemSize>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
+                linesAccumShared<<<grid, block, smemSize, stream>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
             else
-                linesAccumGlobal<<<grid, block>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
+                linesAccumGlobal<<<grid, block, 0, stream>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
 
             cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
+            cudaSafeCall( cudaStreamSynchronize(stream) );
         }
 
         ////////////////////////////////////////////////////////////////////////
         // linesGetResult
 
-        __global__ void linesGetResult(const PtrStepSzi accum, float2* out, int* votes, const int maxSize, const float rho, const float theta, const int threshold, const int numrho)
+        __global__ void linesGetResult(const PtrStepSzi accum, float2* out, int* votes, const int maxSize, const float rho, const float theta, const int threshold, const int numrho, int* counterPtr)
         {
             const int r = blockIdx.x * blockDim.x + threadIdx.x;
             const int n = blockIdx.y * blockDim.y + threadIdx.y;
@@ -165,7 +163,7 @@ namespace cv { namespace cuda { namespace device
                 const float radius = (r - (numrho - 1) * 0.5f) * rho;
                 const float angle = n * theta;
 
-                const int ind = ::atomicAdd(&g_counter, 1);
+                const int ind = ::atomicAdd(counterPtr, 1);
                 if (ind < maxSize)
                 {
                     out[ind] = make_float2(radius, angle);
@@ -174,33 +172,30 @@ namespace cv { namespace cuda { namespace device
             }
         }
 
-        int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort)
+        int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort, int* counterPtr, cudaStream_t stream)
         {
-            void* counterPtr;
-            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
-
-            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+            cudaSafeCall( cudaMemsetAsync(counterPtr, 0, sizeof(int), stream) );
 
             const dim3 block(32, 8);
             const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
 
             cudaSafeCall( cudaFuncSetCacheConfig(linesGetResult, cudaFuncCachePreferL1) );
 
-            linesGetResult<<<grid, block>>>(accum, out, votes, maxSize, rho, theta, threshold, accum.cols - 2);
+            linesGetResult<<<grid, block, 0, stream>>>(accum, out, votes, maxSize, rho, theta, threshold, accum.cols - 2, counterPtr);
             cudaSafeCall( cudaGetLastError() );
 
-            cudaSafeCall( cudaDeviceSynchronize() );
-
             int totalCount;
-            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+            cudaSafeCall( cudaMemcpyAsync(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
+
+            cudaSafeCall( cudaStreamSynchronize(stream) );
 
             totalCount = ::min(totalCount, maxSize);
 
             if (doSort && totalCount > 0)
             {
                 thrust::device_ptr<float2> outPtr(out);
                 thrust::device_ptr<int> votesPtr(votes);
-                thrust::sort_by_key(votesPtr, votesPtr + totalCount, outPtr, thrust::greater<int>());
+                thrust::sort_by_key(thrust::cuda::par.on(stream), votesPtr, votesPtr + totalCount, outPtr, thrust::greater<int>());
             }
 
             return totalCount;
 
@@ -49,15 +49,14 @@ namespace cv { namespace cuda { namespace device
 {
     namespace hough_segments
     {
-        __device__ int g_counter;
-
         texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_mask(false, cudaFilterModePoint, cudaAddressModeClamp);
 
         __global__ void houghLinesProbabilistic(const PtrStepSzi accum,
                                                 int4* out, const int maxSize,
                                                 const float rho, const float theta,
                                                 const int lineGap, const int lineLength,
-                                                const int rows, const int cols)
+                                                const int rows, const int cols,
+                                                int* counterPtr)
         {
             const int r = blockIdx.x * blockDim.x + threadIdx.x;
             const int n = blockIdx.y * blockDim.y + threadIdx.y;
@@ -182,7 +181,7 @@ namespace cv { namespace cuda { namespace device
 
                             if (good_line)
                             {
-                                const int ind = ::atomicAdd(&g_counter, 1);
+                                const int ind = ::atomicAdd(counterPtr, 1);
                                 if (ind < maxSize)
                                     out[ind] = make_int4(line_end[0].x, line_end[0].y, line_end[1].x, line_end[1].y);
                             }
@@ -202,7 +201,7 @@ namespace cv { namespace cuda { namespace device
 
                             if (good_line)
                             {
-                                const int ind = ::atomicAdd(&g_counter, 1);
+                                const int ind = ::atomicAdd(counterPtr, 1);
                                 if (ind < maxSize)
                                     out[ind] = make_int4(line_end[0].x, line_end[0].y, line_end[1].x, line_end[1].y);
                             }
@@ -214,29 +213,27 @@ namespace cv { namespace cuda { namespace device
             }
         }
 
-        int houghLinesProbabilistic_gpu(PtrStepSzb mask, PtrStepSzi accum, int4* out, int maxSize, float rho, float theta, int lineGap, int lineLength)
+        int houghLinesProbabilistic_gpu(PtrStepSzb mask, PtrStepSzi accum, int4* out, int maxSize, float rho, float theta, int lineGap, int lineLength, int* counterPtr, cudaStream_t stream)
         {
-            void* counterPtr;
-            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
-
-            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+            cudaSafeCall( cudaMemsetAsync(counterPtr, 0, sizeof(int), stream) );
 
             const dim3 block(32, 8);
             const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
 
             bindTexture(&tex_mask, mask);
 
-            houghLinesProbabilistic<<<grid, block>>>(accum,
+            houghLinesProbabilistic<<<grid, block, 0, stream>>>(accum,
                                                      out, maxSize,
                                                      rho, theta,
                                                      lineGap, lineLength,
-                                                     mask.rows, mask.cols);
+                                                     mask.rows, mask.cols,
+                                                     counterPtr);
             cudaSafeCall( cudaGetLastError() );
 
-            cudaSafeCall( cudaDeviceSynchronize() );
-
             int totalCount;
-            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+            cudaSafeCall( cudaMemcpyAsync(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
+
+            cudaSafeCall( cudaStreamSynchronize(stream) );
 
             totalCount = ::min(totalCount, maxSize);
Original file line number	Diff line number	Diff line change
`@@ -49,10 +49,8 @@ namespace cv { namespace cuda { namespace device`
`49`	`49`	`{`
`50`	`50`	`namespace hough`
`51`	`51`	`{`
`52`		`- __device__ int g_counter;`
`53`		`-`
`54`	`52`	`template <int PIXELS_PER_THREAD>`
`55`		`- __global__ void buildPointList(const PtrStepSzb src, unsigned int* list)`
	`53`	`+ __global__ void buildPointList(const PtrStepSzb src, unsigned int* list, int* counterPtr)`
`56`	`54`	`{`
`57`	`55`	`__shared__ unsigned int s_queues[4][32 * PIXELS_PER_THREAD];`
`58`	`56`	`__shared__ int s_qsize[4];`
`@@ -94,7 +92,7 @@ namespace cv { namespace cuda { namespace device`
`94`	`92`	`}`
`95`	`93`
`96`	`94`	`// calculate the offset in the global list`
`97`		`- const int globalOffset = atomicAdd(&g_counter, totalSize);`
	`95`	`+ const int globalOffset = atomicAdd(counterPtr, totalSize);`
`98`	`96`	`for (int i = 0; i < blockDim.y; ++i)`
`99`	`97`	`s_globStart[i] += globalOffset;`
`100`	`98`	`}`
`@@ -108,27 +106,23 @@ namespace cv { namespace cuda { namespace device`
`108`	`106`	`list[gidx] = s_queues[threadIdx.y][i];`
`109`	`107`	`}`
`110`	`108`
`111`		`- int buildPointList_gpu(PtrStepSzb src, unsigned int* list)`
	`109`	`+ int buildPointList_gpu(PtrStepSzb src, unsigned int* list, int* counterPtr, cudaStream_t stream)`
`112`	`110`	`{`
`113`	`111`	`const int PIXELS_PER_THREAD = 16;`
`114`	`112`
`115`		`- void* counterPtr;`
`116`		`- cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );`
`117`		`-`
`118`		`- cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );`
	`113`	`+ cudaSafeCall( cudaMemsetAsync(counterPtr, 0, sizeof(int), stream) );`
`119`	`114`
`120`	`115`	`const dim3 block(32, 4);`
`121`	`116`	`const dim3 grid(divUp(src.cols, block.x * PIXELS_PER_THREAD), divUp(src.rows, block.y));`
`122`	`117`
`123`	`118`	`cudaSafeCall( cudaFuncSetCacheConfig(buildPointList<PIXELS_PER_THREAD>, cudaFuncCachePreferShared) );`
`124`	`119`
`125`		`- buildPointList<PIXELS_PER_THREAD><<<grid, block>>>(src, list);`
	`120`	`+ buildPointList<PIXELS_PER_THREAD><<<grid, block, 0, stream>>>(src, list, counterPtr);`
`126`	`121`	`cudaSafeCall( cudaGetLastError() );`
`127`	`122`
`128`		`- cudaSafeCall( cudaDeviceSynchronize() );`
`129`		`-`
`130`	`123`	`int totalCount;`
`131`		`- cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );`
	`124`	`+ cudaSafeCall( cudaMemcpyAsync(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost, stream) );`
	`125`	`+ cudaSafeCall( cudaStreamSynchronize(stream) );`
`132`	`126`
`133`	`127`	`return totalCount;`
`134`	`128`	`}`