@@ -54,8 +54,6 @@ namespace cv { namespace cuda { namespace device
54
54
{
55
55
namespace hough_circles
56
56
{
57
- __device__ int g_counter;
58
-
59
57
// //////////////////////////////////////////////////////////////////////
60
58
// circlesAccumCenters
61
59
@@ -111,23 +109,22 @@ namespace cv { namespace cuda { namespace device
111
109
}
112
110
}
113
111
114
- void circlesAccumCenters_gpu (const unsigned int * list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp)
112
+ void circlesAccumCenters_gpu (const unsigned int * list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp, cudaStream_t stream )
115
113
{
116
114
const dim3 block (256 );
117
115
const dim3 grid (divUp (count, block.x ));
118
116
119
117
cudaSafeCall ( cudaFuncSetCacheConfig (circlesAccumCenters, cudaFuncCachePreferL1) );
120
118
121
- circlesAccumCenters<<<grid, block>>> (list, count, dx, dy, accum, accum.cols - 2 , accum.rows - 2 , minRadius, maxRadius, idp);
119
+ circlesAccumCenters<<<grid, block, 0 , stream >>> (list, count, dx, dy, accum, accum.cols - 2 , accum.rows - 2 , minRadius, maxRadius, idp);
122
120
cudaSafeCall ( cudaGetLastError () );
123
121
124
- cudaSafeCall ( cudaDeviceSynchronize ( ) );
122
+ cudaSafeCall ( cudaStreamSynchronize (stream ) );
125
123
}
126
124
127
125
// //////////////////////////////////////////////////////////////////////
128
126
// buildCentersList
129
-
130
- __global__ void buildCentersList (const PtrStepSzi accum, unsigned int * centers, const int threshold)
127
+ __global__ void buildCentersList (const PtrStepSzi accum, unsigned int * centers, const int threshold, int * counterPtr)
131
128
{
132
129
const int x = blockIdx .x * blockDim .x + threadIdx .x ;
133
130
const int y = blockIdx .y * blockDim .y + threadIdx .y ;
@@ -145,31 +142,27 @@ namespace cv { namespace cuda { namespace device
145
142
if (cur > threshold && cur > top && cur >= bottom && cur > left && cur >= right)
146
143
{
147
144
const unsigned int val = (y << 16 ) | x;
148
- const int idx = ::atomicAdd (&g_counter , 1 );
145
+ const int idx = ::atomicAdd (counterPtr , 1 );
149
146
centers[idx] = val;
150
147
}
151
148
}
152
149
}
153
150
154
- int buildCentersList_gpu (PtrStepSzi accum, unsigned int * centers, int threshold)
151
+ int buildCentersList_gpu (PtrStepSzi accum, unsigned int * centers, int threshold, int * counterPtr, cudaStream_t stream )
155
152
{
156
- void * counterPtr;
157
- cudaSafeCall ( cudaGetSymbolAddress (&counterPtr, g_counter) );
158
-
159
- cudaSafeCall ( cudaMemset (counterPtr, 0 , sizeof (int )) );
153
+ cudaSafeCall ( cudaMemsetAsync (counterPtr, 0 , sizeof (int ), stream) );
160
154
161
155
const dim3 block (32 , 8 );
162
156
const dim3 grid (divUp (accum.cols - 2 , block.x ), divUp (accum.rows - 2 , block.y ));
163
157
164
158
cudaSafeCall ( cudaFuncSetCacheConfig (buildCentersList, cudaFuncCachePreferL1) );
165
159
166
- buildCentersList<<<grid, block>>> (accum, centers, threshold);
160
+ buildCentersList<<<grid, block, 0 , stream >>> (accum, centers, threshold, counterPtr );
167
161
cudaSafeCall ( cudaGetLastError () );
168
162
169
- cudaSafeCall ( cudaDeviceSynchronize () );
170
-
171
163
int totalCount;
172
- cudaSafeCall ( cudaMemcpy (&totalCount, counterPtr, sizeof (int ), cudaMemcpyDeviceToHost) );
164
+ cudaSafeCall ( cudaMemcpyAsync (&totalCount, counterPtr, sizeof (int ), cudaMemcpyDeviceToHost, stream) );
165
+ cudaSafeCall ( cudaStreamSynchronize (stream) );
173
166
174
167
return totalCount;
175
168
}
@@ -179,7 +172,8 @@ namespace cv { namespace cuda { namespace device
179
172
180
173
__global__ void circlesAccumRadius (const unsigned int * centers, const unsigned int * list, const int count,
181
174
float3 * circles, const int maxCircles, const float dp,
182
- const int minRadius, const int maxRadius, const int histSize, const int threshold)
175
+ const int minRadius, const int maxRadius, const int histSize, const int threshold,
176
+ int * counterPtr)
183
177
{
184
178
int * smem = DynamicSharedMem<int >();
185
179
@@ -219,34 +213,30 @@ namespace cv { namespace cuda { namespace device
219
213
220
214
if (curVotes >= threshold && curVotes > smem[i] && curVotes >= smem[i + 2 ])
221
215
{
222
- const int ind = ::atomicAdd (&g_counter , 1 );
216
+ const int ind = ::atomicAdd (counterPtr , 1 );
223
217
if (ind < maxCircles)
224
218
circles[ind] = make_float3 (cx, cy, i + minRadius);
225
219
}
226
220
}
227
221
}
228
222
229
223
int circlesAccumRadius_gpu (const unsigned int * centers, int centersCount, const unsigned int * list, int count,
230
- float3 * circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20)
224
+ float3 * circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20, int * counterPtr, cudaStream_t stream )
231
225
{
232
- void * counterPtr;
233
- cudaSafeCall ( cudaGetSymbolAddress (&counterPtr, g_counter) );
234
-
235
- cudaSafeCall ( cudaMemset (counterPtr, 0 , sizeof (int )) );
226
+ cudaSafeCall ( cudaMemsetAsync (counterPtr, 0 , sizeof (int ), stream) );
236
227
237
228
const dim3 block (has20 ? 1024 : 512 );
238
229
const dim3 grid (centersCount);
239
230
240
231
const int histSize = maxRadius - minRadius + 1 ;
241
232
size_t smemSize = (histSize + 2 ) * sizeof (int );
242
233
243
- circlesAccumRadius<<<grid, block, smemSize>>> (centers, list, count, circles, maxCircles, dp, minRadius, maxRadius, histSize, threshold);
234
+ circlesAccumRadius<<<grid, block, smemSize, stream >>> (centers, list, count, circles, maxCircles, dp, minRadius, maxRadius, histSize, threshold, counterPtr );
244
235
cudaSafeCall ( cudaGetLastError () );
245
236
246
- cudaSafeCall ( cudaDeviceSynchronize () );
247
-
248
237
int totalCount;
249
- cudaSafeCall ( cudaMemcpy (&totalCount, counterPtr, sizeof (int ), cudaMemcpyDeviceToHost) );
238
+ cudaSafeCall ( cudaMemcpyAsync (&totalCount, counterPtr, sizeof (int ), cudaMemcpyDeviceToHost, stream) );
239
+ cudaSafeCall ( cudaStreamSynchronize (stream) );
250
240
251
241
totalCount = ::min (totalCount, maxCircles);
252
242
0 commit comments