Skip to content

Commit 33ae078

Browse files
Atlas42william.fink
and
william.fink
authored
Merge pull request #2801 from Atlas42:cuda-hough-stream-fix
Added stream support on hough circles, lines and segments * Added stream support on hough circles lines and segments - Passed the stream to the different cuda, OpenCV and thurst library calls - Replace all device by cuda synchronizes - Added extra synchronize calls after device to host transfers - Replaced the cuda globals by allocated values * Fixed missing include for CUDA 8 Co-authored-by: william.fink <[email protected]>
1 parent 6d5f440 commit 33ae078

File tree

7 files changed

+127
-119
lines changed

7 files changed

+127
-119
lines changed

modules/cudaimgproc/src/cuda/build_point_list.cu

+7-13
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,8 @@ namespace cv { namespace cuda { namespace device
4949
{
5050
namespace hough
5151
{
52-
__device__ int g_counter;
53-
5452
template <int PIXELS_PER_THREAD>
55-
__global__ void buildPointList(const PtrStepSzb src, unsigned int* list)
53+
__global__ void buildPointList(const PtrStepSzb src, unsigned int* list, int* counterPtr)
5654
{
5755
__shared__ unsigned int s_queues[4][32 * PIXELS_PER_THREAD];
5856
__shared__ int s_qsize[4];
@@ -94,7 +92,7 @@ namespace cv { namespace cuda { namespace device
9492
}
9593

9694
// calculate the offset in the global list
97-
const int globalOffset = atomicAdd(&g_counter, totalSize);
95+
const int globalOffset = atomicAdd(counterPtr, totalSize);
9896
for (int i = 0; i < blockDim.y; ++i)
9997
s_globStart[i] += globalOffset;
10098
}
@@ -108,27 +106,23 @@ namespace cv { namespace cuda { namespace device
108106
list[gidx] = s_queues[threadIdx.y][i];
109107
}
110108

111-
int buildPointList_gpu(PtrStepSzb src, unsigned int* list)
109+
int buildPointList_gpu(PtrStepSzb src, unsigned int* list, int* counterPtr, cudaStream_t stream)
112110
{
113111
const int PIXELS_PER_THREAD = 16;
114112

115-
void* counterPtr;
116-
cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
117-
118-
cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
113+
cudaSafeCall( cudaMemsetAsync(counterPtr, 0, sizeof(int), stream) );
119114

120115
const dim3 block(32, 4);
121116
const dim3 grid(divUp(src.cols, block.x * PIXELS_PER_THREAD), divUp(src.rows, block.y));
122117

123118
cudaSafeCall( cudaFuncSetCacheConfig(buildPointList<PIXELS_PER_THREAD>, cudaFuncCachePreferShared) );
124119

125-
buildPointList<PIXELS_PER_THREAD><<<grid, block>>>(src, list);
120+
buildPointList<PIXELS_PER_THREAD><<<grid, block, 0, stream>>>(src, list, counterPtr);
126121
cudaSafeCall( cudaGetLastError() );
127122

128-
cudaSafeCall( cudaDeviceSynchronize() );
129-
130123
int totalCount;
131-
cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
124+
cudaSafeCall( cudaMemcpyAsync(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
125+
cudaSafeCall( cudaStreamSynchronize(stream) );
132126

133127
return totalCount;
134128
}

modules/cudaimgproc/src/cuda/hough_circles.cu

+18-28
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,6 @@ namespace cv { namespace cuda { namespace device
5454
{
5555
namespace hough_circles
5656
{
57-
__device__ int g_counter;
58-
5957
////////////////////////////////////////////////////////////////////////
6058
// circlesAccumCenters
6159

@@ -111,23 +109,22 @@ namespace cv { namespace cuda { namespace device
111109
}
112110
}
113111

114-
void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp)
112+
void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp, cudaStream_t stream)
115113
{
116114
const dim3 block(256);
117115
const dim3 grid(divUp(count, block.x));
118116

119117
cudaSafeCall( cudaFuncSetCacheConfig(circlesAccumCenters, cudaFuncCachePreferL1) );
120118

121-
circlesAccumCenters<<<grid, block>>>(list, count, dx, dy, accum, accum.cols - 2, accum.rows - 2, minRadius, maxRadius, idp);
119+
circlesAccumCenters<<<grid, block, 0, stream>>>(list, count, dx, dy, accum, accum.cols - 2, accum.rows - 2, minRadius, maxRadius, idp);
122120
cudaSafeCall( cudaGetLastError() );
123121

124-
cudaSafeCall( cudaDeviceSynchronize() );
122+
cudaSafeCall( cudaStreamSynchronize(stream) );
125123
}
126124

127125
////////////////////////////////////////////////////////////////////////
128126
// buildCentersList
129-
130-
__global__ void buildCentersList(const PtrStepSzi accum, unsigned int* centers, const int threshold)
127+
__global__ void buildCentersList(const PtrStepSzi accum, unsigned int* centers, const int threshold, int* counterPtr)
131128
{
132129
const int x = blockIdx.x * blockDim.x + threadIdx.x;
133130
const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -145,31 +142,27 @@ namespace cv { namespace cuda { namespace device
145142
if (cur > threshold && cur > top && cur >= bottom && cur > left && cur >= right)
146143
{
147144
const unsigned int val = (y << 16) | x;
148-
const int idx = ::atomicAdd(&g_counter, 1);
145+
const int idx = ::atomicAdd(counterPtr, 1);
149146
centers[idx] = val;
150147
}
151148
}
152149
}
153150

154-
int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold)
151+
int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold, int* counterPtr, cudaStream_t stream)
155152
{
156-
void* counterPtr;
157-
cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
158-
159-
cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
153+
cudaSafeCall( cudaMemsetAsync(counterPtr, 0, sizeof(int), stream) );
160154

161155
const dim3 block(32, 8);
162156
const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
163157

164158
cudaSafeCall( cudaFuncSetCacheConfig(buildCentersList, cudaFuncCachePreferL1) );
165159

166-
buildCentersList<<<grid, block>>>(accum, centers, threshold);
160+
buildCentersList<<<grid, block, 0, stream>>>(accum, centers, threshold, counterPtr);
167161
cudaSafeCall( cudaGetLastError() );
168162

169-
cudaSafeCall( cudaDeviceSynchronize() );
170-
171163
int totalCount;
172-
cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
164+
cudaSafeCall( cudaMemcpyAsync(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
165+
cudaSafeCall( cudaStreamSynchronize(stream) );
173166

174167
return totalCount;
175168
}
@@ -179,7 +172,8 @@ namespace cv { namespace cuda { namespace device
179172

180173
__global__ void circlesAccumRadius(const unsigned int* centers, const unsigned int* list, const int count,
181174
float3* circles, const int maxCircles, const float dp,
182-
const int minRadius, const int maxRadius, const int histSize, const int threshold)
175+
const int minRadius, const int maxRadius, const int histSize, const int threshold,
176+
int* counterPtr)
183177
{
184178
int* smem = DynamicSharedMem<int>();
185179

@@ -219,34 +213,30 @@ namespace cv { namespace cuda { namespace device
219213

220214
if (curVotes >= threshold && curVotes > smem[i] && curVotes >= smem[i + 2])
221215
{
222-
const int ind = ::atomicAdd(&g_counter, 1);
216+
const int ind = ::atomicAdd(counterPtr, 1);
223217
if (ind < maxCircles)
224218
circles[ind] = make_float3(cx, cy, i + minRadius);
225219
}
226220
}
227221
}
228222

229223
int circlesAccumRadius_gpu(const unsigned int* centers, int centersCount, const unsigned int* list, int count,
230-
float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20)
224+
float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20, int* counterPtr, cudaStream_t stream)
231225
{
232-
void* counterPtr;
233-
cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
234-
235-
cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
226+
cudaSafeCall( cudaMemsetAsync(counterPtr, 0, sizeof(int), stream) );
236227

237228
const dim3 block(has20 ? 1024 : 512);
238229
const dim3 grid(centersCount);
239230

240231
const int histSize = maxRadius - minRadius + 1;
241232
size_t smemSize = (histSize + 2) * sizeof(int);
242233

243-
circlesAccumRadius<<<grid, block, smemSize>>>(centers, list, count, circles, maxCircles, dp, minRadius, maxRadius, histSize, threshold);
234+
circlesAccumRadius<<<grid, block, smemSize, stream>>>(centers, list, count, circles, maxCircles, dp, minRadius, maxRadius, histSize, threshold, counterPtr);
244235
cudaSafeCall( cudaGetLastError() );
245236

246-
cudaSafeCall( cudaDeviceSynchronize() );
247-
248237
int totalCount;
249-
cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
238+
cudaSafeCall( cudaMemcpyAsync(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
239+
cudaSafeCall( cudaStreamSynchronize(stream) );
250240

251241
totalCount = ::min(totalCount, maxCircles);
252242

modules/cudaimgproc/src/cuda/hough_lines.cu

+14-19
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444

4545
#include <thrust/device_ptr.h>
4646
#include <thrust/sort.h>
47+
#include <thrust/system/cuda/execution_policy.h>
4748

4849
#include "opencv2/core/cuda/common.hpp"
4950
#include "opencv2/core/cuda/emulation.hpp"
@@ -53,8 +54,6 @@ namespace cv { namespace cuda { namespace device
5354
{
5455
namespace hough_lines
5556
{
56-
__device__ int g_counter;
57-
5857
////////////////////////////////////////////////////////////////////////
5958
// linesAccum
6059

@@ -126,27 +125,26 @@ namespace cv { namespace cuda { namespace device
126125
accumRow[i] = smem[i];
127126
}
128127

129-
void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20)
128+
void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20, cudaStream_t stream)
130129
{
131130
const dim3 block(has20 ? 1024 : 512);
132131
const dim3 grid(accum.rows - 2);
133132

134133
size_t smemSize = (accum.cols - 1) * sizeof(int);
135134

136135
if (smemSize < sharedMemPerBlock - 1000)
137-
linesAccumShared<<<grid, block, smemSize>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
136+
linesAccumShared<<<grid, block, smemSize, stream>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
138137
else
139-
linesAccumGlobal<<<grid, block>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
138+
linesAccumGlobal<<<grid, block, 0, stream>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
140139

141140
cudaSafeCall( cudaGetLastError() );
142-
143-
cudaSafeCall( cudaDeviceSynchronize() );
141+
cudaSafeCall( cudaStreamSynchronize(stream) );
144142
}
145143

146144
////////////////////////////////////////////////////////////////////////
147145
// linesGetResult
148146

149-
__global__ void linesGetResult(const PtrStepSzi accum, float2* out, int* votes, const int maxSize, const float rho, const float theta, const int threshold, const int numrho)
147+
__global__ void linesGetResult(const PtrStepSzi accum, float2* out, int* votes, const int maxSize, const float rho, const float theta, const int threshold, const int numrho, int* counterPtr)
150148
{
151149
const int r = blockIdx.x * blockDim.x + threadIdx.x;
152150
const int n = blockIdx.y * blockDim.y + threadIdx.y;
@@ -165,7 +163,7 @@ namespace cv { namespace cuda { namespace device
165163
const float radius = (r - (numrho - 1) * 0.5f) * rho;
166164
const float angle = n * theta;
167165

168-
const int ind = ::atomicAdd(&g_counter, 1);
166+
const int ind = ::atomicAdd(counterPtr, 1);
169167
if (ind < maxSize)
170168
{
171169
out[ind] = make_float2(radius, angle);
@@ -174,33 +172,30 @@ namespace cv { namespace cuda { namespace device
174172
}
175173
}
176174

177-
int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort)
175+
int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort, int* counterPtr, cudaStream_t stream)
178176
{
179-
void* counterPtr;
180-
cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
181-
182-
cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
177+
cudaSafeCall( cudaMemsetAsync(counterPtr, 0, sizeof(int), stream) );
183178

184179
const dim3 block(32, 8);
185180
const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
186181

187182
cudaSafeCall( cudaFuncSetCacheConfig(linesGetResult, cudaFuncCachePreferL1) );
188183

189-
linesGetResult<<<grid, block>>>(accum, out, votes, maxSize, rho, theta, threshold, accum.cols - 2);
184+
linesGetResult<<<grid, block, 0, stream>>>(accum, out, votes, maxSize, rho, theta, threshold, accum.cols - 2, counterPtr);
190185
cudaSafeCall( cudaGetLastError() );
191186

192-
cudaSafeCall( cudaDeviceSynchronize() );
193-
194187
int totalCount;
195-
cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
188+
cudaSafeCall( cudaMemcpyAsync(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
189+
190+
cudaSafeCall( cudaStreamSynchronize(stream) );
196191

197192
totalCount = ::min(totalCount, maxSize);
198193

199194
if (doSort && totalCount > 0)
200195
{
201196
thrust::device_ptr<float2> outPtr(out);
202197
thrust::device_ptr<int> votesPtr(votes);
203-
thrust::sort_by_key(votesPtr, votesPtr + totalCount, outPtr, thrust::greater<int>());
198+
thrust::sort_by_key(thrust::cuda::par.on(stream), votesPtr, votesPtr + totalCount, outPtr, thrust::greater<int>());
204199
}
205200

206201
return totalCount;

modules/cudaimgproc/src/cuda/hough_segments.cu

+12-15
Original file line numberDiff line numberDiff line change
@@ -49,15 +49,14 @@ namespace cv { namespace cuda { namespace device
4949
{
5050
namespace hough_segments
5151
{
52-
__device__ int g_counter;
53-
5452
texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_mask(false, cudaFilterModePoint, cudaAddressModeClamp);
5553

5654
__global__ void houghLinesProbabilistic(const PtrStepSzi accum,
5755
int4* out, const int maxSize,
5856
const float rho, const float theta,
5957
const int lineGap, const int lineLength,
60-
const int rows, const int cols)
58+
const int rows, const int cols,
59+
int* counterPtr)
6160
{
6261
const int r = blockIdx.x * blockDim.x + threadIdx.x;
6362
const int n = blockIdx.y * blockDim.y + threadIdx.y;
@@ -182,7 +181,7 @@ namespace cv { namespace cuda { namespace device
182181

183182
if (good_line)
184183
{
185-
const int ind = ::atomicAdd(&g_counter, 1);
184+
const int ind = ::atomicAdd(counterPtr, 1);
186185
if (ind < maxSize)
187186
out[ind] = make_int4(line_end[0].x, line_end[0].y, line_end[1].x, line_end[1].y);
188187
}
@@ -202,7 +201,7 @@ namespace cv { namespace cuda { namespace device
202201

203202
if (good_line)
204203
{
205-
const int ind = ::atomicAdd(&g_counter, 1);
204+
const int ind = ::atomicAdd(counterPtr, 1);
206205
if (ind < maxSize)
207206
out[ind] = make_int4(line_end[0].x, line_end[0].y, line_end[1].x, line_end[1].y);
208207
}
@@ -214,29 +213,27 @@ namespace cv { namespace cuda { namespace device
214213
}
215214
}
216215

217-
int houghLinesProbabilistic_gpu(PtrStepSzb mask, PtrStepSzi accum, int4* out, int maxSize, float rho, float theta, int lineGap, int lineLength)
216+
int houghLinesProbabilistic_gpu(PtrStepSzb mask, PtrStepSzi accum, int4* out, int maxSize, float rho, float theta, int lineGap, int lineLength, int* counterPtr, cudaStream_t stream)
218217
{
219-
void* counterPtr;
220-
cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
221-
222-
cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
218+
cudaSafeCall( cudaMemsetAsync(counterPtr, 0, sizeof(int), stream) );
223219

224220
const dim3 block(32, 8);
225221
const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
226222

227223
bindTexture(&tex_mask, mask);
228224

229-
houghLinesProbabilistic<<<grid, block>>>(accum,
225+
houghLinesProbabilistic<<<grid, block, 0, stream>>>(accum,
230226
out, maxSize,
231227
rho, theta,
232228
lineGap, lineLength,
233-
mask.rows, mask.cols);
229+
mask.rows, mask.cols,
230+
counterPtr);
234231
cudaSafeCall( cudaGetLastError() );
235232

236-
cudaSafeCall( cudaDeviceSynchronize() );
237-
238233
int totalCount;
239-
cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
234+
cudaSafeCall( cudaMemcpyAsync(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
235+
236+
cudaSafeCall( cudaStreamSynchronize(stream) );
240237

241238
totalCount = ::min(totalCount, maxSize);
242239

0 commit comments

Comments
 (0)