Skip to content

Commit bdc0101

Browse files
committed
Merge pull request #2612 from nglee:dev_cudaFlipInplace
2 parents 5fae408 + 3ba40a5 commit bdc0101

File tree

2 files changed

+56
-1
lines changed

2 files changed

+56
-1
lines changed

modules/cudaarithm/src/core.cpp

+43-1
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,34 @@ namespace
102102
cudaSafeCall( cudaDeviceSynchronize() );
103103
}
104104
};
105+
106+
template <int DEPTH> struct NppMirrorIFunc
107+
{
108+
typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
109+
110+
typedef NppStatus (*func_t)(npp_t* pSrcDst, int nSrcDstStep, NppiSize oROI, NppiAxis flip);
111+
};
112+
113+
template <int DEPTH, typename NppMirrorIFunc<DEPTH>::func_t func> struct NppMirrorI
114+
{
115+
typedef typename NppMirrorIFunc<DEPTH>::npp_t npp_t;
116+
117+
static void call(GpuMat& srcDst, int flipCode, cudaStream_t stream)
118+
{
119+
NppStreamHandler h(stream);
120+
121+
NppiSize sz;
122+
sz.width = srcDst.cols;
123+
sz.height = srcDst.rows;
124+
125+
nppSafeCall( func(srcDst.ptr<npp_t>(), static_cast<int>(srcDst.step),
126+
sz,
127+
(flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) );
128+
129+
if (stream == 0)
130+
cudaSafeCall( cudaDeviceSynchronize() );
131+
}
132+
};
105133
}
106134

107135
void cv::cuda::flip(InputArray _src, OutputArray _dst, int flipCode, Stream& stream)
@@ -117,6 +145,17 @@ void cv::cuda::flip(InputArray _src, OutputArray _dst, int flipCode, Stream& str
117145
{NppMirror<CV_32F, nppiMirror_32f_C1R>::call, 0, NppMirror<CV_32F, nppiMirror_32f_C3R>::call, NppMirror<CV_32F, nppiMirror_32f_C4R>::call}
118146
};
119147

148+
typedef void (*ifunc_t)(GpuMat& srcDst, int flipCode, cudaStream_t stream);
149+
static const ifunc_t ifuncs[6][4] =
150+
{
151+
{NppMirrorI<CV_8U, nppiMirror_8u_C1IR>::call, 0, NppMirrorI<CV_8U, nppiMirror_8u_C3IR>::call, NppMirrorI<CV_8U, nppiMirror_8u_C4IR>::call},
152+
{0,0,0,0},
153+
{NppMirrorI<CV_16U, nppiMirror_16u_C1IR>::call, 0, NppMirrorI<CV_16U, nppiMirror_16u_C3IR>::call, NppMirrorI<CV_16U, nppiMirror_16u_C4IR>::call},
154+
{0,0,0,0},
155+
{NppMirrorI<CV_32S, nppiMirror_32s_C1IR>::call, 0, NppMirrorI<CV_32S, nppiMirror_32s_C3IR>::call, NppMirrorI<CV_32S, nppiMirror_32s_C4IR>::call},
156+
{NppMirrorI<CV_32F, nppiMirror_32f_C1IR>::call, 0, NppMirrorI<CV_32F, nppiMirror_32f_C3IR>::call, NppMirrorI<CV_32F, nppiMirror_32f_C4IR>::call}
157+
};
158+
120159
GpuMat src = getInputMat(_src, stream);
121160

122161
CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S || src.depth() == CV_32F);
@@ -125,7 +164,10 @@ void cv::cuda::flip(InputArray _src, OutputArray _dst, int flipCode, Stream& str
125164
_dst.create(src.size(), src.type());
126165
GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);
127166

128-
funcs[src.depth()][src.channels() - 1](src, dst, flipCode, StreamAccessor::getStream(stream));
167+
if (src.refcount != dst.refcount)
168+
funcs[src.depth()][src.channels() - 1](src, dst, flipCode, StreamAccessor::getStream(stream));
169+
else // in-place
170+
ifuncs[src.depth()][src.channels() - 1](src, flipCode, StreamAccessor::getStream(stream));
129171

130172
syncOutput(dst, _dst, stream);
131173
}

modules/cudaarithm/test/test_core.cpp

+13
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,19 @@ CUDA_TEST_P(Flip, Accuracy)
279279
EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
280280
}
281281

282+
CUDA_TEST_P(Flip, AccuracyInplace)
283+
{
284+
cv::Mat src = randomMat(size, type);
285+
286+
cv::cuda::GpuMat srcDst = loadMat(src, useRoi);
287+
cv::cuda::flip(srcDst, srcDst, flip_code);
288+
289+
cv::Mat dst_gold;
290+
cv::flip(src, dst_gold, flip_code);
291+
292+
EXPECT_MAT_NEAR(dst_gold, srcDst, 0.0);
293+
}
294+
282295
INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Flip, testing::Combine(
283296
ALL_DEVICES,
284297
DIFFERENT_SIZES,

0 commit comments

Comments
 (0)