Skip to content

Commit c3d636d

Browse files
nikhilaravifacebook-github-bot
authored andcommitted
Cuda updates
Summary: Updates to: - enable cuda kernel launches on any GPU (not just the default) - cuda and contiguous checks for all kernels - checks to ensure all tensors are on the same device - error reporting in the cuda kernels - cuda tests now run on a random device not just the default Reviewed By: jcjohnson, gkioxari Differential Revision: D21215280 fbshipit-source-id: 1bedc9fe6c35e9e920bdc4d78ed12865b1005519
1 parent c9267ab commit c3d636d

33 files changed

+979
-240
lines changed

pytorch3d/csrc/compositing/alpha_composite.cu

+39-4
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
#include <ATen/ATen.h>
44
#include <ATen/core/TensorAccessor.h>
5+
#include <ATen/cuda/CUDAContext.h>
6+
#include <c10/cuda/CUDAGuard.h>
57

68
#include <cuda.h>
79
#include <cuda_runtime.h>
@@ -136,26 +138,42 @@ at::Tensor alphaCompositeCudaForward(
136138
const at::Tensor& features,
137139
const at::Tensor& alphas,
138140
const at::Tensor& points_idx) {
141+
// Check inputs are on the same device
142+
at::TensorArg features_t{features, "features", 1},
143+
alphas_t{alphas, "alphas", 2}, points_idx_t{points_idx, "points_idx", 3};
144+
at::CheckedFrom c = "alphaCompositeCudaForward";
145+
at::checkAllSameGPU(c, {features_t, alphas_t, points_idx_t});
146+
at::checkAllSameType(c, {features_t, alphas_t});
147+
148+
// Set the device for the kernel launch based on the device of the input
149+
at::cuda::CUDAGuard device_guard(features.device());
150+
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
151+
139152
const int64_t batch_size = points_idx.size(0);
140153
const int64_t C = features.size(0);
141154
const int64_t H = points_idx.size(2);
142155
const int64_t W = points_idx.size(3);
143156

144157
auto result = at::zeros({batch_size, C, H, W}, features.options());
145158

159+
if (result.numel() == 0) {
160+
AT_CUDA_CHECK(cudaGetLastError());
161+
return result;
162+
}
163+
146164
const dim3 threadsPerBlock(64);
147165
const dim3 numBlocks(batch_size, 1024 / batch_size + 1);
148166

149167
// TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
150168
// doubles. Currently, support is for floats only.
151-
alphaCompositeCudaForwardKernel<<<numBlocks, threadsPerBlock>>>(
169+
alphaCompositeCudaForwardKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(
152170
// clang-format off
153171
result.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
154172
features.packed_accessor64<float, 2, at::RestrictPtrTraits>(),
155173
alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
156174
points_idx.packed_accessor64<int64_t, 4, at::RestrictPtrTraits>());
157175
// clang-format on
158-
176+
AT_CUDA_CHECK(cudaGetLastError());
159177
return result;
160178
}
161179

@@ -164,17 +182,34 @@ std::tuple<at::Tensor, at::Tensor> alphaCompositeCudaBackward(
164182
const at::Tensor& features,
165183
const at::Tensor& alphas,
166184
const at::Tensor& points_idx) {
185+
// Check inputs are on the same device
186+
at::TensorArg grad_outputs_t{grad_outputs, "grad_outputs", 1},
187+
features_t{features, "features", 2}, alphas_t{alphas, "alphas", 3},
188+
points_idx_t{points_idx, "points_idx", 4};
189+
at::CheckedFrom c = "alphaCompositeCudaBackward";
190+
at::checkAllSameGPU(c, {grad_outputs_t, features_t, alphas_t, points_idx_t});
191+
at::checkAllSameType(c, {grad_outputs_t, features_t, alphas_t});
192+
193+
// Set the device for the kernel launch based on the device of the input
194+
at::cuda::CUDAGuard device_guard(features.device());
195+
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
196+
167197
auto grad_features = at::zeros_like(features);
168198
auto grad_alphas = at::zeros_like(alphas);
169199

200+
if (grad_features.numel() == 0 || grad_alphas.numel() == 0) {
201+
AT_CUDA_CHECK(cudaGetLastError());
202+
return std::make_tuple(grad_features, grad_alphas);
203+
}
204+
170205
const int64_t bs = alphas.size(0);
171206

172207
const dim3 threadsPerBlock(64);
173208
const dim3 numBlocks(bs, 1024 / bs + 1);
174209

175210
// TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
176211
// doubles. Currently, support is for floats only.
177-
alphaCompositeCudaBackwardKernel<<<numBlocks, threadsPerBlock>>>(
212+
alphaCompositeCudaBackwardKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(
178213
// clang-format off
179214
grad_features.packed_accessor64<float, 2, at::RestrictPtrTraits>(),
180215
grad_alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
@@ -183,6 +218,6 @@ std::tuple<at::Tensor, at::Tensor> alphaCompositeCudaBackward(
183218
alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
184219
points_idx.packed_accessor64<int64_t, 4, at::RestrictPtrTraits>());
185220
// clang-format on
186-
221+
AT_CUDA_CHECK(cudaGetLastError());
187222
return std::make_tuple(grad_features, grad_alphas);
188223
}

pytorch3d/csrc/compositing/norm_weighted_sum.cu

+39-3
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
#include <ATen/ATen.h>
44
#include <ATen/core/TensorAccessor.h>
5+
#include <ATen/cuda/CUDAContext.h>
6+
#include <c10/cuda/CUDAGuard.h>
57

68
#include <cuda.h>
79
#include <cuda_runtime.h>
@@ -151,26 +153,43 @@ at::Tensor weightedSumNormCudaForward(
151153
const at::Tensor& features,
152154
const at::Tensor& alphas,
153155
const at::Tensor& points_idx) {
156+
// Check inputs are on the same device
157+
at::TensorArg features_t{features, "features", 1},
158+
alphas_t{alphas, "alphas", 2}, points_idx_t{points_idx, "points_idx", 3};
159+
at::CheckedFrom c = "weightedSumNormCudaForward";
160+
at::checkAllSameGPU(c, {features_t, alphas_t, points_idx_t});
161+
at::checkAllSameType(c, {features_t, alphas_t});
162+
163+
// Set the device for the kernel launch based on the device of the input
164+
at::cuda::CUDAGuard device_guard(features.device());
165+
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
166+
154167
const int64_t batch_size = points_idx.size(0);
155168
const int64_t C = features.size(0);
156169
const int64_t H = points_idx.size(2);
157170
const int64_t W = points_idx.size(3);
158171

159172
auto result = at::zeros({batch_size, C, H, W}, features.options());
160173

174+
if (result.numel() == 0) {
175+
AT_CUDA_CHECK(cudaGetLastError());
176+
return result;
177+
}
178+
161179
const dim3 threadsPerBlock(64);
162180
const dim3 numBlocks(batch_size, 1024 / batch_size + 1);
163181

164182
// TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
165183
// doubles. Currently, support is for floats only.
166184
// clang-format off
167-
weightedSumNormCudaForwardKernel<<<numBlocks, threadsPerBlock>>>(
185+
weightedSumNormCudaForwardKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(
168186
result.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
169187
features.packed_accessor64<float, 2, at::RestrictPtrTraits>(),
170188
alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
171189
points_idx.packed_accessor64<int64_t, 4, at::RestrictPtrTraits>());
172190
// clang-format on
173191

192+
AT_CUDA_CHECK(cudaGetLastError());
174193
return result;
175194
}
176195

@@ -179,17 +198,34 @@ std::tuple<at::Tensor, at::Tensor> weightedSumNormCudaBackward(
179198
const at::Tensor& features,
180199
const at::Tensor& alphas,
181200
const at::Tensor& points_idx) {
201+
// Check inputs are on the same device
202+
at::TensorArg grad_outputs_t{grad_outputs, "grad_outputs", 1},
203+
features_t{features, "features", 2}, alphas_t{alphas, "alphas", 3},
204+
points_idx_t{points_idx, "points_idx", 4};
205+
at::CheckedFrom c = "weightedSumNormCudaBackward";
206+
at::checkAllSameGPU(c, {grad_outputs_t, features_t, alphas_t, points_idx_t});
207+
at::checkAllSameType(c, {grad_outputs_t, features_t, alphas_t});
208+
209+
// Set the device for the kernel launch based on the device of the input
210+
at::cuda::CUDAGuard device_guard(features.device());
211+
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
212+
182213
auto grad_features = at::zeros_like(features);
183214
auto grad_alphas = at::zeros_like(alphas);
184215

216+
if (grad_features.numel() == 0 || grad_alphas.numel() == 0) {
217+
AT_CUDA_CHECK(cudaGetLastError());
218+
return std::make_tuple(grad_features, grad_alphas);
219+
}
220+
185221
const int64_t bs = points_idx.size(0);
186222

187223
const dim3 threadsPerBlock(64);
188224
const dim3 numBlocks(bs, 1024 / bs + 1);
189225

190226
// TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
191227
// doubles. Currently, support is for floats only.
192-
weightedSumNormCudaBackwardKernel<<<numBlocks, threadsPerBlock>>>(
228+
weightedSumNormCudaBackwardKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(
193229
// clang-format off
194230
grad_features.packed_accessor64<float, 2, at::RestrictPtrTraits>(),
195231
grad_alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
@@ -198,6 +234,6 @@ std::tuple<at::Tensor, at::Tensor> weightedSumNormCudaBackward(
198234
alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
199235
points_idx.packed_accessor64<int64_t, 4, at::RestrictPtrTraits>());
200236
// clang-format on
201-
237+
AT_CUDA_CHECK(cudaGetLastError());
202238
return std::make_tuple(grad_features, grad_alphas);
203239
}

pytorch3d/csrc/compositing/weighted_sum.cu

+39-4
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
#include <ATen/ATen.h>
44
#include <ATen/core/TensorAccessor.h>
5+
#include <ATen/cuda/CUDAContext.h>
6+
#include <c10/cuda/CUDAGuard.h>
57

68
#include <cuda.h>
79
#include <cuda_runtime.h>
@@ -110,26 +112,42 @@ at::Tensor weightedSumCudaForward(
110112
const at::Tensor& features,
111113
const at::Tensor& alphas,
112114
const at::Tensor& points_idx) {
115+
// Check inputs are on the same device
116+
at::TensorArg features_t{features, "features", 1},
117+
alphas_t{alphas, "alphas", 2}, points_idx_t{points_idx, "points_idx", 3};
118+
at::CheckedFrom c = "weightedSumCudaForward";
119+
at::checkAllSameGPU(c, {features_t, alphas_t, points_idx_t});
120+
at::checkAllSameType(c, {features_t, alphas_t});
121+
122+
// Set the device for the kernel launch based on the device of the input
123+
at::cuda::CUDAGuard device_guard(features.device());
124+
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
125+
113126
const int64_t batch_size = points_idx.size(0);
114127
const int64_t C = features.size(0);
115128
const int64_t H = points_idx.size(2);
116129
const int64_t W = points_idx.size(3);
117130

118131
auto result = at::zeros({batch_size, C, H, W}, features.options());
119132

133+
if (result.numel() == 0) {
134+
AT_CUDA_CHECK(cudaGetLastError());
135+
return result;
136+
}
137+
120138
const dim3 threadsPerBlock(64);
121139
const dim3 numBlocks(batch_size, 1024 / batch_size + 1);
122140

123141
// TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
124142
// doubles. Currently, support is for floats only.
125-
weightedSumCudaForwardKernel<<<numBlocks, threadsPerBlock>>>(
143+
weightedSumCudaForwardKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(
126144
// clang-format off
127145
result.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
128146
features.packed_accessor64<float, 2, at::RestrictPtrTraits>(),
129147
alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
130148
points_idx.packed_accessor64<int64_t, 4, at::RestrictPtrTraits>());
131149
// clang-format on
132-
150+
AT_CUDA_CHECK(cudaGetLastError());
133151
return result;
134152
}
135153

@@ -138,17 +156,34 @@ std::tuple<at::Tensor, at::Tensor> weightedSumCudaBackward(
138156
const at::Tensor& features,
139157
const at::Tensor& alphas,
140158
const at::Tensor& points_idx) {
159+
// Check inputs are on the same device
160+
at::TensorArg grad_outputs_t{grad_outputs, "grad_outputs", 1},
161+
features_t{features, "features", 2}, alphas_t{alphas, "alphas", 3},
162+
points_idx_t{points_idx, "points_idx", 4};
163+
at::CheckedFrom c = "weightedSumCudaBackward";
164+
at::checkAllSameGPU(c, {grad_outputs_t, features_t, alphas_t, points_idx_t});
165+
at::checkAllSameType(c, {grad_outputs_t, features_t, alphas_t});
166+
167+
// Set the device for the kernel launch based on the device of the input
168+
at::cuda::CUDAGuard device_guard(features.device());
169+
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
170+
141171
auto grad_features = at::zeros_like(features);
142172
auto grad_alphas = at::zeros_like(alphas);
143173

174+
if (grad_features.numel() == 0 || grad_alphas.numel() == 0) {
175+
AT_CUDA_CHECK(cudaGetLastError());
176+
return std::make_tuple(grad_features, grad_alphas);
177+
}
178+
144179
const int64_t bs = points_idx.size(0);
145180

146181
const dim3 threadsPerBlock(64);
147182
const dim3 numBlocks(bs, 1024 / bs + 1);
148183

149184
// TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
150185
// doubles. Currently, support is for floats only.
151-
weightedSumCudaBackwardKernel<<<numBlocks, threadsPerBlock>>>(
186+
weightedSumCudaBackwardKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(
152187
// clang-format off
153188
grad_features.packed_accessor64<float, 2, at::RestrictPtrTraits>(),
154189
grad_alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
@@ -157,6 +192,6 @@ std::tuple<at::Tensor, at::Tensor> weightedSumCudaBackward(
157192
alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
158193
points_idx.packed_accessor64<int64_t, 4, at::RestrictPtrTraits>());
159194
// clang-format on
160-
195+
AT_CUDA_CHECK(cudaGetLastError());
161196
return std::make_tuple(grad_features, grad_alphas);
162197
}

pytorch3d/csrc/ext.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
2323
#endif
2424
m.def("knn_points_idx", &KNearestNeighborIdx);
2525
m.def("knn_points_backward", &KNearestNeighborBackward);
26-
m.def("gather_scatter", &gather_scatter);
26+
m.def("gather_scatter", &GatherScatter);
2727
m.def("rasterize_points", &RasterizePoints);
2828
m.def("rasterize_points_backward", &RasterizePointsBackward);
2929
m.def("rasterize_meshes_backward", &RasterizeMeshesBackward);

pytorch3d/csrc/face_areas_normals/face_areas_normals.cu

+39-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
22

33
#include <ATen/ATen.h>
4+
#include <ATen/cuda/CUDAContext.h>
5+
#include <c10/cuda/CUDAGuard.h>
46
#include <tuple>
57

68
template <typename scalar_t>
@@ -213,22 +215,38 @@ std::tuple<at::Tensor, at::Tensor> FaceAreasNormalsForwardCuda(
213215
const auto V = verts.size(0);
214216
const auto F = faces.size(0);
215217

218+
// Check inputs are on the same device
219+
at::TensorArg verts_t{verts, "verts", 1}, faces_t{verts, "faces", 2};
220+
at::CheckedFrom c = "FaceAreasNormalsForwardCuda";
221+
at::checkAllSameGPU(c, {verts_t, faces_t});
222+
at::checkAllSameType(c, {verts_t, faces_t});
223+
224+
// Set the device for the kernel launch based on the device of verts
225+
at::cuda::CUDAGuard device_guard(verts.device());
226+
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
227+
216228
at::Tensor areas = at::empty({F}, verts.options());
217229
at::Tensor normals = at::empty({F, 3}, verts.options());
218230

231+
if (areas.numel() == 0) {
232+
AT_CUDA_CHECK(cudaGetLastError());
233+
return std::make_tuple(areas, normals);
234+
}
235+
219236
const int blocks = 64;
220237
const int threads = 512;
238+
221239
AT_DISPATCH_FLOATING_TYPES(
222240
verts.scalar_type(), "face_areas_normals_forward_cuda", ([&] {
223-
FaceAreasNormalsForwardKernel<scalar_t><<<blocks, threads>>>(
241+
FaceAreasNormalsForwardKernel<scalar_t><<<blocks, threads, 0, stream>>>(
224242
verts.data_ptr<scalar_t>(),
225243
faces.data_ptr<int64_t>(),
226244
areas.data_ptr<scalar_t>(),
227245
normals.data_ptr<scalar_t>(),
228246
V,
229247
F);
230248
}));
231-
249+
AT_CUDA_CHECK(cudaGetLastError());
232250
return std::make_tuple(areas, normals);
233251
}
234252

@@ -237,16 +255,33 @@ at::Tensor FaceAreasNormalsBackwardCuda(
237255
const at::Tensor grad_normals,
238256
const at::Tensor verts,
239257
const at::Tensor faces) {
258+
// Check inputs are on the same device
259+
at::TensorArg verts_t{verts, "verts", 1}, faces_t{verts, "faces", 2},
260+
grad_areas_t{verts, "grad_areas", 3},
261+
grad_normals_t{verts, "grad_normals", 4};
262+
at::CheckedFrom c = "FaceAreasNormalsBackwardCuda";
263+
at::checkAllSameGPU(c, {verts_t, faces_t, grad_areas_t, grad_normals_t});
264+
at::checkAllSameType(c, {verts_t, faces_t, grad_areas_t, grad_normals_t});
265+
266+
// Set the device for the kernel launch based on the device of verts
267+
at::cuda::CUDAGuard device_guard(verts.device());
268+
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
269+
240270
const auto V = verts.size(0);
241271
const auto F = faces.size(0);
242272

243273
at::Tensor grad_verts = at::zeros({V, 3}, grad_areas.options());
244274

275+
if (grad_verts.numel() == 0) {
276+
AT_CUDA_CHECK(cudaGetLastError());
277+
return grad_verts;
278+
}
279+
245280
const int blocks = 64;
246281
const int threads = 512;
247282
// TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
248283
// doubles. Currently, support is for floats only.
249-
FaceAreasNormalsBackwardKernel<<<blocks, threads>>>(
284+
FaceAreasNormalsBackwardKernel<<<blocks, threads, 0, stream>>>(
250285
grad_areas.data_ptr<float>(),
251286
grad_normals.data_ptr<float>(),
252287
verts.data_ptr<float>(),
@@ -255,5 +290,6 @@ at::Tensor FaceAreasNormalsBackwardCuda(
255290
V,
256291
F);
257292

293+
AT_CUDA_CHECK(cudaGetLastError());
258294
return grad_verts;
259295
}

0 commit comments

Comments
 (0)