facebookresearch
diff --git a/‎pytorch3d/csrc/compositing/alpha_composite.cu
+39-4 b/‎pytorch3d/csrc/compositing/alpha_composite.cu
+39-4
diff --git a/‎pytorch3d/csrc/compositing/norm_weighted_sum.cu
+39-3 b/‎pytorch3d/csrc/compositing/norm_weighted_sum.cu
+39-3
diff --git a/‎pytorch3d/csrc/compositing/weighted_sum.cu
+39-4 b/‎pytorch3d/csrc/compositing/weighted_sum.cu
+39-4
diff --git a/‎pytorch3d/csrc/ext.cpp
+1-1 b/‎pytorch3d/csrc/ext.cpp
+1-1
diff --git a/‎pytorch3d/csrc/face_areas_normals/face_areas_normals.cu
+39-3 b/‎pytorch3d/csrc/face_areas_normals/face_areas_normals.cu
+39-3
@@ -2,6 +2,8 @@
 
 #include <ATen/ATen.h>
 #include <ATen/core/TensorAccessor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
 
 #include <cuda.h>
 #include <cuda_runtime.h>
@@ -136,26 +138,42 @@ at::Tensor alphaCompositeCudaForward(
     const at::Tensor& features,
     const at::Tensor& alphas,
     const at::Tensor& points_idx) {
+  // Check inputs are on the same device
+  at::TensorArg features_t{features, "features", 1},
+      alphas_t{alphas, "alphas", 2}, points_idx_t{points_idx, "points_idx", 3};
+  at::CheckedFrom c = "alphaCompositeCudaForward";
+  at::checkAllSameGPU(c, {features_t, alphas_t, points_idx_t});
+  at::checkAllSameType(c, {features_t, alphas_t});
+
+  // Set the device for the kernel launch based on the device of the input
+  at::cuda::CUDAGuard device_guard(features.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
   const int64_t batch_size = points_idx.size(0);
   const int64_t C = features.size(0);
   const int64_t H = points_idx.size(2);
   const int64_t W = points_idx.size(3);
 
   auto result = at::zeros({batch_size, C, H, W}, features.options());
 
+  if (result.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return result;
+  }
+
   const dim3 threadsPerBlock(64);
   const dim3 numBlocks(batch_size, 1024 / batch_size + 1);
 
   // TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
   // doubles. Currently, support is for floats only.
-  alphaCompositeCudaForwardKernel<<<numBlocks, threadsPerBlock>>>(
+  alphaCompositeCudaForwardKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(
       // clang-format off
       result.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
       features.packed_accessor64<float, 2, at::RestrictPtrTraits>(),
       alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
       points_idx.packed_accessor64<int64_t, 4, at::RestrictPtrTraits>());
   // clang-format on
-
+  AT_CUDA_CHECK(cudaGetLastError());
   return result;
 }
 
@@ -164,17 +182,34 @@ std::tuple<at::Tensor, at::Tensor> alphaCompositeCudaBackward(
     const at::Tensor& features,
     const at::Tensor& alphas,
     const at::Tensor& points_idx) {
+  // Check inputs are on the same device
+  at::TensorArg grad_outputs_t{grad_outputs, "grad_outputs", 1},
+      features_t{features, "features", 2}, alphas_t{alphas, "alphas", 3},
+      points_idx_t{points_idx, "points_idx", 4};
+  at::CheckedFrom c = "alphaCompositeCudaBackward";
+  at::checkAllSameGPU(c, {grad_outputs_t, features_t, alphas_t, points_idx_t});
+  at::checkAllSameType(c, {grad_outputs_t, features_t, alphas_t});
+
+  // Set the device for the kernel launch based on the device of the input
+  at::cuda::CUDAGuard device_guard(features.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
   auto grad_features = at::zeros_like(features);
   auto grad_alphas = at::zeros_like(alphas);
 
+  if (grad_features.numel() == 0 || grad_alphas.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return std::make_tuple(grad_features, grad_alphas);
+  }
+
   const int64_t bs = alphas.size(0);
 
   const dim3 threadsPerBlock(64);
   const dim3 numBlocks(bs, 1024 / bs + 1);
 
   // TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
   // doubles. Currently, support is for floats only.
-  alphaCompositeCudaBackwardKernel<<<numBlocks, threadsPerBlock>>>(
+  alphaCompositeCudaBackwardKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(
       // clang-format off
       grad_features.packed_accessor64<float, 2, at::RestrictPtrTraits>(),
       grad_alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
@@ -183,6 +218,6 @@ std::tuple<at::Tensor, at::Tensor> alphaCompositeCudaBackward(
       alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
       points_idx.packed_accessor64<int64_t, 4, at::RestrictPtrTraits>());
   // clang-format on
-
+  AT_CUDA_CHECK(cudaGetLastError());
   return std::make_tuple(grad_features, grad_alphas);
 }
@@ -2,6 +2,8 @@
 
 #include <ATen/ATen.h>
 #include <ATen/core/TensorAccessor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
 
 #include <cuda.h>
 #include <cuda_runtime.h>
@@ -151,26 +153,43 @@ at::Tensor weightedSumNormCudaForward(
     const at::Tensor& features,
     const at::Tensor& alphas,
     const at::Tensor& points_idx) {
+  // Check inputs are on the same device
+  at::TensorArg features_t{features, "features", 1},
+      alphas_t{alphas, "alphas", 2}, points_idx_t{points_idx, "points_idx", 3};
+  at::CheckedFrom c = "weightedSumNormCudaForward";
+  at::checkAllSameGPU(c, {features_t, alphas_t, points_idx_t});
+  at::checkAllSameType(c, {features_t, alphas_t});
+
+  // Set the device for the kernel launch based on the device of the input
+  at::cuda::CUDAGuard device_guard(features.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
   const int64_t batch_size = points_idx.size(0);
   const int64_t C = features.size(0);
   const int64_t H = points_idx.size(2);
   const int64_t W = points_idx.size(3);
 
   auto result = at::zeros({batch_size, C, H, W}, features.options());
 
+  if (result.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return result;
+  }
+
   const dim3 threadsPerBlock(64);
   const dim3 numBlocks(batch_size, 1024 / batch_size + 1);
 
   // TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
   // doubles. Currently, support is for floats only.
   // clang-format off
-  weightedSumNormCudaForwardKernel<<<numBlocks, threadsPerBlock>>>(
+  weightedSumNormCudaForwardKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(
       result.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
       features.packed_accessor64<float, 2, at::RestrictPtrTraits>(),
       alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
       points_idx.packed_accessor64<int64_t, 4, at::RestrictPtrTraits>());
   // clang-format on
 
+  AT_CUDA_CHECK(cudaGetLastError());
   return result;
 }
 
@@ -179,17 +198,34 @@ std::tuple<at::Tensor, at::Tensor> weightedSumNormCudaBackward(
     const at::Tensor& features,
     const at::Tensor& alphas,
     const at::Tensor& points_idx) {
+  // Check inputs are on the same device
+  at::TensorArg grad_outputs_t{grad_outputs, "grad_outputs", 1},
+      features_t{features, "features", 2}, alphas_t{alphas, "alphas", 3},
+      points_idx_t{points_idx, "points_idx", 4};
+  at::CheckedFrom c = "weightedSumNormCudaBackward";
+  at::checkAllSameGPU(c, {grad_outputs_t, features_t, alphas_t, points_idx_t});
+  at::checkAllSameType(c, {grad_outputs_t, features_t, alphas_t});
+
+  // Set the device for the kernel launch based on the device of the input
+  at::cuda::CUDAGuard device_guard(features.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
   auto grad_features = at::zeros_like(features);
   auto grad_alphas = at::zeros_like(alphas);
 
+  if (grad_features.numel() == 0 || grad_alphas.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return std::make_tuple(grad_features, grad_alphas);
+  }
+
   const int64_t bs = points_idx.size(0);
 
   const dim3 threadsPerBlock(64);
   const dim3 numBlocks(bs, 1024 / bs + 1);
 
   // TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
   // doubles. Currently, support is for floats only.
-  weightedSumNormCudaBackwardKernel<<<numBlocks, threadsPerBlock>>>(
+  weightedSumNormCudaBackwardKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(
       // clang-format off
       grad_features.packed_accessor64<float, 2, at::RestrictPtrTraits>(),
       grad_alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
@@ -198,6 +234,6 @@ std::tuple<at::Tensor, at::Tensor> weightedSumNormCudaBackward(
       alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
       points_idx.packed_accessor64<int64_t, 4, at::RestrictPtrTraits>());
   // clang-format on
-
+  AT_CUDA_CHECK(cudaGetLastError());
   return std::make_tuple(grad_features, grad_alphas);
 }
@@ -2,6 +2,8 @@
 
 #include <ATen/ATen.h>
 #include <ATen/core/TensorAccessor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
 
 #include <cuda.h>
 #include <cuda_runtime.h>
@@ -110,26 +112,42 @@ at::Tensor weightedSumCudaForward(
     const at::Tensor& features,
     const at::Tensor& alphas,
     const at::Tensor& points_idx) {
+  // Check inputs are on the same device
+  at::TensorArg features_t{features, "features", 1},
+      alphas_t{alphas, "alphas", 2}, points_idx_t{points_idx, "points_idx", 3};
+  at::CheckedFrom c = "weightedSumCudaForward";
+  at::checkAllSameGPU(c, {features_t, alphas_t, points_idx_t});
+  at::checkAllSameType(c, {features_t, alphas_t});
+
+  // Set the device for the kernel launch based on the device of the input
+  at::cuda::CUDAGuard device_guard(features.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
   const int64_t batch_size = points_idx.size(0);
   const int64_t C = features.size(0);
   const int64_t H = points_idx.size(2);
   const int64_t W = points_idx.size(3);
 
   auto result = at::zeros({batch_size, C, H, W}, features.options());
 
+  if (result.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return result;
+  }
+
   const dim3 threadsPerBlock(64);
   const dim3 numBlocks(batch_size, 1024 / batch_size + 1);
 
   // TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
   // doubles. Currently, support is for floats only.
-  weightedSumCudaForwardKernel<<<numBlocks, threadsPerBlock>>>(
+  weightedSumCudaForwardKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(
       // clang-format off
       result.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
       features.packed_accessor64<float, 2, at::RestrictPtrTraits>(),
       alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
       points_idx.packed_accessor64<int64_t, 4, at::RestrictPtrTraits>());
   // clang-format on
-
+  AT_CUDA_CHECK(cudaGetLastError());
   return result;
 }
 
@@ -138,17 +156,34 @@ std::tuple<at::Tensor, at::Tensor> weightedSumCudaBackward(
     const at::Tensor& features,
     const at::Tensor& alphas,
     const at::Tensor& points_idx) {
+  // Check inputs are on the same device
+  at::TensorArg grad_outputs_t{grad_outputs, "grad_outputs", 1},
+      features_t{features, "features", 2}, alphas_t{alphas, "alphas", 3},
+      points_idx_t{points_idx, "points_idx", 4};
+  at::CheckedFrom c = "weightedSumCudaBackward";
+  at::checkAllSameGPU(c, {grad_outputs_t, features_t, alphas_t, points_idx_t});
+  at::checkAllSameType(c, {grad_outputs_t, features_t, alphas_t});
+
+  // Set the device for the kernel launch based on the device of the input
+  at::cuda::CUDAGuard device_guard(features.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
   auto grad_features = at::zeros_like(features);
   auto grad_alphas = at::zeros_like(alphas);
 
+  if (grad_features.numel() == 0 || grad_alphas.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return std::make_tuple(grad_features, grad_alphas);
+  }
+
   const int64_t bs = points_idx.size(0);
 
   const dim3 threadsPerBlock(64);
   const dim3 numBlocks(bs, 1024 / bs + 1);
 
   // TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
   // doubles. Currently, support is for floats only.
-  weightedSumCudaBackwardKernel<<<numBlocks, threadsPerBlock>>>(
+  weightedSumCudaBackwardKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(
       // clang-format off
       grad_features.packed_accessor64<float, 2, at::RestrictPtrTraits>(),
       grad_alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
@@ -157,6 +192,6 @@ std::tuple<at::Tensor, at::Tensor> weightedSumCudaBackward(
       alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
       points_idx.packed_accessor64<int64_t, 4, at::RestrictPtrTraits>());
   // clang-format on
-
+  AT_CUDA_CHECK(cudaGetLastError());
   return std::make_tuple(grad_features, grad_alphas);
 }
@@ -23,7 +23,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
 #endif
   m.def("knn_points_idx", &KNearestNeighborIdx);
   m.def("knn_points_backward", &KNearestNeighborBackward);
-  m.def("gather_scatter", &gather_scatter);
+  m.def("gather_scatter", &GatherScatter);
   m.def("rasterize_points", &RasterizePoints);
   m.def("rasterize_points_backward", &RasterizePointsBackward);
   m.def("rasterize_meshes_backward", &RasterizeMeshesBackward);
 
@@ -1,6 +1,8 @@
 // Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 
 #include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
 #include <tuple>
 
 template <typename scalar_t>
@@ -213,22 +215,38 @@ std::tuple<at::Tensor, at::Tensor> FaceAreasNormalsForwardCuda(
   const auto V = verts.size(0);
   const auto F = faces.size(0);
 
+  // Check inputs are on the same device
+  at::TensorArg verts_t{verts, "verts", 1}, faces_t{verts, "faces", 2};
+  at::CheckedFrom c = "FaceAreasNormalsForwardCuda";
+  at::checkAllSameGPU(c, {verts_t, faces_t});
+  at::checkAllSameType(c, {verts_t, faces_t});
+
+  // Set the device for the kernel launch based on the device of verts
+  at::cuda::CUDAGuard device_guard(verts.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
   at::Tensor areas = at::empty({F}, verts.options());
   at::Tensor normals = at::empty({F, 3}, verts.options());
 
+  if (areas.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return std::make_tuple(areas, normals);
+  }
+
   const int blocks = 64;
   const int threads = 512;
+
   AT_DISPATCH_FLOATING_TYPES(
       verts.scalar_type(), "face_areas_normals_forward_cuda", ([&] {
-        FaceAreasNormalsForwardKernel<scalar_t><<<blocks, threads>>>(
+        FaceAreasNormalsForwardKernel<scalar_t><<<blocks, threads, 0, stream>>>(
             verts.data_ptr<scalar_t>(),
             faces.data_ptr<int64_t>(),
             areas.data_ptr<scalar_t>(),
             normals.data_ptr<scalar_t>(),
             V,
             F);
       }));
-
+  AT_CUDA_CHECK(cudaGetLastError());
   return std::make_tuple(areas, normals);
 }
 
@@ -237,16 +255,33 @@ at::Tensor FaceAreasNormalsBackwardCuda(
     const at::Tensor grad_normals,
     const at::Tensor verts,
     const at::Tensor faces) {
+  // Check inputs are on the same device
+  at::TensorArg verts_t{verts, "verts", 1}, faces_t{verts, "faces", 2},
+      grad_areas_t{verts, "grad_areas", 3},
+      grad_normals_t{verts, "grad_normals", 4};
+  at::CheckedFrom c = "FaceAreasNormalsBackwardCuda";
+  at::checkAllSameGPU(c, {verts_t, faces_t, grad_areas_t, grad_normals_t});
+  at::checkAllSameType(c, {verts_t, faces_t, grad_areas_t, grad_normals_t});
+
+  // Set the device for the kernel launch based on the device of verts
+  at::cuda::CUDAGuard device_guard(verts.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
   const auto V = verts.size(0);
   const auto F = faces.size(0);
 
   at::Tensor grad_verts = at::zeros({V, 3}, grad_areas.options());
 
+  if (grad_verts.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return grad_verts;
+  }
+
   const int blocks = 64;
   const int threads = 512;
   // TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
   // doubles. Currently, support is for floats only.
-  FaceAreasNormalsBackwardKernel<<<blocks, threads>>>(
+  FaceAreasNormalsBackwardKernel<<<blocks, threads, 0, stream>>>(
       grad_areas.data_ptr<float>(),
       grad_normals.data_ptr<float>(),
       verts.data_ptr<float>(),
@@ -255,5 +290,6 @@ at::Tensor FaceAreasNormalsBackwardCuda(
       V,
       F);
 
+  AT_CUDA_CHECK(cudaGetLastError());
   return grad_verts;
 }