[CIR][CUDA] Fix destructor behaviour (#1422)

AdUhTkJm · web-flow · commit 994dee0d22cf · 2025-03-03T19:16:05.000-08:00
CIR didn't work on structs with destructor but without constructor. Now
it is fixed.

Moreover, CUDA kernels must be emitted if it was referred to in the
destructor of a non-device variable. It seems already working, so I just
unblocked the code path.
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -373,9 +373,9 @@ bool CIRGenModule::MayBeEmittedEagerly(const ValueDecl *global) {
   if (fd) {
     // Implicit template instantiations may change linkage if they are later
     // explicitly instantiated, so they should not be emitted eagerly.
-    // TODO(cir): do we care?
-    assert(fd->getTemplateSpecializationKind() != TSK_ImplicitInstantiation &&
-           "not implemented");
+    if (fd->getTemplateSpecializationKind() == TSK_ImplicitInstantiation)
+      return false;
+
     assert(!fd->isTemplated() && "Templates NYI");
   }
   const auto *vd = dyn_cast<VarDecl>(global);
diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -227,10 +227,12 @@ FuncOp LoweringPreparePass::buildCXXGlobalVarDeclInitFunc(GlobalOp op) {
                                   cir::GlobalLinkageKind::InternalLinkage);
 
   // Move over the initialzation code of the ctor region.
-  auto &block = op.getCtorRegion().front();
   mlir::Block *entryBB = f.addEntryBlock();
-  entryBB->getOperations().splice(entryBB->begin(), block.getOperations(),
-                                  block.begin(), std::prev(block.end()));
+  if (!op.getCtorRegion().empty()) {
+    auto &block = op.getCtorRegion().front();
+    entryBB->getOperations().splice(entryBB->begin(), block.getOperations(),
+                                    block.begin(), std::prev(block.end()));
+  }
 
   // Register the destructor call with __cxa_atexit
   auto &dtorRegion = op.getDtorRegion();
@@ -294,9 +296,18 @@ FuncOp LoweringPreparePass::buildCXXGlobalVarDeclInitFunc(GlobalOp op) {
 
   // Replace cir.yield with cir.return
   builder.setInsertionPointToEnd(entryBB);
-  auto &yieldOp = block.getOperations().back();
-  assert(isa<YieldOp>(yieldOp));
-  builder.create<ReturnOp>(yieldOp.getLoc());
+  mlir::Operation *yieldOp = nullptr;
+  if (!op.getCtorRegion().empty()) {
+    auto &block = op.getCtorRegion().front();
+    yieldOp = &block.getOperations().back();
+  } else {
+    assert(!dtorRegion.empty());
+    auto &block = dtorRegion.front();
+    yieldOp = &block.getOperations().back();
+  }
+
+  assert(isa<YieldOp>(*yieldOp));
+  builder.create<ReturnOp>(yieldOp->getLoc());
   return f;
 }
 
diff --git a/clang/test/CIR/CodeGen/CUDA/destructor.cu b/clang/test/CIR/CodeGen/CUDA/destructor.cu
@@ -0,0 +1,27 @@
+#include "../Inputs/cuda.h"
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir \
+// RUN:            -x cuda -emit-cir -target-sdk-version=12.3 \
+// RUN:            %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR-HOST --input-file=%t.cir %s
+
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fclangir \
+// RUN:            -fcuda-is-device -emit-cir -target-sdk-version=12.3 \
+// RUN:            %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR-DEVICE --input-file=%t.cir %s
+
+// Make sure we do emit device-side kernel even if it's only referenced
+// by the destructor of a variable not present on device.
+template<typename T> __global__ void f(T) {}
+template<typename T> struct A {
+  ~A() { f<<<1, 1>>>(T()); }
+};
+
+// CIR-DEVICE: cir.func @_Z1fIiEvT_
+
+// CIR-HOST: cir.func {{.*}} @_ZN1AIiED2Ev{{.*}} {
+// CIR-HOST:   cir.call @__cudaPushCallConfiguration
+// CIR-HOST:   cir.call @_Z16__device_stub__fIiEvT_
+// CIR-HOST: }
+
+A<int> a;