diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt index 25c879710645c..954d561eb7cd8 100644 --- a/offload/CMakeLists.txt +++ b/offload/CMakeLists.txt @@ -113,6 +113,14 @@ else() set(CMAKE_CXX_EXTENSIONS NO) endif() +# Emit a warning for people who haven't updated their build. +if(NOT "openmp" IN_LIST RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES AND + NOT "openmp" IN_LIST RUNTIMES_nvptx64-nvidia-cuda_LLVM_ENABLE_RUNTIMES) + message(WARNING "Building the offloading runtime with no device library. See " + "https://openmp.llvm.org/SupportAndFAQ.html#q-how-to-build-an-openmp-gpu-offload-capable-compiler.html " + "for more information.") +endif() + # Set the path of all resulting libraries to a unified location so that it can # be used for testing. set(LIBOMPTARGET_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) @@ -373,7 +381,6 @@ set(LIBOMPTARGET_LLVM_LIBRARY_INTDIR "${LIBOMPTARGET_INTDIR}" CACHE STRING # Build offloading plugins and device RTLs if they are available. add_subdirectory(plugins-nextgen) -add_subdirectory(DeviceRTL) add_subdirectory(tools) # Build target agnostic offloading library. diff --git a/offload/cmake/caches/AMDGPUBot.cmake b/offload/cmake/caches/AMDGPUBot.cmake index 0236f5f0b6987..ce7c28c4a1144 100644 --- a/offload/cmake/caches/AMDGPUBot.cmake +++ b/offload/cmake/caches/AMDGPUBot.cmake @@ -19,3 +19,6 @@ set(LLVM_LIT_ARGS "-v --show-unsupported --timeout 100 --show-xfail -j 32" CACHE set(CLANG_DEFAULT_LINKER "lld" CACHE STRING "") set(CLANG_DEFAULT_RTLIB "compiler-rt" STRING "") + +set(LLVM_RUNTIME_TARGETS default;amdgcn-amd-amdhsa CACHE STRING "") +set(RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES "openmp" CACHE STRING "") diff --git a/offload/cmake/caches/Offload.cmake b/offload/cmake/caches/Offload.cmake index 5533a6508f5d5..3747a1d3eb299 100644 --- a/offload/cmake/caches/Offload.cmake +++ b/offload/cmake/caches/Offload.cmake @@ -5,5 +5,5 @@ set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR ON CACHE BOOL "") set(LLVM_RUNTIME_TARGETS default;amdgcn-amd-amdhsa;nvptx64-nvidia-cuda CACHE STRING "") set(RUNTIMES_nvptx64-nvidia-cuda_CACHE_FILES "${CMAKE_SOURCE_DIR}/../libcxx/cmake/caches/NVPTX.cmake" CACHE STRING "") set(RUNTIMES_amdgcn-amd-amdhsa_CACHE_FILES "${CMAKE_SOURCE_DIR}/../libcxx/cmake/caches/AMDGPU.cmake" CACHE STRING "") -set(RUNTIMES_nvptx64-nvidia-cuda_LLVM_ENABLE_RUNTIMES "compiler-rt;libc;libcxx;libcxxabi" CACHE STRING "") -set(RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES "compiler-rt;libc;libcxx;libcxxabi" CACHE STRING "") +set(RUNTIMES_nvptx64-nvidia-cuda_LLVM_ENABLE_RUNTIMES "compiler-rt;libc;openmp;libcxx;libcxxabi" CACHE STRING "") +set(RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES "compiler-rt;libc;openmp;libcxx;libcxxabi" CACHE STRING "") diff --git a/openmp/CMakeLists.txt b/openmp/CMakeLists.txt index c206386fa6b61..829a44913c124 100644 --- a/openmp/CMakeLists.txt +++ b/openmp/CMakeLists.txt @@ -88,6 +88,14 @@ else() set(CMAKE_CXX_EXTENSIONS NO) endif() +# Targeting the GPU directly requires a few flags to make CMake happy. +if("${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^amdgcn") + set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -nogpulib") +elseif("${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^nvptx") + set(CMAKE_REQUIRED_FLAGS + "${CMAKE_REQUIRED_FLAGS} -flto -c -Wno-unused-command-line-argument") +endif() + # Check and set up common compiler flags. include(config-ix) include(HandleOpenMPOptions) @@ -122,35 +130,41 @@ else() get_clang_resource_dir(LIBOMP_HEADERS_INSTALL_PATH SUBDIR include) endif() -# Build host runtime library, after LIBOMPTARGET variables are set since they are needed -# to enable time profiling support in the OpenMP runtime. -add_subdirectory(runtime) - -set(ENABLE_OMPT_TOOLS ON) -# Currently tools are not tested well on Windows or MacOS X. -if (APPLE OR WIN32) - set(ENABLE_OMPT_TOOLS OFF) -endif() +# Use the current compiler target to determine the appropriate runtime to build. +if("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^amdgcn|^nvptx" OR + "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^amdgcn|^nvptx") + add_subdirectory(device) +else() + # Build host runtime library, after LIBOMPTARGET variables are set since they + # are needed to enable time profiling support in the OpenMP runtime. + add_subdirectory(runtime) + + set(ENABLE_OMPT_TOOLS ON) + # Currently tools are not tested well on Windows or MacOS X. + if (APPLE OR WIN32) + set(ENABLE_OMPT_TOOLS OFF) + endif() -option(OPENMP_ENABLE_OMPT_TOOLS "Enable building ompt based tools for OpenMP." - ${ENABLE_OMPT_TOOLS}) -if (OPENMP_ENABLE_OMPT_TOOLS) - add_subdirectory(tools) -endif() + option(OPENMP_ENABLE_OMPT_TOOLS "Enable building ompt based tools for OpenMP." + ${ENABLE_OMPT_TOOLS}) + if (OPENMP_ENABLE_OMPT_TOOLS) + add_subdirectory(tools) + endif() -# Propagate OMPT support to offload -if(NOT ${OPENMP_STANDALONE_BUILD}) - set(LIBOMP_HAVE_OMPT_SUPPORT ${LIBOMP_HAVE_OMPT_SUPPORT} PARENT_SCOPE) - set(LIBOMP_OMP_TOOLS_INCLUDE_DIR ${LIBOMP_OMP_TOOLS_INCLUDE_DIR} PARENT_SCOPE) -endif() + # Propagate OMPT support to offload + if(NOT ${OPENMP_STANDALONE_BUILD}) + set(LIBOMP_HAVE_OMPT_SUPPORT ${LIBOMP_HAVE_OMPT_SUPPORT} PARENT_SCOPE) + set(LIBOMP_OMP_TOOLS_INCLUDE_DIR ${LIBOMP_OMP_TOOLS_INCLUDE_DIR} PARENT_SCOPE) + endif() -option(OPENMP_MSVC_NAME_SCHEME "Build dll with MSVC naming scheme." OFF) + option(OPENMP_MSVC_NAME_SCHEME "Build dll with MSVC naming scheme." OFF) -# Build libompd.so -add_subdirectory(libompd) + # Build libompd.so + add_subdirectory(libompd) -# Build documentation -add_subdirectory(docs) + # Build documentation + add_subdirectory(docs) -# Now that we have seen all testsuites, create the check-openmp target. -construct_check_openmp_target() + # Now that we have seen all testsuites, create the check-openmp target. + construct_check_openmp_target() +endif() diff --git a/openmp/device/CMakeLists.txt b/openmp/device/CMakeLists.txt new file mode 100644 index 0000000000000..619890863ca0c --- /dev/null +++ b/openmp/device/CMakeLists.txt @@ -0,0 +1,99 @@ +# Ensure the compiler is a valid clang when building the GPU target. +set(req_ver "${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}") +if(LLVM_VERSION_MAJOR AND NOT (CMAKE_CXX_COMPILER_ID MATCHES "[Cc]lang" AND + ${CMAKE_CXX_COMPILER_VERSION} VERSION_EQUAL "${req_ver}")) + message(FATAL_ERROR "Cannot build GPU device runtime. CMake compiler " + "'${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}' " + " is not 'Clang ${req_ver}'.") +endif() + +set(src_files + ${CMAKE_CURRENT_SOURCE_DIR}/src/Allocator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/Configuration.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/Debug.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/Kernel.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/LibC.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/Mapping.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/Misc.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/Parallelism.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/Profiling.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/Reduction.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/State.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/Synchronization.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/Tasking.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/DeviceUtils.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/Workshare.cpp +) + +list(APPEND compile_options -flto) +list(APPEND compile_options -fvisibility=hidden) +list(APPEND compile_options -nogpulib) +list(APPEND compile_options -nostdlibinc) +list(APPEND compile_options -fno-rtti) +list(APPEND compile_options -fno-exceptions) +list(APPEND compile_options -fconvergent-functions) +list(APPEND compile_options -Wno-unknown-cuda-version) +if(LLVM_DEFAULT_TARGET_TRIPLE) + list(APPEND compile_options --target=${LLVM_DEFAULT_TARGET_TRIPLE}) +endif() + +# We disable the slp vectorizer during the runtime optimization to avoid +# vectorized accesses to the shared state. Generally, those are "good" but +# the optimizer pipeline (esp. Attributor) does not fully support vectorized +# instructions yet and we end up missing out on way more important constant +# propagation. That said, we will run the vectorizer again after the runtime +# has been linked into the user program. +list(APPEND compile_flags "SHELL: -mllvm -vectorize-slp=false") +if("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^amdgcn" OR + "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^amdgcn") + set(target_name "amdgpu") + list(APPEND compile_flags "SHELL:-Xclang -mcode-object-version=none") +elseif("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^nvptx" OR + "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^nvptx") + set(target_name "nvptx") + list(APPEND compile_flags --cuda-feature=+ptx63) +endif() + +# Trick to combine these into a bitcode file via the linker's LTO pass. +add_executable(libompdevice ${src_files}) +set_target_properties(libompdevice PROPERTIES + RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + LINKER_LANGUAGE CXX + BUILD_RPATH "" + INSTALL_RPATH "" + RUNTIME_OUTPUT_NAME libomptarget-${target_name}.bc) + +# If the user built with the GPU C library enabled we will use that instead. +if(LIBOMPTARGET_GPU_LIBC_SUPPORT) + target_compile_definitions(libompdevice PRIVATE OMPTARGET_HAS_LIBC) +endif() +target_compile_definitions(libompdevice PRIVATE SHARED_SCRATCHPAD_SIZE=512) + +target_include_directories(libompdevice PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/include + ${CMAKE_CURRENT_SOURCE_DIR}/../../libc + ${CMAKE_CURRENT_SOURCE_DIR}/../../offload/include) +target_compile_options(libompdevice PRIVATE ${compile_options}) +target_link_options(libompdevice PRIVATE + "-flto" "-r" "-nostdlib" "-Wl,--lto-emit-llvm") +if(LLVM_DEFAULT_TARGET_TRIPLE) + target_link_options(libompdevice PRIVATE "--target=${LLVM_DEFAULT_TARGET_TRIPLE}") +endif() +install(TARGETS libompdevice + PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ + DESTINATION ${OPENMP_INSTALL_LIBDIR}) + +add_library(ompdevice.all_objs OBJECT IMPORTED) +set_property(TARGET ompdevice.all_objs APPEND PROPERTY IMPORTED_OBJECTS + ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-${target_name}.bc) + +# Archive all the object files generated above into a static library +add_library(ompdevice STATIC) +add_dependencies(ompdevice libompdevice) +set_target_properties(ompdevice PROPERTIES + ARCHIVE_OUTPUT_DIRECTORY "${OPENMP_INSTALL_LIBDIR}" + ARCHIVE_OUTPUT_NAME ompdevice + LINKER_LANGUAGE CXX +) +target_link_libraries(ompdevice PRIVATE ompdevice.all_objs) +install(TARGETS ompdevice ARCHIVE DESTINATION "${OPENMP_INSTALL_LIBDIR}") diff --git a/offload/DeviceRTL/include/Allocator.h b/openmp/device/include/Allocator.h similarity index 100% rename from offload/DeviceRTL/include/Allocator.h rename to openmp/device/include/Allocator.h diff --git a/offload/DeviceRTL/include/Configuration.h b/openmp/device/include/Configuration.h similarity index 100% rename from offload/DeviceRTL/include/Configuration.h rename to openmp/device/include/Configuration.h diff --git a/offload/DeviceRTL/include/Debug.h b/openmp/device/include/Debug.h similarity index 100% rename from offload/DeviceRTL/include/Debug.h rename to openmp/device/include/Debug.h diff --git a/offload/DeviceRTL/include/DeviceTypes.h b/openmp/device/include/DeviceTypes.h similarity index 100% rename from offload/DeviceRTL/include/DeviceTypes.h rename to openmp/device/include/DeviceTypes.h diff --git a/offload/DeviceRTL/include/DeviceUtils.h b/openmp/device/include/DeviceUtils.h similarity index 100% rename from offload/DeviceRTL/include/DeviceUtils.h rename to openmp/device/include/DeviceUtils.h diff --git a/offload/DeviceRTL/include/Interface.h b/openmp/device/include/Interface.h similarity index 100% rename from offload/DeviceRTL/include/Interface.h rename to openmp/device/include/Interface.h diff --git a/offload/DeviceRTL/include/LibC.h b/openmp/device/include/LibC.h similarity index 100% rename from offload/DeviceRTL/include/LibC.h rename to openmp/device/include/LibC.h diff --git a/offload/DeviceRTL/include/Mapping.h b/openmp/device/include/Mapping.h similarity index 100% rename from offload/DeviceRTL/include/Mapping.h rename to openmp/device/include/Mapping.h diff --git a/offload/DeviceRTL/include/Profiling.h b/openmp/device/include/Profiling.h similarity index 100% rename from offload/DeviceRTL/include/Profiling.h rename to openmp/device/include/Profiling.h diff --git a/offload/DeviceRTL/include/State.h b/openmp/device/include/State.h similarity index 100% rename from offload/DeviceRTL/include/State.h rename to openmp/device/include/State.h diff --git a/offload/DeviceRTL/include/Synchronization.h b/openmp/device/include/Synchronization.h similarity index 100% rename from offload/DeviceRTL/include/Synchronization.h rename to openmp/device/include/Synchronization.h diff --git a/offload/DeviceRTL/include/Workshare.h b/openmp/device/include/Workshare.h similarity index 100% rename from offload/DeviceRTL/include/Workshare.h rename to openmp/device/include/Workshare.h diff --git a/offload/DeviceRTL/include/generated_microtask_cases.gen b/openmp/device/include/generated_microtask_cases.gen similarity index 100% rename from offload/DeviceRTL/include/generated_microtask_cases.gen rename to openmp/device/include/generated_microtask_cases.gen diff --git a/offload/DeviceRTL/src/Allocator.cpp b/openmp/device/src/Allocator.cpp similarity index 100% rename from offload/DeviceRTL/src/Allocator.cpp rename to openmp/device/src/Allocator.cpp diff --git a/offload/DeviceRTL/src/Configuration.cpp b/openmp/device/src/Configuration.cpp similarity index 100% rename from offload/DeviceRTL/src/Configuration.cpp rename to openmp/device/src/Configuration.cpp diff --git a/offload/DeviceRTL/src/Debug.cpp b/openmp/device/src/Debug.cpp similarity index 100% rename from offload/DeviceRTL/src/Debug.cpp rename to openmp/device/src/Debug.cpp diff --git a/offload/DeviceRTL/src/DeviceUtils.cpp b/openmp/device/src/DeviceUtils.cpp similarity index 100% rename from offload/DeviceRTL/src/DeviceUtils.cpp rename to openmp/device/src/DeviceUtils.cpp diff --git a/offload/DeviceRTL/src/Kernel.cpp b/openmp/device/src/Kernel.cpp similarity index 100% rename from offload/DeviceRTL/src/Kernel.cpp rename to openmp/device/src/Kernel.cpp diff --git a/offload/DeviceRTL/src/LibC.cpp b/openmp/device/src/LibC.cpp similarity index 100% rename from offload/DeviceRTL/src/LibC.cpp rename to openmp/device/src/LibC.cpp diff --git a/offload/DeviceRTL/src/Mapping.cpp b/openmp/device/src/Mapping.cpp similarity index 100% rename from offload/DeviceRTL/src/Mapping.cpp rename to openmp/device/src/Mapping.cpp diff --git a/offload/DeviceRTL/src/Misc.cpp b/openmp/device/src/Misc.cpp similarity index 100% rename from offload/DeviceRTL/src/Misc.cpp rename to openmp/device/src/Misc.cpp diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/openmp/device/src/Parallelism.cpp similarity index 100% rename from offload/DeviceRTL/src/Parallelism.cpp rename to openmp/device/src/Parallelism.cpp diff --git a/offload/DeviceRTL/src/Profiling.cpp b/openmp/device/src/Profiling.cpp similarity index 100% rename from offload/DeviceRTL/src/Profiling.cpp rename to openmp/device/src/Profiling.cpp diff --git a/offload/DeviceRTL/src/Reduction.cpp b/openmp/device/src/Reduction.cpp similarity index 100% rename from offload/DeviceRTL/src/Reduction.cpp rename to openmp/device/src/Reduction.cpp diff --git a/offload/DeviceRTL/src/State.cpp b/openmp/device/src/State.cpp similarity index 100% rename from offload/DeviceRTL/src/State.cpp rename to openmp/device/src/State.cpp diff --git a/offload/DeviceRTL/src/Stub.cpp b/openmp/device/src/Stub.cpp similarity index 100% rename from offload/DeviceRTL/src/Stub.cpp rename to openmp/device/src/Stub.cpp diff --git a/offload/DeviceRTL/src/Synchronization.cpp b/openmp/device/src/Synchronization.cpp similarity index 100% rename from offload/DeviceRTL/src/Synchronization.cpp rename to openmp/device/src/Synchronization.cpp diff --git a/offload/DeviceRTL/src/Tasking.cpp b/openmp/device/src/Tasking.cpp similarity index 100% rename from offload/DeviceRTL/src/Tasking.cpp rename to openmp/device/src/Tasking.cpp diff --git a/offload/DeviceRTL/src/Workshare.cpp b/openmp/device/src/Workshare.cpp similarity index 100% rename from offload/DeviceRTL/src/Workshare.cpp rename to openmp/device/src/Workshare.cpp diff --git a/openmp/docs/SupportAndFAQ.rst b/openmp/docs/SupportAndFAQ.rst index b645723dcfd5e..9e8473ce384d1 100644 --- a/openmp/docs/SupportAndFAQ.rst +++ b/openmp/docs/SupportAndFAQ.rst @@ -78,6 +78,13 @@ Clang will be built with all backends enabled. When building with ``LLVM_ENABLE_RUNTIMES="openmp"`` OpenMP should not be enabled in ``LLVM_ENABLE_PROJECTS`` because it is enabled by default. +Support for the device library comes from a separate build of the OpenMP library +that targets the GPU architecture. Building it requires enabling the runtime +targets, or setting the target manually when doing a standalone build. This is +done with the ``LLVM_RUNTIME_TARGETS`` option and then enabling the OpenMP +runtime for the GPU target. ``RUNTIMES__LLVM_ENABLE_RUNTIMES``. Refer to +the cache file for the specific invocation. + For Nvidia offload, please see :ref:`build_nvidia_offload_capable_compiler`. For AMDGPU offload, please see :ref:`build_amdgpu_offload_capable_compiler`.