Browse Source

CUDA: Static libraries can now explicitly resolve device symbols

If a static library has the property CUDA_RESOLVE_DEVICE_SYMBOLS enabled
it will now perform the device link step. The normal behavior is
to delay calling device link until the static library is consumed by
a shared library or an executable.
Robert Maynard 8 years ago
parent
commit
493671a521

+ 1 - 0
Help/manual/cmake-properties.7.rst

@@ -154,6 +154,7 @@ Properties on Targets
    /prop_tgt/CROSSCOMPILING_EMULATOR
    /prop_tgt/CUDA_PTX_COMPILATION
    /prop_tgt/CUDA_SEPARABLE_COMPILATION
+   /prop_tgt/CUDA_RESOLVE_DEVICE_SYMBOLS
    /prop_tgt/CUDA_EXTENSIONS
    /prop_tgt/CUDA_STANDARD
    /prop_tgt/CUDA_STANDARD_REQUIRED

+ 15 - 0
Help/prop_tgt/CUDA_RESOLVE_DEVICE_SYMBOLS.rst

@@ -0,0 +1,15 @@
+CUDA_RESOLVE_DEVICE_SYMBOLS
+---------------------------
+
+CUDA only: Enables device linking for the specific static library target
+
+If set this will enable device linking on this static library target. Normally
+device linking is deferred until a shared library or executable is generated,
+allowing for multiple static libraries to resolve device symbols at the same
+time.
+
+For instance:
+
+.. code-block:: cmake
+
+  set_property(TARGET mystaticlib PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)

+ 18 - 3
Source/cmLinkLineDeviceComputer.cxx

@@ -39,9 +39,24 @@ std::string cmLinkLineDeviceComputer::ComputeLinkLibraries(
       continue;
     }
 
-    if (li->Target->GetType() == cmStateEnums::INTERFACE_LIBRARY ||
-        li->Target->GetType() == cmStateEnums::SHARED_LIBRARY ||
-        li->Target->GetType() == cmStateEnums::MODULE_LIBRARY) {
+    bool skippable = false;
+    switch (li->Target->GetType()) {
+      case cmStateEnums::SHARED_LIBRARY:
+      case cmStateEnums::MODULE_LIBRARY:
+      case cmStateEnums::INTERFACE_LIBRARY:
+        skippable = true;
+        break;
+      case cmStateEnums::STATIC_LIBRARY:
+        // If a static library is resolving its device linking, it should
+        // be removed for other device linking
+        skippable =
+          li->Target->GetPropertyAsBool("CUDA_RESOLVE_DEVICE_SYMBOLS");
+        break;
+      default:
+        break;
+    }
+
+    if (skippable) {
       continue;
     }
 

+ 28 - 0
Source/cmMakefileLibraryTargetGenerator.cxx

@@ -127,6 +127,24 @@ void cmMakefileLibraryTargetGenerator::WriteObjectLibraryRules()
 
 void cmMakefileLibraryTargetGenerator::WriteStaticLibraryRules()
 {
+  const std::string cuda_lang("CUDA");
+  cmGeneratorTarget::LinkClosure const* closure =
+    this->GeneratorTarget->GetLinkClosure(this->ConfigName);
+
+  const bool hasCUDA =
+    (std::find(closure->Languages.begin(), closure->Languages.end(),
+               cuda_lang) != closure->Languages.end());
+
+  const bool resolveDeviceSymbols =
+    this->GeneratorTarget->GetPropertyAsBool("CUDA_RESOLVE_DEVICE_SYMBOLS");
+  if (hasCUDA && resolveDeviceSymbols) {
+    std::string linkRuleVar = "CMAKE_CUDA_DEVICE_LINK_LIBRARY";
+    std::string extraFlags;
+    this->LocalGenerator->AppendFlags(
+      extraFlags, this->GeneratorTarget->GetProperty("LINK_FLAGS"));
+    this->WriteDeviceLibraryRules(linkRuleVar, extraFlags, false);
+  }
+
   std::string linkLanguage =
     this->GeneratorTarget->GetLinkerLanguage(this->ConfigName);
 
@@ -861,6 +879,16 @@ void cmMakefileLibraryTargetGenerator::WriteLibraryRules(
       std::vector<std::string> object_strings;
       this->WriteObjectsStrings(object_strings, archiveCommandLimit);
 
+      // Add the cuda device object to the list of archive files. This will
+      // only occur on archives which have CUDA_RESOLVE_DEVICE_SYMBOLS enabled
+      if (!this->DeviceLinkObject.empty()) {
+        object_strings.push_back(this->LocalGenerator->ConvertToOutputFormat(
+          this->LocalGenerator->MaybeConvertToRelativePath(
+            this->LocalGenerator->GetCurrentBinaryDirectory(),
+            this->DeviceLinkObject),
+          cmOutputConverter::SHELL));
+      }
+
       // Create the archive with the first set of objects.
       std::vector<std::string>::iterator osi = object_strings.begin();
       {

+ 6 - 1
Source/cmNinjaNormalTargetGenerator.cxx

@@ -447,6 +447,7 @@ std::vector<std::string> cmNinjaNormalTargetGenerator::ComputeDeviceLinkCmd()
   // an executable or a dynamic library.
   std::string linkCmd;
   switch (this->GetGeneratorTarget()->GetType()) {
+    case cmStateEnums::STATIC_LIBRARY:
     case cmStateEnums::SHARED_LIBRARY:
     case cmStateEnums::MODULE_LIBRARY: {
       const std::string cudaLinkCmd(
@@ -559,11 +560,15 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement()
     case cmStateEnums::EXECUTABLE:
       shouldHaveDeviceLinking = true;
       break;
+    case cmStateEnums::STATIC_LIBRARY:
+      shouldHaveDeviceLinking =
+        genTarget.GetPropertyAsBool("CUDA_RESOLVE_DEVICE_SYMBOLS");
+      break;
     default:
       break;
   }
 
-  if (!shouldHaveDeviceLinking || !hasCUDA) {
+  if (!(shouldHaveDeviceLinking && hasCUDA)) {
     return;
   }
 

+ 4 - 0
Source/cmVisualStudio10TargetGenerator.cxx

@@ -2563,6 +2563,10 @@ bool cmVisualStudio10TargetGenerator::ComputeCudaLinkOptions(
     case cmStateEnums::EXECUTABLE:
       doDeviceLinking = true;
       break;
+    case cmStateEnums::STATIC_LIBRARY:
+      doDeviceLinking = this->GeneratorTarget->GetPropertyAsBool(
+        "CUDA_RESOLVE_DEVICE_SYMBOLS");
+      break;
     default:
       break;
   }

+ 1 - 0
Tests/CudaOnly/CMakeLists.txt

@@ -3,3 +3,4 @@ ADD_TEST_MACRO(CudaOnly.EnableStandard CudaOnlyEnableStandard)
 ADD_TEST_MACRO(CudaOnly.ExportPTX CudaOnlyExportPTX)
 ADD_TEST_MACRO(CudaOnly.SeparateCompilation CudaOnlySeparateCompilation)
 ADD_TEST_MACRO(CudaOnly.WithDefs CudaOnlyWithDefs)
+ADD_TEST_MACRO(CudaOnly.ResolveDeviceSymbols CudaOnlyResolveDeviceSymbols)

+ 52 - 0
Tests/CudaOnly/ResolveDeviceSymbols/CMakeLists.txt

@@ -0,0 +1,52 @@
+cmake_minimum_required(VERSION 3.7)
+project (CudaOnlyResolveDeviceSymbols CUDA)
+
+# Find nm and dumpbin
+if(CMAKE_NM)
+  set(dump_command ${CMAKE_NM})
+  set(dump_args -g)
+else()
+  include(GetPrerequisites)
+  message(STATUS "calling list_prerequisites to find dumpbin")
+  list_prerequisites("${CMAKE_COMMAND}" 0 0 0)
+  if(gp_dumpbin)
+    set(dump_command ${gp_dumpbin})
+    set(dump_args /ARCHIVEMEMBERS)
+  endif()
+endif()
+
+#Goal for this example:
+#Build a static library that defines multiple methods and kernels that
+#use each other.
+#Use a custom command to build an executable that uses this static library
+#We do these together to verify that we can get a static library to do
+#device symbol linking, and not have it done when the executable is made
+string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_30,code=compute_30")
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CUDA_STANDARD 11)
+
+add_library(CUDAResolveDeviceLib STATIC file1.cu file2.cu)
+set_target_properties(CUDAResolveDeviceLib
+                      PROPERTIES
+                      CUDA_SEPARABLE_COMPILATION ON
+                      CUDA_RESOLVE_DEVICE_SYMBOLS ON
+                      POSITION_INDEPENDENT_CODE ON)
+
+if(dump_command)
+add_custom_command(TARGET CUDAResolveDeviceLib POST_BUILD
+  COMMAND ${CMAKE_COMMAND}
+  -DDUMP_COMMAND=${dump_command}
+  -DDUMP_ARGS=${dump_args}
+  -DTEST_LIBRARY_PATH=$<TARGET_FILE:CUDAResolveDeviceLib>
+  -P ${CMAKE_CURRENT_SOURCE_DIR}/verify.cmake
+  )
+endif()
+
+add_executable(CudaOnlyResolveDeviceSymbols main.cu)
+target_link_libraries(CudaOnlyResolveDeviceSymbols PRIVATE CUDAResolveDeviceLib)
+
+if(APPLE)
+  # We need to add the default path to the driver (libcuda.dylib) as an rpath, so that
+  # the static cuda runtime can find it at runtime.
+  target_link_libraries(CudaOnlyResolveDeviceSymbols PRIVATE -Wl,-rpath,/usr/local/cuda/lib)
+endif()

+ 10 - 0
Tests/CudaOnly/ResolveDeviceSymbols/file1.cu

@@ -0,0 +1,10 @@
+
+#include "file1.h"
+
+result_type __device__ file1_func(int x)
+{
+  result_type r;
+  r.input = x;
+  r.sum = x * x;
+  return r;
+}

+ 7 - 0
Tests/CudaOnly/ResolveDeviceSymbols/file1.h

@@ -0,0 +1,7 @@
+
+#pragma once
+struct result_type
+{
+  int input;
+  int sum;
+};

+ 25 - 0
Tests/CudaOnly/ResolveDeviceSymbols/file2.cu

@@ -0,0 +1,25 @@
+
+#include "file2.h"
+
+result_type __device__ file1_func(int x);
+
+result_type_dynamic __device__ file2_func(int x)
+{
+  const result_type r = file1_func(x);
+  const result_type_dynamic rd{ r.input, r.sum, true };
+  return rd;
+}
+
+static __global__ void file2_kernel(result_type_dynamic& r, int x)
+{
+  // call static_func which is a method that is defined in the
+  // static library that is always out of date
+  r = file2_func(x);
+}
+
+int file2_launch_kernel(int x)
+{
+  result_type_dynamic r;
+  file2_kernel<<<1, 1>>>(r, x);
+  return r.sum;
+}

+ 10 - 0
Tests/CudaOnly/ResolveDeviceSymbols/file2.h

@@ -0,0 +1,10 @@
+
+#pragma once
+#include "file1.h"
+
+struct result_type_dynamic
+{
+  int input;
+  int sum;
+  bool from_static;
+};

+ 85 - 0
Tests/CudaOnly/ResolveDeviceSymbols/main.cu

@@ -0,0 +1,85 @@
+
+#include <iostream>
+
+#include "file1.h"
+#include "file2.h"
+
+int file2_launch_kernel(int x);
+
+result_type_dynamic __device__ file2_func(int x);
+static __global__ void main_kernel(result_type_dynamic& r, int x)
+{
+  // call function that was not device linked to us, this will cause
+  // a runtime failure of "invalid device function"
+  r = file2_func(x);
+}
+
+int main_launch_kernel(int x)
+{
+  result_type_dynamic r;
+  main_kernel<<<1, 1>>>(r, x);
+  return r.sum;
+}
+
+int choose_cuda_device()
+{
+  int nDevices = 0;
+  cudaError_t err = cudaGetDeviceCount(&nDevices);
+  if (err != cudaSuccess) {
+    std::cerr << "Failed to retrieve the number of CUDA enabled devices"
+              << std::endl;
+    return 1;
+  }
+  for (int i = 0; i < nDevices; ++i) {
+    cudaDeviceProp prop;
+    cudaError_t err = cudaGetDeviceProperties(&prop, i);
+    if (err != cudaSuccess) {
+      std::cerr << "Could not retrieve properties from CUDA device " << i
+                << std::endl;
+      return 1;
+    }
+    std::cout << "prop.major: " << prop.major << std::endl;
+    if (prop.major >= 3) {
+      err = cudaSetDevice(i);
+      if (err != cudaSuccess) {
+        std::cout << "Could not select CUDA device " << i << std::endl;
+      } else {
+        return 0;
+      }
+    }
+  }
+
+  std::cout << "Could not find a CUDA enabled card supporting compute >=3.0"
+            << std::endl;
+
+  return 1;
+}
+
+int main(int argc, char** argv)
+{
+  int ret = choose_cuda_device();
+  if (ret) {
+    return 0;
+  }
+
+  cudaError_t err;
+  file2_launch_kernel(42);
+  err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    std::cerr << "file2_launch_kernel: kernel launch failed: "
+              << cudaGetErrorString(err) << std::endl;
+    return 1;
+  }
+
+  main_launch_kernel(1);
+  err = cudaGetLastError();
+  if (err == cudaSuccess) {
+    // This kernel launch should fail as the file2_func was device linked
+    // into the static library and is not usable by the executable
+    std::cerr << "main_launch_kernel: kernel launch should have failed"
+              << std::endl;
+    return 1;
+  }
+
+  return 0;
+}

+ 14 - 0
Tests/CudaOnly/ResolveDeviceSymbols/verify.cmake

@@ -0,0 +1,14 @@
+execute_process(COMMAND ${DUMP_COMMAND} ${DUMP_ARGS} ${TEST_LIBRARY_PATH}
+  RESULT_VARIABLE RESULT
+  OUTPUT_VARIABLE OUTPUT
+  ERROR_VARIABLE ERROR
+)
+
+if(NOT "${RESULT}" STREQUAL "0")
+  message(FATAL_ERROR "${DUMP_COMMAND} failed [${RESULT}] [${OUTPUT}] [${ERROR}]")
+endif()
+
+if(NOT "${OUTPUT}" MATCHES "(cmake_device_link|device-link)")
+  message(FATAL_ERROR
+    "No cuda device objects found, device linking did not occur")
+endif()