github.com/johnnyeven/libtools@v0.0.0-20191126065708-61829c1adf46/third_party/nccl/build_defs.bzl.tpl

github.com/johnnyeven/libtools@v0.0.0-20191126065708-61829c1adf46/third_party/nccl/build_defs.bzl.tpl (about)

     1  """Repository rule for NCCL."""
     2  
     3  load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
     4  load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain")
     5  
     6  def _gen_device_srcs_impl(ctx):
     7      ops = ["sum", "prod", "min", "max"]
     8      types = ["i8", "u8", "i32", "u32", "i64", "u64", "f16", "f32", "f64"]
     9      hdr_tail = "****************************************/"
    10      defines = "\n\n#define NCCL_OP %d\n#define NCCL_TYPE %d"
    11  
    12      files = []
    13      for NCCL_OP, op in enumerate(ops):
    14          for NCCL_TYPE, dt in enumerate(types):
    15              substitutions = {
    16                  hdr_tail: hdr_tail + defines % (NCCL_OP, NCCL_TYPE),
    17              }
    18              for src in ctx.files.srcs:
    19                  name = "%s_%s_%s" % (op, dt, src.basename)
    20                  file = ctx.actions.declare_file(name, sibling = src)
    21                  ctx.actions.expand_template(
    22                      output = file,
    23                      template = src,
    24                      substitutions = substitutions,
    25                  )
    26                  files.append(file)
    27      return [DefaultInfo(files = depset(files))]
    28  
    29  gen_device_srcs = rule(
    30      implementation = _gen_device_srcs_impl,
    31      attrs = {
    32          "srcs": attr.label_list(allow_files = True),
    33      },
    34  )
    35  """Adds prefix to each file name in srcs and adds #define NCCL_OP."""
    36  
    37  def _rdc_copts():
    38      """Returns copts for compiling relocatable device code."""
    39  
    40      # The global functions can not have a lower register count than the
    41      # device functions. This is enforced by setting a fixed register count.
    42      # https://github.com/NVIDIA/nccl/blob/f93fe9bfd94884cec2ba711897222e0df5569a53/makefiles/common.mk#L48
    43      maxrregcount = "-maxrregcount=96"
    44  
    45      return cuda_default_copts() + select({
    46          "@local_config_cuda//cuda:using_nvcc": [
    47              "-nvcc_options",
    48              "relocatable-device-code=true",
    49              "-nvcc_options",
    50              "ptxas-options=" + maxrregcount,
    51          ],
    52          "@local_config_cuda//cuda:using_clang": [
    53              "-fcuda-rdc",
    54              "-Xcuda-ptxas",
    55              maxrregcount,
    56          ],
    57          "//conditions:default": [],
    58      })
    59  
    60  def _lookup_file(filegroup, path):
    61      """Extracts file at (relative) path in filegroup."""
    62      for file in filegroup.files:
    63          if file.path.endswith(path):
    64              return file
    65      return None
    66  
    67  def _pic_only(files):
    68      """Returns the PIC files if there are any in 'files', otherwise 'files'."""
    69      pic_only = [f for f in files if f.basename.find(".pic.") >= 0]
    70      return pic_only if pic_only else files
    71  
    72  def _device_link_impl(ctx):
    73      if not ctx.attr.gpu_archs:
    74          fail("No GPU architecture specified. NCCL requires --config=cuda or similar.")
    75  
    76      inputs = []
    77      for dep in ctx.attr.deps:
    78          inputs += dep.files.to_list()
    79      inputs = _pic_only(inputs)
    80  
    81      # Device-link to cubins for each architecture.
    82      name = ctx.attr.name
    83      register_h = None
    84      cubins = []
    85      images = []
    86      for arch in ctx.attr.gpu_archs:
    87          cubin = ctx.actions.declare_file("%s_%s.cubin" % (name, arch))
    88          register_h = ctx.actions.declare_file("%s_register_%s.h" % (name, arch))
    89          ctx.actions.run(
    90              outputs = [register_h, cubin],
    91              inputs = inputs,
    92              executable = ctx.file._nvlink,
    93              arguments = ctx.attr.nvlink_args + [
    94                  "--arch=%s" % arch,
    95                  "--register-link-binaries=%s" % register_h.path,
    96                  "--output-file=%s" % cubin.path,
    97              ] + [file.path for file in inputs],
    98              mnemonic = "nvlink",
    99          )
   100          cubins.append(cubin)
   101          images.append("--image=profile=%s,file=%s" % (arch, cubin.path))
   102  
   103      # Generate fatbin header from all cubins.
   104      tmp_fatbin = ctx.actions.declare_file("%s.fatbin" % name)
   105      fatbin_h = ctx.actions.declare_file("%s_fatbin.h" % name)
   106      bin2c = ctx.file._bin2c
   107      ctx.actions.run(
   108          outputs = [tmp_fatbin, fatbin_h],
   109          inputs = cubins,
   110          executable = ctx.file._fatbinary,
   111          arguments = [
   112              "-64",
   113              "--cmdline=--compile-only",
   114              "--link",
   115              "--compress-all",
   116              "--bin2c-path=%s" % bin2c.dirname,
   117              "--create=%s" % tmp_fatbin.path,
   118              "--embedded-fatbin=%s" % fatbin_h.path,
   119          ] + images,
   120          tools = [bin2c],
   121          mnemonic = "fatbinary",
   122      )
   123  
   124      # Generate the source file #including the headers generated above.
   125      ctx.actions.expand_template(
   126          output = ctx.outputs.out,
   127          template = ctx.file._link_stub,
   128          substitutions = {
   129              "REGISTERLINKBINARYFILE": '"%s"' % register_h.short_path,
   130              "FATBINFILE": '"%s"' % fatbin_h.short_path,
   131          },
   132      )
   133  
   134      return [DefaultInfo(files = depset([register_h, fatbin_h]))]
   135  
   136  _device_link = rule(
   137      implementation = _device_link_impl,
   138      attrs = {
   139          "deps": attr.label_list(),
   140          "out": attr.output(mandatory = True),
   141          "gpu_archs": attr.string_list(),
   142          "nvlink_args": attr.string_list(),
   143          "_nvlink": attr.label(
   144              default = Label("@local_config_cuda//cuda:cuda/bin/nvlink"),
   145              allow_single_file = True,
   146              executable = True,
   147              cfg = "host",
   148          ),
   149          "_fatbinary": attr.label(
   150              default = Label("@local_config_cuda//cuda:cuda/bin/fatbinary"),
   151              allow_single_file = True,
   152              executable = True,
   153              cfg = "host",
   154          ),
   155          "_bin2c": attr.label(
   156              default = Label("@local_config_cuda//cuda:cuda/bin/bin2c"),
   157              allow_single_file = True,
   158              executable = True,
   159              cfg = "host",
   160          ),
   161          "_link_stub": attr.label(
   162              default = Label("@local_config_cuda//cuda:cuda/bin/crt/link.stub"),
   163              allow_single_file = True,
   164          ),
   165      },
   166  )
   167  """Links device code and generates source code for kernel registration."""
   168  
   169  def _merge_archive_impl(ctx):
   170      # Generate an mri script to the merge archives in srcs and pass it to 'ar'.
   171      # See https://stackoverflow.com/a/23621751.
   172      files = _pic_only(ctx.files.srcs)
   173      mri_script = "create " + ctx.outputs.out.path
   174      for f in files:
   175          mri_script += "\\naddlib " + f.path
   176      mri_script += "\\nsave\\nend"
   177  
   178      cc_toolchain = find_cpp_toolchain(ctx)
   179      ctx.actions.run_shell(
   180          inputs = ctx.files.srcs,  # + ctx.files._crosstool,
   181          outputs = [ctx.outputs.out],
   182          command = "printf \"%s\" | %s -M" % (mri_script, cc_toolchain.ar_executable),
   183      )
   184  
   185  _merge_archive = rule(
   186      implementation = _merge_archive_impl,
   187      attrs = {
   188          "srcs": attr.label_list(mandatory = True, allow_files = True),
   189          "_cc_toolchain": attr.label(default = "@bazel_tools//tools/cpp:current_cc_toolchain"),
   190          # "_crosstool": attr.label_list(cfg = "host", default = ["@bazel_tools//tools/cpp:crosstool"]),
   191      },
   192      outputs = {"out": "lib%{name}.a"},
   193  )
   194  """Merges srcs into a single archive."""
   195  
   196  def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwargs):
   197      """Produces a cuda_library using separate compilation and linking.
   198  
   199      CUDA separate compilation and linking allows device function calls across
   200      translation units. This is different from the normal whole program
   201      compilation where each translation unit contains all device code. For more
   202      background, see
   203      https://devblogs.nvidia.com/separate-compilation-linking-cuda-device-code/,
   204      https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#nvcc-options-for-separate-compilation
   205  
   206      During separate compilation, the different CUDA source files are compiled
   207      to 'relocatable device code' (RDC) and embedded in the host object files.
   208      When using nvcc, linking the device code for each supported GPU
   209      architecture and generating kernel registration code for the CUDA runtime
   210      is handled automatically. Clang supports generating relocatable device
   211      code, but it can't link it. We therefore rely on tools provided by the CUDA
   212      SDK to link the device code and generate the host code to register the
   213      kernels.
   214  
   215      The nvlink tool extracts the RDC code from the object files and links it
   216      into cubin files, one per GPU architecture. It also produces a header file
   217      with a list of kernel names to register. The cubins are merged into a
   218      binary blob using the fatbinary tool, and converted to a C header file with
   219      the help of the bin2c tool. The registration header file, the fatbinary
   220      header file, and the link.stub file (shipped with the CUDA SDK) are
   221      compiled as ordinary host code.
   222  
   223      Here is a diagram of the CUDA separate compilation trajectory:
   224  
   225       x.cu.cc    y.cu.cc
   226             \    /            cc_library (compile RDC and archive)
   227              xy.a
   228             /    \            * nvlink
   229      register.h  xy.cubin
   230            :      |           * fatbinary and bin2c
   231            :     xy.fatbin.h
   232            :      :           * #include
   233            dlink.cc           * Expanded from crt/dlink.stub template
   234               |               cc_library (host compile and archive)
   235            dlink.a
   236  
   237      The steps marked with '*' are implemented in the _device_link rule.
   238  
   239      The object files in both xy.a and dlink.a reference symbols defined in the
   240      other archive. The separate archives are a side effect of using two
   241      cc_library targets to implement a single compilation trajectory. We could
   242      fix this once bazel supports C++ sandwich. For now, we just merge the two
   243      archives to avoid unresolved symbols:
   244  
   245      xy.a      dlink.a
   246          \    /           merge archive
   247        xy_dlink.a
   248             |             cc_library (or alternatively, cc_import)
   249       final target
   250  
   251      Another complication is that cc_library produces (depending on the
   252      configuration) both PIC and non-PIC archives, but the distinction
   253      is hidden from Starlark until C++ sandwich becomes available. We work
   254      around this by dropping the non-PIC files if PIC files are available.
   255  
   256      Args:
   257        name: Target name.
   258        hdrs: Header files.
   259        copts: Compiler options.
   260        linkstatic: Must be true.
   261        **kwargs: Any other arguments.
   262      """
   263  
   264      if not hdrs:
   265          hdrs = []
   266      if not copts:
   267          copts = []
   268  
   269      # Compile host and device code into library.
   270      lib = name + "_lib"
   271      native.cc_library(
   272          name = lib,
   273          hdrs = hdrs,
   274          copts = _rdc_copts() + copts,
   275          linkstatic = linkstatic,
   276          **kwargs
   277      )
   278  
   279      # Generate source file containing linked device code.
   280      dlink_hdrs = name + "_dlink_hdrs"
   281      dlink_cc = name + "_dlink.cc"
   282      _device_link(
   283          name = dlink_hdrs,
   284          deps = [lib],
   285          out = dlink_cc,
   286          gpu_archs = %{gpu_architectures},
   287          nvlink_args = select({
   288              "@org_tensorflow//tensorflow:linux_x86_64": ["--cpu-arch=X86_64"],
   289              "@org_tensorflow//tensorflow:linux_ppc64le": ["--cpu-arch=PPC64LE"],
   290              "//conditions:default": [],
   291          }),
   292      )
   293  
   294      # Compile the source file into a library.
   295      dlink = name + "_dlink"
   296      native.cc_library(
   297          name = dlink,
   298          srcs = [dlink_cc],
   299          textual_hdrs = [dlink_hdrs],
   300          deps = [
   301              "@local_config_cuda//cuda:cuda_headers",
   302          ],
   303          defines = [
   304              # Silence warning about including internal header.
   305              "__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__",
   306              # Macros that need to be defined starting with CUDA 10.
   307              "__NV_EXTRA_INITIALIZATION=",
   308              "__NV_EXTRA_FINALIZATION=",
   309          ],
   310          linkstatic = linkstatic,
   311      )
   312  
   313      # Repackage the two libs into a single archive. This is required because
   314      # both libs reference symbols defined in the other one. For details, see
   315      # https://eli.thegreenplace.net/2013/07/09/library-order-in-static-linking
   316      archive = name + "_a"
   317      _merge_archive(
   318          name = archive,
   319          srcs = [lib, dlink],
   320      )
   321  
   322      # Create cc target from archive.
   323      native.cc_library(
   324          name = name,
   325          srcs = [archive],
   326          hdrs = hdrs,
   327          linkstatic = linkstatic,
   328      )