github.com/johnnyeven/libtools@v0.0.0-20191126065708-61829c1adf46/third_party/nccl/build_defs.bzl.tpl (about) 1 """Repository rule for NCCL.""" 2 3 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts") 4 load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain") 5 6 def _gen_device_srcs_impl(ctx): 7 ops = ["sum", "prod", "min", "max"] 8 types = ["i8", "u8", "i32", "u32", "i64", "u64", "f16", "f32", "f64"] 9 hdr_tail = "****************************************/" 10 defines = "\n\n#define NCCL_OP %d\n#define NCCL_TYPE %d" 11 12 files = [] 13 for NCCL_OP, op in enumerate(ops): 14 for NCCL_TYPE, dt in enumerate(types): 15 substitutions = { 16 hdr_tail: hdr_tail + defines % (NCCL_OP, NCCL_TYPE), 17 } 18 for src in ctx.files.srcs: 19 name = "%s_%s_%s" % (op, dt, src.basename) 20 file = ctx.actions.declare_file(name, sibling = src) 21 ctx.actions.expand_template( 22 output = file, 23 template = src, 24 substitutions = substitutions, 25 ) 26 files.append(file) 27 return [DefaultInfo(files = depset(files))] 28 29 gen_device_srcs = rule( 30 implementation = _gen_device_srcs_impl, 31 attrs = { 32 "srcs": attr.label_list(allow_files = True), 33 }, 34 ) 35 """Adds prefix to each file name in srcs and adds #define NCCL_OP.""" 36 37 def _rdc_copts(): 38 """Returns copts for compiling relocatable device code.""" 39 40 # The global functions can not have a lower register count than the 41 # device functions. This is enforced by setting a fixed register count. 42 # https://github.com/NVIDIA/nccl/blob/f93fe9bfd94884cec2ba711897222e0df5569a53/makefiles/common.mk#L48 43 maxrregcount = "-maxrregcount=96" 44 45 return cuda_default_copts() + select({ 46 "@local_config_cuda//cuda:using_nvcc": [ 47 "-nvcc_options", 48 "relocatable-device-code=true", 49 "-nvcc_options", 50 "ptxas-options=" + maxrregcount, 51 ], 52 "@local_config_cuda//cuda:using_clang": [ 53 "-fcuda-rdc", 54 "-Xcuda-ptxas", 55 maxrregcount, 56 ], 57 "//conditions:default": [], 58 }) 59 60 def _lookup_file(filegroup, path): 61 """Extracts file at (relative) path in filegroup.""" 62 for file in filegroup.files: 63 if file.path.endswith(path): 64 return file 65 return None 66 67 def _pic_only(files): 68 """Returns the PIC files if there are any in 'files', otherwise 'files'.""" 69 pic_only = [f for f in files if f.basename.find(".pic.") >= 0] 70 return pic_only if pic_only else files 71 72 def _device_link_impl(ctx): 73 if not ctx.attr.gpu_archs: 74 fail("No GPU architecture specified. NCCL requires --config=cuda or similar.") 75 76 inputs = [] 77 for dep in ctx.attr.deps: 78 inputs += dep.files.to_list() 79 inputs = _pic_only(inputs) 80 81 # Device-link to cubins for each architecture. 82 name = ctx.attr.name 83 register_h = None 84 cubins = [] 85 images = [] 86 for arch in ctx.attr.gpu_archs: 87 cubin = ctx.actions.declare_file("%s_%s.cubin" % (name, arch)) 88 register_h = ctx.actions.declare_file("%s_register_%s.h" % (name, arch)) 89 ctx.actions.run( 90 outputs = [register_h, cubin], 91 inputs = inputs, 92 executable = ctx.file._nvlink, 93 arguments = ctx.attr.nvlink_args + [ 94 "--arch=%s" % arch, 95 "--register-link-binaries=%s" % register_h.path, 96 "--output-file=%s" % cubin.path, 97 ] + [file.path for file in inputs], 98 mnemonic = "nvlink", 99 ) 100 cubins.append(cubin) 101 images.append("--image=profile=%s,file=%s" % (arch, cubin.path)) 102 103 # Generate fatbin header from all cubins. 104 tmp_fatbin = ctx.actions.declare_file("%s.fatbin" % name) 105 fatbin_h = ctx.actions.declare_file("%s_fatbin.h" % name) 106 bin2c = ctx.file._bin2c 107 ctx.actions.run( 108 outputs = [tmp_fatbin, fatbin_h], 109 inputs = cubins, 110 executable = ctx.file._fatbinary, 111 arguments = [ 112 "-64", 113 "--cmdline=--compile-only", 114 "--link", 115 "--compress-all", 116 "--bin2c-path=%s" % bin2c.dirname, 117 "--create=%s" % tmp_fatbin.path, 118 "--embedded-fatbin=%s" % fatbin_h.path, 119 ] + images, 120 tools = [bin2c], 121 mnemonic = "fatbinary", 122 ) 123 124 # Generate the source file #including the headers generated above. 125 ctx.actions.expand_template( 126 output = ctx.outputs.out, 127 template = ctx.file._link_stub, 128 substitutions = { 129 "REGISTERLINKBINARYFILE": '"%s"' % register_h.short_path, 130 "FATBINFILE": '"%s"' % fatbin_h.short_path, 131 }, 132 ) 133 134 return [DefaultInfo(files = depset([register_h, fatbin_h]))] 135 136 _device_link = rule( 137 implementation = _device_link_impl, 138 attrs = { 139 "deps": attr.label_list(), 140 "out": attr.output(mandatory = True), 141 "gpu_archs": attr.string_list(), 142 "nvlink_args": attr.string_list(), 143 "_nvlink": attr.label( 144 default = Label("@local_config_cuda//cuda:cuda/bin/nvlink"), 145 allow_single_file = True, 146 executable = True, 147 cfg = "host", 148 ), 149 "_fatbinary": attr.label( 150 default = Label("@local_config_cuda//cuda:cuda/bin/fatbinary"), 151 allow_single_file = True, 152 executable = True, 153 cfg = "host", 154 ), 155 "_bin2c": attr.label( 156 default = Label("@local_config_cuda//cuda:cuda/bin/bin2c"), 157 allow_single_file = True, 158 executable = True, 159 cfg = "host", 160 ), 161 "_link_stub": attr.label( 162 default = Label("@local_config_cuda//cuda:cuda/bin/crt/link.stub"), 163 allow_single_file = True, 164 ), 165 }, 166 ) 167 """Links device code and generates source code for kernel registration.""" 168 169 def _merge_archive_impl(ctx): 170 # Generate an mri script to the merge archives in srcs and pass it to 'ar'. 171 # See https://stackoverflow.com/a/23621751. 172 files = _pic_only(ctx.files.srcs) 173 mri_script = "create " + ctx.outputs.out.path 174 for f in files: 175 mri_script += "\\naddlib " + f.path 176 mri_script += "\\nsave\\nend" 177 178 cc_toolchain = find_cpp_toolchain(ctx) 179 ctx.actions.run_shell( 180 inputs = ctx.files.srcs, # + ctx.files._crosstool, 181 outputs = [ctx.outputs.out], 182 command = "printf \"%s\" | %s -M" % (mri_script, cc_toolchain.ar_executable), 183 ) 184 185 _merge_archive = rule( 186 implementation = _merge_archive_impl, 187 attrs = { 188 "srcs": attr.label_list(mandatory = True, allow_files = True), 189 "_cc_toolchain": attr.label(default = "@bazel_tools//tools/cpp:current_cc_toolchain"), 190 # "_crosstool": attr.label_list(cfg = "host", default = ["@bazel_tools//tools/cpp:crosstool"]), 191 }, 192 outputs = {"out": "lib%{name}.a"}, 193 ) 194 """Merges srcs into a single archive.""" 195 196 def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwargs): 197 """Produces a cuda_library using separate compilation and linking. 198 199 CUDA separate compilation and linking allows device function calls across 200 translation units. This is different from the normal whole program 201 compilation where each translation unit contains all device code. For more 202 background, see 203 https://devblogs.nvidia.com/separate-compilation-linking-cuda-device-code/, 204 https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#nvcc-options-for-separate-compilation 205 206 During separate compilation, the different CUDA source files are compiled 207 to 'relocatable device code' (RDC) and embedded in the host object files. 208 When using nvcc, linking the device code for each supported GPU 209 architecture and generating kernel registration code for the CUDA runtime 210 is handled automatically. Clang supports generating relocatable device 211 code, but it can't link it. We therefore rely on tools provided by the CUDA 212 SDK to link the device code and generate the host code to register the 213 kernels. 214 215 The nvlink tool extracts the RDC code from the object files and links it 216 into cubin files, one per GPU architecture. It also produces a header file 217 with a list of kernel names to register. The cubins are merged into a 218 binary blob using the fatbinary tool, and converted to a C header file with 219 the help of the bin2c tool. The registration header file, the fatbinary 220 header file, and the link.stub file (shipped with the CUDA SDK) are 221 compiled as ordinary host code. 222 223 Here is a diagram of the CUDA separate compilation trajectory: 224 225 x.cu.cc y.cu.cc 226 \ / cc_library (compile RDC and archive) 227 xy.a 228 / \ * nvlink 229 register.h xy.cubin 230 : | * fatbinary and bin2c 231 : xy.fatbin.h 232 : : * #include 233 dlink.cc * Expanded from crt/dlink.stub template 234 | cc_library (host compile and archive) 235 dlink.a 236 237 The steps marked with '*' are implemented in the _device_link rule. 238 239 The object files in both xy.a and dlink.a reference symbols defined in the 240 other archive. The separate archives are a side effect of using two 241 cc_library targets to implement a single compilation trajectory. We could 242 fix this once bazel supports C++ sandwich. For now, we just merge the two 243 archives to avoid unresolved symbols: 244 245 xy.a dlink.a 246 \ / merge archive 247 xy_dlink.a 248 | cc_library (or alternatively, cc_import) 249 final target 250 251 Another complication is that cc_library produces (depending on the 252 configuration) both PIC and non-PIC archives, but the distinction 253 is hidden from Starlark until C++ sandwich becomes available. We work 254 around this by dropping the non-PIC files if PIC files are available. 255 256 Args: 257 name: Target name. 258 hdrs: Header files. 259 copts: Compiler options. 260 linkstatic: Must be true. 261 **kwargs: Any other arguments. 262 """ 263 264 if not hdrs: 265 hdrs = [] 266 if not copts: 267 copts = [] 268 269 # Compile host and device code into library. 270 lib = name + "_lib" 271 native.cc_library( 272 name = lib, 273 hdrs = hdrs, 274 copts = _rdc_copts() + copts, 275 linkstatic = linkstatic, 276 **kwargs 277 ) 278 279 # Generate source file containing linked device code. 280 dlink_hdrs = name + "_dlink_hdrs" 281 dlink_cc = name + "_dlink.cc" 282 _device_link( 283 name = dlink_hdrs, 284 deps = [lib], 285 out = dlink_cc, 286 gpu_archs = %{gpu_architectures}, 287 nvlink_args = select({ 288 "@org_tensorflow//tensorflow:linux_x86_64": ["--cpu-arch=X86_64"], 289 "@org_tensorflow//tensorflow:linux_ppc64le": ["--cpu-arch=PPC64LE"], 290 "//conditions:default": [], 291 }), 292 ) 293 294 # Compile the source file into a library. 295 dlink = name + "_dlink" 296 native.cc_library( 297 name = dlink, 298 srcs = [dlink_cc], 299 textual_hdrs = [dlink_hdrs], 300 deps = [ 301 "@local_config_cuda//cuda:cuda_headers", 302 ], 303 defines = [ 304 # Silence warning about including internal header. 305 "__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__", 306 # Macros that need to be defined starting with CUDA 10. 307 "__NV_EXTRA_INITIALIZATION=", 308 "__NV_EXTRA_FINALIZATION=", 309 ], 310 linkstatic = linkstatic, 311 ) 312 313 # Repackage the two libs into a single archive. This is required because 314 # both libs reference symbols defined in the other one. For details, see 315 # https://eli.thegreenplace.net/2013/07/09/library-order-in-static-linking 316 archive = name + "_a" 317 _merge_archive( 318 name = archive, 319 srcs = [lib, dlink], 320 ) 321 322 # Create cc target from archive. 323 native.cc_library( 324 name = name, 325 srcs = [archive], 326 hdrs = hdrs, 327 linkstatic = linkstatic, 328 )