github.com/johnnyeven/libtools@v0.0.0-20191126065708-61829c1adf46/third_party/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp (about)

     1  //===- ConvertLaunchFuncToCudaCalls.cpp - MLIR CUDA lowering passes -------===//
     2  //
     3  // Copyright 2019 The MLIR Authors.
     4  //
     5  // Licensed under the Apache License, Version 2.0 (the "License");
     6  // you may not use this file except in compliance with the License.
     7  // You may obtain a copy of the License at
     8  //
     9  //   http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  // =============================================================================
    17  //
    18  // This file implements a pass to convert gpu.launch_func op into a sequence of
    19  // CUDA runtime calls. As the CUDA runtime does not have a stable published ABI,
    20  // this pass uses a slim runtime layer that builds on top of the public API from
    21  // the CUDA headers.
    22  //
    23  //===----------------------------------------------------------------------===//
    24  
    25  #include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
    26  
    27  #include "mlir/Dialect/GPU/GPUDialect.h"
    28  #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
    29  #include "mlir/IR/Attributes.h"
    30  #include "mlir/IR/Builders.h"
    31  #include "mlir/IR/Function.h"
    32  #include "mlir/IR/Module.h"
    33  #include "mlir/IR/StandardTypes.h"
    34  #include "mlir/Pass/Pass.h"
    35  
    36  #include "llvm/ADT/STLExtras.h"
    37  #include "llvm/IR/DataLayout.h"
    38  #include "llvm/IR/DerivedTypes.h"
    39  #include "llvm/IR/Module.h"
    40  #include "llvm/IR/Type.h"
    41  #include "llvm/Support/Error.h"
    42  #include "llvm/Support/FormatVariadic.h"
    43  
    44  using namespace mlir;
    45  
    46  // To avoid name mangling, these are defined in the mini-runtime file.
    47  static constexpr const char *cuModuleLoadName = "mcuModuleLoad";
    48  static constexpr const char *cuModuleGetFunctionName = "mcuModuleGetFunction";
    49  static constexpr const char *cuLaunchKernelName = "mcuLaunchKernel";
    50  static constexpr const char *cuGetStreamHelperName = "mcuGetStreamHelper";
    51  static constexpr const char *cuStreamSynchronizeName = "mcuStreamSynchronize";
    52  
    53  static constexpr const char *kCubinGetterAnnotation = "nvvm.cubingetter";
    54  
    55  namespace {
    56  
    57  /// A pass to convert gpu.launch_func operations into a sequence of CUDA
    58  /// runtime calls.
    59  ///
    60  /// In essence, a gpu.launch_func operations gets compiled into the following
    61  /// sequence of runtime calls:
    62  ///
    63  /// * mcuModuleLoad        -- loads the module given the cubin data
    64  /// * mcuModuleGetFunction -- gets a handle to the actual kernel function
    65  /// * mcuGetStreamHelper   -- initializes a new CUDA stream
    66  /// * mcuLaunchKernelName  -- launches the kernel on a stream
    67  /// * mcuStreamSynchronize -- waits for operations on the stream to finish
    68  ///
    69  /// Intermediate data structures are allocated on the stack.
    70  class GpuLaunchFuncToCudaCallsPass
    71      : public ModulePass<GpuLaunchFuncToCudaCallsPass> {
    72  private:
    73    LLVM::LLVMDialect *getLLVMDialect() { return llvmDialect; }
    74  
    75    llvm::LLVMContext &getLLVMContext() {
    76      return getLLVMDialect()->getLLVMContext();
    77    }
    78  
    79    void initializeCachedTypes() {
    80      const llvm::Module &module = llvmDialect->getLLVMModule();
    81      llvmPointerType = LLVM::LLVMType::getInt8PtrTy(llvmDialect);
    82      llvmPointerPointerType = llvmPointerType.getPointerTo();
    83      llvmInt8Type = LLVM::LLVMType::getInt8Ty(llvmDialect);
    84      llvmInt32Type = LLVM::LLVMType::getInt32Ty(llvmDialect);
    85      llvmInt64Type = LLVM::LLVMType::getInt64Ty(llvmDialect);
    86      llvmIntPtrType = LLVM::LLVMType::getIntNTy(
    87          llvmDialect, module.getDataLayout().getPointerSizeInBits());
    88    }
    89  
    90    LLVM::LLVMType getPointerType() { return llvmPointerType; }
    91  
    92    LLVM::LLVMType getPointerPointerType() { return llvmPointerPointerType; }
    93  
    94    LLVM::LLVMType getInt8Type() { return llvmInt8Type; }
    95  
    96    LLVM::LLVMType getInt32Type() { return llvmInt32Type; }
    97  
    98    LLVM::LLVMType getInt64Type() { return llvmInt64Type; }
    99  
   100    LLVM::LLVMType getIntPtrType() {
   101      const llvm::Module &module = getLLVMDialect()->getLLVMModule();
   102      return LLVM::LLVMType::getIntNTy(
   103          getLLVMDialect(), module.getDataLayout().getPointerSizeInBits());
   104    }
   105  
   106    LLVM::LLVMType getCUResultType() {
   107      // This is declared as an enum in CUDA but helpers use i32.
   108      return getInt32Type();
   109    }
   110  
   111    // Allocate a void pointer on the stack.
   112    Value *allocatePointer(OpBuilder &builder, Location loc) {
   113      auto one = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
   114                                                  builder.getI32IntegerAttr(1));
   115      return builder.create<LLVM::AllocaOp>(loc, getPointerPointerType(), one,
   116                                            /*alignment=*/0);
   117    }
   118  
   119    void declareCudaFunctions(Location loc);
   120    Value *setupParamsArray(gpu::LaunchFuncOp launchOp, OpBuilder &builder);
   121    Value *generateKernelNameConstant(FuncOp kernelFunction, Location &loc,
   122                                      OpBuilder &builder);
   123    void translateGpuLaunchCalls(mlir::gpu::LaunchFuncOp launchOp);
   124  
   125  public:
   126    // Run the dialect converter on the module.
   127    void runOnModule() override {
   128      // Cache the LLVMDialect for the current module.
   129      llvmDialect = getContext().getRegisteredDialect<LLVM::LLVMDialect>();
   130      // Cache the used LLVM types.
   131      initializeCachedTypes();
   132  
   133      for (auto func : getModule().getOps<FuncOp>()) {
   134        func.walk(
   135            [this](mlir::gpu::LaunchFuncOp op) { translateGpuLaunchCalls(op); });
   136      }
   137    }
   138  
   139  private:
   140    LLVM::LLVMDialect *llvmDialect;
   141    LLVM::LLVMType llvmPointerType;
   142    LLVM::LLVMType llvmPointerPointerType;
   143    LLVM::LLVMType llvmInt8Type;
   144    LLVM::LLVMType llvmInt32Type;
   145    LLVM::LLVMType llvmInt64Type;
   146    LLVM::LLVMType llvmIntPtrType;
   147  };
   148  
   149  } // anonymous namespace
   150  
   151  // Adds declarations for the needed helper functions from the CUDA wrapper.
   152  // The types in comments give the actual types expected/returned but the API
   153  // uses void pointers. This is fine as they have the same linkage in C.
   154  void GpuLaunchFuncToCudaCallsPass::declareCudaFunctions(Location loc) {
   155    ModuleOp module = getModule();
   156    Builder builder(module);
   157    if (!module.lookupSymbol<FuncOp>(cuModuleLoadName)) {
   158      module.push_back(
   159          FuncOp::create(loc, cuModuleLoadName,
   160                         builder.getFunctionType(
   161                             {
   162                                 getPointerPointerType(), /* CUmodule *module */
   163                                 getPointerType()         /* void *cubin */
   164                             },
   165                             getCUResultType())));
   166    }
   167    if (!module.lookupSymbol<FuncOp>(cuModuleGetFunctionName)) {
   168      // The helper uses void* instead of CUDA's opaque CUmodule and
   169      // CUfunction.
   170      module.push_back(
   171          FuncOp::create(loc, cuModuleGetFunctionName,
   172                         builder.getFunctionType(
   173                             {
   174                                 getPointerPointerType(), /* void **function */
   175                                 getPointerType(),        /* void *module */
   176                                 getPointerType()         /* char *name */
   177                             },
   178                             getCUResultType())));
   179    }
   180    if (!module.lookupSymbol<FuncOp>(cuLaunchKernelName)) {
   181      // Other than the CUDA api, the wrappers use uintptr_t to match the
   182      // LLVM type if MLIR's index type, which the GPU dialect uses.
   183      // Furthermore, they use void* instead of CUDA's opaque CUfunction and
   184      // CUstream.
   185      module.push_back(FuncOp::create(
   186          loc, cuLaunchKernelName,
   187          builder.getFunctionType(
   188              {
   189                  getPointerType(),        /* void* f */
   190                  getIntPtrType(),         /* intptr_t gridXDim */
   191                  getIntPtrType(),         /* intptr_t gridyDim */
   192                  getIntPtrType(),         /* intptr_t gridZDim */
   193                  getIntPtrType(),         /* intptr_t blockXDim */
   194                  getIntPtrType(),         /* intptr_t blockYDim */
   195                  getIntPtrType(),         /* intptr_t blockZDim */
   196                  getInt32Type(),          /* unsigned int sharedMemBytes */
   197                  getPointerType(),        /* void *hstream */
   198                  getPointerPointerType(), /* void **kernelParams */
   199                  getPointerPointerType()  /* void **extra */
   200              },
   201              getCUResultType())));
   202    }
   203    if (!module.lookupSymbol<FuncOp>(cuGetStreamHelperName)) {
   204      // Helper function to get the current CUDA stream. Uses void* instead of
   205      // CUDAs opaque CUstream.
   206      module.push_back(FuncOp::create(
   207          loc, cuGetStreamHelperName,
   208          builder.getFunctionType({}, getPointerType() /* void *stream */)));
   209    }
   210    if (!module.lookupSymbol<FuncOp>(cuStreamSynchronizeName)) {
   211      module.push_back(
   212          FuncOp::create(loc, cuStreamSynchronizeName,
   213                         builder.getFunctionType(
   214                             {
   215                                 getPointerType() /* CUstream stream */
   216                             },
   217                             getCUResultType())));
   218    }
   219  }
   220  
   221  // Generates a parameters array to be used with a CUDA kernel launch call. The
   222  // arguments are extracted from the launchOp.
   223  // The generated code is essentially as follows:
   224  //
   225  // %array = alloca(numparams * sizeof(void *))
   226  // for (i : [0, NumKernelOperands))
   227  //   %array[i] = cast<void*>(KernelOperand[i])
   228  // return %array
   229  Value *
   230  GpuLaunchFuncToCudaCallsPass::setupParamsArray(gpu::LaunchFuncOp launchOp,
   231                                                 OpBuilder &builder) {
   232    Location loc = launchOp.getLoc();
   233    auto one = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
   234                                                builder.getI32IntegerAttr(1));
   235    auto arraySize = builder.create<LLVM::ConstantOp>(
   236        loc, getInt32Type(),
   237        builder.getI32IntegerAttr(launchOp.getNumKernelOperands()));
   238    auto array = builder.create<LLVM::AllocaOp>(loc, getPointerPointerType(),
   239                                                arraySize, /*alignment=*/0);
   240    for (int idx = 0, e = launchOp.getNumKernelOperands(); idx < e; ++idx) {
   241      auto operand = launchOp.getKernelOperand(idx);
   242      auto llvmType = operand->getType().cast<LLVM::LLVMType>();
   243      auto memLocation = builder.create<LLVM::AllocaOp>(
   244          loc, llvmType.getPointerTo(), one, /*alignment=*/1);
   245      builder.create<LLVM::StoreOp>(loc, operand, memLocation);
   246      auto casted =
   247          builder.create<LLVM::BitcastOp>(loc, getPointerType(), memLocation);
   248      auto index = builder.create<LLVM::ConstantOp>(
   249          loc, getInt32Type(), builder.getI32IntegerAttr(idx));
   250      auto gep = builder.create<LLVM::GEPOp>(loc, getPointerPointerType(), array,
   251                                             ArrayRef<Value *>{index});
   252      builder.create<LLVM::StoreOp>(loc, casted, gep);
   253    }
   254    return array;
   255  }
   256  
   257  // Generates an LLVM IR dialect global that contains the name of the given
   258  // kernel function as a C string, and returns a pointer to its beginning.
   259  // The code is essentially:
   260  //
   261  // llvm.global constant @kernel_name("function_name\00")
   262  // func(...) {
   263  //   %0 = llvm.addressof @kernel_name
   264  //   %1 = llvm.constant (0 : index)
   265  //   %2 = llvm.getelementptr %0[%1, %1] : !llvm<"i8*">
   266  // }
   267  Value *GpuLaunchFuncToCudaCallsPass::generateKernelNameConstant(
   268      FuncOp kernelFunction, Location &loc, OpBuilder &builder) {
   269    // Make sure the trailing zero is included in the constant.
   270    std::vector<char> kernelName(kernelFunction.getName().begin(),
   271                                 kernelFunction.getName().end());
   272    kernelName.push_back('\0');
   273  
   274    std::string globalName =
   275        llvm::formatv("{0}_kernel_name", kernelFunction.getName());
   276    return LLVM::createGlobalString(
   277        loc, builder, globalName, StringRef(kernelName.data(), kernelName.size()),
   278        llvmDialect);
   279  }
   280  
   281  // Emits LLVM IR to launch a kernel function. Expects the module that contains
   282  // the compiled kernel function as a cubin in the 'nvvm.cubin' attribute of the
   283  // kernel function in the IR.
   284  // While MLIR has no global constants, also expects a cubin getter function in
   285  // an 'nvvm.cubingetter' attribute. Such function is expected to return a
   286  // pointer to the cubin blob when invoked.
   287  // With these given, the generated code in essence is
   288  //
   289  // %0 = call %cubingetter
   290  // %1 = alloca sizeof(void*)
   291  // call %mcuModuleLoad(%2, %1)
   292  // %2 = alloca sizeof(void*)
   293  // %3 = load %1
   294  // %4 = <see generateKernelNameConstant>
   295  // call %mcuModuleGetFunction(%2, %3, %4)
   296  // %5 = call %mcuGetStreamHelper()
   297  // %6 = load %2
   298  // %7 = <see setupParamsArray>
   299  // call %mcuLaunchKernel(%6, <launchOp operands 0..5>, 0, %5, %7, nullptr)
   300  // call %mcuStreamSynchronize(%5)
   301  void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls(
   302      mlir::gpu::LaunchFuncOp launchOp) {
   303    OpBuilder builder(launchOp);
   304    Location loc = launchOp.getLoc();
   305    declareCudaFunctions(loc);
   306  
   307    auto zero = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
   308                                                 builder.getI32IntegerAttr(0));
   309    // Emit a call to the cubin getter to retrieve a pointer to the data that
   310    // represents the cubin at runtime.
   311    // TODO(herhut): This should rather be a static global once supported.
   312    auto kernelFunction = getModule().lookupSymbol<FuncOp>(launchOp.kernel());
   313    auto cubinGetter =
   314        kernelFunction.getAttrOfType<SymbolRefAttr>(kCubinGetterAnnotation);
   315    if (!cubinGetter) {
   316      kernelFunction.emitError("Missing ")
   317          << kCubinGetterAnnotation << " attribute.";
   318      return signalPassFailure();
   319    }
   320    auto data = builder.create<LLVM::CallOp>(
   321        loc, ArrayRef<Type>{getPointerType()}, cubinGetter, ArrayRef<Value *>{});
   322    // Emit the load module call to load the module data. Error checking is done
   323    // in the called helper function.
   324    auto cuModule = allocatePointer(builder, loc);
   325    FuncOp cuModuleLoad = getModule().lookupSymbol<FuncOp>(cuModuleLoadName);
   326    builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{getCUResultType()},
   327                                 builder.getSymbolRefAttr(cuModuleLoad),
   328                                 ArrayRef<Value *>{cuModule, data.getResult(0)});
   329    // Get the function from the module. The name corresponds to the name of
   330    // the kernel function.
   331    auto cuOwningModuleRef =
   332        builder.create<LLVM::LoadOp>(loc, getPointerType(), cuModule);
   333    auto kernelName = generateKernelNameConstant(kernelFunction, loc, builder);
   334    auto cuFunction = allocatePointer(builder, loc);
   335    FuncOp cuModuleGetFunction =
   336        getModule().lookupSymbol<FuncOp>(cuModuleGetFunctionName);
   337    builder.create<LLVM::CallOp>(
   338        loc, ArrayRef<Type>{getCUResultType()},
   339        builder.getSymbolRefAttr(cuModuleGetFunction),
   340        ArrayRef<Value *>{cuFunction, cuOwningModuleRef, kernelName});
   341    // Grab the global stream needed for execution.
   342    FuncOp cuGetStreamHelper =
   343        getModule().lookupSymbol<FuncOp>(cuGetStreamHelperName);
   344    auto cuStream = builder.create<LLVM::CallOp>(
   345        loc, ArrayRef<Type>{getPointerType()},
   346        builder.getSymbolRefAttr(cuGetStreamHelper), ArrayRef<Value *>{});
   347    // Invoke the function with required arguments.
   348    auto cuLaunchKernel = getModule().lookupSymbol<FuncOp>(cuLaunchKernelName);
   349    auto cuFunctionRef =
   350        builder.create<LLVM::LoadOp>(loc, getPointerType(), cuFunction);
   351    auto paramsArray = setupParamsArray(launchOp, builder);
   352    auto nullpointer =
   353        builder.create<LLVM::IntToPtrOp>(loc, getPointerPointerType(), zero);
   354    builder.create<LLVM::CallOp>(
   355        loc, ArrayRef<Type>{getCUResultType()},
   356        builder.getSymbolRefAttr(cuLaunchKernel),
   357        ArrayRef<Value *>{cuFunctionRef, launchOp.getOperand(0),
   358                          launchOp.getOperand(1), launchOp.getOperand(2),
   359                          launchOp.getOperand(3), launchOp.getOperand(4),
   360                          launchOp.getOperand(5), zero, /* sharedMemBytes */
   361                          cuStream.getResult(0),        /* stream */
   362                          paramsArray,                  /* kernel params */
   363                          nullpointer /* extra */});
   364    // Sync on the stream to make it synchronous.
   365    auto cuStreamSync = getModule().lookupSymbol<FuncOp>(cuStreamSynchronizeName);
   366    builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{getCUResultType()},
   367                                 builder.getSymbolRefAttr(cuStreamSync),
   368                                 ArrayRef<Value *>(cuStream.getResult(0)));
   369    launchOp.erase();
   370  }
   371  
   372  std::unique_ptr<mlir::ModulePassBase>
   373  mlir::createConvertGpuLaunchFuncToCudaCallsPass() {
   374    return std::make_unique<GpuLaunchFuncToCudaCallsPass>();
   375  }
   376  
   377  static PassRegistration<GpuLaunchFuncToCudaCallsPass>
   378      pass("launch-func-to-cuda",
   379           "Convert all launch_func ops to CUDA runtime calls");