github.com/johnnyeven/libtools@v0.0.0-20191126065708-61829c1adf46/third_party/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp (about)

     1  //===- ConvertKernelFuncToCubin.cpp - MLIR GPU lowering passes ------------===//
     2  //
     3  // Copyright 2019 The MLIR Authors.
     4  //
     5  // Licensed under the Apache License, Version 2.0 (the "License");
     6  // you may not use this file except in compliance with the License.
     7  // You may obtain a copy of the License at
     8  //
     9  //   http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  // =============================================================================
    17  //
    18  // This file implements a pass to convert gpu kernel functions into a
    19  // corresponding binary blob that can be executed on a CUDA GPU. Currently
    20  // only translates the function itself but no dependencies.
    21  //
    22  //===----------------------------------------------------------------------===//
    23  
    24  #include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
    25  
    26  #include "mlir/Dialect/GPU/GPUDialect.h"
    27  #include "mlir/IR/Attributes.h"
    28  #include "mlir/IR/Builders.h"
    29  #include "mlir/IR/Function.h"
    30  #include "mlir/IR/Module.h"
    31  #include "mlir/Pass/Pass.h"
    32  #include "mlir/Pass/PassRegistry.h"
    33  #include "mlir/Support/LogicalResult.h"
    34  #include "mlir/Target/NVVMIR.h"
    35  
    36  #include "llvm/ADT/Optional.h"
    37  #include "llvm/ADT/Twine.h"
    38  #include "llvm/IR/Constants.h"
    39  #include "llvm/IR/LegacyPassManager.h"
    40  #include "llvm/IR/Module.h"
    41  #include "llvm/Support/Error.h"
    42  #include "llvm/Support/TargetRegistry.h"
    43  #include "llvm/Support/TargetSelect.h"
    44  #include "llvm/Target/TargetMachine.h"
    45  
    46  using namespace mlir;
    47  
    48  namespace {
    49  // TODO(herhut): Move to shared location.
    50  static constexpr const char *kCubinAnnotation = "nvvm.cubin";
    51  
    52  /// A pass converting tagged kernel functions to cubin blobs.
    53  class GpuKernelToCubinPass : public ModulePass<GpuKernelToCubinPass> {
    54  public:
    55    GpuKernelToCubinPass(
    56        CubinGenerator cubinGenerator = compilePtxToCubinForTesting)
    57        : cubinGenerator(cubinGenerator) {}
    58  
    59    // Run the dialect converter on the module.
    60    void runOnModule() override {
    61      // Make sure the NVPTX target is initialized.
    62      LLVMInitializeNVPTXTarget();
    63      LLVMInitializeNVPTXTargetInfo();
    64      LLVMInitializeNVPTXTargetMC();
    65      LLVMInitializeNVPTXAsmPrinter();
    66  
    67      for (auto function : getModule().getOps<FuncOp>()) {
    68        if (!gpu::GPUDialect::isKernel(function) || function.isExternal()) {
    69          continue;
    70        }
    71        if (failed(translateGpuKernelToCubinAnnotation(function)))
    72          signalPassFailure();
    73      }
    74    }
    75  
    76  private:
    77    static OwnedCubin compilePtxToCubinForTesting(const std::string &ptx,
    78                                                  FuncOp &function);
    79  
    80    std::string translateModuleToPtx(llvm::Module &module,
    81                                     llvm::TargetMachine &target_machine);
    82    OwnedCubin convertModuleToCubin(llvm::Module &llvmModule, FuncOp &function);
    83    LogicalResult translateGpuKernelToCubinAnnotation(FuncOp &function);
    84  
    85    CubinGenerator cubinGenerator;
    86  };
    87  
    88  } // anonymous namespace
    89  
    90  std::string GpuKernelToCubinPass::translateModuleToPtx(
    91      llvm::Module &module, llvm::TargetMachine &target_machine) {
    92    std::string ptx;
    93    {
    94      llvm::raw_string_ostream stream(ptx);
    95      llvm::buffer_ostream pstream(stream);
    96      llvm::legacy::PassManager codegen_passes;
    97      target_machine.addPassesToEmitFile(codegen_passes, pstream, nullptr,
    98                                         llvm::TargetMachine::CGFT_AssemblyFile);
    99      codegen_passes.run(module);
   100    }
   101  
   102    return ptx;
   103  }
   104  
   105  OwnedCubin
   106  GpuKernelToCubinPass::compilePtxToCubinForTesting(const std::string &ptx,
   107                                                    FuncOp &function) {
   108    const char data[] = "CUBIN";
   109    return std::make_unique<std::vector<char>>(data, data + sizeof(data) - 1);
   110  }
   111  
   112  OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule,
   113                                                        FuncOp &function) {
   114    std::unique_ptr<llvm::TargetMachine> targetMachine;
   115    {
   116      std::string error;
   117      // TODO(herhut): Make triple configurable.
   118      constexpr const char *cudaTriple = "nvptx64-nvidia-cuda";
   119      llvm::Triple triple(cudaTriple);
   120      const llvm::Target *target =
   121          llvm::TargetRegistry::lookupTarget("", triple, error);
   122      if (target == nullptr) {
   123        function.emitError("Cannot initialize target triple");
   124        return {};
   125      }
   126      targetMachine.reset(
   127          target->createTargetMachine(triple.str(), "sm_35", "+ptx60", {}, {}));
   128    }
   129  
   130    // Set the data layout of the llvm module to match what the ptx target needs.
   131    llvmModule.setDataLayout(targetMachine->createDataLayout());
   132  
   133    auto ptx = translateModuleToPtx(llvmModule, *targetMachine);
   134  
   135    return cubinGenerator(ptx, function);
   136  }
   137  
   138  LogicalResult
   139  GpuKernelToCubinPass::translateGpuKernelToCubinAnnotation(FuncOp &function) {
   140    Builder builder(function.getContext());
   141  
   142    OwningModuleRef module = ModuleOp::create(function.getLoc());
   143  
   144    // TODO(herhut): Also handle called functions.
   145    module->push_back(function.clone());
   146  
   147    auto llvmModule = translateModuleToNVVMIR(*module);
   148    auto cubin = convertModuleToCubin(*llvmModule, function);
   149  
   150    if (!cubin) {
   151      return function.emitError("Translation to CUDA binary failed.");
   152    }
   153  
   154    function.setAttr(kCubinAnnotation,
   155                     builder.getStringAttr({cubin->data(), cubin->size()}));
   156  
   157    // Remove the body of the kernel function now that it has been translated.
   158    // The main reason to do this is so that the resulting module no longer
   159    // contains the NVVM instructions (typically contained in the kernel bodies)
   160    // and hence can be compiled into host code by a separate pass.
   161    function.eraseBody();
   162  
   163    return success();
   164  }
   165  
   166  std::unique_ptr<ModulePassBase>
   167  mlir::createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator) {
   168    return std::make_unique<GpuKernelToCubinPass>(cubinGenerator);
   169  }
   170  
   171  static PassRegistration<GpuKernelToCubinPass>
   172      pass("test-kernel-to-cubin",
   173           "Convert all kernel functions to CUDA cubin blobs");