github.com/johnnyeven/libtools@v0.0.0-20191126065708-61829c1adf46/third_party/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp (about) 1 //===- ConvertKernelFuncToCubin.cpp - MLIR GPU lowering passes ------------===// 2 // 3 // Copyright 2019 The MLIR Authors. 4 // 5 // Licensed under the Apache License, Version 2.0 (the "License"); 6 // you may not use this file except in compliance with the License. 7 // You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 // ============================================================================= 17 // 18 // This file implements a pass to convert gpu kernel functions into a 19 // corresponding binary blob that can be executed on a CUDA GPU. Currently 20 // only translates the function itself but no dependencies. 21 // 22 //===----------------------------------------------------------------------===// 23 24 #include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h" 25 26 #include "mlir/Dialect/GPU/GPUDialect.h" 27 #include "mlir/IR/Attributes.h" 28 #include "mlir/IR/Builders.h" 29 #include "mlir/IR/Function.h" 30 #include "mlir/IR/Module.h" 31 #include "mlir/Pass/Pass.h" 32 #include "mlir/Pass/PassRegistry.h" 33 #include "mlir/Support/LogicalResult.h" 34 #include "mlir/Target/NVVMIR.h" 35 36 #include "llvm/ADT/Optional.h" 37 #include "llvm/ADT/Twine.h" 38 #include "llvm/IR/Constants.h" 39 #include "llvm/IR/LegacyPassManager.h" 40 #include "llvm/IR/Module.h" 41 #include "llvm/Support/Error.h" 42 #include "llvm/Support/TargetRegistry.h" 43 #include "llvm/Support/TargetSelect.h" 44 #include "llvm/Target/TargetMachine.h" 45 46 using namespace mlir; 47 48 namespace { 49 // TODO(herhut): Move to shared location. 50 static constexpr const char *kCubinAnnotation = "nvvm.cubin"; 51 52 /// A pass converting tagged kernel functions to cubin blobs. 53 class GpuKernelToCubinPass : public ModulePass<GpuKernelToCubinPass> { 54 public: 55 GpuKernelToCubinPass( 56 CubinGenerator cubinGenerator = compilePtxToCubinForTesting) 57 : cubinGenerator(cubinGenerator) {} 58 59 // Run the dialect converter on the module. 60 void runOnModule() override { 61 // Make sure the NVPTX target is initialized. 62 LLVMInitializeNVPTXTarget(); 63 LLVMInitializeNVPTXTargetInfo(); 64 LLVMInitializeNVPTXTargetMC(); 65 LLVMInitializeNVPTXAsmPrinter(); 66 67 for (auto function : getModule().getOps<FuncOp>()) { 68 if (!gpu::GPUDialect::isKernel(function) || function.isExternal()) { 69 continue; 70 } 71 if (failed(translateGpuKernelToCubinAnnotation(function))) 72 signalPassFailure(); 73 } 74 } 75 76 private: 77 static OwnedCubin compilePtxToCubinForTesting(const std::string &ptx, 78 FuncOp &function); 79 80 std::string translateModuleToPtx(llvm::Module &module, 81 llvm::TargetMachine &target_machine); 82 OwnedCubin convertModuleToCubin(llvm::Module &llvmModule, FuncOp &function); 83 LogicalResult translateGpuKernelToCubinAnnotation(FuncOp &function); 84 85 CubinGenerator cubinGenerator; 86 }; 87 88 } // anonymous namespace 89 90 std::string GpuKernelToCubinPass::translateModuleToPtx( 91 llvm::Module &module, llvm::TargetMachine &target_machine) { 92 std::string ptx; 93 { 94 llvm::raw_string_ostream stream(ptx); 95 llvm::buffer_ostream pstream(stream); 96 llvm::legacy::PassManager codegen_passes; 97 target_machine.addPassesToEmitFile(codegen_passes, pstream, nullptr, 98 llvm::TargetMachine::CGFT_AssemblyFile); 99 codegen_passes.run(module); 100 } 101 102 return ptx; 103 } 104 105 OwnedCubin 106 GpuKernelToCubinPass::compilePtxToCubinForTesting(const std::string &ptx, 107 FuncOp &function) { 108 const char data[] = "CUBIN"; 109 return std::make_unique<std::vector<char>>(data, data + sizeof(data) - 1); 110 } 111 112 OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule, 113 FuncOp &function) { 114 std::unique_ptr<llvm::TargetMachine> targetMachine; 115 { 116 std::string error; 117 // TODO(herhut): Make triple configurable. 118 constexpr const char *cudaTriple = "nvptx64-nvidia-cuda"; 119 llvm::Triple triple(cudaTriple); 120 const llvm::Target *target = 121 llvm::TargetRegistry::lookupTarget("", triple, error); 122 if (target == nullptr) { 123 function.emitError("Cannot initialize target triple"); 124 return {}; 125 } 126 targetMachine.reset( 127 target->createTargetMachine(triple.str(), "sm_35", "+ptx60", {}, {})); 128 } 129 130 // Set the data layout of the llvm module to match what the ptx target needs. 131 llvmModule.setDataLayout(targetMachine->createDataLayout()); 132 133 auto ptx = translateModuleToPtx(llvmModule, *targetMachine); 134 135 return cubinGenerator(ptx, function); 136 } 137 138 LogicalResult 139 GpuKernelToCubinPass::translateGpuKernelToCubinAnnotation(FuncOp &function) { 140 Builder builder(function.getContext()); 141 142 OwningModuleRef module = ModuleOp::create(function.getLoc()); 143 144 // TODO(herhut): Also handle called functions. 145 module->push_back(function.clone()); 146 147 auto llvmModule = translateModuleToNVVMIR(*module); 148 auto cubin = convertModuleToCubin(*llvmModule, function); 149 150 if (!cubin) { 151 return function.emitError("Translation to CUDA binary failed."); 152 } 153 154 function.setAttr(kCubinAnnotation, 155 builder.getStringAttr({cubin->data(), cubin->size()})); 156 157 // Remove the body of the kernel function now that it has been translated. 158 // The main reason to do this is so that the resulting module no longer 159 // contains the NVVM instructions (typically contained in the kernel bodies) 160 // and hence can be compiled into host code by a separate pass. 161 function.eraseBody(); 162 163 return success(); 164 } 165 166 std::unique_ptr<ModulePassBase> 167 mlir::createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator) { 168 return std::make_unique<GpuKernelToCubinPass>(cubinGenerator); 169 } 170 171 static PassRegistration<GpuKernelToCubinPass> 172 pass("test-kernel-to-cubin", 173 "Convert all kernel functions to CUDA cubin blobs");