github.com/johnnyeven/libtools@v0.0.0-20191126065708-61829c1adf46/third_party/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp (about) 1 //===- ConvertLaunchFuncToCudaCalls.cpp - MLIR CUDA lowering passes -------===// 2 // 3 // Copyright 2019 The MLIR Authors. 4 // 5 // Licensed under the Apache License, Version 2.0 (the "License"); 6 // you may not use this file except in compliance with the License. 7 // You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 // ============================================================================= 17 // 18 // This file implements a pass to convert gpu.launch_func op into a sequence of 19 // CUDA runtime calls. As the CUDA runtime does not have a stable published ABI, 20 // this pass uses a slim runtime layer that builds on top of the public API from 21 // the CUDA headers. 22 // 23 //===----------------------------------------------------------------------===// 24 25 #include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h" 26 27 #include "mlir/Dialect/GPU/GPUDialect.h" 28 #include "mlir/Dialect/LLVMIR/LLVMDialect.h" 29 #include "mlir/IR/Attributes.h" 30 #include "mlir/IR/Builders.h" 31 #include "mlir/IR/Function.h" 32 #include "mlir/IR/Module.h" 33 #include "mlir/IR/StandardTypes.h" 34 #include "mlir/Pass/Pass.h" 35 36 #include "llvm/ADT/STLExtras.h" 37 #include "llvm/IR/DataLayout.h" 38 #include "llvm/IR/DerivedTypes.h" 39 #include "llvm/IR/Module.h" 40 #include "llvm/IR/Type.h" 41 #include "llvm/Support/Error.h" 42 #include "llvm/Support/FormatVariadic.h" 43 44 using namespace mlir; 45 46 // To avoid name mangling, these are defined in the mini-runtime file. 47 static constexpr const char *cuModuleLoadName = "mcuModuleLoad"; 48 static constexpr const char *cuModuleGetFunctionName = "mcuModuleGetFunction"; 49 static constexpr const char *cuLaunchKernelName = "mcuLaunchKernel"; 50 static constexpr const char *cuGetStreamHelperName = "mcuGetStreamHelper"; 51 static constexpr const char *cuStreamSynchronizeName = "mcuStreamSynchronize"; 52 53 static constexpr const char *kCubinGetterAnnotation = "nvvm.cubingetter"; 54 55 namespace { 56 57 /// A pass to convert gpu.launch_func operations into a sequence of CUDA 58 /// runtime calls. 59 /// 60 /// In essence, a gpu.launch_func operations gets compiled into the following 61 /// sequence of runtime calls: 62 /// 63 /// * mcuModuleLoad -- loads the module given the cubin data 64 /// * mcuModuleGetFunction -- gets a handle to the actual kernel function 65 /// * mcuGetStreamHelper -- initializes a new CUDA stream 66 /// * mcuLaunchKernelName -- launches the kernel on a stream 67 /// * mcuStreamSynchronize -- waits for operations on the stream to finish 68 /// 69 /// Intermediate data structures are allocated on the stack. 70 class GpuLaunchFuncToCudaCallsPass 71 : public ModulePass<GpuLaunchFuncToCudaCallsPass> { 72 private: 73 LLVM::LLVMDialect *getLLVMDialect() { return llvmDialect; } 74 75 llvm::LLVMContext &getLLVMContext() { 76 return getLLVMDialect()->getLLVMContext(); 77 } 78 79 void initializeCachedTypes() { 80 const llvm::Module &module = llvmDialect->getLLVMModule(); 81 llvmPointerType = LLVM::LLVMType::getInt8PtrTy(llvmDialect); 82 llvmPointerPointerType = llvmPointerType.getPointerTo(); 83 llvmInt8Type = LLVM::LLVMType::getInt8Ty(llvmDialect); 84 llvmInt32Type = LLVM::LLVMType::getInt32Ty(llvmDialect); 85 llvmInt64Type = LLVM::LLVMType::getInt64Ty(llvmDialect); 86 llvmIntPtrType = LLVM::LLVMType::getIntNTy( 87 llvmDialect, module.getDataLayout().getPointerSizeInBits()); 88 } 89 90 LLVM::LLVMType getPointerType() { return llvmPointerType; } 91 92 LLVM::LLVMType getPointerPointerType() { return llvmPointerPointerType; } 93 94 LLVM::LLVMType getInt8Type() { return llvmInt8Type; } 95 96 LLVM::LLVMType getInt32Type() { return llvmInt32Type; } 97 98 LLVM::LLVMType getInt64Type() { return llvmInt64Type; } 99 100 LLVM::LLVMType getIntPtrType() { 101 const llvm::Module &module = getLLVMDialect()->getLLVMModule(); 102 return LLVM::LLVMType::getIntNTy( 103 getLLVMDialect(), module.getDataLayout().getPointerSizeInBits()); 104 } 105 106 LLVM::LLVMType getCUResultType() { 107 // This is declared as an enum in CUDA but helpers use i32. 108 return getInt32Type(); 109 } 110 111 // Allocate a void pointer on the stack. 112 Value *allocatePointer(OpBuilder &builder, Location loc) { 113 auto one = builder.create<LLVM::ConstantOp>(loc, getInt32Type(), 114 builder.getI32IntegerAttr(1)); 115 return builder.create<LLVM::AllocaOp>(loc, getPointerPointerType(), one, 116 /*alignment=*/0); 117 } 118 119 void declareCudaFunctions(Location loc); 120 Value *setupParamsArray(gpu::LaunchFuncOp launchOp, OpBuilder &builder); 121 Value *generateKernelNameConstant(FuncOp kernelFunction, Location &loc, 122 OpBuilder &builder); 123 void translateGpuLaunchCalls(mlir::gpu::LaunchFuncOp launchOp); 124 125 public: 126 // Run the dialect converter on the module. 127 void runOnModule() override { 128 // Cache the LLVMDialect for the current module. 129 llvmDialect = getContext().getRegisteredDialect<LLVM::LLVMDialect>(); 130 // Cache the used LLVM types. 131 initializeCachedTypes(); 132 133 for (auto func : getModule().getOps<FuncOp>()) { 134 func.walk( 135 [this](mlir::gpu::LaunchFuncOp op) { translateGpuLaunchCalls(op); }); 136 } 137 } 138 139 private: 140 LLVM::LLVMDialect *llvmDialect; 141 LLVM::LLVMType llvmPointerType; 142 LLVM::LLVMType llvmPointerPointerType; 143 LLVM::LLVMType llvmInt8Type; 144 LLVM::LLVMType llvmInt32Type; 145 LLVM::LLVMType llvmInt64Type; 146 LLVM::LLVMType llvmIntPtrType; 147 }; 148 149 } // anonymous namespace 150 151 // Adds declarations for the needed helper functions from the CUDA wrapper. 152 // The types in comments give the actual types expected/returned but the API 153 // uses void pointers. This is fine as they have the same linkage in C. 154 void GpuLaunchFuncToCudaCallsPass::declareCudaFunctions(Location loc) { 155 ModuleOp module = getModule(); 156 Builder builder(module); 157 if (!module.lookupSymbol<FuncOp>(cuModuleLoadName)) { 158 module.push_back( 159 FuncOp::create(loc, cuModuleLoadName, 160 builder.getFunctionType( 161 { 162 getPointerPointerType(), /* CUmodule *module */ 163 getPointerType() /* void *cubin */ 164 }, 165 getCUResultType()))); 166 } 167 if (!module.lookupSymbol<FuncOp>(cuModuleGetFunctionName)) { 168 // The helper uses void* instead of CUDA's opaque CUmodule and 169 // CUfunction. 170 module.push_back( 171 FuncOp::create(loc, cuModuleGetFunctionName, 172 builder.getFunctionType( 173 { 174 getPointerPointerType(), /* void **function */ 175 getPointerType(), /* void *module */ 176 getPointerType() /* char *name */ 177 }, 178 getCUResultType()))); 179 } 180 if (!module.lookupSymbol<FuncOp>(cuLaunchKernelName)) { 181 // Other than the CUDA api, the wrappers use uintptr_t to match the 182 // LLVM type if MLIR's index type, which the GPU dialect uses. 183 // Furthermore, they use void* instead of CUDA's opaque CUfunction and 184 // CUstream. 185 module.push_back(FuncOp::create( 186 loc, cuLaunchKernelName, 187 builder.getFunctionType( 188 { 189 getPointerType(), /* void* f */ 190 getIntPtrType(), /* intptr_t gridXDim */ 191 getIntPtrType(), /* intptr_t gridyDim */ 192 getIntPtrType(), /* intptr_t gridZDim */ 193 getIntPtrType(), /* intptr_t blockXDim */ 194 getIntPtrType(), /* intptr_t blockYDim */ 195 getIntPtrType(), /* intptr_t blockZDim */ 196 getInt32Type(), /* unsigned int sharedMemBytes */ 197 getPointerType(), /* void *hstream */ 198 getPointerPointerType(), /* void **kernelParams */ 199 getPointerPointerType() /* void **extra */ 200 }, 201 getCUResultType()))); 202 } 203 if (!module.lookupSymbol<FuncOp>(cuGetStreamHelperName)) { 204 // Helper function to get the current CUDA stream. Uses void* instead of 205 // CUDAs opaque CUstream. 206 module.push_back(FuncOp::create( 207 loc, cuGetStreamHelperName, 208 builder.getFunctionType({}, getPointerType() /* void *stream */))); 209 } 210 if (!module.lookupSymbol<FuncOp>(cuStreamSynchronizeName)) { 211 module.push_back( 212 FuncOp::create(loc, cuStreamSynchronizeName, 213 builder.getFunctionType( 214 { 215 getPointerType() /* CUstream stream */ 216 }, 217 getCUResultType()))); 218 } 219 } 220 221 // Generates a parameters array to be used with a CUDA kernel launch call. The 222 // arguments are extracted from the launchOp. 223 // The generated code is essentially as follows: 224 // 225 // %array = alloca(numparams * sizeof(void *)) 226 // for (i : [0, NumKernelOperands)) 227 // %array[i] = cast<void*>(KernelOperand[i]) 228 // return %array 229 Value * 230 GpuLaunchFuncToCudaCallsPass::setupParamsArray(gpu::LaunchFuncOp launchOp, 231 OpBuilder &builder) { 232 Location loc = launchOp.getLoc(); 233 auto one = builder.create<LLVM::ConstantOp>(loc, getInt32Type(), 234 builder.getI32IntegerAttr(1)); 235 auto arraySize = builder.create<LLVM::ConstantOp>( 236 loc, getInt32Type(), 237 builder.getI32IntegerAttr(launchOp.getNumKernelOperands())); 238 auto array = builder.create<LLVM::AllocaOp>(loc, getPointerPointerType(), 239 arraySize, /*alignment=*/0); 240 for (int idx = 0, e = launchOp.getNumKernelOperands(); idx < e; ++idx) { 241 auto operand = launchOp.getKernelOperand(idx); 242 auto llvmType = operand->getType().cast<LLVM::LLVMType>(); 243 auto memLocation = builder.create<LLVM::AllocaOp>( 244 loc, llvmType.getPointerTo(), one, /*alignment=*/1); 245 builder.create<LLVM::StoreOp>(loc, operand, memLocation); 246 auto casted = 247 builder.create<LLVM::BitcastOp>(loc, getPointerType(), memLocation); 248 auto index = builder.create<LLVM::ConstantOp>( 249 loc, getInt32Type(), builder.getI32IntegerAttr(idx)); 250 auto gep = builder.create<LLVM::GEPOp>(loc, getPointerPointerType(), array, 251 ArrayRef<Value *>{index}); 252 builder.create<LLVM::StoreOp>(loc, casted, gep); 253 } 254 return array; 255 } 256 257 // Generates an LLVM IR dialect global that contains the name of the given 258 // kernel function as a C string, and returns a pointer to its beginning. 259 // The code is essentially: 260 // 261 // llvm.global constant @kernel_name("function_name\00") 262 // func(...) { 263 // %0 = llvm.addressof @kernel_name 264 // %1 = llvm.constant (0 : index) 265 // %2 = llvm.getelementptr %0[%1, %1] : !llvm<"i8*"> 266 // } 267 Value *GpuLaunchFuncToCudaCallsPass::generateKernelNameConstant( 268 FuncOp kernelFunction, Location &loc, OpBuilder &builder) { 269 // Make sure the trailing zero is included in the constant. 270 std::vector<char> kernelName(kernelFunction.getName().begin(), 271 kernelFunction.getName().end()); 272 kernelName.push_back('\0'); 273 274 std::string globalName = 275 llvm::formatv("{0}_kernel_name", kernelFunction.getName()); 276 return LLVM::createGlobalString( 277 loc, builder, globalName, StringRef(kernelName.data(), kernelName.size()), 278 llvmDialect); 279 } 280 281 // Emits LLVM IR to launch a kernel function. Expects the module that contains 282 // the compiled kernel function as a cubin in the 'nvvm.cubin' attribute of the 283 // kernel function in the IR. 284 // While MLIR has no global constants, also expects a cubin getter function in 285 // an 'nvvm.cubingetter' attribute. Such function is expected to return a 286 // pointer to the cubin blob when invoked. 287 // With these given, the generated code in essence is 288 // 289 // %0 = call %cubingetter 290 // %1 = alloca sizeof(void*) 291 // call %mcuModuleLoad(%2, %1) 292 // %2 = alloca sizeof(void*) 293 // %3 = load %1 294 // %4 = <see generateKernelNameConstant> 295 // call %mcuModuleGetFunction(%2, %3, %4) 296 // %5 = call %mcuGetStreamHelper() 297 // %6 = load %2 298 // %7 = <see setupParamsArray> 299 // call %mcuLaunchKernel(%6, <launchOp operands 0..5>, 0, %5, %7, nullptr) 300 // call %mcuStreamSynchronize(%5) 301 void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls( 302 mlir::gpu::LaunchFuncOp launchOp) { 303 OpBuilder builder(launchOp); 304 Location loc = launchOp.getLoc(); 305 declareCudaFunctions(loc); 306 307 auto zero = builder.create<LLVM::ConstantOp>(loc, getInt32Type(), 308 builder.getI32IntegerAttr(0)); 309 // Emit a call to the cubin getter to retrieve a pointer to the data that 310 // represents the cubin at runtime. 311 // TODO(herhut): This should rather be a static global once supported. 312 auto kernelFunction = getModule().lookupSymbol<FuncOp>(launchOp.kernel()); 313 auto cubinGetter = 314 kernelFunction.getAttrOfType<SymbolRefAttr>(kCubinGetterAnnotation); 315 if (!cubinGetter) { 316 kernelFunction.emitError("Missing ") 317 << kCubinGetterAnnotation << " attribute."; 318 return signalPassFailure(); 319 } 320 auto data = builder.create<LLVM::CallOp>( 321 loc, ArrayRef<Type>{getPointerType()}, cubinGetter, ArrayRef<Value *>{}); 322 // Emit the load module call to load the module data. Error checking is done 323 // in the called helper function. 324 auto cuModule = allocatePointer(builder, loc); 325 FuncOp cuModuleLoad = getModule().lookupSymbol<FuncOp>(cuModuleLoadName); 326 builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{getCUResultType()}, 327 builder.getSymbolRefAttr(cuModuleLoad), 328 ArrayRef<Value *>{cuModule, data.getResult(0)}); 329 // Get the function from the module. The name corresponds to the name of 330 // the kernel function. 331 auto cuOwningModuleRef = 332 builder.create<LLVM::LoadOp>(loc, getPointerType(), cuModule); 333 auto kernelName = generateKernelNameConstant(kernelFunction, loc, builder); 334 auto cuFunction = allocatePointer(builder, loc); 335 FuncOp cuModuleGetFunction = 336 getModule().lookupSymbol<FuncOp>(cuModuleGetFunctionName); 337 builder.create<LLVM::CallOp>( 338 loc, ArrayRef<Type>{getCUResultType()}, 339 builder.getSymbolRefAttr(cuModuleGetFunction), 340 ArrayRef<Value *>{cuFunction, cuOwningModuleRef, kernelName}); 341 // Grab the global stream needed for execution. 342 FuncOp cuGetStreamHelper = 343 getModule().lookupSymbol<FuncOp>(cuGetStreamHelperName); 344 auto cuStream = builder.create<LLVM::CallOp>( 345 loc, ArrayRef<Type>{getPointerType()}, 346 builder.getSymbolRefAttr(cuGetStreamHelper), ArrayRef<Value *>{}); 347 // Invoke the function with required arguments. 348 auto cuLaunchKernel = getModule().lookupSymbol<FuncOp>(cuLaunchKernelName); 349 auto cuFunctionRef = 350 builder.create<LLVM::LoadOp>(loc, getPointerType(), cuFunction); 351 auto paramsArray = setupParamsArray(launchOp, builder); 352 auto nullpointer = 353 builder.create<LLVM::IntToPtrOp>(loc, getPointerPointerType(), zero); 354 builder.create<LLVM::CallOp>( 355 loc, ArrayRef<Type>{getCUResultType()}, 356 builder.getSymbolRefAttr(cuLaunchKernel), 357 ArrayRef<Value *>{cuFunctionRef, launchOp.getOperand(0), 358 launchOp.getOperand(1), launchOp.getOperand(2), 359 launchOp.getOperand(3), launchOp.getOperand(4), 360 launchOp.getOperand(5), zero, /* sharedMemBytes */ 361 cuStream.getResult(0), /* stream */ 362 paramsArray, /* kernel params */ 363 nullpointer /* extra */}); 364 // Sync on the stream to make it synchronous. 365 auto cuStreamSync = getModule().lookupSymbol<FuncOp>(cuStreamSynchronizeName); 366 builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{getCUResultType()}, 367 builder.getSymbolRefAttr(cuStreamSync), 368 ArrayRef<Value *>(cuStream.getResult(0))); 369 launchOp.erase(); 370 } 371 372 std::unique_ptr<mlir::ModulePassBase> 373 mlir::createConvertGpuLaunchFuncToCudaCallsPass() { 374 return std::make_unique<GpuLaunchFuncToCudaCallsPass>(); 375 } 376 377 static PassRegistration<GpuLaunchFuncToCudaCallsPass> 378 pass("launch-func-to-cuda", 379 "Convert all launch_func ops to CUDA runtime calls");