github.com/johnnyeven/libtools@v0.0.0-20191126065708-61829c1adf46/third_party/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp (about) 1 //===- LoopsToGPU.cpp - Convert an affine loop nest to a GPU kernel -------===// 2 // 3 // Copyright 2019 The MLIR Authors. 4 // 5 // Licensed under the Apache License, Version 2.0 (the "License"); 6 // you may not use this file except in compliance with the License. 7 // You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 // ============================================================================= 17 // 18 // This implements a straightforward conversion of an loop nest into a GPU 19 // kernel. The caller is expected to guarantee that the conversion is correct 20 // or to further transform the kernel to ensure correctness. 21 // 22 //===----------------------------------------------------------------------===// 23 24 #include "mlir/Conversion/LoopsToGPU/LoopsToGPU.h" 25 #include "mlir/Dialect/AffineOps/AffineOps.h" 26 #include "mlir/Dialect/GPU/GPUDialect.h" 27 #include "mlir/Dialect/LoopOps/LoopOps.h" 28 #include "mlir/Dialect/StandardOps/Ops.h" 29 #include "mlir/IR/AffineExpr.h" 30 #include "mlir/IR/Builders.h" 31 #include "mlir/Transforms/LowerAffine.h" 32 #include "mlir/Transforms/RegionUtils.h" 33 34 #include "llvm/Support/Debug.h" 35 36 #define DEBUG_TYPE "loops-to-gpu" 37 38 using namespace mlir; 39 using namespace mlir::loop; 40 41 // Extract an indexed value from KernelDim3. 42 static Value *getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) { 43 switch (pos) { 44 case 0: 45 return dim3.x; 46 case 1: 47 return dim3.y; 48 case 2: 49 return dim3.z; 50 default: 51 llvm_unreachable("dim3 position out of bounds"); 52 } 53 return nullptr; 54 } 55 56 // Get the lower bound-related operands of a loop operation. 57 static Operation::operand_range getLowerBoundOperands(AffineForOp forOp) { 58 return forOp.getLowerBoundOperands(); 59 } 60 static SmallVector<Value *, 1> getLowerBoundOperands(ForOp forOp) { 61 SmallVector<Value *, 1> bounds(1, forOp.lowerBound()); 62 return bounds; 63 } 64 65 // Get the upper bound-related operands of a loop operation. 66 static Operation::operand_range getUpperBoundOperands(AffineForOp forOp) { 67 return forOp.getUpperBoundOperands(); 68 } 69 static SmallVector<Value *, 1> getUpperBoundOperands(ForOp forOp) { 70 SmallVector<Value *, 1> bounds(1, forOp.upperBound()); 71 return bounds; 72 } 73 74 // Get a Value that corresponds to the loop step. If the step is an attribute, 75 // materialize a corresponding constant using builder. 76 static Value *getOrCreateStep(AffineForOp forOp, OpBuilder &builder) { 77 return builder.create<ConstantIndexOp>(forOp.getLoc(), forOp.getStep()); 78 } 79 static Value *getOrCreateStep(ForOp forOp, OpBuilder &) { return forOp.step(); } 80 81 // Get a Value for the loop lower bound. If the value requires computation, 82 // materialize the instructions using builder. 83 static Value *getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder) { 84 return lowerAffineLowerBound(forOp, builder); 85 } 86 static Value *getOrEmitLowerBound(ForOp forOp, OpBuilder &) { 87 return forOp.lowerBound(); 88 } 89 90 // Get a Value for the loop upper bound. If the value requires computation, 91 // materialize the instructions using builder. 92 static Value *getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder) { 93 return lowerAffineUpperBound(forOp, builder); 94 } 95 static Value *getOrEmitUpperBound(ForOp forOp, OpBuilder &) { 96 return forOp.upperBound(); 97 } 98 99 // Check the structure of the loop nest: 100 // - there are enough loops to map to numBlockDims + numThreadDims; 101 // - the loops are perfectly nested; 102 // - the loop bounds can be computed above the outermost loop. 103 // This roughly corresponds to the "matcher" part of the pattern-based 104 // rewriting infrastructure. 105 template <typename OpTy> 106 LogicalResult checkLoopNestMappable(OpTy forOp, unsigned numBlockDims, 107 unsigned numThreadDims) { 108 if (numBlockDims < 1 || numThreadDims < 1) { 109 LLVM_DEBUG(llvm::dbgs() << "nothing to map"); 110 return success(); 111 } 112 113 OpBuilder builder(forOp.getOperation()); 114 if (numBlockDims > 3) { 115 return emitError(builder.getUnknownLoc(), 116 "cannot map to more than 3 block dimensions"); 117 } 118 if (numThreadDims > 3) { 119 return emitError(builder.getUnknownLoc(), 120 "cannot map to more than 3 thread dimensions"); 121 } 122 123 OpTy currentLoop = forOp; 124 Region &limit = forOp.region(); 125 for (unsigned i = 0, e = numBlockDims + numThreadDims; i < e; ++i) { 126 Operation *nested = ¤tLoop.getBody()->front(); 127 if (!areValuesDefinedAbove(getLowerBoundOperands(currentLoop), limit) || 128 !areValuesDefinedAbove(getUpperBoundOperands(currentLoop), limit)) 129 return currentLoop.emitError( 130 "loops with bounds depending on other mapped loops " 131 "are not supported"); 132 133 // The innermost loop can have an arbitrary body, skip the perfect nesting 134 // check for it. 135 if (i == e - 1) 136 break; 137 138 auto begin = currentLoop.getBody()->begin(), 139 end = currentLoop.getBody()->end(); 140 if (currentLoop.getBody()->empty() || std::next(begin, 2) != end) 141 return currentLoop.emitError( 142 "expected perfectly nested loops in the body"); 143 144 if (!(currentLoop = dyn_cast<OpTy>(nested))) 145 return nested->emitError("expected a nested loop"); 146 } 147 148 return success(); 149 } 150 151 namespace { 152 // Helper structure that holds common state of the loop to GPU kernel 153 // conversion. 154 struct LoopToGpuConverter { 155 template <typename OpTy> 156 Optional<OpTy> collectBounds(OpTy forOp, unsigned numLoops); 157 158 template <typename OpTy> 159 void createLaunch(OpTy rootForOp, OpTy innermostForOp, unsigned numBlockDims, 160 unsigned numThreadDims); 161 162 // Ranges of the loops mapped to blocks or threads. 163 SmallVector<Value *, 6> dims; 164 // Lower bounds of the loops mapped to blocks or threads. 165 SmallVector<Value *, 6> lbs; 166 // Induction variables of the loops mapped to blocks or threads. 167 SmallVector<Value *, 6> ivs; 168 // Steps of the loops mapped to blocks or threads. 169 SmallVector<Value *, 6> steps; 170 }; 171 } // namespace 172 173 // Return true if the value is obviously a constant "one". 174 static bool isConstantOne(Value *value) { 175 if (auto def = dyn_cast_or_null<ConstantIndexOp>(value->getDefiningOp())) 176 return def.getValue() == 1; 177 return false; 178 } 179 180 // Collect ranges, bounds, steps and induction variables in preparation for 181 // mapping a loop nest of depth "numLoops" rooted at "forOp" to a GPU kernel. 182 // This may fail if the IR for computing loop bounds cannot be constructed, for 183 // example if an affine loop uses semi-affine maps. Return the last loop to be 184 // mapped on success, llvm::None on failure. 185 template <typename OpTy> 186 Optional<OpTy> LoopToGpuConverter::collectBounds(OpTy forOp, 187 unsigned numLoops) { 188 OpBuilder builder(forOp.getOperation()); 189 dims.reserve(numLoops); 190 lbs.reserve(numLoops); 191 ivs.reserve(numLoops); 192 steps.reserve(numLoops); 193 OpTy currentLoop = forOp; 194 for (unsigned i = 0; i < numLoops; ++i) { 195 Value *lowerBound = getOrEmitLowerBound(currentLoop, builder); 196 Value *upperBound = getOrEmitUpperBound(currentLoop, builder); 197 if (!lowerBound || !upperBound) { 198 return llvm::None; 199 } 200 201 Value *range = 202 builder.create<SubIOp>(currentLoop.getLoc(), upperBound, lowerBound); 203 Value *step = getOrCreateStep(currentLoop, builder); 204 if (!isConstantOne(step)) 205 range = builder.create<DivISOp>(currentLoop.getLoc(), range, step); 206 dims.push_back(range); 207 208 lbs.push_back(lowerBound); 209 ivs.push_back(currentLoop.getInductionVar()); 210 steps.push_back(step); 211 212 if (i != numLoops - 1) 213 currentLoop = cast<OpTy>(¤tLoop.getBody()->front()); 214 } 215 return currentLoop; 216 } 217 218 // Replace the rooted at "rootForOp" with a GPU launch operation. This expects 219 // "innermostForOp" to point to the last loop to be transformed to the kernel, 220 // and to have (numBlockDims + numThreadDims) perfectly nested loops between 221 // "rootForOp" and "innermostForOp". 222 template <typename OpTy> 223 void LoopToGpuConverter::createLaunch(OpTy rootForOp, OpTy innermostForOp, 224 unsigned numBlockDims, 225 unsigned numThreadDims) { 226 OpBuilder builder(rootForOp.getOperation()); 227 // Prepare the grid and block sizes for the launch operation. If there is 228 // no loop mapped to a specific dimension, use constant "1" as its size. 229 Value *constOne = (numBlockDims < 3 || numThreadDims < 3) 230 ? builder.create<ConstantIndexOp>(rootForOp.getLoc(), 1) 231 : nullptr; 232 Value *gridSizeX = dims[0]; 233 Value *gridSizeY = numBlockDims > 1 ? dims[1] : constOne; 234 Value *gridSizeZ = numBlockDims > 2 ? dims[2] : constOne; 235 Value *blockSizeX = dims[numBlockDims]; 236 Value *blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne; 237 Value *blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne; 238 239 // Create a launch op and move the body region of the innermost loop to the 240 // launch op. Pass the values defined outside the outermost loop and used 241 // inside the innermost loop and loop lower bounds as kernel data arguments. 242 // Still assuming perfect nesting so there are no values other than induction 243 // variables that are defined in one loop and used in deeper loops. 244 llvm::SetVector<Value *> valuesToForwardSet; 245 getUsedValuesDefinedAbove(innermostForOp.region(), rootForOp.region(), 246 valuesToForwardSet); 247 auto valuesToForward = valuesToForwardSet.takeVector(); 248 auto originallyForwardedValues = valuesToForward.size(); 249 valuesToForward.insert(valuesToForward.end(), lbs.begin(), lbs.end()); 250 valuesToForward.insert(valuesToForward.end(), steps.begin(), steps.end()); 251 auto launchOp = builder.create<gpu::LaunchOp>( 252 rootForOp.getLoc(), gridSizeX, gridSizeY, gridSizeZ, blockSizeX, 253 blockSizeY, blockSizeZ, valuesToForward); 254 valuesToForward.resize(originallyForwardedValues); 255 256 // Replace the loop terminator (loops contain only a single block) with the 257 // gpu return and move the operations from the loop body block to the gpu 258 // launch body block. Do not move the entire block because of the difference 259 // in block arguments. 260 Operation &terminator = innermostForOp.getBody()->back(); 261 Location terminatorLoc = terminator.getLoc(); 262 terminator.erase(); 263 builder.setInsertionPointToEnd(innermostForOp.getBody()); 264 builder.create<gpu::Return>(terminatorLoc); 265 launchOp.getBody().front().getOperations().splice( 266 launchOp.getBody().front().begin(), 267 innermostForOp.getBody()->getOperations()); 268 269 // Remap the loop iterators to use block/thread identifiers instead. Loops 270 // may iterate from LB with step S whereas GPU thread/block ids always iterate 271 // from 0 to N with step 1. Therefore, loop induction variables are replaced 272 // with (gpu-thread/block-id * S) + LB. 273 builder.setInsertionPointToStart(&launchOp.getBody().front()); 274 auto lbArgumentIt = std::next(launchOp.getKernelArguments().begin(), 275 originallyForwardedValues); 276 auto stepArgumentIt = std::next(lbArgumentIt, lbs.size()); 277 for (auto en : llvm::enumerate(ivs)) { 278 Value *id = 279 en.index() < numBlockDims 280 ? getDim3Value(launchOp.getBlockIds(), en.index()) 281 : getDim3Value(launchOp.getThreadIds(), en.index() - numBlockDims); 282 Value *step = steps[en.index()]; 283 if (!isConstantOne(step)) 284 id = builder.create<MulIOp>(rootForOp.getLoc(), step, id); 285 286 Value *ivReplacement = 287 builder.create<AddIOp>(rootForOp.getLoc(), *lbArgumentIt, id); 288 en.value()->replaceAllUsesWith(ivReplacement); 289 replaceAllUsesInRegionWith(steps[en.index()], *stepArgumentIt, 290 launchOp.getBody()); 291 std::advance(lbArgumentIt, 1); 292 std::advance(stepArgumentIt, 1); 293 } 294 295 // Remap the values defined outside the body to use kernel arguments instead. 296 // The list of kernel arguments also contains the lower bounds for loops at 297 // trailing positions, make sure we don't touch those. 298 for (const auto &pair : 299 llvm::zip_first(valuesToForward, launchOp.getKernelArguments())) { 300 Value *from = std::get<0>(pair); 301 Value *to = std::get<1>(pair); 302 replaceAllUsesInRegionWith(from, to, launchOp.getBody()); 303 } 304 305 // We are done and can erase the original outermost loop. 306 rootForOp.erase(); 307 } 308 309 // Generic loop to GPU kernel conversion function. 310 template <typename OpTy> 311 static LogicalResult convertLoopNestToGPULaunch(OpTy forOp, 312 unsigned numBlockDims, 313 unsigned numThreadDims) { 314 if (failed(checkLoopNestMappable(forOp, numBlockDims, numThreadDims))) 315 return failure(); 316 317 LoopToGpuConverter converter; 318 auto maybeInnerLoop = 319 converter.collectBounds(forOp, numBlockDims + numThreadDims); 320 if (!maybeInnerLoop) 321 return failure(); 322 converter.createLaunch(forOp, *maybeInnerLoop, numBlockDims, numThreadDims); 323 324 return success(); 325 } 326 327 LogicalResult mlir::convertAffineLoopNestToGPULaunch(AffineForOp forOp, 328 unsigned numBlockDims, 329 unsigned numThreadDims) { 330 return ::convertLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims); 331 } 332 333 LogicalResult mlir::convertLoopNestToGPULaunch(ForOp forOp, 334 unsigned numBlockDims, 335 unsigned numThreadDims) { 336 return ::convertLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims); 337 }