github.com/johnnyeven/libtools@v0.0.0-20191126065708-61829c1adf46/third_party/mlir/lib/Transforms/Utils/LoopUtils.cpp (about) 1 //===- LoopUtils.cpp ---- Misc utilities for loop transformation ----------===// 2 // 3 // Copyright 2019 The MLIR Authors. 4 // 5 // Licensed under the Apache License, Version 2.0 (the "License"); 6 // you may not use this file except in compliance with the License. 7 // You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 // ============================================================================= 17 // 18 // This file implements miscellaneous loop transformation routines. 19 // 20 //===----------------------------------------------------------------------===// 21 22 #include "mlir/Transforms/LoopUtils.h" 23 24 #include "mlir/Analysis/AffineAnalysis.h" 25 #include "mlir/Analysis/AffineStructures.h" 26 #include "mlir/Analysis/LoopAnalysis.h" 27 #include "mlir/Analysis/SliceAnalysis.h" 28 #include "mlir/Dialect/AffineOps/AffineOps.h" 29 #include "mlir/Dialect/LoopOps/LoopOps.h" 30 #include "mlir/Dialect/StandardOps/Ops.h" 31 #include "mlir/IR/AffineExpr.h" 32 #include "mlir/IR/AffineMap.h" 33 #include "mlir/IR/BlockAndValueMapping.h" 34 #include "mlir/IR/Builders.h" 35 #include "mlir/IR/Function.h" 36 #include "mlir/IR/Module.h" 37 #include "mlir/IR/Operation.h" 38 #include "mlir/Transforms/RegionUtils.h" 39 #include "llvm/ADT/DenseMap.h" 40 #include "llvm/ADT/SetVector.h" 41 #include "llvm/ADT/SmallPtrSet.h" 42 #include "llvm/Support/Debug.h" 43 44 #define DEBUG_TYPE "LoopUtils" 45 46 using namespace mlir; 47 using llvm::SetVector; 48 49 /// Computes the cleanup loop lower bound of the loop being unrolled with 50 /// the specified unroll factor; this bound will also be upper bound of the main 51 /// part of the unrolled loop. Computes the bound as an AffineMap with its 52 /// operands or a null map when the trip count can't be expressed as an affine 53 /// expression. 54 void mlir::getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor, 55 AffineMap *map, 56 SmallVectorImpl<Value *> *operands, 57 OpBuilder &b) { 58 auto lbMap = forOp.getLowerBoundMap(); 59 60 // Single result lower bound map only. 61 if (lbMap.getNumResults() != 1) { 62 *map = AffineMap(); 63 return; 64 } 65 66 AffineMap tripCountMap; 67 SmallVector<Value *, 4> tripCountOperands; 68 buildTripCountMapAndOperands(forOp, &tripCountMap, &tripCountOperands); 69 70 // Sometimes the trip count cannot be expressed as an affine expression. 71 if (!tripCountMap) { 72 *map = AffineMap(); 73 return; 74 } 75 76 unsigned step = forOp.getStep(); 77 78 SmallVector<Value *, 4> lbOperands(forOp.getLowerBoundOperands()); 79 auto lb = b.create<AffineApplyOp>(forOp.getLoc(), lbMap, lbOperands); 80 81 // For each upper bound expr, get the range. 82 // Eg: affine.for %i = lb to min (ub1, ub2), 83 // where tripCountExprs yield (tr1, tr2), we create affine.apply's: 84 // lb + tr1 - tr1 % ufactor, lb + tr2 - tr2 % ufactor; the results of all 85 // these affine.apply's make up the cleanup loop lower bound. 86 SmallVector<AffineExpr, 4> bumpExprs(tripCountMap.getNumResults()); 87 SmallVector<Value *, 4> bumpValues(tripCountMap.getNumResults()); 88 for (unsigned i = 0, e = tripCountMap.getNumResults(); i < e; i++) { 89 auto tripCountExpr = tripCountMap.getResult(i); 90 bumpExprs[i] = (tripCountExpr - tripCountExpr % unrollFactor) * step; 91 auto bumpMap = b.getAffineMap(tripCountMap.getNumDims(), 92 tripCountMap.getNumSymbols(), bumpExprs[i]); 93 bumpValues[i] = 94 b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, tripCountOperands); 95 } 96 97 SmallVector<AffineExpr, 4> newUbExprs(tripCountMap.getNumResults()); 98 for (unsigned i = 0, e = bumpExprs.size(); i < e; i++) 99 newUbExprs[i] = b.getAffineDimExpr(0) + b.getAffineDimExpr(i + 1); 100 101 operands->clear(); 102 operands->push_back(lb); 103 operands->append(bumpValues.begin(), bumpValues.end()); 104 *map = b.getAffineMap(1 + tripCountMap.getNumResults(), 0, newUbExprs); 105 // Simplify the map + operands. 106 fullyComposeAffineMapAndOperands(map, operands); 107 *map = simplifyAffineMap(*map); 108 canonicalizeMapAndOperands(map, operands); 109 // Remove any affine.apply's that became dead from the simplification above. 110 for (auto *v : bumpValues) { 111 if (v->use_empty()) { 112 v->getDefiningOp()->erase(); 113 } 114 } 115 if (lb.use_empty()) 116 lb.erase(); 117 } 118 119 /// Promotes the loop body of a forOp to its containing block if the forOp 120 /// was known to have a single iteration. 121 // TODO(bondhugula): extend this for arbitrary affine bounds. 122 LogicalResult mlir::promoteIfSingleIteration(AffineForOp forOp) { 123 Optional<uint64_t> tripCount = getConstantTripCount(forOp); 124 if (!tripCount.hasValue() || tripCount.getValue() != 1) 125 return failure(); 126 127 // TODO(mlir-team): there is no builder for a max. 128 if (forOp.getLowerBoundMap().getNumResults() != 1) 129 return failure(); 130 131 // Replaces all IV uses to its single iteration value. 132 auto *iv = forOp.getInductionVar(); 133 Operation *op = forOp.getOperation(); 134 if (!iv->use_empty()) { 135 if (forOp.hasConstantLowerBound()) { 136 OpBuilder topBuilder(op->getParentOfType<FuncOp>().getBody()); 137 auto constOp = topBuilder.create<ConstantIndexOp>( 138 forOp.getLoc(), forOp.getConstantLowerBound()); 139 iv->replaceAllUsesWith(constOp); 140 } else { 141 AffineBound lb = forOp.getLowerBound(); 142 SmallVector<Value *, 4> lbOperands(lb.operand_begin(), lb.operand_end()); 143 OpBuilder builder(op->getBlock(), Block::iterator(op)); 144 if (lb.getMap() == builder.getDimIdentityMap()) { 145 // No need of generating an affine.apply. 146 iv->replaceAllUsesWith(lbOperands[0]); 147 } else { 148 auto affineApplyOp = builder.create<AffineApplyOp>( 149 op->getLoc(), lb.getMap(), lbOperands); 150 iv->replaceAllUsesWith(affineApplyOp); 151 } 152 } 153 } 154 // Move the loop body operations, except for terminator, to the loop's 155 // containing block. 156 auto *block = op->getBlock(); 157 forOp.getBody()->getOperations().back().erase(); 158 block->getOperations().splice(Block::iterator(op), 159 forOp.getBody()->getOperations()); 160 forOp.erase(); 161 return success(); 162 } 163 164 /// Promotes all single iteration for op's in the FuncOp, i.e., moves 165 /// their body into the containing Block. 166 void mlir::promoteSingleIterationLoops(FuncOp f) { 167 // Gathers all innermost loops through a post order pruned walk. 168 f.walk([](AffineForOp forOp) { promoteIfSingleIteration(forOp); }); 169 } 170 171 /// Generates a 'affine.for' op with the specified lower and upper bounds 172 /// while generating the right IV remappings for the shifted operations. The 173 /// operation blocks that go into the loop are specified in instGroupQueue 174 /// starting from the specified offset, and in that order; the first element of 175 /// the pair specifies the shift applied to that group of operations; note 176 /// that the shift is multiplied by the loop step before being applied. Returns 177 /// nullptr if the generated loop simplifies to a single iteration one. 178 static AffineForOp 179 generateLoop(AffineMap lbMap, AffineMap ubMap, 180 const std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> 181 &instGroupQueue, 182 unsigned offset, AffineForOp srcForInst, OpBuilder b) { 183 SmallVector<Value *, 4> lbOperands(srcForInst.getLowerBoundOperands()); 184 SmallVector<Value *, 4> ubOperands(srcForInst.getUpperBoundOperands()); 185 186 assert(lbMap.getNumInputs() == lbOperands.size()); 187 assert(ubMap.getNumInputs() == ubOperands.size()); 188 189 auto loopChunk = 190 b.create<AffineForOp>(srcForInst.getLoc(), lbOperands, lbMap, ubOperands, 191 ubMap, srcForInst.getStep()); 192 auto *loopChunkIV = loopChunk.getInductionVar(); 193 auto *srcIV = srcForInst.getInductionVar(); 194 195 BlockAndValueMapping operandMap; 196 197 OpBuilder bodyBuilder = loopChunk.getBodyBuilder(); 198 for (auto it = instGroupQueue.begin() + offset, e = instGroupQueue.end(); 199 it != e; ++it) { 200 uint64_t shift = it->first; 201 auto insts = it->second; 202 // All 'same shift' operations get added with their operands being 203 // remapped to results of cloned operations, and their IV used remapped. 204 // Generate the remapping if the shift is not zero: remappedIV = newIV - 205 // shift. 206 if (!srcIV->use_empty() && shift != 0) { 207 auto ivRemap = bodyBuilder.create<AffineApplyOp>( 208 srcForInst.getLoc(), 209 bodyBuilder.getSingleDimShiftAffineMap( 210 -static_cast<int64_t>(srcForInst.getStep() * shift)), 211 loopChunkIV); 212 operandMap.map(srcIV, ivRemap); 213 } else { 214 operandMap.map(srcIV, loopChunkIV); 215 } 216 for (auto *op : insts) { 217 if (!isa<AffineTerminatorOp>(op)) 218 bodyBuilder.clone(*op, operandMap); 219 } 220 }; 221 if (succeeded(promoteIfSingleIteration(loopChunk))) 222 return AffineForOp(); 223 return loopChunk; 224 } 225 226 /// Skew the operations in the body of a 'affine.for' operation with the 227 /// specified operation-wise shifts. The shifts are with respect to the 228 /// original execution order, and are multiplied by the loop 'step' before being 229 /// applied. A shift of zero for each operation will lead to no change. 230 // The skewing of operations with respect to one another can be used for 231 // example to allow overlap of asynchronous operations (such as DMA 232 // communication) with computation, or just relative shifting of operations 233 // for better register reuse, locality or parallelism. As such, the shifts are 234 // typically expected to be at most of the order of the number of operations. 235 // This method should not be used as a substitute for loop distribution/fission. 236 // This method uses an algorithm// in time linear in the number of operations 237 // in the body of the for loop - (using the 'sweep line' paradigm). This method 238 // asserts preservation of SSA dominance. A check for that as well as that for 239 // memory-based depedence preservation check rests with the users of this 240 // method. 241 LogicalResult mlir::instBodySkew(AffineForOp forOp, ArrayRef<uint64_t> shifts, 242 bool unrollPrologueEpilogue) { 243 if (forOp.getBody()->begin() == std::prev(forOp.getBody()->end())) 244 return success(); 245 246 // If the trip counts aren't constant, we would need versioning and 247 // conditional guards (or context information to prevent such versioning). The 248 // better way to pipeline for such loops is to first tile them and extract 249 // constant trip count "full tiles" before applying this. 250 auto mayBeConstTripCount = getConstantTripCount(forOp); 251 if (!mayBeConstTripCount.hasValue()) { 252 LLVM_DEBUG(forOp.emitRemark("non-constant trip count loop not handled")); 253 return success(); 254 } 255 uint64_t tripCount = mayBeConstTripCount.getValue(); 256 257 assert(isInstwiseShiftValid(forOp, shifts) && 258 "shifts will lead to an invalid transformation\n"); 259 260 int64_t step = forOp.getStep(); 261 262 unsigned numChildInsts = forOp.getBody()->getOperations().size(); 263 264 // Do a linear time (counting) sort for the shifts. 265 uint64_t maxShift = 0; 266 for (unsigned i = 0; i < numChildInsts; i++) { 267 maxShift = std::max(maxShift, shifts[i]); 268 } 269 // Such large shifts are not the typical use case. 270 if (maxShift >= numChildInsts) { 271 forOp.emitWarning("not shifting because shifts are unrealistically large"); 272 return success(); 273 } 274 275 // An array of operation groups sorted by shift amount; each group has all 276 // operations with the same shift in the order in which they appear in the 277 // body of the 'affine.for' op. 278 std::vector<std::vector<Operation *>> sortedInstGroups(maxShift + 1); 279 unsigned pos = 0; 280 for (auto &op : *forOp.getBody()) { 281 auto shift = shifts[pos++]; 282 sortedInstGroups[shift].push_back(&op); 283 } 284 285 // Unless the shifts have a specific pattern (which actually would be the 286 // common use case), prologue and epilogue are not meaningfully defined. 287 // Nevertheless, if 'unrollPrologueEpilogue' is set, we will treat the first 288 // loop generated as the prologue and the last as epilogue and unroll these 289 // fully. 290 AffineForOp prologue; 291 AffineForOp epilogue; 292 293 // Do a sweep over the sorted shifts while storing open groups in a 294 // vector, and generating loop portions as necessary during the sweep. A block 295 // of operations is paired with its shift. 296 std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> instGroupQueue; 297 298 auto origLbMap = forOp.getLowerBoundMap(); 299 uint64_t lbShift = 0; 300 OpBuilder b(forOp.getOperation()); 301 for (uint64_t d = 0, e = sortedInstGroups.size(); d < e; ++d) { 302 // If nothing is shifted by d, continue. 303 if (sortedInstGroups[d].empty()) 304 continue; 305 if (!instGroupQueue.empty()) { 306 assert(d >= 1 && 307 "Queue expected to be empty when the first block is found"); 308 // The interval for which the loop needs to be generated here is: 309 // [lbShift, min(lbShift + tripCount, d)) and the body of the 310 // loop needs to have all operations in instQueue in that order. 311 AffineForOp res; 312 if (lbShift + tripCount * step < d * step) { 313 res = generateLoop( 314 b.getShiftedAffineMap(origLbMap, lbShift), 315 b.getShiftedAffineMap(origLbMap, lbShift + tripCount * step), 316 instGroupQueue, 0, forOp, b); 317 // Entire loop for the queued op groups generated, empty it. 318 instGroupQueue.clear(); 319 lbShift += tripCount * step; 320 } else { 321 res = generateLoop(b.getShiftedAffineMap(origLbMap, lbShift), 322 b.getShiftedAffineMap(origLbMap, d), instGroupQueue, 323 0, forOp, b); 324 lbShift = d * step; 325 } 326 if (!prologue && res) 327 prologue = res; 328 epilogue = res; 329 } else { 330 // Start of first interval. 331 lbShift = d * step; 332 } 333 // Augment the list of operations that get into the current open interval. 334 instGroupQueue.push_back({d, sortedInstGroups[d]}); 335 } 336 337 // Those operations groups left in the queue now need to be processed (FIFO) 338 // and their loops completed. 339 for (unsigned i = 0, e = instGroupQueue.size(); i < e; ++i) { 340 uint64_t ubShift = (instGroupQueue[i].first + tripCount) * step; 341 epilogue = generateLoop(b.getShiftedAffineMap(origLbMap, lbShift), 342 b.getShiftedAffineMap(origLbMap, ubShift), 343 instGroupQueue, i, forOp, b); 344 lbShift = ubShift; 345 if (!prologue) 346 prologue = epilogue; 347 } 348 349 // Erase the original for op. 350 forOp.erase(); 351 352 if (unrollPrologueEpilogue && prologue) 353 loopUnrollFull(prologue); 354 if (unrollPrologueEpilogue && !epilogue && 355 epilogue.getOperation() != prologue.getOperation()) 356 loopUnrollFull(epilogue); 357 358 return success(); 359 } 360 361 // Collect perfectly nested loops starting from `rootForOps`. Loops are 362 // perfectly nested if each loop is the first and only non-terminator operation 363 // in the parent loop. Collect at most `maxLoops` loops and append them to 364 // `forOps`. 365 template <typename T> 366 void getPerfectlyNestedLoopsImpl( 367 SmallVectorImpl<T> &forOps, T rootForOp, 368 unsigned maxLoops = std::numeric_limits<unsigned>::max()) { 369 for (unsigned i = 0; i < maxLoops; ++i) { 370 forOps.push_back(rootForOp); 371 // FIXME: ForOp and AffineForOp currently provide different names to access 372 // the region ("region" and "getRegion"). Remove this generic access when 373 // AffineForOp moves to ODS and also gets "region". 374 Block &body = rootForOp.getOperation()->getRegion(0).front(); 375 if (body.begin() != std::prev(body.end(), 2)) 376 return; 377 378 rootForOp = dyn_cast<T>(&body.front()); 379 if (!rootForOp) 380 return; 381 } 382 } 383 384 /// Get perfectly nested sequence of loops starting at root of loop nest 385 /// (the first op being another AffineFor, and the second op - a terminator). 386 /// A loop is perfectly nested iff: the first op in the loop's body is another 387 /// AffineForOp, and the second op is a terminator). 388 void mlir::getPerfectlyNestedLoops(SmallVectorImpl<AffineForOp> &nestedLoops, 389 AffineForOp root) { 390 getPerfectlyNestedLoopsImpl(nestedLoops, root); 391 } 392 393 void mlir::getPerfectlyNestedLoops(SmallVectorImpl<loop::ForOp> &nestedLoops, 394 loop::ForOp root) { 395 getPerfectlyNestedLoopsImpl(nestedLoops, root); 396 } 397 398 /// Unrolls this loop completely. 399 LogicalResult mlir::loopUnrollFull(AffineForOp forOp) { 400 Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp); 401 if (mayBeConstantTripCount.hasValue()) { 402 uint64_t tripCount = mayBeConstantTripCount.getValue(); 403 if (tripCount == 1) { 404 return promoteIfSingleIteration(forOp); 405 } 406 return loopUnrollByFactor(forOp, tripCount); 407 } 408 return failure(); 409 } 410 411 /// Unrolls and jams this loop by the specified factor or by the trip count (if 412 /// constant) whichever is lower. 413 LogicalResult mlir::loopUnrollUpToFactor(AffineForOp forOp, 414 uint64_t unrollFactor) { 415 Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp); 416 417 if (mayBeConstantTripCount.hasValue() && 418 mayBeConstantTripCount.getValue() < unrollFactor) 419 return loopUnrollByFactor(forOp, mayBeConstantTripCount.getValue()); 420 return loopUnrollByFactor(forOp, unrollFactor); 421 } 422 423 /// Unrolls this loop by the specified factor. Returns success if the loop 424 /// is successfully unrolled. 425 LogicalResult mlir::loopUnrollByFactor(AffineForOp forOp, 426 uint64_t unrollFactor) { 427 assert(unrollFactor >= 1 && "unroll factor should be >= 1"); 428 429 if (unrollFactor == 1) 430 return promoteIfSingleIteration(forOp); 431 432 if (forOp.getBody()->empty() || 433 forOp.getBody()->begin() == std::prev(forOp.getBody()->end())) 434 return failure(); 435 436 // Loops where the lower bound is a max expression isn't supported for 437 // unrolling since the trip count can be expressed as an affine function when 438 // both the lower bound and the upper bound are multi-result maps. However, 439 // one meaningful way to do such unrolling would be to specialize the loop for 440 // the 'hotspot' case and unroll that hotspot. 441 if (forOp.getLowerBoundMap().getNumResults() != 1) 442 return failure(); 443 444 // If the trip count is lower than the unroll factor, no unrolled body. 445 // TODO(bondhugula): option to specify cleanup loop unrolling. 446 Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp); 447 if (mayBeConstantTripCount.hasValue() && 448 mayBeConstantTripCount.getValue() < unrollFactor) 449 return failure(); 450 451 // Generate the cleanup loop if trip count isn't a multiple of unrollFactor. 452 Operation *op = forOp.getOperation(); 453 if (getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) { 454 OpBuilder builder(op->getBlock(), ++Block::iterator(op)); 455 auto cleanupForInst = cast<AffineForOp>(builder.clone(*op)); 456 AffineMap cleanupMap; 457 SmallVector<Value *, 4> cleanupOperands; 458 getCleanupLoopLowerBound(forOp, unrollFactor, &cleanupMap, &cleanupOperands, 459 builder); 460 assert(cleanupMap && 461 "cleanup loop lower bound map for single result lower bound maps " 462 "can always be determined"); 463 cleanupForInst.setLowerBound(cleanupOperands, cleanupMap); 464 // Promote the loop body up if this has turned into a single iteration loop. 465 promoteIfSingleIteration(cleanupForInst); 466 467 // Adjust upper bound of the original loop; this is the same as the lower 468 // bound of the cleanup loop. 469 forOp.setUpperBound(cleanupOperands, cleanupMap); 470 } 471 472 // Scale the step of loop being unrolled by unroll factor. 473 int64_t step = forOp.getStep(); 474 forOp.setStep(step * unrollFactor); 475 476 // Builder to insert unrolled bodies just before the terminator of the body of 477 // 'forOp'. 478 OpBuilder builder = forOp.getBodyBuilder(); 479 480 // Keep a pointer to the last non-terminator operation in the original block 481 // so that we know what to clone (since we are doing this in-place). 482 Block::iterator srcBlockEnd = std::prev(forOp.getBody()->end(), 2); 483 484 // Unroll the contents of 'forOp' (append unrollFactor-1 additional copies). 485 auto *forOpIV = forOp.getInductionVar(); 486 for (unsigned i = 1; i < unrollFactor; i++) { 487 BlockAndValueMapping operandMap; 488 489 // If the induction variable is used, create a remapping to the value for 490 // this unrolled instance. 491 if (!forOpIV->use_empty()) { 492 // iv' = iv + 1/2/3...unrollFactor-1; 493 auto d0 = builder.getAffineDimExpr(0); 494 auto bumpMap = builder.getAffineMap(1, 0, {d0 + i * step}); 495 auto ivUnroll = 496 builder.create<AffineApplyOp>(forOp.getLoc(), bumpMap, forOpIV); 497 operandMap.map(forOpIV, ivUnroll); 498 } 499 500 // Clone the original body of 'forOp'. 501 for (auto it = forOp.getBody()->begin(); it != std::next(srcBlockEnd); 502 it++) { 503 builder.clone(*it, operandMap); 504 } 505 } 506 507 // Promote the loop body up if this has turned into a single iteration loop. 508 promoteIfSingleIteration(forOp); 509 return success(); 510 } 511 512 /// Performs loop interchange on 'forOpA' and 'forOpB', where 'forOpB' is 513 /// nested within 'forOpA' as the only non-terminator operation in its block. 514 void mlir::interchangeLoops(AffineForOp forOpA, AffineForOp forOpB) { 515 auto *forOpAInst = forOpA.getOperation(); 516 517 assert(&*forOpA.getBody()->begin() == forOpB.getOperation()); 518 auto &forOpABody = forOpA.getBody()->getOperations(); 519 auto &forOpBBody = forOpB.getBody()->getOperations(); 520 521 // 1) Splice forOpA's non-terminator operations (which is just forOpB) just 522 // before forOpA (in ForOpA's parent's block) this should leave 'forOpA's 523 // body containing only the terminator. 524 forOpAInst->getBlock()->getOperations().splice(Block::iterator(forOpAInst), 525 forOpABody, forOpABody.begin(), 526 std::prev(forOpABody.end())); 527 // 2) Splice forOpB's non-terminator operations into the beginning of forOpA's 528 // body (this leaves forOpB's body containing only the terminator). 529 forOpABody.splice(forOpABody.begin(), forOpBBody, forOpBBody.begin(), 530 std::prev(forOpBBody.end())); 531 // 3) Splice forOpA into the beginning of forOpB's body. 532 forOpBBody.splice(forOpBBody.begin(), forOpAInst->getBlock()->getOperations(), 533 Block::iterator(forOpAInst)); 534 } 535 536 // Checks each dependence component against the permutation to see if the 537 // desired loop interchange would violate dependences by making the 538 // dependence componenent lexicographically negative. 539 static bool checkLoopInterchangeDependences( 540 const std::vector<llvm::SmallVector<DependenceComponent, 2>> &depCompsVec, 541 ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) { 542 // Invert permutation map. 543 unsigned maxLoopDepth = loops.size(); 544 llvm::SmallVector<unsigned, 4> loopPermMapInv; 545 loopPermMapInv.resize(maxLoopDepth); 546 for (unsigned i = 0; i < maxLoopDepth; ++i) 547 loopPermMapInv[loopPermMap[i]] = i; 548 549 // Check each dependence component against the permutation to see if the 550 // desired loop interchange permutation would make the dependence vectors 551 // lexicographically negative. 552 // Example 1: [-1, 1][0, 0] 553 // Example 2: [0, 0][-1, 1] 554 for (unsigned i = 0, e = depCompsVec.size(); i < e; ++i) { 555 const llvm::SmallVector<DependenceComponent, 2> &depComps = depCompsVec[i]; 556 assert(depComps.size() >= maxLoopDepth); 557 // Check if the first non-zero dependence component is positive. 558 // This iterates through loops in the desired order. 559 for (unsigned j = 0; j < maxLoopDepth; ++j) { 560 unsigned permIndex = loopPermMapInv[j]; 561 assert(depComps[permIndex].lb.hasValue()); 562 int64_t depCompLb = depComps[permIndex].lb.getValue(); 563 if (depCompLb > 0) 564 break; 565 if (depCompLb < 0) 566 return false; 567 } 568 } 569 return true; 570 } 571 572 /// Checks if the loop interchange permutation 'loopPermMap' of the perfectly 573 /// nested sequence of loops in 'loops' would violate dependences. 574 bool mlir::isValidLoopInterchangePermutation(ArrayRef<AffineForOp> loops, 575 ArrayRef<unsigned> loopPermMap) { 576 // Gather dependence components for dependences between all ops in loop nest 577 // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth]. 578 assert(loopPermMap.size() == loops.size()); 579 unsigned maxLoopDepth = loops.size(); 580 std::vector<llvm::SmallVector<DependenceComponent, 2>> depCompsVec; 581 getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec); 582 return checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap); 583 } 584 585 /// Performs a sequence of loop interchanges of loops in perfectly nested 586 /// sequence of loops in 'loops', as specified by permutation in 'loopPermMap'. 587 unsigned mlir::interchangeLoops(ArrayRef<AffineForOp> loops, 588 ArrayRef<unsigned> loopPermMap) { 589 Optional<unsigned> loopNestRootIndex; 590 for (int i = loops.size() - 1; i >= 0; --i) { 591 int permIndex = static_cast<int>(loopPermMap[i]); 592 // Store the index of the for loop which will be the new loop nest root. 593 if (permIndex == 0) 594 loopNestRootIndex = i; 595 if (permIndex > i) { 596 // Sink loop 'i' by 'permIndex - i' levels deeper into the loop nest. 597 sinkLoop(loops[i], permIndex - i); 598 } 599 } 600 assert(loopNestRootIndex.hasValue()); 601 return loopNestRootIndex.getValue(); 602 } 603 604 // Sinks all sequential loops to the innermost levels (while preserving 605 // relative order among them) and moves all parallel loops to the 606 // outermost (while again preserving relative order among them). 607 AffineForOp mlir::sinkSequentialLoops(AffineForOp forOp) { 608 SmallVector<AffineForOp, 4> loops; 609 getPerfectlyNestedLoops(loops, forOp); 610 if (loops.size() < 2) 611 return forOp; 612 613 // Gather dependence components for dependences between all ops in loop nest 614 // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth]. 615 unsigned maxLoopDepth = loops.size(); 616 std::vector<llvm::SmallVector<DependenceComponent, 2>> depCompsVec; 617 getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec); 618 619 // Mark loops as either parallel or sequential. 620 llvm::SmallVector<bool, 8> isParallelLoop(maxLoopDepth, true); 621 for (unsigned i = 0, e = depCompsVec.size(); i < e; ++i) { 622 llvm::SmallVector<DependenceComponent, 2> &depComps = depCompsVec[i]; 623 assert(depComps.size() >= maxLoopDepth); 624 for (unsigned j = 0; j < maxLoopDepth; ++j) { 625 DependenceComponent &depComp = depComps[j]; 626 assert(depComp.lb.hasValue() && depComp.ub.hasValue()); 627 if (depComp.lb.getValue() != 0 || depComp.ub.getValue() != 0) 628 isParallelLoop[j] = false; 629 } 630 } 631 632 // Count the number of parallel loops. 633 unsigned numParallelLoops = 0; 634 for (unsigned i = 0, e = isParallelLoop.size(); i < e; ++i) 635 if (isParallelLoop[i]) 636 ++numParallelLoops; 637 638 // Compute permutation of loops that sinks sequential loops (and thus raises 639 // parallel loops) while preserving relative order. 640 llvm::SmallVector<unsigned, 4> loopPermMap(maxLoopDepth); 641 unsigned nextSequentialLoop = numParallelLoops; 642 unsigned nextParallelLoop = 0; 643 for (unsigned i = 0; i < maxLoopDepth; ++i) { 644 if (isParallelLoop[i]) { 645 loopPermMap[i] = nextParallelLoop++; 646 } else { 647 loopPermMap[i] = nextSequentialLoop++; 648 } 649 } 650 651 // Check if permutation 'loopPermMap' would violate dependences. 652 if (!checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap)) 653 return forOp; 654 // Perform loop interchange according to permutation 'loopPermMap'. 655 unsigned loopNestRootIndex = interchangeLoops(loops, loopPermMap); 656 return loops[loopNestRootIndex]; 657 } 658 659 /// Performs a series of loop interchanges to sink 'forOp' 'loopDepth' levels 660 /// deeper in the loop nest. 661 void mlir::sinkLoop(AffineForOp forOp, unsigned loopDepth) { 662 for (unsigned i = 0; i < loopDepth; ++i) { 663 AffineForOp nextForOp = cast<AffineForOp>(forOp.getBody()->front()); 664 interchangeLoops(forOp, nextForOp); 665 } 666 } 667 668 // Factors out common behavior to add a new `iv` (resp. `iv` + `offset`) to the 669 // lower (resp. upper) loop bound. When called for both the lower and upper 670 // bounds, the resulting IR resembles: 671 // 672 // ```mlir 673 // affine.for %i = max (`iv, ...) to min (`iv` + `offset`) { 674 // ... 675 // } 676 // ``` 677 static void augmentMapAndBounds(OpBuilder &b, Value *iv, AffineMap *map, 678 SmallVector<Value *, 4> *operands, 679 int64_t offset = 0) { 680 auto bounds = llvm::to_vector<4>(map->getResults()); 681 bounds.push_back(b.getAffineDimExpr(map->getNumDims()) + offset); 682 operands->insert(operands->begin() + map->getNumDims(), iv); 683 *map = b.getAffineMap(map->getNumDims() + 1, map->getNumSymbols(), bounds); 684 canonicalizeMapAndOperands(map, operands); 685 } 686 687 // Stripmines `forOp` by `factor` and sinks it under each of the `targets`. 688 // Stripmine-sink is a primitive building block for generalized tiling of 689 // imperfectly nested loops. 690 // This transformation is purely mechanical and does not check legality, 691 // profitability or even structural correctness. It is the user's 692 // responsibility to specify `targets` that are dominated by `forOp`. 693 // Returns the new AffineForOps, one per `targets`, nested immediately under 694 // each of the `targets`. 695 static SmallVector<AffineForOp, 8> 696 stripmineSink(AffineForOp forOp, uint64_t factor, 697 ArrayRef<AffineForOp> targets) { 698 auto originalStep = forOp.getStep(); 699 auto scaledStep = originalStep * factor; 700 forOp.setStep(scaledStep); 701 702 auto *op = forOp.getOperation(); 703 OpBuilder b(op->getBlock(), ++Block::iterator(op)); 704 705 // Lower-bound map creation. 706 auto lbMap = forOp.getLowerBoundMap(); 707 SmallVector<Value *, 4> lbOperands(forOp.getLowerBoundOperands()); 708 augmentMapAndBounds(b, forOp.getInductionVar(), &lbMap, &lbOperands); 709 710 // Upper-bound map creation. 711 auto ubMap = forOp.getUpperBoundMap(); 712 SmallVector<Value *, 4> ubOperands(forOp.getUpperBoundOperands()); 713 augmentMapAndBounds(b, forOp.getInductionVar(), &ubMap, &ubOperands, 714 /*offset=*/scaledStep); 715 716 auto *iv = forOp.getInductionVar(); 717 SmallVector<AffineForOp, 8> innerLoops; 718 for (auto t : targets) { 719 // Insert newForOp before the terminator of `t`. 720 OpBuilder b = t.getBodyBuilder(); 721 auto newForOp = b.create<AffineForOp>(t.getLoc(), lbOperands, lbMap, 722 ubOperands, ubMap, originalStep); 723 auto begin = t.getBody()->begin(); 724 // Skip terminator and `newForOp` which is just before the terminator. 725 auto nOps = t.getBody()->getOperations().size() - 2; 726 newForOp.getBody()->getOperations().splice( 727 newForOp.getBody()->getOperations().begin(), 728 t.getBody()->getOperations(), begin, std::next(begin, nOps)); 729 replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(), 730 newForOp.region()); 731 innerLoops.push_back(newForOp); 732 } 733 734 return innerLoops; 735 } 736 737 static Loops stripmineSink(loop::ForOp forOp, Value *factor, 738 ArrayRef<loop::ForOp> targets) { 739 auto *originalStep = forOp.step(); 740 auto *iv = forOp.getInductionVar(); 741 742 OpBuilder b(forOp); 743 forOp.setStep(b.create<MulIOp>(forOp.getLoc(), originalStep, factor)); 744 745 Loops innerLoops; 746 for (auto t : targets) { 747 // Save information for splicing ops out of t when done 748 auto begin = t.getBody()->begin(); 749 auto nOps = t.getBody()->getOperations().size(); 750 751 // Insert newForOp before the terminator of `t`. 752 OpBuilder b(t.getBodyBuilder()); 753 Value *stepped = b.create<AddIOp>(t.getLoc(), iv, forOp.step()); 754 Value *less = b.create<CmpIOp>(t.getLoc(), CmpIPredicate::SLT, 755 forOp.upperBound(), stepped); 756 Value *ub = 757 b.create<SelectOp>(t.getLoc(), less, forOp.upperBound(), stepped); 758 759 // Splice [begin, begin + nOps - 1) into `newForOp` and replace uses. 760 auto newForOp = b.create<loop::ForOp>(t.getLoc(), iv, ub, originalStep); 761 newForOp.getBody()->getOperations().splice( 762 newForOp.getBody()->getOperations().begin(), 763 t.getBody()->getOperations(), begin, std::next(begin, nOps - 1)); 764 replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(), 765 newForOp.region()); 766 767 innerLoops.push_back(newForOp); 768 } 769 770 return innerLoops; 771 } 772 773 // Stripmines a `forOp` by `factor` and sinks it under a single `target`. 774 // Returns the new AffineForOps, nested immediately under `target`. 775 template <typename ForType, typename SizeType> 776 static ForType stripmineSink(ForType forOp, SizeType factor, ForType target) { 777 // TODO(ntv): Use cheap structural assertions that targets are nested under 778 // forOp and that targets are not nested under each other when DominanceInfo 779 // exposes the capability. It seems overkill to construct a whole function 780 // dominance tree at this point. 781 auto res = stripmineSink(forOp, factor, ArrayRef<ForType>{target}); 782 assert(res.size() == 1 && "Expected 1 inner forOp"); 783 return res[0]; 784 } 785 786 template <typename ForType, typename SizeType> 787 static SmallVector<SmallVector<ForType, 8>, 8> 788 tileImpl(ArrayRef<ForType> forOps, ArrayRef<SizeType> sizes, 789 ArrayRef<ForType> targets) { 790 SmallVector<SmallVector<ForType, 8>, 8> res; 791 SmallVector<ForType, 8> currentTargets(targets.begin(), targets.end()); 792 for (auto it : llvm::zip(forOps, sizes)) { 793 auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets); 794 res.push_back(step); 795 currentTargets = step; 796 } 797 return res; 798 } 799 800 SmallVector<SmallVector<AffineForOp, 8>, 8> 801 mlir::tile(ArrayRef<AffineForOp> forOps, ArrayRef<uint64_t> sizes, 802 ArrayRef<AffineForOp> targets) { 803 return tileImpl(forOps, sizes, targets); 804 } 805 806 SmallVector<Loops, 8> mlir::tile(ArrayRef<loop::ForOp> forOps, 807 ArrayRef<Value *> sizes, 808 ArrayRef<loop::ForOp> targets) { 809 return tileImpl(forOps, sizes, targets); 810 } 811 812 template <typename ForType, typename SizeType> 813 static SmallVector<ForType, 8> 814 tileImpl(ArrayRef<ForType> forOps, ArrayRef<SizeType> sizes, ForType target) { 815 SmallVector<ForType, 8> res; 816 for (auto loops : tile(forOps, sizes, ArrayRef<ForType>{target})) { 817 assert(loops.size() == 1); 818 res.push_back(loops[0]); 819 } 820 return res; 821 } 822 823 SmallVector<AffineForOp, 8> mlir::tile(ArrayRef<AffineForOp> forOps, 824 ArrayRef<uint64_t> sizes, 825 AffineForOp target) { 826 return tileImpl(forOps, sizes, target); 827 } 828 829 Loops mlir::tile(ArrayRef<loop::ForOp> forOps, ArrayRef<Value *> sizes, 830 loop::ForOp target) { 831 return tileImpl(forOps, sizes, target); 832 } 833 834 Loops mlir::tilePerfectlyNested(loop::ForOp rootForOp, 835 ArrayRef<Value *> sizes) { 836 // Collect prefectly nested loops. If more size values provided than nested 837 // loops available, truncate `sizes`. 838 SmallVector<loop::ForOp, 4> forOps; 839 forOps.reserve(sizes.size()); 840 getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size()); 841 if (forOps.size() < sizes.size()) 842 sizes = sizes.take_front(forOps.size()); 843 844 return ::tile(forOps, sizes, forOps.back()); 845 } 846 847 // Build the IR that performs ceil division of a positive value by a constant: 848 // ceildiv(a, B) = divis(a + (B-1), B) 849 // where divis is roundning-to-zero division. 850 static Value *ceilDivPositive(OpBuilder &builder, Location loc, Value *dividend, 851 int64_t divisor) { 852 assert(divisor > 0 && "expected positive divisor"); 853 assert(dividend->getType().isIndex() && "expected index-typed value"); 854 855 Value *divisorMinusOneCst = builder.create<ConstantIndexOp>(loc, divisor - 1); 856 Value *divisorCst = builder.create<ConstantIndexOp>(loc, divisor); 857 Value *sum = builder.create<AddIOp>(loc, dividend, divisorMinusOneCst); 858 return builder.create<DivISOp>(loc, sum, divisorCst); 859 } 860 861 // Build the IR that performs ceil division of a positive value by another 862 // positive value: 863 // ceildiv(a, b) = divis(a + (b - 1), b) 864 // where divis is rounding-to-zero division. 865 static Value *ceilDivPositive(OpBuilder &builder, Location loc, Value *dividend, 866 Value *divisor) { 867 assert(dividend->getType().isIndex() && "expected index-typed value"); 868 869 Value *cstOne = builder.create<ConstantIndexOp>(loc, 1); 870 Value *divisorMinusOne = builder.create<SubIOp>(loc, divisor, cstOne); 871 Value *sum = builder.create<AddIOp>(loc, dividend, divisorMinusOne); 872 return builder.create<DivISOp>(loc, sum, divisor); 873 } 874 875 // Hoist the ops within `outer` that appear before `inner`. 876 // Such ops include the ops that have been introduced by parametric tiling. 877 // Ops that come from triangular loops (i.e. that belong to the program slice 878 // rooted at `outer`) and ops that have side effects cannot be hoisted. 879 // Return failure when any op fails to hoist. 880 static LogicalResult hoistOpsBetween(loop::ForOp outer, loop::ForOp inner) { 881 SetVector<Operation *> forwardSlice; 882 getForwardSlice(outer.getOperation(), &forwardSlice, [&inner](Operation *op) { 883 return op != inner.getOperation(); 884 }); 885 LogicalResult status = success(); 886 SmallVector<Operation *, 8> toHoist; 887 for (auto &op : outer.getBody()->getOperations()) { 888 // Stop when encountering the inner loop. 889 if (&op == inner.getOperation()) 890 break; 891 // Skip over non-hoistable ops. 892 if (forwardSlice.count(&op) > 0) { 893 status = failure(); 894 continue; 895 } 896 // Skip loop::ForOp, these are not considered a failure. 897 if (op.getNumRegions() > 0) 898 continue; 899 // Skip other ops with regions. 900 if (op.getNumRegions() > 0) { 901 status = failure(); 902 continue; 903 } 904 // Skip if op has side effects. 905 // TODO(ntv): loads to immutable memory regions are ok. 906 if (!op.hasNoSideEffect()) { 907 status = failure(); 908 continue; 909 } 910 toHoist.push_back(&op); 911 } 912 auto *outerForOp = outer.getOperation(); 913 for (auto *op : toHoist) 914 op->moveBefore(outerForOp); 915 return status; 916 } 917 918 // Traverse the interTile and intraTile loops and try to hoist ops such that 919 // bands of perfectly nested loops are isolated. 920 // Return failure if either perfect interTile or perfect intraTile bands cannot 921 // be formed. 922 static LogicalResult tryIsolateBands(const TileLoops &tileLoops) { 923 LogicalResult status = success(); 924 auto &interTile = tileLoops.first; 925 auto &intraTile = tileLoops.second; 926 auto size = interTile.size(); 927 assert(size == intraTile.size()); 928 if (size <= 1) 929 return success(); 930 for (unsigned s = 1; s < size; ++s) 931 status = succeeded(status) ? hoistOpsBetween(intraTile[0], intraTile[s]) 932 : failure(); 933 for (unsigned s = 1; s < size; ++s) 934 status = succeeded(status) ? hoistOpsBetween(interTile[0], interTile[s]) 935 : failure(); 936 return status; 937 } 938 939 TileLoops mlir::extractFixedOuterLoops(loop::ForOp rootForOp, 940 ArrayRef<int64_t> sizes) { 941 // Collect prefectly nested loops. If more size values provided than nested 942 // loops available, truncate `sizes`. 943 SmallVector<loop::ForOp, 4> forOps; 944 forOps.reserve(sizes.size()); 945 getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size()); 946 if (forOps.size() < sizes.size()) 947 sizes = sizes.take_front(forOps.size()); 948 949 // Compute the tile sizes such that i-th outer loop executes size[i] 950 // iterations. Given that the loop current executes 951 // numIterations = ceildiv((upperBound - lowerBound), step) 952 // iterations, we need to tile with size ceildiv(numIterations, size[i]). 953 SmallVector<Value *, 4> tileSizes; 954 tileSizes.reserve(sizes.size()); 955 for (unsigned i = 0, e = sizes.size(); i < e; ++i) { 956 assert(sizes[i] > 0 && "expected strictly positive size for strip-mining"); 957 958 auto forOp = forOps[i]; 959 OpBuilder builder(forOp); 960 auto loc = forOp.getLoc(); 961 Value *diff = 962 builder.create<SubIOp>(loc, forOp.upperBound(), forOp.lowerBound()); 963 Value *numIterations = ceilDivPositive(builder, loc, diff, forOp.step()); 964 Value *iterationsPerBlock = 965 ceilDivPositive(builder, loc, numIterations, sizes[i]); 966 tileSizes.push_back(iterationsPerBlock); 967 } 968 969 // Call parametric tiling with the given sizes. 970 auto intraTile = tile(forOps, tileSizes, forOps.back()); 971 TileLoops tileLoops = std::make_pair(forOps, intraTile); 972 973 // TODO(ntv, zinenko) for now we just ignore the result of band isolation. 974 // In the future, mapping decisions may be impacted by the ability to 975 // isolate perfectly nested bands. 976 tryIsolateBands(tileLoops); 977 978 return tileLoops; 979 } 980 981 // Replaces all uses of `orig` with `replacement` except if the user is listed 982 // in `exceptions`. 983 static void 984 replaceAllUsesExcept(Value *orig, Value *replacement, 985 const SmallPtrSetImpl<Operation *> &exceptions) { 986 for (auto &use : orig->getUses()) { 987 if (exceptions.count(use.getOwner()) == 0) 988 use.set(replacement); 989 } 990 } 991 992 // Transform a loop with a strictly positive step 993 // for %i = %lb to %ub step %s 994 // into a 0-based loop with step 1 995 // for %ii = 0 to ceildiv(%ub - %lb, %s) step 1 { 996 // %i = %ii * %s + %lb 997 // Insert the induction variable remapping in the body of `inner`, which is 998 // expected to be either `loop` or another loop perfectly nested under `loop`. 999 // Insert the definition of new bounds immediate before `outer`, which is 1000 // expected to be either `loop` or its parent in the loop nest. 1001 static void normalizeLoop(loop::ForOp loop, loop::ForOp outer, 1002 loop::ForOp inner) { 1003 OpBuilder builder(outer); 1004 Location loc = loop.getLoc(); 1005 1006 // Check if the loop is already known to have a constant zero lower bound or 1007 // a constant one step. 1008 bool isZeroBased = false; 1009 if (auto ubCst = 1010 dyn_cast_or_null<ConstantIndexOp>(loop.lowerBound()->getDefiningOp())) 1011 isZeroBased = ubCst.getValue() == 0; 1012 1013 bool isStepOne = false; 1014 if (auto stepCst = 1015 dyn_cast_or_null<ConstantIndexOp>(loop.step()->getDefiningOp())) 1016 isStepOne = stepCst.getValue() == 1; 1017 1018 if (isZeroBased && isStepOne) 1019 return; 1020 1021 // Compute the number of iterations the loop executes: ceildiv(ub - lb, step) 1022 // assuming the step is strictly positive. Update the bounds and the step 1023 // of the loop to go from 0 to the number of iterations, if necessary. 1024 // TODO(zinenko): introduce support for negative steps or emit dynamic asserts 1025 // on step positivity, whatever gets implemented first. 1026 Value *diff = 1027 builder.create<SubIOp>(loc, loop.upperBound(), loop.lowerBound()); 1028 Value *numIterations = ceilDivPositive(builder, loc, diff, loop.step()); 1029 loop.setUpperBound(numIterations); 1030 1031 Value *lb = loop.lowerBound(); 1032 if (!isZeroBased) { 1033 Value *cst0 = builder.create<ConstantIndexOp>(loc, 0); 1034 loop.setLowerBound(cst0); 1035 } 1036 1037 Value *step = loop.step(); 1038 if (!isStepOne) { 1039 Value *cst1 = builder.create<ConstantIndexOp>(loc, 1); 1040 loop.setStep(cst1); 1041 } 1042 1043 // Insert code computing the value of the original loop induction variable 1044 // from the "normalized" one. 1045 builder.setInsertionPointToStart(inner.getBody()); 1046 Value *scaled = 1047 isStepOne ? loop.getInductionVar() 1048 : builder.create<MulIOp>(loc, loop.getInductionVar(), step); 1049 Value *shifted = 1050 isZeroBased ? scaled : builder.create<AddIOp>(loc, scaled, lb); 1051 1052 SmallPtrSet<Operation *, 2> preserve{scaled->getDefiningOp(), 1053 shifted->getDefiningOp()}; 1054 replaceAllUsesExcept(loop.getInductionVar(), shifted, preserve); 1055 } 1056 1057 void mlir::coalesceLoops(MutableArrayRef<loop::ForOp> loops) { 1058 if (loops.size() < 2) 1059 return; 1060 1061 loop::ForOp innermost = loops.back(); 1062 loop::ForOp outermost = loops.front(); 1063 1064 // 1. Make sure all loops iterate from 0 to upperBound with step 1. This 1065 // allows the following code to assume upperBound is the number of iterations. 1066 for (auto loop : loops) 1067 normalizeLoop(loop, outermost, innermost); 1068 1069 // 2. Emit code computing the upper bound of the coalesced loop as product 1070 // of the number of iterations of all loops. 1071 OpBuilder builder(outermost); 1072 Location loc = outermost.getLoc(); 1073 Value *upperBound = outermost.upperBound(); 1074 for (auto loop : loops.drop_front()) 1075 upperBound = builder.create<MulIOp>(loc, upperBound, loop.upperBound()); 1076 outermost.setUpperBound(upperBound); 1077 1078 builder.setInsertionPointToStart(outermost.getBody()); 1079 1080 // 3. Remap induction variables. For each original loop, the value of the 1081 // induction variable can be obtained by dividing the induction variable of 1082 // the linearized loop by the total number of iterations of the loops nested 1083 // in it modulo the number of iterations in this loop (remove the values 1084 // related to the outer loops): 1085 // iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i. 1086 // Compute these iteratively from the innermost loop by creating a "running 1087 // quotient" of division by the range. 1088 Value *previous = outermost.getInductionVar(); 1089 for (unsigned i = 0, e = loops.size(); i < e; ++i) { 1090 unsigned idx = loops.size() - i - 1; 1091 if (i != 0) 1092 previous = 1093 builder.create<DivISOp>(loc, previous, loops[idx + 1].upperBound()); 1094 1095 Value *iv = (i == e - 1) ? previous 1096 : builder.create<RemISOp>(loc, previous, 1097 loops[idx].upperBound()); 1098 replaceAllUsesInRegionWith(loops[idx].getInductionVar(), iv, 1099 loops.back().region()); 1100 } 1101 1102 // 4. Move the operations from the innermost just above the second-outermost 1103 // loop, delete the extra terminator and the second-outermost loop. 1104 loop::ForOp second = loops[1]; 1105 innermost.getBody()->back().erase(); 1106 outermost.getBody()->getOperations().splice( 1107 Block::iterator(second.getOperation()), 1108 innermost.getBody()->getOperations()); 1109 second.erase(); 1110 } 1111 1112 void mlir::mapLoopToProcessorIds(loop::ForOp forOp, 1113 ArrayRef<Value *> processorId, 1114 ArrayRef<Value *> numProcessors) { 1115 assert(processorId.size() == numProcessors.size()); 1116 if (processorId.empty()) 1117 return; 1118 1119 OpBuilder b(forOp); 1120 Location loc(forOp.getLoc()); 1121 Value *mul = processorId.front(); 1122 for (unsigned i = 1, e = processorId.size(); i < e; ++i) 1123 mul = b.create<AddIOp>(loc, b.create<MulIOp>(loc, mul, numProcessors[i]), 1124 processorId[i]); 1125 Value *lb = b.create<AddIOp>(loc, forOp.lowerBound(), mul); 1126 forOp.setLowerBound(lb); 1127 1128 Value *step = numProcessors.front(); 1129 for (auto *numProcs : numProcessors.drop_front()) 1130 step = b.create<MulIOp>(loc, step, numProcs); 1131 forOp.setStep(step); 1132 }