github.com/johnnyeven/libtools@v0.0.0-20191126065708-61829c1adf46/third_party/mlir/lib/Transforms/Utils/LoopUtils.cpp (about)

     1  //===- LoopUtils.cpp ---- Misc utilities for loop transformation ----------===//
     2  //
     3  // Copyright 2019 The MLIR Authors.
     4  //
     5  // Licensed under the Apache License, Version 2.0 (the "License");
     6  // you may not use this file except in compliance with the License.
     7  // You may obtain a copy of the License at
     8  //
     9  //   http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  // =============================================================================
    17  //
    18  // This file implements miscellaneous loop transformation routines.
    19  //
    20  //===----------------------------------------------------------------------===//
    21  
    22  #include "mlir/Transforms/LoopUtils.h"
    23  
    24  #include "mlir/Analysis/AffineAnalysis.h"
    25  #include "mlir/Analysis/AffineStructures.h"
    26  #include "mlir/Analysis/LoopAnalysis.h"
    27  #include "mlir/Analysis/SliceAnalysis.h"
    28  #include "mlir/Dialect/AffineOps/AffineOps.h"
    29  #include "mlir/Dialect/LoopOps/LoopOps.h"
    30  #include "mlir/Dialect/StandardOps/Ops.h"
    31  #include "mlir/IR/AffineExpr.h"
    32  #include "mlir/IR/AffineMap.h"
    33  #include "mlir/IR/BlockAndValueMapping.h"
    34  #include "mlir/IR/Builders.h"
    35  #include "mlir/IR/Function.h"
    36  #include "mlir/IR/Module.h"
    37  #include "mlir/IR/Operation.h"
    38  #include "mlir/Transforms/RegionUtils.h"
    39  #include "llvm/ADT/DenseMap.h"
    40  #include "llvm/ADT/SetVector.h"
    41  #include "llvm/ADT/SmallPtrSet.h"
    42  #include "llvm/Support/Debug.h"
    43  
    44  #define DEBUG_TYPE "LoopUtils"
    45  
    46  using namespace mlir;
    47  using llvm::SetVector;
    48  
    49  /// Computes the cleanup loop lower bound of the loop being unrolled with
    50  /// the specified unroll factor; this bound will also be upper bound of the main
    51  /// part of the unrolled loop. Computes the bound as an AffineMap with its
    52  /// operands or a null map when the trip count can't be expressed as an affine
    53  /// expression.
    54  void mlir::getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
    55                                      AffineMap *map,
    56                                      SmallVectorImpl<Value *> *operands,
    57                                      OpBuilder &b) {
    58    auto lbMap = forOp.getLowerBoundMap();
    59  
    60    // Single result lower bound map only.
    61    if (lbMap.getNumResults() != 1) {
    62      *map = AffineMap();
    63      return;
    64    }
    65  
    66    AffineMap tripCountMap;
    67    SmallVector<Value *, 4> tripCountOperands;
    68    buildTripCountMapAndOperands(forOp, &tripCountMap, &tripCountOperands);
    69  
    70    // Sometimes the trip count cannot be expressed as an affine expression.
    71    if (!tripCountMap) {
    72      *map = AffineMap();
    73      return;
    74    }
    75  
    76    unsigned step = forOp.getStep();
    77  
    78    SmallVector<Value *, 4> lbOperands(forOp.getLowerBoundOperands());
    79    auto lb = b.create<AffineApplyOp>(forOp.getLoc(), lbMap, lbOperands);
    80  
    81    // For each upper bound expr, get the range.
    82    // Eg: affine.for %i = lb to min (ub1, ub2),
    83    // where tripCountExprs yield (tr1, tr2), we create affine.apply's:
    84    // lb + tr1 - tr1 % ufactor, lb + tr2 - tr2 % ufactor; the results of all
    85    // these affine.apply's make up the cleanup loop lower bound.
    86    SmallVector<AffineExpr, 4> bumpExprs(tripCountMap.getNumResults());
    87    SmallVector<Value *, 4> bumpValues(tripCountMap.getNumResults());
    88    for (unsigned i = 0, e = tripCountMap.getNumResults(); i < e; i++) {
    89      auto tripCountExpr = tripCountMap.getResult(i);
    90      bumpExprs[i] = (tripCountExpr - tripCountExpr % unrollFactor) * step;
    91      auto bumpMap = b.getAffineMap(tripCountMap.getNumDims(),
    92                                    tripCountMap.getNumSymbols(), bumpExprs[i]);
    93      bumpValues[i] =
    94          b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, tripCountOperands);
    95    }
    96  
    97    SmallVector<AffineExpr, 4> newUbExprs(tripCountMap.getNumResults());
    98    for (unsigned i = 0, e = bumpExprs.size(); i < e; i++)
    99      newUbExprs[i] = b.getAffineDimExpr(0) + b.getAffineDimExpr(i + 1);
   100  
   101    operands->clear();
   102    operands->push_back(lb);
   103    operands->append(bumpValues.begin(), bumpValues.end());
   104    *map = b.getAffineMap(1 + tripCountMap.getNumResults(), 0, newUbExprs);
   105    // Simplify the map + operands.
   106    fullyComposeAffineMapAndOperands(map, operands);
   107    *map = simplifyAffineMap(*map);
   108    canonicalizeMapAndOperands(map, operands);
   109    // Remove any affine.apply's that became dead from the simplification above.
   110    for (auto *v : bumpValues) {
   111      if (v->use_empty()) {
   112        v->getDefiningOp()->erase();
   113      }
   114    }
   115    if (lb.use_empty())
   116      lb.erase();
   117  }
   118  
   119  /// Promotes the loop body of a forOp to its containing block if the forOp
   120  /// was known to have a single iteration.
   121  // TODO(bondhugula): extend this for arbitrary affine bounds.
   122  LogicalResult mlir::promoteIfSingleIteration(AffineForOp forOp) {
   123    Optional<uint64_t> tripCount = getConstantTripCount(forOp);
   124    if (!tripCount.hasValue() || tripCount.getValue() != 1)
   125      return failure();
   126  
   127    // TODO(mlir-team): there is no builder for a max.
   128    if (forOp.getLowerBoundMap().getNumResults() != 1)
   129      return failure();
   130  
   131    // Replaces all IV uses to its single iteration value.
   132    auto *iv = forOp.getInductionVar();
   133    Operation *op = forOp.getOperation();
   134    if (!iv->use_empty()) {
   135      if (forOp.hasConstantLowerBound()) {
   136        OpBuilder topBuilder(op->getParentOfType<FuncOp>().getBody());
   137        auto constOp = topBuilder.create<ConstantIndexOp>(
   138            forOp.getLoc(), forOp.getConstantLowerBound());
   139        iv->replaceAllUsesWith(constOp);
   140      } else {
   141        AffineBound lb = forOp.getLowerBound();
   142        SmallVector<Value *, 4> lbOperands(lb.operand_begin(), lb.operand_end());
   143        OpBuilder builder(op->getBlock(), Block::iterator(op));
   144        if (lb.getMap() == builder.getDimIdentityMap()) {
   145          // No need of generating an affine.apply.
   146          iv->replaceAllUsesWith(lbOperands[0]);
   147        } else {
   148          auto affineApplyOp = builder.create<AffineApplyOp>(
   149              op->getLoc(), lb.getMap(), lbOperands);
   150          iv->replaceAllUsesWith(affineApplyOp);
   151        }
   152      }
   153    }
   154    // Move the loop body operations, except for terminator, to the loop's
   155    // containing block.
   156    auto *block = op->getBlock();
   157    forOp.getBody()->getOperations().back().erase();
   158    block->getOperations().splice(Block::iterator(op),
   159                                  forOp.getBody()->getOperations());
   160    forOp.erase();
   161    return success();
   162  }
   163  
   164  /// Promotes all single iteration for op's in the FuncOp, i.e., moves
   165  /// their body into the containing Block.
   166  void mlir::promoteSingleIterationLoops(FuncOp f) {
   167    // Gathers all innermost loops through a post order pruned walk.
   168    f.walk([](AffineForOp forOp) { promoteIfSingleIteration(forOp); });
   169  }
   170  
   171  /// Generates a 'affine.for' op with the specified lower and upper bounds
   172  /// while generating the right IV remappings for the shifted operations. The
   173  /// operation blocks that go into the loop are specified in instGroupQueue
   174  /// starting from the specified offset, and in that order; the first element of
   175  /// the pair specifies the shift applied to that group of operations; note
   176  /// that the shift is multiplied by the loop step before being applied. Returns
   177  /// nullptr if the generated loop simplifies to a single iteration one.
   178  static AffineForOp
   179  generateLoop(AffineMap lbMap, AffineMap ubMap,
   180               const std::vector<std::pair<uint64_t, ArrayRef<Operation *>>>
   181                   &instGroupQueue,
   182               unsigned offset, AffineForOp srcForInst, OpBuilder b) {
   183    SmallVector<Value *, 4> lbOperands(srcForInst.getLowerBoundOperands());
   184    SmallVector<Value *, 4> ubOperands(srcForInst.getUpperBoundOperands());
   185  
   186    assert(lbMap.getNumInputs() == lbOperands.size());
   187    assert(ubMap.getNumInputs() == ubOperands.size());
   188  
   189    auto loopChunk =
   190        b.create<AffineForOp>(srcForInst.getLoc(), lbOperands, lbMap, ubOperands,
   191                              ubMap, srcForInst.getStep());
   192    auto *loopChunkIV = loopChunk.getInductionVar();
   193    auto *srcIV = srcForInst.getInductionVar();
   194  
   195    BlockAndValueMapping operandMap;
   196  
   197    OpBuilder bodyBuilder = loopChunk.getBodyBuilder();
   198    for (auto it = instGroupQueue.begin() + offset, e = instGroupQueue.end();
   199         it != e; ++it) {
   200      uint64_t shift = it->first;
   201      auto insts = it->second;
   202      // All 'same shift' operations get added with their operands being
   203      // remapped to results of cloned operations, and their IV used remapped.
   204      // Generate the remapping if the shift is not zero: remappedIV = newIV -
   205      // shift.
   206      if (!srcIV->use_empty() && shift != 0) {
   207        auto ivRemap = bodyBuilder.create<AffineApplyOp>(
   208            srcForInst.getLoc(),
   209            bodyBuilder.getSingleDimShiftAffineMap(
   210                -static_cast<int64_t>(srcForInst.getStep() * shift)),
   211            loopChunkIV);
   212        operandMap.map(srcIV, ivRemap);
   213      } else {
   214        operandMap.map(srcIV, loopChunkIV);
   215      }
   216      for (auto *op : insts) {
   217        if (!isa<AffineTerminatorOp>(op))
   218          bodyBuilder.clone(*op, operandMap);
   219      }
   220    };
   221    if (succeeded(promoteIfSingleIteration(loopChunk)))
   222      return AffineForOp();
   223    return loopChunk;
   224  }
   225  
   226  /// Skew the operations in the body of a 'affine.for' operation with the
   227  /// specified operation-wise shifts. The shifts are with respect to the
   228  /// original execution order, and are multiplied by the loop 'step' before being
   229  /// applied. A shift of zero for each operation will lead to no change.
   230  // The skewing of operations with respect to one another can be used for
   231  // example to allow overlap of asynchronous operations (such as DMA
   232  // communication) with computation, or just relative shifting of operations
   233  // for better register reuse, locality or parallelism. As such, the shifts are
   234  // typically expected to be at most of the order of the number of operations.
   235  // This method should not be used as a substitute for loop distribution/fission.
   236  // This method uses an algorithm// in time linear in the number of operations
   237  // in the body of the for loop - (using the 'sweep line' paradigm). This method
   238  // asserts preservation of SSA dominance. A check for that as well as that for
   239  // memory-based depedence preservation check rests with the users of this
   240  // method.
   241  LogicalResult mlir::instBodySkew(AffineForOp forOp, ArrayRef<uint64_t> shifts,
   242                                   bool unrollPrologueEpilogue) {
   243    if (forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
   244      return success();
   245  
   246    // If the trip counts aren't constant, we would need versioning and
   247    // conditional guards (or context information to prevent such versioning). The
   248    // better way to pipeline for such loops is to first tile them and extract
   249    // constant trip count "full tiles" before applying this.
   250    auto mayBeConstTripCount = getConstantTripCount(forOp);
   251    if (!mayBeConstTripCount.hasValue()) {
   252      LLVM_DEBUG(forOp.emitRemark("non-constant trip count loop not handled"));
   253      return success();
   254    }
   255    uint64_t tripCount = mayBeConstTripCount.getValue();
   256  
   257    assert(isInstwiseShiftValid(forOp, shifts) &&
   258           "shifts will lead to an invalid transformation\n");
   259  
   260    int64_t step = forOp.getStep();
   261  
   262    unsigned numChildInsts = forOp.getBody()->getOperations().size();
   263  
   264    // Do a linear time (counting) sort for the shifts.
   265    uint64_t maxShift = 0;
   266    for (unsigned i = 0; i < numChildInsts; i++) {
   267      maxShift = std::max(maxShift, shifts[i]);
   268    }
   269    // Such large shifts are not the typical use case.
   270    if (maxShift >= numChildInsts) {
   271      forOp.emitWarning("not shifting because shifts are unrealistically large");
   272      return success();
   273    }
   274  
   275    // An array of operation groups sorted by shift amount; each group has all
   276    // operations with the same shift in the order in which they appear in the
   277    // body of the 'affine.for' op.
   278    std::vector<std::vector<Operation *>> sortedInstGroups(maxShift + 1);
   279    unsigned pos = 0;
   280    for (auto &op : *forOp.getBody()) {
   281      auto shift = shifts[pos++];
   282      sortedInstGroups[shift].push_back(&op);
   283    }
   284  
   285    // Unless the shifts have a specific pattern (which actually would be the
   286    // common use case), prologue and epilogue are not meaningfully defined.
   287    // Nevertheless, if 'unrollPrologueEpilogue' is set, we will treat the first
   288    // loop generated as the prologue and the last as epilogue and unroll these
   289    // fully.
   290    AffineForOp prologue;
   291    AffineForOp epilogue;
   292  
   293    // Do a sweep over the sorted shifts while storing open groups in a
   294    // vector, and generating loop portions as necessary during the sweep. A block
   295    // of operations is paired with its shift.
   296    std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> instGroupQueue;
   297  
   298    auto origLbMap = forOp.getLowerBoundMap();
   299    uint64_t lbShift = 0;
   300    OpBuilder b(forOp.getOperation());
   301    for (uint64_t d = 0, e = sortedInstGroups.size(); d < e; ++d) {
   302      // If nothing is shifted by d, continue.
   303      if (sortedInstGroups[d].empty())
   304        continue;
   305      if (!instGroupQueue.empty()) {
   306        assert(d >= 1 &&
   307               "Queue expected to be empty when the first block is found");
   308        // The interval for which the loop needs to be generated here is:
   309        // [lbShift, min(lbShift + tripCount, d)) and the body of the
   310        // loop needs to have all operations in instQueue in that order.
   311        AffineForOp res;
   312        if (lbShift + tripCount * step < d * step) {
   313          res = generateLoop(
   314              b.getShiftedAffineMap(origLbMap, lbShift),
   315              b.getShiftedAffineMap(origLbMap, lbShift + tripCount * step),
   316              instGroupQueue, 0, forOp, b);
   317          // Entire loop for the queued op groups generated, empty it.
   318          instGroupQueue.clear();
   319          lbShift += tripCount * step;
   320        } else {
   321          res = generateLoop(b.getShiftedAffineMap(origLbMap, lbShift),
   322                             b.getShiftedAffineMap(origLbMap, d), instGroupQueue,
   323                             0, forOp, b);
   324          lbShift = d * step;
   325        }
   326        if (!prologue && res)
   327          prologue = res;
   328        epilogue = res;
   329      } else {
   330        // Start of first interval.
   331        lbShift = d * step;
   332      }
   333      // Augment the list of operations that get into the current open interval.
   334      instGroupQueue.push_back({d, sortedInstGroups[d]});
   335    }
   336  
   337    // Those operations groups left in the queue now need to be processed (FIFO)
   338    // and their loops completed.
   339    for (unsigned i = 0, e = instGroupQueue.size(); i < e; ++i) {
   340      uint64_t ubShift = (instGroupQueue[i].first + tripCount) * step;
   341      epilogue = generateLoop(b.getShiftedAffineMap(origLbMap, lbShift),
   342                              b.getShiftedAffineMap(origLbMap, ubShift),
   343                              instGroupQueue, i, forOp, b);
   344      lbShift = ubShift;
   345      if (!prologue)
   346        prologue = epilogue;
   347    }
   348  
   349    // Erase the original for op.
   350    forOp.erase();
   351  
   352    if (unrollPrologueEpilogue && prologue)
   353      loopUnrollFull(prologue);
   354    if (unrollPrologueEpilogue && !epilogue &&
   355        epilogue.getOperation() != prologue.getOperation())
   356      loopUnrollFull(epilogue);
   357  
   358    return success();
   359  }
   360  
   361  // Collect perfectly nested loops starting from `rootForOps`.  Loops are
   362  // perfectly nested if each loop is the first and only non-terminator operation
   363  // in the parent loop.  Collect at most `maxLoops` loops and append them to
   364  // `forOps`.
   365  template <typename T>
   366  void getPerfectlyNestedLoopsImpl(
   367      SmallVectorImpl<T> &forOps, T rootForOp,
   368      unsigned maxLoops = std::numeric_limits<unsigned>::max()) {
   369    for (unsigned i = 0; i < maxLoops; ++i) {
   370      forOps.push_back(rootForOp);
   371      // FIXME: ForOp and AffineForOp currently provide different names to access
   372      // the region ("region" and "getRegion").  Remove this generic access when
   373      // AffineForOp moves to ODS and also gets "region".
   374      Block &body = rootForOp.getOperation()->getRegion(0).front();
   375      if (body.begin() != std::prev(body.end(), 2))
   376        return;
   377  
   378      rootForOp = dyn_cast<T>(&body.front());
   379      if (!rootForOp)
   380        return;
   381    }
   382  }
   383  
   384  /// Get perfectly nested sequence of loops starting at root of loop nest
   385  /// (the first op being another AffineFor, and the second op - a terminator).
   386  /// A loop is perfectly nested iff: the first op in the loop's body is another
   387  /// AffineForOp, and the second op is a terminator).
   388  void mlir::getPerfectlyNestedLoops(SmallVectorImpl<AffineForOp> &nestedLoops,
   389                                     AffineForOp root) {
   390    getPerfectlyNestedLoopsImpl(nestedLoops, root);
   391  }
   392  
   393  void mlir::getPerfectlyNestedLoops(SmallVectorImpl<loop::ForOp> &nestedLoops,
   394                                     loop::ForOp root) {
   395    getPerfectlyNestedLoopsImpl(nestedLoops, root);
   396  }
   397  
   398  /// Unrolls this loop completely.
   399  LogicalResult mlir::loopUnrollFull(AffineForOp forOp) {
   400    Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
   401    if (mayBeConstantTripCount.hasValue()) {
   402      uint64_t tripCount = mayBeConstantTripCount.getValue();
   403      if (tripCount == 1) {
   404        return promoteIfSingleIteration(forOp);
   405      }
   406      return loopUnrollByFactor(forOp, tripCount);
   407    }
   408    return failure();
   409  }
   410  
   411  /// Unrolls and jams this loop by the specified factor or by the trip count (if
   412  /// constant) whichever is lower.
   413  LogicalResult mlir::loopUnrollUpToFactor(AffineForOp forOp,
   414                                           uint64_t unrollFactor) {
   415    Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
   416  
   417    if (mayBeConstantTripCount.hasValue() &&
   418        mayBeConstantTripCount.getValue() < unrollFactor)
   419      return loopUnrollByFactor(forOp, mayBeConstantTripCount.getValue());
   420    return loopUnrollByFactor(forOp, unrollFactor);
   421  }
   422  
   423  /// Unrolls this loop by the specified factor. Returns success if the loop
   424  /// is successfully unrolled.
   425  LogicalResult mlir::loopUnrollByFactor(AffineForOp forOp,
   426                                         uint64_t unrollFactor) {
   427    assert(unrollFactor >= 1 && "unroll factor should be >= 1");
   428  
   429    if (unrollFactor == 1)
   430      return promoteIfSingleIteration(forOp);
   431  
   432    if (forOp.getBody()->empty() ||
   433        forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
   434      return failure();
   435  
   436    // Loops where the lower bound is a max expression isn't supported for
   437    // unrolling since the trip count can be expressed as an affine function when
   438    // both the lower bound and the upper bound are multi-result maps. However,
   439    // one meaningful way to do such unrolling would be to specialize the loop for
   440    // the 'hotspot' case and unroll that hotspot.
   441    if (forOp.getLowerBoundMap().getNumResults() != 1)
   442      return failure();
   443  
   444    // If the trip count is lower than the unroll factor, no unrolled body.
   445    // TODO(bondhugula): option to specify cleanup loop unrolling.
   446    Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
   447    if (mayBeConstantTripCount.hasValue() &&
   448        mayBeConstantTripCount.getValue() < unrollFactor)
   449      return failure();
   450  
   451    // Generate the cleanup loop if trip count isn't a multiple of unrollFactor.
   452    Operation *op = forOp.getOperation();
   453    if (getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) {
   454      OpBuilder builder(op->getBlock(), ++Block::iterator(op));
   455      auto cleanupForInst = cast<AffineForOp>(builder.clone(*op));
   456      AffineMap cleanupMap;
   457      SmallVector<Value *, 4> cleanupOperands;
   458      getCleanupLoopLowerBound(forOp, unrollFactor, &cleanupMap, &cleanupOperands,
   459                               builder);
   460      assert(cleanupMap &&
   461             "cleanup loop lower bound map for single result lower bound maps "
   462             "can always be determined");
   463      cleanupForInst.setLowerBound(cleanupOperands, cleanupMap);
   464      // Promote the loop body up if this has turned into a single iteration loop.
   465      promoteIfSingleIteration(cleanupForInst);
   466  
   467      // Adjust upper bound of the original loop; this is the same as the lower
   468      // bound of the cleanup loop.
   469      forOp.setUpperBound(cleanupOperands, cleanupMap);
   470    }
   471  
   472    // Scale the step of loop being unrolled by unroll factor.
   473    int64_t step = forOp.getStep();
   474    forOp.setStep(step * unrollFactor);
   475  
   476    // Builder to insert unrolled bodies just before the terminator of the body of
   477    // 'forOp'.
   478    OpBuilder builder = forOp.getBodyBuilder();
   479  
   480    // Keep a pointer to the last non-terminator operation in the original block
   481    // so that we know what to clone (since we are doing this in-place).
   482    Block::iterator srcBlockEnd = std::prev(forOp.getBody()->end(), 2);
   483  
   484    // Unroll the contents of 'forOp' (append unrollFactor-1 additional copies).
   485    auto *forOpIV = forOp.getInductionVar();
   486    for (unsigned i = 1; i < unrollFactor; i++) {
   487      BlockAndValueMapping operandMap;
   488  
   489      // If the induction variable is used, create a remapping to the value for
   490      // this unrolled instance.
   491      if (!forOpIV->use_empty()) {
   492        // iv' = iv + 1/2/3...unrollFactor-1;
   493        auto d0 = builder.getAffineDimExpr(0);
   494        auto bumpMap = builder.getAffineMap(1, 0, {d0 + i * step});
   495        auto ivUnroll =
   496            builder.create<AffineApplyOp>(forOp.getLoc(), bumpMap, forOpIV);
   497        operandMap.map(forOpIV, ivUnroll);
   498      }
   499  
   500      // Clone the original body of 'forOp'.
   501      for (auto it = forOp.getBody()->begin(); it != std::next(srcBlockEnd);
   502           it++) {
   503        builder.clone(*it, operandMap);
   504      }
   505    }
   506  
   507    // Promote the loop body up if this has turned into a single iteration loop.
   508    promoteIfSingleIteration(forOp);
   509    return success();
   510  }
   511  
   512  /// Performs loop interchange on 'forOpA' and 'forOpB', where 'forOpB' is
   513  /// nested within 'forOpA' as the only non-terminator operation in its block.
   514  void mlir::interchangeLoops(AffineForOp forOpA, AffineForOp forOpB) {
   515    auto *forOpAInst = forOpA.getOperation();
   516  
   517    assert(&*forOpA.getBody()->begin() == forOpB.getOperation());
   518    auto &forOpABody = forOpA.getBody()->getOperations();
   519    auto &forOpBBody = forOpB.getBody()->getOperations();
   520  
   521    // 1) Splice forOpA's non-terminator operations (which is just forOpB) just
   522    // before forOpA (in ForOpA's parent's block) this should leave 'forOpA's
   523    // body containing only the terminator.
   524    forOpAInst->getBlock()->getOperations().splice(Block::iterator(forOpAInst),
   525                                                   forOpABody, forOpABody.begin(),
   526                                                   std::prev(forOpABody.end()));
   527    // 2) Splice forOpB's non-terminator operations into the beginning of forOpA's
   528    // body (this leaves forOpB's body containing only the terminator).
   529    forOpABody.splice(forOpABody.begin(), forOpBBody, forOpBBody.begin(),
   530                      std::prev(forOpBBody.end()));
   531    // 3) Splice forOpA into the beginning of forOpB's body.
   532    forOpBBody.splice(forOpBBody.begin(), forOpAInst->getBlock()->getOperations(),
   533                      Block::iterator(forOpAInst));
   534  }
   535  
   536  // Checks each dependence component against the permutation to see if the
   537  // desired loop interchange would violate dependences by making the
   538  // dependence componenent lexicographically negative.
   539  static bool checkLoopInterchangeDependences(
   540      const std::vector<llvm::SmallVector<DependenceComponent, 2>> &depCompsVec,
   541      ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
   542    // Invert permutation map.
   543    unsigned maxLoopDepth = loops.size();
   544    llvm::SmallVector<unsigned, 4> loopPermMapInv;
   545    loopPermMapInv.resize(maxLoopDepth);
   546    for (unsigned i = 0; i < maxLoopDepth; ++i)
   547      loopPermMapInv[loopPermMap[i]] = i;
   548  
   549    // Check each dependence component against the permutation to see if the
   550    // desired loop interchange permutation would make the dependence vectors
   551    // lexicographically negative.
   552    // Example 1: [-1, 1][0, 0]
   553    // Example 2: [0, 0][-1, 1]
   554    for (unsigned i = 0, e = depCompsVec.size(); i < e; ++i) {
   555      const llvm::SmallVector<DependenceComponent, 2> &depComps = depCompsVec[i];
   556      assert(depComps.size() >= maxLoopDepth);
   557      // Check if the first non-zero dependence component is positive.
   558      // This iterates through loops in the desired order.
   559      for (unsigned j = 0; j < maxLoopDepth; ++j) {
   560        unsigned permIndex = loopPermMapInv[j];
   561        assert(depComps[permIndex].lb.hasValue());
   562        int64_t depCompLb = depComps[permIndex].lb.getValue();
   563        if (depCompLb > 0)
   564          break;
   565        if (depCompLb < 0)
   566          return false;
   567      }
   568    }
   569    return true;
   570  }
   571  
   572  /// Checks if the loop interchange permutation 'loopPermMap' of the perfectly
   573  /// nested sequence of loops in 'loops' would violate dependences.
   574  bool mlir::isValidLoopInterchangePermutation(ArrayRef<AffineForOp> loops,
   575                                               ArrayRef<unsigned> loopPermMap) {
   576    // Gather dependence components for dependences between all ops in loop nest
   577    // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
   578    assert(loopPermMap.size() == loops.size());
   579    unsigned maxLoopDepth = loops.size();
   580    std::vector<llvm::SmallVector<DependenceComponent, 2>> depCompsVec;
   581    getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
   582    return checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap);
   583  }
   584  
   585  /// Performs a sequence of loop interchanges of loops in perfectly nested
   586  /// sequence of loops in 'loops', as specified by permutation in 'loopPermMap'.
   587  unsigned mlir::interchangeLoops(ArrayRef<AffineForOp> loops,
   588                                  ArrayRef<unsigned> loopPermMap) {
   589    Optional<unsigned> loopNestRootIndex;
   590    for (int i = loops.size() - 1; i >= 0; --i) {
   591      int permIndex = static_cast<int>(loopPermMap[i]);
   592      // Store the index of the for loop which will be the new loop nest root.
   593      if (permIndex == 0)
   594        loopNestRootIndex = i;
   595      if (permIndex > i) {
   596        // Sink loop 'i' by 'permIndex - i' levels deeper into the loop nest.
   597        sinkLoop(loops[i], permIndex - i);
   598      }
   599    }
   600    assert(loopNestRootIndex.hasValue());
   601    return loopNestRootIndex.getValue();
   602  }
   603  
   604  // Sinks all sequential loops to the innermost levels (while preserving
   605  // relative order among them) and moves all parallel loops to the
   606  // outermost (while again preserving relative order among them).
   607  AffineForOp mlir::sinkSequentialLoops(AffineForOp forOp) {
   608    SmallVector<AffineForOp, 4> loops;
   609    getPerfectlyNestedLoops(loops, forOp);
   610    if (loops.size() < 2)
   611      return forOp;
   612  
   613    // Gather dependence components for dependences between all ops in loop nest
   614    // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
   615    unsigned maxLoopDepth = loops.size();
   616    std::vector<llvm::SmallVector<DependenceComponent, 2>> depCompsVec;
   617    getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
   618  
   619    // Mark loops as either parallel or sequential.
   620    llvm::SmallVector<bool, 8> isParallelLoop(maxLoopDepth, true);
   621    for (unsigned i = 0, e = depCompsVec.size(); i < e; ++i) {
   622      llvm::SmallVector<DependenceComponent, 2> &depComps = depCompsVec[i];
   623      assert(depComps.size() >= maxLoopDepth);
   624      for (unsigned j = 0; j < maxLoopDepth; ++j) {
   625        DependenceComponent &depComp = depComps[j];
   626        assert(depComp.lb.hasValue() && depComp.ub.hasValue());
   627        if (depComp.lb.getValue() != 0 || depComp.ub.getValue() != 0)
   628          isParallelLoop[j] = false;
   629      }
   630    }
   631  
   632    // Count the number of parallel loops.
   633    unsigned numParallelLoops = 0;
   634    for (unsigned i = 0, e = isParallelLoop.size(); i < e; ++i)
   635      if (isParallelLoop[i])
   636        ++numParallelLoops;
   637  
   638    // Compute permutation of loops that sinks sequential loops (and thus raises
   639    // parallel loops) while preserving relative order.
   640    llvm::SmallVector<unsigned, 4> loopPermMap(maxLoopDepth);
   641    unsigned nextSequentialLoop = numParallelLoops;
   642    unsigned nextParallelLoop = 0;
   643    for (unsigned i = 0; i < maxLoopDepth; ++i) {
   644      if (isParallelLoop[i]) {
   645        loopPermMap[i] = nextParallelLoop++;
   646      } else {
   647        loopPermMap[i] = nextSequentialLoop++;
   648      }
   649    }
   650  
   651    // Check if permutation 'loopPermMap' would violate dependences.
   652    if (!checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap))
   653      return forOp;
   654    // Perform loop interchange according to permutation 'loopPermMap'.
   655    unsigned loopNestRootIndex = interchangeLoops(loops, loopPermMap);
   656    return loops[loopNestRootIndex];
   657  }
   658  
   659  /// Performs a series of loop interchanges to sink 'forOp' 'loopDepth' levels
   660  /// deeper in the loop nest.
   661  void mlir::sinkLoop(AffineForOp forOp, unsigned loopDepth) {
   662    for (unsigned i = 0; i < loopDepth; ++i) {
   663      AffineForOp nextForOp = cast<AffineForOp>(forOp.getBody()->front());
   664      interchangeLoops(forOp, nextForOp);
   665    }
   666  }
   667  
   668  // Factors out common behavior to add a new `iv` (resp. `iv` + `offset`) to the
   669  // lower (resp. upper) loop bound. When called for both the lower and upper
   670  // bounds, the resulting IR resembles:
   671  //
   672  // ```mlir
   673  //    affine.for %i = max (`iv, ...) to min (`iv` + `offset`) {
   674  //      ...
   675  //    }
   676  // ```
   677  static void augmentMapAndBounds(OpBuilder &b, Value *iv, AffineMap *map,
   678                                  SmallVector<Value *, 4> *operands,
   679                                  int64_t offset = 0) {
   680    auto bounds = llvm::to_vector<4>(map->getResults());
   681    bounds.push_back(b.getAffineDimExpr(map->getNumDims()) + offset);
   682    operands->insert(operands->begin() + map->getNumDims(), iv);
   683    *map = b.getAffineMap(map->getNumDims() + 1, map->getNumSymbols(), bounds);
   684    canonicalizeMapAndOperands(map, operands);
   685  }
   686  
   687  // Stripmines `forOp` by `factor` and sinks it under each of the `targets`.
   688  // Stripmine-sink is a primitive building block for generalized tiling of
   689  // imperfectly nested loops.
   690  // This transformation is purely mechanical and does not check legality,
   691  // profitability or even structural correctness. It is the user's
   692  // responsibility to specify `targets` that are dominated by `forOp`.
   693  // Returns the new AffineForOps, one per `targets`, nested immediately under
   694  // each of the `targets`.
   695  static SmallVector<AffineForOp, 8>
   696  stripmineSink(AffineForOp forOp, uint64_t factor,
   697                ArrayRef<AffineForOp> targets) {
   698    auto originalStep = forOp.getStep();
   699    auto scaledStep = originalStep * factor;
   700    forOp.setStep(scaledStep);
   701  
   702    auto *op = forOp.getOperation();
   703    OpBuilder b(op->getBlock(), ++Block::iterator(op));
   704  
   705    // Lower-bound map creation.
   706    auto lbMap = forOp.getLowerBoundMap();
   707    SmallVector<Value *, 4> lbOperands(forOp.getLowerBoundOperands());
   708    augmentMapAndBounds(b, forOp.getInductionVar(), &lbMap, &lbOperands);
   709  
   710    // Upper-bound map creation.
   711    auto ubMap = forOp.getUpperBoundMap();
   712    SmallVector<Value *, 4> ubOperands(forOp.getUpperBoundOperands());
   713    augmentMapAndBounds(b, forOp.getInductionVar(), &ubMap, &ubOperands,
   714                        /*offset=*/scaledStep);
   715  
   716    auto *iv = forOp.getInductionVar();
   717    SmallVector<AffineForOp, 8> innerLoops;
   718    for (auto t : targets) {
   719      // Insert newForOp before the terminator of `t`.
   720      OpBuilder b = t.getBodyBuilder();
   721      auto newForOp = b.create<AffineForOp>(t.getLoc(), lbOperands, lbMap,
   722                                            ubOperands, ubMap, originalStep);
   723      auto begin = t.getBody()->begin();
   724      // Skip terminator and `newForOp` which is just before the terminator.
   725      auto nOps = t.getBody()->getOperations().size() - 2;
   726      newForOp.getBody()->getOperations().splice(
   727          newForOp.getBody()->getOperations().begin(),
   728          t.getBody()->getOperations(), begin, std::next(begin, nOps));
   729      replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
   730                                 newForOp.region());
   731      innerLoops.push_back(newForOp);
   732    }
   733  
   734    return innerLoops;
   735  }
   736  
   737  static Loops stripmineSink(loop::ForOp forOp, Value *factor,
   738                             ArrayRef<loop::ForOp> targets) {
   739    auto *originalStep = forOp.step();
   740    auto *iv = forOp.getInductionVar();
   741  
   742    OpBuilder b(forOp);
   743    forOp.setStep(b.create<MulIOp>(forOp.getLoc(), originalStep, factor));
   744  
   745    Loops innerLoops;
   746    for (auto t : targets) {
   747      // Save information for splicing ops out of t when done
   748      auto begin = t.getBody()->begin();
   749      auto nOps = t.getBody()->getOperations().size();
   750  
   751      // Insert newForOp before the terminator of `t`.
   752      OpBuilder b(t.getBodyBuilder());
   753      Value *stepped = b.create<AddIOp>(t.getLoc(), iv, forOp.step());
   754      Value *less = b.create<CmpIOp>(t.getLoc(), CmpIPredicate::SLT,
   755                                     forOp.upperBound(), stepped);
   756      Value *ub =
   757          b.create<SelectOp>(t.getLoc(), less, forOp.upperBound(), stepped);
   758  
   759      // Splice [begin, begin + nOps - 1) into `newForOp` and replace uses.
   760      auto newForOp = b.create<loop::ForOp>(t.getLoc(), iv, ub, originalStep);
   761      newForOp.getBody()->getOperations().splice(
   762          newForOp.getBody()->getOperations().begin(),
   763          t.getBody()->getOperations(), begin, std::next(begin, nOps - 1));
   764      replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
   765                                 newForOp.region());
   766  
   767      innerLoops.push_back(newForOp);
   768    }
   769  
   770    return innerLoops;
   771  }
   772  
   773  // Stripmines a `forOp` by `factor` and sinks it under a single `target`.
   774  // Returns the new AffineForOps, nested immediately under `target`.
   775  template <typename ForType, typename SizeType>
   776  static ForType stripmineSink(ForType forOp, SizeType factor, ForType target) {
   777    // TODO(ntv): Use cheap structural assertions that targets are nested under
   778    // forOp and that targets are not nested under each other when DominanceInfo
   779    // exposes the capability. It seems overkill to construct a whole function
   780    // dominance tree at this point.
   781    auto res = stripmineSink(forOp, factor, ArrayRef<ForType>{target});
   782    assert(res.size() == 1 && "Expected 1 inner forOp");
   783    return res[0];
   784  }
   785  
   786  template <typename ForType, typename SizeType>
   787  static SmallVector<SmallVector<ForType, 8>, 8>
   788  tileImpl(ArrayRef<ForType> forOps, ArrayRef<SizeType> sizes,
   789           ArrayRef<ForType> targets) {
   790    SmallVector<SmallVector<ForType, 8>, 8> res;
   791    SmallVector<ForType, 8> currentTargets(targets.begin(), targets.end());
   792    for (auto it : llvm::zip(forOps, sizes)) {
   793      auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
   794      res.push_back(step);
   795      currentTargets = step;
   796    }
   797    return res;
   798  }
   799  
   800  SmallVector<SmallVector<AffineForOp, 8>, 8>
   801  mlir::tile(ArrayRef<AffineForOp> forOps, ArrayRef<uint64_t> sizes,
   802             ArrayRef<AffineForOp> targets) {
   803    return tileImpl(forOps, sizes, targets);
   804  }
   805  
   806  SmallVector<Loops, 8> mlir::tile(ArrayRef<loop::ForOp> forOps,
   807                                   ArrayRef<Value *> sizes,
   808                                   ArrayRef<loop::ForOp> targets) {
   809    return tileImpl(forOps, sizes, targets);
   810  }
   811  
   812  template <typename ForType, typename SizeType>
   813  static SmallVector<ForType, 8>
   814  tileImpl(ArrayRef<ForType> forOps, ArrayRef<SizeType> sizes, ForType target) {
   815    SmallVector<ForType, 8> res;
   816    for (auto loops : tile(forOps, sizes, ArrayRef<ForType>{target})) {
   817      assert(loops.size() == 1);
   818      res.push_back(loops[0]);
   819    }
   820    return res;
   821  }
   822  
   823  SmallVector<AffineForOp, 8> mlir::tile(ArrayRef<AffineForOp> forOps,
   824                                         ArrayRef<uint64_t> sizes,
   825                                         AffineForOp target) {
   826    return tileImpl(forOps, sizes, target);
   827  }
   828  
   829  Loops mlir::tile(ArrayRef<loop::ForOp> forOps, ArrayRef<Value *> sizes,
   830                   loop::ForOp target) {
   831    return tileImpl(forOps, sizes, target);
   832  }
   833  
   834  Loops mlir::tilePerfectlyNested(loop::ForOp rootForOp,
   835                                  ArrayRef<Value *> sizes) {
   836    // Collect prefectly nested loops.  If more size values provided than nested
   837    // loops available, truncate `sizes`.
   838    SmallVector<loop::ForOp, 4> forOps;
   839    forOps.reserve(sizes.size());
   840    getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
   841    if (forOps.size() < sizes.size())
   842      sizes = sizes.take_front(forOps.size());
   843  
   844    return ::tile(forOps, sizes, forOps.back());
   845  }
   846  
   847  // Build the IR that performs ceil division of a positive value by a constant:
   848  //    ceildiv(a, B) = divis(a + (B-1), B)
   849  // where divis is roundning-to-zero division.
   850  static Value *ceilDivPositive(OpBuilder &builder, Location loc, Value *dividend,
   851                                int64_t divisor) {
   852    assert(divisor > 0 && "expected positive divisor");
   853    assert(dividend->getType().isIndex() && "expected index-typed value");
   854  
   855    Value *divisorMinusOneCst = builder.create<ConstantIndexOp>(loc, divisor - 1);
   856    Value *divisorCst = builder.create<ConstantIndexOp>(loc, divisor);
   857    Value *sum = builder.create<AddIOp>(loc, dividend, divisorMinusOneCst);
   858    return builder.create<DivISOp>(loc, sum, divisorCst);
   859  }
   860  
   861  // Build the IR that performs ceil division of a positive value by another
   862  // positive value:
   863  //    ceildiv(a, b) = divis(a + (b - 1), b)
   864  // where divis is rounding-to-zero division.
   865  static Value *ceilDivPositive(OpBuilder &builder, Location loc, Value *dividend,
   866                                Value *divisor) {
   867    assert(dividend->getType().isIndex() && "expected index-typed value");
   868  
   869    Value *cstOne = builder.create<ConstantIndexOp>(loc, 1);
   870    Value *divisorMinusOne = builder.create<SubIOp>(loc, divisor, cstOne);
   871    Value *sum = builder.create<AddIOp>(loc, dividend, divisorMinusOne);
   872    return builder.create<DivISOp>(loc, sum, divisor);
   873  }
   874  
   875  // Hoist the ops within `outer` that appear before `inner`.
   876  // Such ops include the ops that have been introduced by parametric tiling.
   877  // Ops that come from triangular loops (i.e. that belong to the program slice
   878  // rooted at `outer`) and ops that have side effects cannot be hoisted.
   879  // Return failure when any op fails to hoist.
   880  static LogicalResult hoistOpsBetween(loop::ForOp outer, loop::ForOp inner) {
   881    SetVector<Operation *> forwardSlice;
   882    getForwardSlice(outer.getOperation(), &forwardSlice, [&inner](Operation *op) {
   883      return op != inner.getOperation();
   884    });
   885    LogicalResult status = success();
   886    SmallVector<Operation *, 8> toHoist;
   887    for (auto &op : outer.getBody()->getOperations()) {
   888      // Stop when encountering the inner loop.
   889      if (&op == inner.getOperation())
   890        break;
   891      // Skip over non-hoistable ops.
   892      if (forwardSlice.count(&op) > 0) {
   893        status = failure();
   894        continue;
   895      }
   896      // Skip loop::ForOp, these are not considered a failure.
   897      if (op.getNumRegions() > 0)
   898        continue;
   899      // Skip other ops with regions.
   900      if (op.getNumRegions() > 0) {
   901        status = failure();
   902        continue;
   903      }
   904      // Skip if op has side effects.
   905      // TODO(ntv): loads to immutable memory regions are ok.
   906      if (!op.hasNoSideEffect()) {
   907        status = failure();
   908        continue;
   909      }
   910      toHoist.push_back(&op);
   911    }
   912    auto *outerForOp = outer.getOperation();
   913    for (auto *op : toHoist)
   914      op->moveBefore(outerForOp);
   915    return status;
   916  }
   917  
   918  // Traverse the interTile and intraTile loops and try to hoist ops such that
   919  // bands of perfectly nested loops are isolated.
   920  // Return failure if either perfect interTile or perfect intraTile bands cannot
   921  // be formed.
   922  static LogicalResult tryIsolateBands(const TileLoops &tileLoops) {
   923    LogicalResult status = success();
   924    auto &interTile = tileLoops.first;
   925    auto &intraTile = tileLoops.second;
   926    auto size = interTile.size();
   927    assert(size == intraTile.size());
   928    if (size <= 1)
   929      return success();
   930    for (unsigned s = 1; s < size; ++s)
   931      status = succeeded(status) ? hoistOpsBetween(intraTile[0], intraTile[s])
   932                                 : failure();
   933    for (unsigned s = 1; s < size; ++s)
   934      status = succeeded(status) ? hoistOpsBetween(interTile[0], interTile[s])
   935                                 : failure();
   936    return status;
   937  }
   938  
   939  TileLoops mlir::extractFixedOuterLoops(loop::ForOp rootForOp,
   940                                         ArrayRef<int64_t> sizes) {
   941    // Collect prefectly nested loops.  If more size values provided than nested
   942    // loops available, truncate `sizes`.
   943    SmallVector<loop::ForOp, 4> forOps;
   944    forOps.reserve(sizes.size());
   945    getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
   946    if (forOps.size() < sizes.size())
   947      sizes = sizes.take_front(forOps.size());
   948  
   949    // Compute the tile sizes such that i-th outer loop executes size[i]
   950    // iterations.  Given that the loop current executes
   951    //   numIterations = ceildiv((upperBound - lowerBound), step)
   952    // iterations, we need to tile with size ceildiv(numIterations, size[i]).
   953    SmallVector<Value *, 4> tileSizes;
   954    tileSizes.reserve(sizes.size());
   955    for (unsigned i = 0, e = sizes.size(); i < e; ++i) {
   956      assert(sizes[i] > 0 && "expected strictly positive size for strip-mining");
   957  
   958      auto forOp = forOps[i];
   959      OpBuilder builder(forOp);
   960      auto loc = forOp.getLoc();
   961      Value *diff =
   962          builder.create<SubIOp>(loc, forOp.upperBound(), forOp.lowerBound());
   963      Value *numIterations = ceilDivPositive(builder, loc, diff, forOp.step());
   964      Value *iterationsPerBlock =
   965          ceilDivPositive(builder, loc, numIterations, sizes[i]);
   966      tileSizes.push_back(iterationsPerBlock);
   967    }
   968  
   969    // Call parametric tiling with the given sizes.
   970    auto intraTile = tile(forOps, tileSizes, forOps.back());
   971    TileLoops tileLoops = std::make_pair(forOps, intraTile);
   972  
   973    // TODO(ntv, zinenko) for now we just ignore the result of band isolation.
   974    // In the future, mapping decisions may be impacted by the ability to
   975    // isolate perfectly nested bands.
   976    tryIsolateBands(tileLoops);
   977  
   978    return tileLoops;
   979  }
   980  
   981  // Replaces all uses of `orig` with `replacement` except if the user is listed
   982  // in `exceptions`.
   983  static void
   984  replaceAllUsesExcept(Value *orig, Value *replacement,
   985                       const SmallPtrSetImpl<Operation *> &exceptions) {
   986    for (auto &use : orig->getUses()) {
   987      if (exceptions.count(use.getOwner()) == 0)
   988        use.set(replacement);
   989    }
   990  }
   991  
   992  // Transform a loop with a strictly positive step
   993  //   for %i = %lb to %ub step %s
   994  // into a 0-based loop with step 1
   995  //   for %ii = 0 to ceildiv(%ub - %lb, %s) step 1 {
   996  //     %i = %ii * %s + %lb
   997  // Insert the induction variable remapping in the body of `inner`, which is
   998  // expected to be either `loop` or another loop perfectly nested under `loop`.
   999  // Insert the definition of new bounds immediate before `outer`, which is
  1000  // expected to be either `loop` or its parent in the loop nest.
  1001  static void normalizeLoop(loop::ForOp loop, loop::ForOp outer,
  1002                            loop::ForOp inner) {
  1003    OpBuilder builder(outer);
  1004    Location loc = loop.getLoc();
  1005  
  1006    // Check if the loop is already known to have a constant zero lower bound or
  1007    // a constant one step.
  1008    bool isZeroBased = false;
  1009    if (auto ubCst =
  1010            dyn_cast_or_null<ConstantIndexOp>(loop.lowerBound()->getDefiningOp()))
  1011      isZeroBased = ubCst.getValue() == 0;
  1012  
  1013    bool isStepOne = false;
  1014    if (auto stepCst =
  1015            dyn_cast_or_null<ConstantIndexOp>(loop.step()->getDefiningOp()))
  1016      isStepOne = stepCst.getValue() == 1;
  1017  
  1018    if (isZeroBased && isStepOne)
  1019      return;
  1020  
  1021    // Compute the number of iterations the loop executes: ceildiv(ub - lb, step)
  1022    // assuming the step is strictly positive.  Update the bounds and the step
  1023    // of the loop to go from 0 to the number of iterations, if necessary.
  1024    // TODO(zinenko): introduce support for negative steps or emit dynamic asserts
  1025    // on step positivity, whatever gets implemented first.
  1026    Value *diff =
  1027        builder.create<SubIOp>(loc, loop.upperBound(), loop.lowerBound());
  1028    Value *numIterations = ceilDivPositive(builder, loc, diff, loop.step());
  1029    loop.setUpperBound(numIterations);
  1030  
  1031    Value *lb = loop.lowerBound();
  1032    if (!isZeroBased) {
  1033      Value *cst0 = builder.create<ConstantIndexOp>(loc, 0);
  1034      loop.setLowerBound(cst0);
  1035    }
  1036  
  1037    Value *step = loop.step();
  1038    if (!isStepOne) {
  1039      Value *cst1 = builder.create<ConstantIndexOp>(loc, 1);
  1040      loop.setStep(cst1);
  1041    }
  1042  
  1043    // Insert code computing the value of the original loop induction variable
  1044    // from the "normalized" one.
  1045    builder.setInsertionPointToStart(inner.getBody());
  1046    Value *scaled =
  1047        isStepOne ? loop.getInductionVar()
  1048                  : builder.create<MulIOp>(loc, loop.getInductionVar(), step);
  1049    Value *shifted =
  1050        isZeroBased ? scaled : builder.create<AddIOp>(loc, scaled, lb);
  1051  
  1052    SmallPtrSet<Operation *, 2> preserve{scaled->getDefiningOp(),
  1053                                         shifted->getDefiningOp()};
  1054    replaceAllUsesExcept(loop.getInductionVar(), shifted, preserve);
  1055  }
  1056  
  1057  void mlir::coalesceLoops(MutableArrayRef<loop::ForOp> loops) {
  1058    if (loops.size() < 2)
  1059      return;
  1060  
  1061    loop::ForOp innermost = loops.back();
  1062    loop::ForOp outermost = loops.front();
  1063  
  1064    // 1. Make sure all loops iterate from 0 to upperBound with step 1.  This
  1065    // allows the following code to assume upperBound is the number of iterations.
  1066    for (auto loop : loops)
  1067      normalizeLoop(loop, outermost, innermost);
  1068  
  1069    // 2. Emit code computing the upper bound of the coalesced loop as product
  1070    // of the number of iterations of all loops.
  1071    OpBuilder builder(outermost);
  1072    Location loc = outermost.getLoc();
  1073    Value *upperBound = outermost.upperBound();
  1074    for (auto loop : loops.drop_front())
  1075      upperBound = builder.create<MulIOp>(loc, upperBound, loop.upperBound());
  1076    outermost.setUpperBound(upperBound);
  1077  
  1078    builder.setInsertionPointToStart(outermost.getBody());
  1079  
  1080    // 3. Remap induction variables.  For each original loop, the value of the
  1081    // induction variable can be obtained by dividing the induction variable of
  1082    // the linearized loop by the total number of iterations of the loops nested
  1083    // in it modulo the number of iterations in this loop (remove the values
  1084    // related to the outer loops):
  1085    //   iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
  1086    // Compute these iteratively from the innermost loop by creating a "running
  1087    // quotient" of division by the range.
  1088    Value *previous = outermost.getInductionVar();
  1089    for (unsigned i = 0, e = loops.size(); i < e; ++i) {
  1090      unsigned idx = loops.size() - i - 1;
  1091      if (i != 0)
  1092        previous =
  1093            builder.create<DivISOp>(loc, previous, loops[idx + 1].upperBound());
  1094  
  1095      Value *iv = (i == e - 1) ? previous
  1096                               : builder.create<RemISOp>(loc, previous,
  1097                                                         loops[idx].upperBound());
  1098      replaceAllUsesInRegionWith(loops[idx].getInductionVar(), iv,
  1099                                 loops.back().region());
  1100    }
  1101  
  1102    // 4. Move the operations from the innermost just above the second-outermost
  1103    // loop, delete the extra terminator and the second-outermost loop.
  1104    loop::ForOp second = loops[1];
  1105    innermost.getBody()->back().erase();
  1106    outermost.getBody()->getOperations().splice(
  1107        Block::iterator(second.getOperation()),
  1108        innermost.getBody()->getOperations());
  1109    second.erase();
  1110  }
  1111  
  1112  void mlir::mapLoopToProcessorIds(loop::ForOp forOp,
  1113                                   ArrayRef<Value *> processorId,
  1114                                   ArrayRef<Value *> numProcessors) {
  1115    assert(processorId.size() == numProcessors.size());
  1116    if (processorId.empty())
  1117      return;
  1118  
  1119    OpBuilder b(forOp);
  1120    Location loc(forOp.getLoc());
  1121    Value *mul = processorId.front();
  1122    for (unsigned i = 1, e = processorId.size(); i < e; ++i)
  1123      mul = b.create<AddIOp>(loc, b.create<MulIOp>(loc, mul, numProcessors[i]),
  1124                             processorId[i]);
  1125    Value *lb = b.create<AddIOp>(loc, forOp.lowerBound(), mul);
  1126    forOp.setLowerBound(lb);
  1127  
  1128    Value *step = numProcessors.front();
  1129    for (auto *numProcs : numProcessors.drop_front())
  1130      step = b.create<MulIOp>(loc, step, numProcs);
  1131    forOp.setStep(step);
  1132  }