github.com/johnnyeven/libtools@v0.0.0-20191126065708-61829c1adf46/third_party/mlir/lib/Transforms/LoopTiling.cpp (about)

     1  //===- LoopTiling.cpp --- Loop tiling pass ------------------------------*-===//
     2  //
     3  // Copyright 2019 The MLIR Authors.
     4  //
     5  // Licensed under the Apache License, Version 2.0 (the "License");
     6  // you may not use this file except in compliance with the License.
     7  // You may obtain a copy of the License at
     8  //
     9  //   http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  // =============================================================================
    17  //
    18  // This file implements a pass to tile loop nests.
    19  //
    20  //===----------------------------------------------------------------------===//
    21  
    22  #include "mlir/Analysis/AffineAnalysis.h"
    23  #include "mlir/Analysis/AffineStructures.h"
    24  #include "mlir/Analysis/LoopAnalysis.h"
    25  #include "mlir/Analysis/Utils.h"
    26  #include "mlir/Dialect/AffineOps/AffineOps.h"
    27  #include "mlir/IR/Builders.h"
    28  #include "mlir/Pass/Pass.h"
    29  #include "mlir/Transforms/LoopUtils.h"
    30  #include "mlir/Transforms/Passes.h"
    31  #include "mlir/Transforms/Utils.h"
    32  #include "llvm/Support/CommandLine.h"
    33  #include "llvm/Support/Debug.h"
    34  using namespace mlir;
    35  
    36  #define DEBUG_TYPE "affine-loop-tile"
    37  
    38  static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
    39  
    40  static llvm::cl::opt<unsigned long long>
    41      clCacheSizeKiB("tile-cache-size",
    42                     llvm::cl::desc("Set size of cache to tile for in KiB"),
    43                     llvm::cl::cat(clOptionsCategory));
    44  
    45  // Tile size to use for all loops (overrides -tile-sizes if provided).
    46  static llvm::cl::opt<unsigned>
    47      clTileSize("tile-size", llvm::cl::desc("Use this tile size for all loops"),
    48                 llvm::cl::cat(clOptionsCategory));
    49  
    50  // List of tile sizes. If any of them aren't provided, they are filled with
    51  // clTileSize / kDefaultTileSize.
    52  static llvm::cl::list<unsigned> clTileSizes(
    53      "tile-sizes",
    54      llvm::cl::desc(
    55          "List of tile sizes for each perfect nest (overridden by -tile-size)"),
    56      llvm::cl::ZeroOrMore, llvm::cl::cat(clOptionsCategory));
    57  
    58  namespace {
    59  
    60  /// A pass to perform loop tiling on all suitable loop nests of a Function.
    61  struct LoopTiling : public FunctionPass<LoopTiling> {
    62    explicit LoopTiling(uint64_t cacheSizeBytes = kDefaultCacheMemCapacity,
    63                        bool avoidMaxMinBounds = true)
    64        : cacheSizeBytes(cacheSizeBytes), avoidMaxMinBounds(avoidMaxMinBounds) {}
    65  
    66    void runOnFunction() override;
    67    void getTileSizes(ArrayRef<AffineForOp> band,
    68                      SmallVectorImpl<unsigned> *tileSizes);
    69  
    70    // Default tile size if nothing is provided.
    71    constexpr static unsigned kDefaultTileSize = 4;
    72    constexpr static uint64_t kDefaultCacheMemCapacity = 512 * 1024UL;
    73  
    74    // Capacity of the cache to tile for.
    75    uint64_t cacheSizeBytes;
    76    // If true, tile sizes are set to avoid max/min in bounds if possible.
    77    bool avoidMaxMinBounds;
    78  };
    79  
    80  } // end anonymous namespace
    81  
    82  /// Creates a pass to perform loop tiling on all suitable loop nests of a
    83  /// Function.
    84  std::unique_ptr<FunctionPassBase>
    85  mlir::createLoopTilingPass(uint64_t cacheSizeBytes) {
    86    return std::make_unique<LoopTiling>(cacheSizeBytes);
    87  }
    88  
    89  // Move the loop body of AffineForOp 'src' from 'src' into the specified
    90  // location in destination's body, ignoring the terminator.
    91  static inline void moveLoopBody(AffineForOp src, AffineForOp dest,
    92                                  Block::iterator loc) {
    93    auto &insts = src.getBody()->getOperations();
    94    dest.getBody()->getOperations().splice(loc, insts, insts.begin(),
    95                                           std::prev(insts.end()));
    96  }
    97  
    98  // Move the loop body of AffineForOp 'src' from 'src' to the start of dest's
    99  // body.
   100  static inline void moveLoopBody(AffineForOp src, AffineForOp dest) {
   101    moveLoopBody(src, dest, dest.getBody()->begin());
   102  }
   103  
   104  /// Constructs and sets new loop bounds after tiling for the case of
   105  /// hyper-rectangular index sets, where the bounds of one dimension do not
   106  /// depend on other dimensions. Bounds of each dimension can thus be treated
   107  /// independently, and deriving the new bounds is much simpler and faster
   108  /// than for the case of tiling arbitrary polyhedral shapes.
   109  static void
   110  constructTiledIndexSetHyperRect(MutableArrayRef<AffineForOp> origLoops,
   111                                  MutableArrayRef<AffineForOp> newLoops,
   112                                  ArrayRef<unsigned> tileSizes) {
   113    assert(!origLoops.empty());
   114    assert(origLoops.size() == tileSizes.size());
   115  
   116    OpBuilder b(origLoops[0].getOperation());
   117    unsigned width = origLoops.size();
   118  
   119    // Bounds for tile space loops.
   120    for (unsigned i = 0; i < width; i++) {
   121      auto lbOperands = origLoops[i].getLowerBoundOperands();
   122      auto ubOperands = origLoops[i].getUpperBoundOperands();
   123      SmallVector<Value *, 4> newLbOperands(lbOperands);
   124      SmallVector<Value *, 4> newUbOperands(ubOperands);
   125      newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap());
   126      newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap());
   127      newLoops[i].setStep(tileSizes[i]);
   128    }
   129    // Bounds for intra-tile loops.
   130    for (unsigned i = 0; i < width; i++) {
   131      int64_t largestDiv = getLargestDivisorOfTripCount(origLoops[i]);
   132      auto mayBeConstantCount = getConstantTripCount(origLoops[i]);
   133      // The lower bound is just the tile-space loop.
   134      AffineMap lbMap = b.getDimIdentityMap();
   135      newLoops[width + i].setLowerBound(
   136          /*operands=*/newLoops[i].getInductionVar(), lbMap);
   137  
   138      // Set the upper bound.
   139      if (mayBeConstantCount.hasValue() &&
   140          mayBeConstantCount.getValue() < tileSizes[i]) {
   141        // Trip count is less than tile size; upper bound is the trip count.
   142        auto ubMap = b.getConstantAffineMap(mayBeConstantCount.getValue());
   143        newLoops[width + i].setUpperBoundMap(ubMap);
   144      } else if (largestDiv % tileSizes[i] != 0) {
   145        // Intra-tile loop ii goes from i to min(i + tileSize, ub_i).
   146        // Construct the upper bound map; the operands are the original operands
   147        // with 'i' (tile-space loop) appended to it. The new upper bound map is
   148        // the original one with an additional expression i + tileSize appended.
   149        auto ub = origLoops[i].getUpperBound();
   150        SmallVector<Value *, 4> ubOperands;
   151        ubOperands.reserve(ub.getNumOperands() + 1);
   152        auto origUbMap = ub.getMap();
   153        // Add dim operands from original upper bound.
   154        for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j) {
   155          ubOperands.push_back(ub.getOperand(j));
   156        }
   157        // Add dim operand for new loop upper bound.
   158        ubOperands.push_back(newLoops[i].getInductionVar());
   159        // Add symbol operands from original upper bound.
   160        for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j) {
   161          ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
   162        }
   163        SmallVector<AffineExpr, 4> boundExprs;
   164        boundExprs.reserve(1 + origUbMap.getNumResults());
   165        auto dim = b.getAffineDimExpr(origUbMap.getNumDims());
   166        // The new upper bound map is the original one with an additional
   167        // expression i + tileSize appended.
   168        boundExprs.push_back(dim + tileSizes[i]);
   169        boundExprs.append(origUbMap.getResults().begin(),
   170                          origUbMap.getResults().end());
   171        auto ubMap = b.getAffineMap(origUbMap.getNumDims() + 1,
   172                                    origUbMap.getNumSymbols(), boundExprs);
   173        newLoops[width + i].setUpperBound(/*operands=*/ubOperands, ubMap);
   174      } else {
   175        // No need of the min expression.
   176        auto dim = b.getAffineDimExpr(0);
   177        auto ubMap = b.getAffineMap(1, 0, dim + tileSizes[i]);
   178        newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap);
   179      }
   180    }
   181  }
   182  
   183  /// Tiles the specified band of perfectly nested loops creating tile-space loops
   184  /// and intra-tile loops. A band is a contiguous set of loops.
   185  //  TODO(bondhugula): handle non hyper-rectangular spaces.
   186  LogicalResult mlir::tileCodeGen(MutableArrayRef<AffineForOp> band,
   187                                  ArrayRef<unsigned> tileSizes) {
   188    assert(!band.empty());
   189    assert(band.size() == tileSizes.size() && "Incorrect number of tile sizes");
   190  
   191    // Check if the supplied for op's are all successively nested.
   192    for (unsigned i = 1, e = band.size(); i < e; i++) {
   193      assert(band[i].getOperation()->getParentOp() == band[i - 1].getOperation());
   194    }
   195  
   196    auto origLoops = band;
   197  
   198    AffineForOp rootAffineForOp = origLoops[0];
   199    auto loc = rootAffineForOp.getLoc();
   200    // Note that width is at least one since band isn't empty.
   201    unsigned width = band.size();
   202  
   203    SmallVector<AffineForOp, 12> newLoops(2 * width);
   204    AffineForOp innermostPointLoop;
   205  
   206    // The outermost among the loops as we add more..
   207    auto *topLoop = rootAffineForOp.getOperation();
   208  
   209    // Add intra-tile (or point) loops.
   210    for (unsigned i = 0; i < width; i++) {
   211      OpBuilder b(topLoop);
   212      // Loop bounds will be set later.
   213      auto pointLoop = b.create<AffineForOp>(loc, 0, 0);
   214      pointLoop.getBody()->getOperations().splice(
   215          pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
   216          topLoop);
   217      newLoops[2 * width - 1 - i] = pointLoop;
   218      topLoop = pointLoop.getOperation();
   219      if (i == 0)
   220        innermostPointLoop = pointLoop;
   221    }
   222  
   223    // Add tile space loops;
   224    for (unsigned i = width; i < 2 * width; i++) {
   225      OpBuilder b(topLoop);
   226      // Loop bounds will be set later.
   227      auto tileSpaceLoop = b.create<AffineForOp>(loc, 0, 0);
   228      tileSpaceLoop.getBody()->getOperations().splice(
   229          tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
   230          topLoop);
   231      newLoops[2 * width - i - 1] = tileSpaceLoop;
   232      topLoop = tileSpaceLoop.getOperation();
   233    }
   234  
   235    // Move the loop body of the original nest to the new one.
   236    moveLoopBody(origLoops[origLoops.size() - 1], innermostPointLoop);
   237  
   238    SmallVector<Value *, 8> origLoopIVs;
   239    extractForInductionVars(band, &origLoopIVs);
   240    SmallVector<Optional<Value *>, 6> ids(origLoopIVs.begin(), origLoopIVs.end());
   241    FlatAffineConstraints cst;
   242    getIndexSet(band, &cst);
   243  
   244    if (!cst.isHyperRectangular(0, width)) {
   245      rootAffineForOp.emitError("tiled code generation unimplemented for the "
   246                                "non-hyperrectangular case");
   247      return failure();
   248    }
   249  
   250    constructTiledIndexSetHyperRect(origLoops, newLoops, tileSizes);
   251    // In this case, the point loop IVs just replace the original ones.
   252    for (unsigned i = 0; i < width; i++) {
   253      origLoopIVs[i]->replaceAllUsesWith(newLoops[i + width].getInductionVar());
   254    }
   255  
   256    // Erase the old loop nest.
   257    rootAffineForOp.erase();
   258  
   259    return success();
   260  }
   261  
   262  // Identify valid and profitable bands of loops to tile. This is currently just
   263  // a temporary placeholder to test the mechanics of tiled code generation.
   264  // Returns all maximal outermost perfect loop nests to tile.
   265  static void getTileableBands(FuncOp f,
   266                               std::vector<SmallVector<AffineForOp, 6>> *bands) {
   267    // Get maximal perfect nest of 'affine.for' insts starting from root
   268    // (inclusive).
   269    auto getMaximalPerfectLoopNest = [&](AffineForOp root) {
   270      SmallVector<AffineForOp, 6> band;
   271      getPerfectlyNestedLoops(band, root);
   272      bands->push_back(band);
   273    };
   274  
   275    for (auto &block : f)
   276      for (auto &op : block)
   277        if (auto forOp = dyn_cast<AffineForOp>(op))
   278          getMaximalPerfectLoopNest(forOp);
   279  }
   280  
   281  // Reduce each tile size to the largest divisor of the corresponding trip count
   282  // (if the trip count is known).
   283  static void adjustToDivisorsOfTripCounts(ArrayRef<AffineForOp> band,
   284                                           SmallVectorImpl<unsigned> *tileSizes) {
   285    assert(band.size() == tileSizes->size() && "invalid tile size count");
   286    for (unsigned i = 0, e = band.size(); i < e; i++) {
   287      unsigned &tSizeAdjusted = (*tileSizes)[i];
   288      auto mayConst = getConstantTripCount(band[i]);
   289      if (!mayConst.hasValue())
   290        continue;
   291      // Adjust the tile size to largest factor of the trip count less than
   292      // tSize.
   293      uint64_t constTripCount = mayConst.getValue();
   294      if (constTripCount > 1 && tSizeAdjusted > constTripCount / 2)
   295        tSizeAdjusted = constTripCount / 2;
   296      while (constTripCount % tSizeAdjusted != 0)
   297        tSizeAdjusted--;
   298    }
   299  }
   300  
   301  // Returns tile sizes to use. Checks CL options; if none are specified, sets it
   302  // based on a simple model that looks at the memory footprint and determines
   303  // tile sizes assuming identity accesses / 1:1 tile size proportional footprint
   304  // along each of the dimensions being tiled.
   305  // TODO(mlir-team): evolve this model. Tile size determination is a large area
   306  // to play with in general.
   307  void LoopTiling::getTileSizes(ArrayRef<AffineForOp> band,
   308                                SmallVectorImpl<unsigned> *tileSizes) {
   309    if (band.empty())
   310      return;
   311  
   312    tileSizes->resize(band.size());
   313  
   314    // Use clTileSize for all loops if specified.
   315    if (clTileSize.getNumOccurrences() > 0) {
   316      std::fill(tileSizes->begin(), tileSizes->end(), clTileSize);
   317      return;
   318    }
   319  
   320    // Use clTileSizes and fill them with default tile size if it's short.
   321    if (!clTileSizes.empty()) {
   322      std::fill(tileSizes->begin(), tileSizes->end(),
   323                LoopTiling::kDefaultTileSize);
   324      std::copy(clTileSizes.begin(),
   325                clTileSizes.begin() + std::min(clTileSizes.size(), band.size()),
   326                tileSizes->begin());
   327      return;
   328    }
   329  
   330    // The first loop in the band.
   331    auto rootForOp = band[0];
   332    (void)rootForOp;
   333  
   334    // Obtain memory footprint and set tile sizes so that a tile fits in
   335    // the cache size. This is an approximation with the assumption that the
   336    // footprint increases with the tile size linearly in that dimension (i.e.,
   337    // assumes one-to-one access function).
   338    auto fp = getMemoryFootprintBytes(band[0], 0);
   339    if (!fp.hasValue()) {
   340      // Fill with default tile sizes if footprint is unknown.
   341      std::fill(tileSizes->begin(), tileSizes->end(),
   342                LoopTiling::kDefaultTileSize);
   343      if (avoidMaxMinBounds)
   344        adjustToDivisorsOfTripCounts(band, tileSizes);
   345      LLVM_DEBUG(
   346          rootForOp.emitWarning("memory footprint unknown: using default tile "
   347                                "sizes adjusted to trip count divisors"));
   348      return;
   349    }
   350  
   351    // Check how many times larger the cache size is when compared to footprint.
   352    uint64_t excessFactor = llvm::divideCeil(fp.getValue(), cacheSizeBytes);
   353    if (excessFactor <= 1) {
   354      // No need of any tiling - set tile size to 1.
   355      std::fill(tileSizes->begin(), tileSizes->end(), 1);
   356      return;
   357    }
   358  
   359    // Divide all loops equally in an attempt to reduce footprint.
   360    // TODO(bondhugula): this is approximate. Ideally, obtain reuse factor /
   361    // profitability along each dimension and weight tile sizes based on that as
   362    // one possible approach. Or compute a polynomial in tile sizes and solve for
   363    // it.
   364  
   365    // For an n-d tilable band, compute n^th root of the excess.
   366    unsigned tSize =
   367        static_cast<unsigned>(floorl(std::pow(excessFactor, 1.0 / band.size())));
   368    // We'll keep a running product to determine the last tile size better.
   369    unsigned cumulProductOfTileSizes = 1;
   370    for (unsigned i = 0, e = band.size(); i < e; i++) {
   371      if (i < e - 1)
   372        (*tileSizes)[i] = tSize;
   373      else
   374        // Set last tile size to cover the balance.
   375        (*tileSizes)[i] = std::max(
   376            1U, static_cast<unsigned>(excessFactor / cumulProductOfTileSizes));
   377      cumulProductOfTileSizes *= (*tileSizes)[i];
   378    }
   379    if (avoidMaxMinBounds)
   380      adjustToDivisorsOfTripCounts(band, tileSizes);
   381  }
   382  
   383  void LoopTiling::runOnFunction() {
   384    // Override cache size if provided on command line.
   385    if (clCacheSizeKiB.getNumOccurrences() > 0)
   386      cacheSizeBytes = clCacheSizeKiB * 1024;
   387  
   388    // Bands of loops to tile.
   389    std::vector<SmallVector<AffineForOp, 6>> bands;
   390    getTileableBands(getFunction(), &bands);
   391  
   392    for (auto &band : bands) {
   393      // Set up tile sizes; fill missing tile sizes at the end with default tile
   394      // size or clTileSize if one was provided.
   395      SmallVector<unsigned, 6> tileSizes;
   396      getTileSizes(band, &tileSizes);
   397      if (llvm::DebugFlag) {
   398        auto diag = band[0].emitRemark("using tile sizes [");
   399        for (auto tSize : tileSizes)
   400          diag << tSize << " ";
   401        diag << "]\n";
   402      }
   403      if (failed(tileCodeGen(band, tileSizes)))
   404        return signalPassFailure();
   405    }
   406  }
   407  
   408  constexpr unsigned LoopTiling::kDefaultTileSize;
   409  constexpr uint64_t LoopTiling::kDefaultCacheMemCapacity;
   410  
   411  static PassRegistration<LoopTiling> pass("affine-loop-tile", "Tile loop nests");