github.com/johnnyeven/libtools@v0.0.0-20191126065708-61829c1adf46/third_party/mlir/lib/Transforms/AffineDataCopyGeneration.cpp (about) 1 //===- AffineDataCopyGeneration.cpp - Explicit memref copying pass ------*-===// 2 // 3 // Copyright 2019 The MLIR Authors. 4 // 5 // Licensed under the Apache License, Version 2.0 (the "License"); 6 // you may not use this file except in compliance with the License. 7 // You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 // ============================================================================= 17 // 18 // This file implements a pass to automatically promote accessed memref regions 19 // to buffers in a faster memory space that is explicitly managed, with the 20 // necessary data movement operations performed through either regular 21 // point-wise load/store's or DMAs. Such explicit copying (also referred to as 22 // array packing/unpacking in the literature), when done on arrays that exhibit 23 // reuse, results in near elimination of conflict misses, TLB misses, reduced 24 // use of hardware prefetch streams, and reduced false sharing. It is also 25 // necessary for hardware that explicitly managed levels in the memory 26 // hierarchy, and where DMAs may have to be used. This optimization is often 27 // performed on already tiled code. 28 // 29 //===----------------------------------------------------------------------===// 30 31 #include "mlir/Analysis/AffineStructures.h" 32 #include "mlir/Analysis/Utils.h" 33 #include "mlir/Dialect/AffineOps/AffineOps.h" 34 #include "mlir/Dialect/StandardOps/Ops.h" 35 #include "mlir/IR/Builders.h" 36 #include "mlir/Pass/Pass.h" 37 #include "mlir/Transforms/Passes.h" 38 #include "mlir/Transforms/Utils.h" 39 #include "llvm/ADT/MapVector.h" 40 #include "llvm/Support/CommandLine.h" 41 #include "llvm/Support/Debug.h" 42 #include <algorithm> 43 44 #define DEBUG_TYPE "affine-data-copy-generate" 45 46 using namespace mlir; 47 using llvm::SmallMapVector; 48 49 static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options"); 50 51 static llvm::cl::opt<unsigned long long> clFastMemoryCapacity( 52 "affine-data-copy-generate-fast-mem-capacity", 53 llvm::cl::desc( 54 "Set fast memory space capacity in KiB (default: unlimited)"), 55 llvm::cl::cat(clOptionsCategory)); 56 57 static llvm::cl::opt<bool> 58 clDma("affine-data-copy-generate-dma", 59 llvm::cl::desc("Generate DMA instead of point-wise copy"), 60 llvm::cl::cat(clOptionsCategory), 61 llvm::cl::init(true)); 62 63 static llvm::cl::opt<unsigned> clFastMemorySpace( 64 "affine-data-copy-generate-fast-mem-space", llvm::cl::init(0), 65 llvm::cl::desc( 66 "Fast memory space identifier for copy generation (default: 1)"), 67 llvm::cl::cat(clOptionsCategory)); 68 69 static llvm::cl::opt<bool> clSkipNonUnitStrideLoop( 70 "affine-data-copy-generate-skip-non-unit-stride-loops", llvm::cl::Hidden, 71 llvm::cl::init(false), 72 llvm::cl::desc("Testing purposes: avoid non-unit stride loop choice depths " 73 "for copy placement"), 74 llvm::cl::cat(clOptionsCategory)); 75 76 namespace { 77 78 /// Replaces all loads and stores on memref's living in 'slowMemorySpace' by 79 /// introducing copy operations to transfer data into `fastMemorySpace` and 80 /// rewriting the original load's/store's to instead load/store from the 81 /// allocated fast memory buffers. Additional options specify the identifier 82 /// corresponding to the fast memory space and the amount of fast memory space 83 /// available. The pass traverses through the nesting structure, recursing to 84 /// inner levels if necessary to determine at what depth copies need to be 85 /// placed so that the allocated buffers fit within the memory capacity 86 /// provided. 87 // TODO(bondhugula): We currently can't generate copies correctly when stores 88 // are strided. Check for strided stores. 89 struct AffineDataCopyGeneration 90 : public FunctionPass<AffineDataCopyGeneration> { 91 explicit AffineDataCopyGeneration( 92 unsigned slowMemorySpace = 0, 93 unsigned fastMemorySpace = clFastMemorySpace, unsigned tagMemorySpace = 0, 94 int minDmaTransferSize = 1024, 95 uint64_t fastMemCapacityBytes = 96 (clFastMemoryCapacity.getNumOccurrences() > 0 97 ? clFastMemoryCapacity * 1024 // cl-provided size is in KiB 98 : std::numeric_limits<uint64_t>::max()), 99 bool generateDma = clDma, 100 bool skipNonUnitStrideLoops = clSkipNonUnitStrideLoop) 101 : slowMemorySpace(slowMemorySpace), fastMemorySpace(fastMemorySpace), 102 tagMemorySpace(tagMemorySpace), minDmaTransferSize(minDmaTransferSize), 103 fastMemCapacityBytes(fastMemCapacityBytes), generateDma(generateDma), 104 skipNonUnitStrideLoops(skipNonUnitStrideLoops) {} 105 106 explicit AffineDataCopyGeneration(const AffineDataCopyGeneration &other) 107 : slowMemorySpace(other.slowMemorySpace), 108 fastMemorySpace(other.fastMemorySpace), 109 tagMemorySpace(other.tagMemorySpace), 110 minDmaTransferSize(other.minDmaTransferSize), 111 fastMemCapacityBytes(other.fastMemCapacityBytes), 112 generateDma(other.generateDma), 113 skipNonUnitStrideLoops(other.skipNonUnitStrideLoops) {} 114 115 void runOnFunction() override; 116 LogicalResult runOnBlock(Block *block); 117 uint64_t runOnBlock(Block::iterator begin, Block::iterator end); 118 119 LogicalResult generateCopy(const MemRefRegion ®ion, Block *block, 120 Block::iterator begin, Block::iterator end, 121 uint64_t *sizeInBytes, Block::iterator *nBegin, 122 Block::iterator *nEnd); 123 124 // List of memory regions to copy for. We need a map vector to have a 125 // guaranteed iteration order to write test cases. CHECK-DAG doesn't help here 126 // since the alloc's for example are identical except for the SSA id. 127 SmallMapVector<Value *, std::unique_ptr<MemRefRegion>, 4> readRegions; 128 SmallMapVector<Value *, std::unique_ptr<MemRefRegion>, 4> writeRegions; 129 130 // Nests that are copy in's or copy out's; the root AffineForOp of that 131 // nest is stored herein. 132 DenseSet<Operation *> copyNests; 133 134 // Map from original memref's to the fast buffers that their accesses are 135 // replaced with. 136 DenseMap<Value *, Value *> fastBufferMap; 137 138 // Slow memory space associated with copies. 139 const unsigned slowMemorySpace; 140 // Fast memory space associated with copies. 141 unsigned fastMemorySpace; 142 // Memory space associated with DMA tags. 143 unsigned tagMemorySpace; 144 // Minimum DMA transfer size supported by the target in bytes. 145 const int minDmaTransferSize; 146 // Capacity of the faster memory space. 147 uint64_t fastMemCapacityBytes; 148 149 // If set, generate DMA operations instead of read/write. 150 bool generateDma; 151 152 // If set, ignore loops with steps other than 1. 153 bool skipNonUnitStrideLoops; 154 155 // Constant zero index to avoid too many duplicates. 156 Value *zeroIndex = nullptr; 157 }; 158 159 } // end anonymous namespace 160 161 /// Generates copies for memref's living in 'slowMemorySpace' into newly created 162 /// buffers in 'fastMemorySpace', and replaces memory operations to the former 163 /// by the latter. Only load op's handled for now. 164 /// TODO(bondhugula): extend this to store op's. 165 std::unique_ptr<FunctionPassBase> mlir::createAffineDataCopyGenerationPass( 166 unsigned slowMemorySpace, unsigned fastMemorySpace, unsigned tagMemorySpace, 167 int minDmaTransferSize, uint64_t fastMemCapacityBytes) { 168 return std::make_unique<AffineDataCopyGeneration>( 169 slowMemorySpace, fastMemorySpace, tagMemorySpace, minDmaTransferSize, 170 fastMemCapacityBytes); 171 } 172 173 // Info comprising stride and number of elements transferred every stride. 174 struct StrideInfo { 175 int64_t stride; 176 int64_t numEltPerStride; 177 }; 178 179 /// Returns striding information for a copy/transfer of this region with 180 /// potentially multiple striding levels from outermost to innermost. For an 181 /// n-dimensional region, there can be at most n-1 levels of striding 182 /// successively nested. 183 // TODO(bondhugula): make this work with non-identity layout maps. 184 static void getMultiLevelStrides(const MemRefRegion ®ion, 185 ArrayRef<int64_t> bufferShape, 186 SmallVectorImpl<StrideInfo> *strideInfos) { 187 if (bufferShape.size() <= 1) 188 return; 189 190 int64_t numEltPerStride = 1; 191 int64_t stride = 1; 192 for (int d = bufferShape.size() - 1; d >= 1; d--) { 193 int64_t dimSize = region.memref->getType().cast<MemRefType>().getDimSize(d); 194 stride *= dimSize; 195 numEltPerStride *= bufferShape[d]; 196 // A stride is needed only if the region has a shorter extent than the 197 // memref along the dimension *and* has an extent greater than one along the 198 // next major dimension. 199 if (bufferShape[d] < dimSize && bufferShape[d - 1] > 1) { 200 strideInfos->push_back({stride, numEltPerStride}); 201 } 202 } 203 } 204 205 /// Construct the memref region to just include the entire memref. Returns false 206 /// dynamic shaped memref's for now. `numParamLoopIVs` is the number of 207 /// enclosing loop IVs of opInst (starting from the outermost) that the region 208 /// is parametric on. 209 static bool getFullMemRefAsRegion(Operation *opInst, unsigned numParamLoopIVs, 210 MemRefRegion *region) { 211 unsigned rank; 212 if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) { 213 rank = loadOp.getMemRefType().getRank(); 214 region->memref = loadOp.getMemRef(); 215 region->setWrite(false); 216 } else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) { 217 rank = storeOp.getMemRefType().getRank(); 218 region->memref = storeOp.getMemRef(); 219 region->setWrite(true); 220 } else { 221 assert(false && "expected load or store op"); 222 return false; 223 } 224 auto memRefType = region->memref->getType().cast<MemRefType>(); 225 if (!memRefType.hasStaticShape()) 226 return false; 227 228 auto *regionCst = region->getConstraints(); 229 230 // Just get the first numSymbols IVs, which the memref region is parametric 231 // on. 232 SmallVector<AffineForOp, 4> ivs; 233 getLoopIVs(*opInst, &ivs); 234 ivs.resize(numParamLoopIVs); 235 SmallVector<Value *, 4> symbols; 236 extractForInductionVars(ivs, &symbols); 237 regionCst->reset(rank, numParamLoopIVs, 0); 238 regionCst->setIdValues(rank, rank + numParamLoopIVs, symbols); 239 240 // Memref dim sizes provide the bounds. 241 for (unsigned d = 0; d < rank; d++) { 242 auto dimSize = memRefType.getDimSize(d); 243 assert(dimSize > 0 && "filtered dynamic shapes above"); 244 regionCst->addConstantLowerBound(d, 0); 245 regionCst->addConstantUpperBound(d, dimSize - 1); 246 } 247 return true; 248 } 249 250 static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED 251 emitRemarkForBlock(Block &block) { 252 return block.getParentOp()->emitRemark(); 253 } 254 255 /// Generates a point-wise copy from/to `memref' to/from `fastMemRef' and 256 /// returns the outermost AffineForOp of the copy loop nest. `memIndicesStart' 257 /// holds the lower coordinates of the region in the original memref to copy 258 /// in/out. If `copyOut' is true, generates a copy-out; otherwise a copy-in. 259 static AffineForOp generatePointWiseCopy(Location loc, Value *memref, 260 Value *fastMemRef, 261 ArrayRef<Value *> memIndicesStart, 262 ArrayRef<int64_t> fastBufferShape, 263 bool isCopyOut, OpBuilder b) { 264 assert(!memIndicesStart.empty() && "only 1-d or more memrefs"); 265 266 // The copy-in nest is generated as follows as an example for a 2-d region: 267 // for x = ... 268 // for y = ... 269 // fast_buf[x][y] = buf[mem_x + x][mem_y + y] 270 271 SmallVector<Value *, 4> fastBufIndices, memIndices; 272 AffineForOp copyNestRoot; 273 for (unsigned d = 0, e = fastBufferShape.size(); d < e; ++d) { 274 auto forOp = b.create<AffineForOp>(loc, 0, fastBufferShape[d]); 275 if (d == 0) 276 copyNestRoot = forOp; 277 b = forOp.getBodyBuilder(); 278 fastBufIndices.push_back(forOp.getInductionVar()); 279 // Construct the subscript for the slow memref being copied. 280 SmallVector<Value *, 2> operands = {memIndicesStart[d], forOp.getInductionVar()}; 281 auto memIndex = b.create<AffineApplyOp>( 282 loc, 283 b.getAffineMap(2, 0, b.getAffineDimExpr(0) + b.getAffineDimExpr(1)), 284 operands); 285 memIndices.push_back(memIndex); 286 } 287 288 if (!isCopyOut) { 289 // Copy in. 290 auto load = b.create<AffineLoadOp>(loc, memref, memIndices); 291 b.create<AffineStoreOp>(loc, load, fastMemRef, fastBufIndices); 292 return copyNestRoot; 293 } 294 295 // Copy out. 296 auto load = b.create<AffineLoadOp>(loc, fastMemRef, fastBufIndices); 297 b.create<AffineStoreOp>(loc, load, memref, memIndices); 298 return copyNestRoot; 299 } 300 301 /// Creates a buffer in the faster memory space for the specified region; 302 /// generates a copy from the lower memory space to this one, and replaces all 303 /// loads to load from that buffer. Returns failure if copies could not be 304 /// generated due to yet unimplemented cases. `begin` and `end` specify the 305 /// insertion points where the incoming copies and outgoing copies, 306 /// respectively, should be inserted (the insertion happens right before the 307 /// insertion point). Since `begin` can itself be invalidated due to the memref 308 /// rewriting done from this method, the output argument `nBegin` is set to its 309 /// replacement (set to `begin` if no invalidation happens). Since outgoing 310 /// copies are inserted at `end`, the output argument `nEnd` is set to the one 311 /// following the original end (since the latter could have been 312 /// invalidated/replaced). `sizeInBytes` is set to the size of the fast buffer 313 /// allocated. 314 LogicalResult AffineDataCopyGeneration::generateCopy( 315 const MemRefRegion ®ion, Block *block, Block::iterator begin, 316 Block::iterator end, uint64_t *sizeInBytes, Block::iterator *nBegin, 317 Block::iterator *nEnd) { 318 *nBegin = begin; 319 *nEnd = end; 320 321 if (begin == end) 322 return success(); 323 324 // Copies for read regions are going to be inserted at 'begin'. 325 OpBuilder prologue(block, begin); 326 // Copies for write regions are going to be inserted at 'end'. 327 OpBuilder epilogue(block, end); 328 OpBuilder &b = region.isWrite() ? epilogue : prologue; 329 330 // Builder to create constants at the top level. 331 auto func = block->getParent()->getParentOfType<FuncOp>(); 332 OpBuilder top(func.getBody()); 333 334 auto loc = region.loc; 335 auto *memref = region.memref; 336 auto memRefType = memref->getType().cast<MemRefType>(); 337 338 auto layoutMaps = memRefType.getAffineMaps(); 339 if (layoutMaps.size() > 1 || 340 (layoutMaps.size() == 1 && !layoutMaps[0].isIdentity())) { 341 LLVM_DEBUG(llvm::dbgs() << "Non-identity layout map not yet supported\n"); 342 return failure(); 343 } 344 345 // Indices to use for the copying. 346 // Indices for the original memref being copied from/to. 347 SmallVector<Value *, 4> memIndices; 348 // Indices for the faster buffer being copied into/from. 349 SmallVector<Value *, 4> bufIndices; 350 351 unsigned rank = memRefType.getRank(); 352 SmallVector<int64_t, 4> fastBufferShape; 353 354 // Compute the extents of the buffer. 355 std::vector<SmallVector<int64_t, 4>> lbs; 356 SmallVector<int64_t, 8> lbDivisors; 357 lbs.reserve(rank); 358 Optional<int64_t> numElements = region.getConstantBoundingSizeAndShape( 359 &fastBufferShape, &lbs, &lbDivisors); 360 if (!numElements.hasValue()) { 361 LLVM_DEBUG(llvm::dbgs() << "Non-constant region size not supported\n"); 362 return failure(); 363 } 364 365 if (numElements.getValue() == 0) { 366 LLVM_DEBUG(llvm::dbgs() << "Nothing to copy\n"); 367 *sizeInBytes = 0; 368 return success(); 369 } 370 371 const FlatAffineConstraints *cst = region.getConstraints(); 372 // 'regionSymbols' hold values that this memory region is symbolic/paramteric 373 // on; these typically include loop IVs surrounding the level at which the 374 // copy generation is being done or other valid symbols in MLIR. 375 SmallVector<Value *, 8> regionSymbols; 376 cst->getIdValues(rank, cst->getNumIds(), ®ionSymbols); 377 378 // Construct the index expressions for the fast memory buffer. The index 379 // expression for a particular dimension of the fast buffer is obtained by 380 // subtracting out the lower bound on the original memref's data region 381 // along the corresponding dimension. 382 383 // Index start offsets for faster memory buffer relative to the original. 384 SmallVector<AffineExpr, 4> offsets; 385 offsets.reserve(rank); 386 for (unsigned d = 0; d < rank; d++) { 387 assert(lbs[d].size() == cst->getNumCols() - rank && "incorrect bound size"); 388 389 AffineExpr offset = top.getAffineConstantExpr(0); 390 for (unsigned j = 0, e = cst->getNumCols() - rank - 1; j < e; j++) { 391 offset = offset + lbs[d][j] * top.getAffineDimExpr(j); 392 } 393 assert(lbDivisors[d] > 0); 394 offset = 395 (offset + lbs[d][cst->getNumCols() - 1 - rank]).floorDiv(lbDivisors[d]); 396 397 // Set copy start location for this dimension in the lower memory space 398 // memref. 399 if (auto caf = offset.dyn_cast<AffineConstantExpr>()) { 400 auto indexVal = caf.getValue(); 401 if (indexVal == 0) { 402 memIndices.push_back(zeroIndex); 403 } else { 404 memIndices.push_back( 405 top.create<ConstantIndexOp>(loc, indexVal).getResult()); 406 } 407 } else { 408 // The coordinate for the start location is just the lower bound along the 409 // corresponding dimension on the memory region (stored in 'offset'). 410 auto map = top.getAffineMap( 411 cst->getNumDimIds() + cst->getNumSymbolIds() - rank, 0, offset); 412 memIndices.push_back(b.create<AffineApplyOp>(loc, map, regionSymbols)); 413 } 414 // The fast buffer is copied into at location zero; addressing is relative. 415 bufIndices.push_back(zeroIndex); 416 417 // Record the offsets since they are needed to remap the memory accesses of 418 // the original memref further below. 419 offsets.push_back(offset); 420 } 421 422 // The faster memory space buffer. 423 Value *fastMemRef; 424 425 // Check if a buffer was already created. 426 bool existingBuf = fastBufferMap.count(memref) > 0; 427 if (!existingBuf) { 428 auto fastMemRefType = top.getMemRefType( 429 fastBufferShape, memRefType.getElementType(), {}, fastMemorySpace); 430 431 // Create the fast memory space buffer just before the 'affine.for' 432 // operation. 433 fastMemRef = prologue.create<AllocOp>(loc, fastMemRefType).getResult(); 434 // Record it. 435 fastBufferMap[memref] = fastMemRef; 436 // fastMemRefType is a constant shaped memref. 437 *sizeInBytes = getMemRefSizeInBytes(fastMemRefType).getValue(); 438 LLVM_DEBUG(emitRemarkForBlock(*block) 439 << "Creating fast buffer of type " << fastMemRefType 440 << " and size " << llvm::divideCeil(*sizeInBytes, 1024) 441 << " KiB\n"); 442 } else { 443 // Reuse the one already created. 444 fastMemRef = fastBufferMap[memref]; 445 *sizeInBytes = 0; 446 } 447 448 auto numElementsSSA = 449 top.create<ConstantIndexOp>(loc, numElements.getValue()); 450 451 SmallVector<StrideInfo, 4> strideInfos; 452 getMultiLevelStrides(region, fastBufferShape, &strideInfos); 453 454 // TODO(bondhugula): use all stride levels once DmaStartOp is extended for 455 // multi-level strides. 456 if (strideInfos.size() > 1) { 457 LLVM_DEBUG(llvm::dbgs() << "Only up to one level of stride supported\n"); 458 return failure(); 459 } 460 461 Value *stride = nullptr; 462 Value *numEltPerStride = nullptr; 463 if (!strideInfos.empty()) { 464 stride = top.create<ConstantIndexOp>(loc, strideInfos[0].stride); 465 numEltPerStride = 466 top.create<ConstantIndexOp>(loc, strideInfos[0].numEltPerStride); 467 } 468 469 // Record the last operation just before the point where we insert the 470 // copy out's. We later do the memref replacement later only in [begin, 471 // postDomFilter] so that the original memref's in the data movement code 472 // themselves don't get replaced. 473 auto postDomFilter = std::prev(end); 474 475 // Create fully composed affine maps for each memref. 476 auto memAffineMap = b.getMultiDimIdentityMap(memIndices.size()); 477 fullyComposeAffineMapAndOperands(&memAffineMap, &memIndices); 478 auto bufAffineMap = b.getMultiDimIdentityMap(bufIndices.size()); 479 fullyComposeAffineMapAndOperands(&bufAffineMap, &bufIndices); 480 481 if (!generateDma) { 482 auto copyNest = generatePointWiseCopy(loc, memref, fastMemRef, memIndices, 483 fastBufferShape, 484 /*isCopyOut=*/region.isWrite(), b); 485 486 // Record this so that we can skip it from yet another copy. 487 copyNests.insert(copyNest); 488 489 if (region.isWrite()) 490 // Since new ops are being appended (for copy out's), adjust the end to 491 // mark end of block range being processed. 492 *nEnd = Block::iterator(copyNest.getOperation()); 493 } else { 494 // Create a tag (single element 1-d memref) for the DMA. 495 auto tagMemRefType = 496 top.getMemRefType({1}, top.getIntegerType(32), {}, tagMemorySpace); 497 auto tagMemRef = prologue.create<AllocOp>(loc, tagMemRefType); 498 499 SmallVector<Value *, 4> tagIndices({zeroIndex}); 500 auto tagAffineMap = b.getMultiDimIdentityMap(tagIndices.size()); 501 fullyComposeAffineMapAndOperands(&tagAffineMap, &tagIndices); 502 if (!region.isWrite()) { 503 // DMA non-blocking read from original buffer to fast buffer. 504 b.create<AffineDmaStartOp>(loc, memref, memAffineMap, memIndices, 505 fastMemRef, bufAffineMap, bufIndices, 506 tagMemRef, tagAffineMap, tagIndices, 507 numElementsSSA, stride, numEltPerStride); 508 } else { 509 // DMA non-blocking write from fast buffer to the original memref. 510 auto op = b.create<AffineDmaStartOp>( 511 loc, fastMemRef, bufAffineMap, bufIndices, memref, memAffineMap, 512 memIndices, tagMemRef, tagAffineMap, tagIndices, numElementsSSA, 513 stride, numEltPerStride); 514 // Since new ops are being appended (for outgoing DMAs), adjust the end to 515 // mark end of block range being processed. 516 *nEnd = Block::iterator(op.getOperation()); 517 } 518 519 // Matching DMA wait to block on completion; tag always has a 0 index. 520 b.create<AffineDmaWaitOp>(loc, tagMemRef, tagAffineMap, zeroIndex, 521 numElementsSSA); 522 523 // Generate dealloc for the tag. 524 auto tagDeallocOp = epilogue.create<DeallocOp>(loc, tagMemRef); 525 if (*nEnd == end) 526 // Since new ops are being appended (for outgoing DMAs), adjust the end to 527 // mark end of range of the original. 528 *nEnd = Block::iterator(tagDeallocOp.getOperation()); 529 } 530 531 // Generate dealloc for the buffer. 532 if (!existingBuf) { 533 auto bufDeallocOp = epilogue.create<DeallocOp>(loc, fastMemRef); 534 // When generating pointwise copies, `nEnd' has to be set to deallocOp on 535 // the fast buffer (since it marks the new end insertion point). 536 if (!generateDma && *nEnd == end) 537 *nEnd = Block::iterator(bufDeallocOp.getOperation()); 538 } 539 540 // Replace all uses of the old memref with the faster one while remapping 541 // access indices (subtracting out lower bound offsets for each dimension). 542 // Ex: to replace load %A[%i, %j] with load %Abuf[%i - %iT, %j - %jT], 543 // index remap will be (%i, %j) -> (%i - %iT, %j - %jT), 544 // i.e., affine.apply (d0, d1, d2, d3) -> (d2-d0, d3-d1) (%iT, %jT, %i, %j), 545 // and (%iT, %jT) will be the 'extraOperands' for 'rep all memref uses with'. 546 // d2, d3 correspond to the original indices (%i, %j). 547 SmallVector<AffineExpr, 4> remapExprs; 548 remapExprs.reserve(rank); 549 for (unsigned i = 0; i < rank; i++) { 550 // The starting operands of indexRemap will be regionSymbols (the symbols on 551 // which the memref region is parametric); then those corresponding to 552 // the memref's original indices follow. 553 auto dimExpr = b.getAffineDimExpr(regionSymbols.size() + i); 554 remapExprs.push_back(dimExpr - offsets[i]); 555 } 556 auto indexRemap = b.getAffineMap(regionSymbols.size() + rank, 0, remapExprs); 557 558 // Record the begin since it may be invalidated by memref replacement. 559 Block::iterator prev; 560 bool wasAtStartOfBlock = (begin == block->begin()); 561 if (!wasAtStartOfBlock) 562 prev = std::prev(begin); 563 564 // *Only* those uses within the range [begin, end) of 'block' are replaced. 565 replaceAllMemRefUsesWith(memref, fastMemRef, 566 /*extraIndices=*/{}, indexRemap, 567 /*extraOperands=*/regionSymbols, 568 /*domInstFilter=*/&*begin, 569 /*postDomInstFilter=*/&*postDomFilter); 570 571 *nBegin = wasAtStartOfBlock ? block->begin() : std::next(prev); 572 573 return success(); 574 } 575 576 /// Generate copies for this block. The block is partitioned into separate 577 /// ranges: each range is either a sequence of one or more operations starting 578 /// and ending with an affine load or store op, or just an affine.forop (which 579 /// could have other affine for op's nested within). 580 LogicalResult AffineDataCopyGeneration::runOnBlock(Block *block) { 581 if (block->empty()) 582 return success(); 583 584 copyNests.clear(); 585 586 // Every affine.forop in the block starts and ends a block range for copying. 587 // A contiguous sequence of operations starting and ending with a load/store 588 // op is also identified as a copy block range. Straightline code (a 589 // contiguous chunk of operations excluding AffineForOp's) are always assumed 590 // to not exhaust memory. As a result, this approach is conservative in some 591 // cases at the moment; we do a check later and report an error with location 592 // info. 593 // TODO(bondhugula): An 'affine.if' operation is being treated similar to an 594 // operation. 'affine.if''s could have 'affine.for's in them; 595 // treat them separately. 596 597 // Get to the first load, store, or for op (that is not a copy nest itself). 598 auto curBegin = 599 std::find_if(block->begin(), block->end(), [&](Operation &op) { 600 return (isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op) || 601 isa<AffineForOp>(op)) && 602 copyNests.count(&op) == 0; 603 }); 604 605 for (auto it = curBegin; it != block->end(); ++it) { 606 AffineForOp forOp; 607 if ((forOp = dyn_cast<AffineForOp>(&*it)) && copyNests.count(forOp) == 0) { 608 // Returns true if the footprint is known to exceed capacity. 609 auto exceedsCapacity = [&](AffineForOp forOp) { 610 Optional<int64_t> footprint = 611 getMemoryFootprintBytes(forOp, 612 /*memorySpace=*/0); 613 return (footprint.hasValue() && 614 static_cast<uint64_t>(footprint.getValue()) > 615 fastMemCapacityBytes); 616 }; 617 618 // If the memory footprint of the 'affine.for' loop is higher than fast 619 // memory capacity (when provided), we recurse to copy at an inner level 620 // until we find a depth at which footprint fits in fast mem capacity. If 621 // the footprint can't be calculated, we assume for now it fits. Recurse 622 // inside if footprint for 'forOp' exceeds capacity, or when 623 // skipNonUnitStrideLoops is set and the step size is not one. 624 bool recurseInner = skipNonUnitStrideLoops ? forOp.getStep() != 1 625 : exceedsCapacity(forOp); 626 if (recurseInner) { 627 // We'll recurse and do the copies at an inner level for 'forInst'. 628 runOnBlock(/*begin=*/curBegin, /*end=*/it); 629 // Recurse onto the body of this loop. 630 runOnBlock(forOp.getBody()); 631 // The next block range starts right after the 'affine.for' operation. 632 curBegin = std::next(it); 633 } else { 634 // We have enough capacity, i.e., copies will be computed for the 635 // portion of the block until 'it', and for 'it', which is 'forOp'. Note 636 // that for the latter, the copies are placed just before this loop (for 637 // incoming copies) and right after (for outgoing ones). 638 runOnBlock(/*begin=*/curBegin, /*end=*/it); 639 640 // Inner loop copies have their own scope - we don't thus update 641 // consumed capacity. The footprint check above guarantees this inner 642 // loop's footprint fits. 643 runOnBlock(/*begin=*/it, /*end=*/std::next(it)); 644 curBegin = std::next(it); 645 } 646 } else if (!isa<AffineLoadOp>(&*it) && !isa<AffineStoreOp>(&*it)) { 647 runOnBlock(/*begin=*/curBegin, /*end=*/it); 648 curBegin = std::next(it); 649 } 650 } 651 652 // Generate the copy for the final block range. 653 if (curBegin != block->end()) { 654 // Can't be a terminator because it would have been skipped above. 655 assert(!curBegin->isKnownTerminator() && "can't be a terminator"); 656 runOnBlock(/*begin=*/curBegin, /*end=*/block->end()); 657 } 658 659 return success(); 660 } 661 662 /// Given a memref region, determine the lowest depth at which transfers can be 663 /// placed for it, and return the corresponding block, start and end positions 664 /// in the block for placing incoming (read) and outgoing (write) copies 665 /// respectively. The lowest depth depends on whether the region being accessed 666 /// is invariant with respect to one or more immediately surrounding loops. 667 static void 668 findHighestBlockForPlacement(const MemRefRegion ®ion, Block &block, 669 Block::iterator &begin, Block::iterator &end, 670 Block **copyPlacementBlock, 671 Block::iterator *copyPlacementReadStart, 672 Block::iterator *copyPlacementWriteStart) { 673 const auto *cst = region.getConstraints(); 674 SmallVector<Value *, 4> symbols; 675 cst->getIdValues(cst->getNumDimIds(), cst->getNumDimAndSymbolIds(), &symbols); 676 677 SmallVector<AffineForOp, 4> enclosingFors; 678 getLoopIVs(*block.begin(), &enclosingFors); 679 // Walk up loop parents till we find an IV on which this region is 680 // symbolic/variant. 681 auto it = enclosingFors.rbegin(); 682 for (auto e = enclosingFors.rend(); it != e; ++it) { 683 // TODO(bondhugula): also need to be checking this for regions symbols that 684 // aren't loop IVs, whether we are within their resp. defs' dominance scope. 685 if (llvm::is_contained(symbols, it->getInductionVar())) 686 break; 687 } 688 689 if (it != enclosingFors.rbegin()) { 690 auto lastInvariantIV = *std::prev(it); 691 *copyPlacementReadStart = Block::iterator(lastInvariantIV.getOperation()); 692 *copyPlacementWriteStart = std::next(*copyPlacementReadStart); 693 *copyPlacementBlock = lastInvariantIV.getOperation()->getBlock(); 694 } else { 695 *copyPlacementReadStart = begin; 696 *copyPlacementWriteStart = end; 697 *copyPlacementBlock = █ 698 } 699 } 700 701 /// Generates copies for a contiguous sequence of operations in `block` in the 702 /// iterator range [begin, end). Returns the total size of the fast buffers 703 /// used. 704 // Since we generate alloc's and dealloc's for all fast buffers (before and 705 // after the range of operations resp.), all of the fast memory capacity is 706 // assumed to be available for processing this block range. 707 uint64_t AffineDataCopyGeneration::runOnBlock(Block::iterator begin, 708 Block::iterator end) { 709 if (begin == end) 710 return 0; 711 712 assert(begin->getBlock() == std::prev(end)->getBlock() && 713 "Inconsistent args"); 714 715 Block *block = begin->getBlock(); 716 717 // Copies will be generated for this depth, i.e., symbolic in all loops 718 // surrounding the this block range. 719 unsigned copyDepth = getNestingDepth(*begin); 720 721 LLVM_DEBUG(llvm::dbgs() << "Generating copies at depth " << copyDepth 722 << "\n"); 723 724 readRegions.clear(); 725 writeRegions.clear(); 726 fastBufferMap.clear(); 727 728 // To check for errors when walking the block. 729 bool error = false; 730 731 // Walk this range of operations to gather all memory regions. 732 block->walk(begin, end, [&](Operation *opInst) { 733 // Gather regions to allocate to buffers in faster memory space. 734 if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) { 735 if (loadOp.getMemRefType().getMemorySpace() != slowMemorySpace) 736 return; 737 } else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) { 738 if (storeOp.getMemRefType().getMemorySpace() != slowMemorySpace) 739 return; 740 } else { 741 // Neither load nor a store op. 742 return; 743 } 744 745 // Compute the MemRefRegion accessed. 746 auto region = std::make_unique<MemRefRegion>(opInst->getLoc()); 747 if (failed(region->compute(opInst, copyDepth))) { 748 LLVM_DEBUG(llvm::dbgs() 749 << "Error obtaining memory region: semi-affine maps?\n"); 750 LLVM_DEBUG(llvm::dbgs() << "over-approximating to the entire memref\n"); 751 if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) { 752 LLVM_DEBUG( 753 opInst->emitError("Non-constant memref sizes not yet supported")); 754 error = true; 755 return; 756 } 757 } 758 759 // Each memref has a single buffer associated with it irrespective of how 760 // many load's and store's happen on it. 761 // TODO(bondhugula): in the future, when regions don't intersect and satisfy 762 // other properties (based on load/store regions), we could consider 763 // multiple buffers per memref. 764 765 // Add to the appropriate region if it's not already in it, or take a 766 // bounding box union with the existing one if it's already in there. 767 // Note that a memref may have both read and write regions - so update the 768 // region in the other list if one exists (write in case of read and vice 769 // versa) since there is a single bounding box for a memref across all reads 770 // and writes that happen on it. 771 772 // Attempts to update; returns true if 'region' exists in targetRegions. 773 auto updateRegion = 774 [&](const SmallMapVector<Value *, std::unique_ptr<MemRefRegion>, 4> 775 &targetRegions) { 776 auto it = targetRegions.find(region->memref); 777 if (it == targetRegions.end()) 778 return false; 779 780 // Perform a union with the existing region. 781 if (failed(it->second->unionBoundingBox(*region))) { 782 LLVM_DEBUG(llvm::dbgs() 783 << "Memory region bounding box failed; " 784 "over-approximating to the entire memref\n"); 785 // If the union fails, we will overapproximate. 786 if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) { 787 LLVM_DEBUG(opInst->emitError( 788 "Non-constant memref sizes not yet supported")); 789 error = true; 790 return true; 791 } 792 it->second->getConstraints()->clearAndCopyFrom( 793 *region->getConstraints()); 794 } else { 795 // Union was computed and stored in 'it->second': copy to 'region'. 796 region->getConstraints()->clearAndCopyFrom( 797 *it->second->getConstraints()); 798 } 799 return true; 800 }; 801 802 bool existsInRead = updateRegion(readRegions); 803 if (error) 804 return; 805 bool existsInWrite = updateRegion(writeRegions); 806 if (error) 807 return; 808 809 // Finally add it to the region list. 810 if (region->isWrite() && !existsInWrite) { 811 writeRegions[region->memref] = std::move(region); 812 } else if (!region->isWrite() && !existsInRead) { 813 readRegions[region->memref] = std::move(region); 814 } 815 }); 816 817 if (error) { 818 begin->emitError( 819 "copy generation failed for one or more memref's in this block\n"); 820 return 0; 821 } 822 823 uint64_t totalCopyBuffersSizeInBytes = 0; 824 bool ret = true; 825 auto processRegions = 826 [&](const SmallMapVector<Value *, std::unique_ptr<MemRefRegion>, 4> 827 ®ions) { 828 for (const auto ®ionEntry : regions) { 829 // For each region, hoist copy in/out past all invariant 830 // 'affine.for's. 831 Block::iterator copyPlacementReadStart, copyPlacementWriteStart; 832 Block *copyPlacementBlock; 833 findHighestBlockForPlacement( 834 *regionEntry.second, *block, begin, end, ©PlacementBlock, 835 ©PlacementReadStart, ©PlacementWriteStart); 836 837 uint64_t sizeInBytes; 838 Block::iterator nBegin, nEnd; 839 LogicalResult iRet = generateCopy( 840 *regionEntry.second, copyPlacementBlock, copyPlacementReadStart, 841 copyPlacementWriteStart, &sizeInBytes, &nBegin, &nEnd); 842 if (succeeded(iRet)) { 843 // copyPlacmentStart/End (or begin/end) may be invalidated; use 844 // nBegin, nEnd to reset. 845 if (copyPlacementBlock == block) { 846 begin = nBegin; 847 end = nEnd; 848 } 849 totalCopyBuffersSizeInBytes += sizeInBytes; 850 } 851 ret = ret & succeeded(iRet); 852 } 853 }; 854 processRegions(readRegions); 855 processRegions(writeRegions); 856 857 if (!ret) { 858 begin->emitError( 859 "copy generation failed for one or more memref's in this block\n"); 860 return totalCopyBuffersSizeInBytes; 861 } 862 863 // For a range of operations, a note will be emitted at the caller. 864 AffineForOp forOp; 865 uint64_t sizeInKib = llvm::divideCeil(totalCopyBuffersSizeInBytes, 1024); 866 if (llvm::DebugFlag && (forOp = dyn_cast<AffineForOp>(&*begin))) { 867 forOp.emitRemark() 868 << sizeInKib 869 << " KiB of copy buffers in fast memory space for this block\n"; 870 } 871 872 if (totalCopyBuffersSizeInBytes > fastMemCapacityBytes) { 873 StringRef str = "Total size of all copy buffers' for this block " 874 "exceeds fast memory capacity\n"; 875 block->getParentOp()->emitError(str); 876 } 877 878 return totalCopyBuffersSizeInBytes; 879 } 880 881 void AffineDataCopyGeneration::runOnFunction() { 882 FuncOp f = getFunction(); 883 OpBuilder topBuilder(f.getBody()); 884 zeroIndex = topBuilder.create<ConstantIndexOp>(f.getLoc(), 0); 885 886 for (auto &block : f) 887 runOnBlock(&block); 888 } 889 890 static PassRegistration<AffineDataCopyGeneration> 891 pass("affine-data-copy-generate", 892 "Generate explicit copying for memory operations");