github.com/johnnyeven/libtools@v0.0.0-20191126065708-61829c1adf46/third_party/mlir/lib/Transforms/Vectorize.cpp (about) 1 //===- Vectorize.cpp - Vectorize Pass Impl --------------------------------===// 2 // 3 // Copyright 2019 The MLIR Authors. 4 // 5 // Licensed under the Apache License, Version 2.0 (the "License"); 6 // you may not use this file except in compliance with the License. 7 // You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 // ============================================================================= 17 // 18 // This file implements vectorization of loops, operations and data types to 19 // a target-independent, n-D super-vector abstraction. 20 // 21 //===----------------------------------------------------------------------===// 22 23 #include "mlir/Analysis/LoopAnalysis.h" 24 #include "mlir/Analysis/NestedMatcher.h" 25 #include "mlir/Analysis/SliceAnalysis.h" 26 #include "mlir/Analysis/Utils.h" 27 #include "mlir/Analysis/VectorAnalysis.h" 28 #include "mlir/Dialect/AffineOps/AffineOps.h" 29 #include "mlir/Dialect/StandardOps/Ops.h" 30 #include "mlir/Dialect/VectorOps/VectorOps.h" 31 #include "mlir/IR/AffineExpr.h" 32 #include "mlir/IR/Builders.h" 33 #include "mlir/IR/Location.h" 34 #include "mlir/IR/Types.h" 35 #include "mlir/Pass/Pass.h" 36 #include "mlir/Support/Functional.h" 37 #include "mlir/Support/LLVM.h" 38 #include "mlir/Transforms/Passes.h" 39 40 #include "llvm/ADT/DenseMap.h" 41 #include "llvm/ADT/DenseSet.h" 42 #include "llvm/ADT/SetVector.h" 43 #include "llvm/ADT/SmallString.h" 44 #include "llvm/ADT/SmallVector.h" 45 #include "llvm/Support/CommandLine.h" 46 #include "llvm/Support/Debug.h" 47 48 using namespace mlir; 49 50 /// 51 /// Implements a high-level vectorization strategy on a Function. 52 /// The abstraction used is that of super-vectors, which provide a single, 53 /// compact, representation in the vector types, information that is expected 54 /// to reduce the impact of the phase ordering problem 55 /// 56 /// Vector granularity: 57 /// =================== 58 /// This pass is designed to perform vectorization at a super-vector 59 /// granularity. A super-vector is loosely defined as a vector type that is a 60 /// multiple of a "good" vector size so the HW can efficiently implement a set 61 /// of high-level primitives. Multiple is understood along any dimension; e.g. 62 /// both vector<16xf32> and vector<2x8xf32> are valid super-vectors for a 63 /// vector<8xf32> HW vector. Note that a "good vector size so the HW can 64 /// efficiently implement a set of high-level primitives" is not necessarily an 65 /// integer multiple of actual hardware registers. We leave details of this 66 /// distinction unspecified for now. 67 /// 68 /// Some may prefer the terminology a "tile of HW vectors". In this case, one 69 /// should note that super-vectors implement an "always full tile" abstraction. 70 /// They guarantee no partial-tile separation is necessary by relying on a 71 /// high-level copy-reshape abstraction that we call vector.transfer. This 72 /// copy-reshape operations is also responsible for performing layout 73 /// transposition if necessary. In the general case this will require a scoped 74 /// allocation in some notional local memory. 75 /// 76 /// Whatever the mental model one prefers to use for this abstraction, the key 77 /// point is that we burn into a single, compact, representation in the vector 78 /// types, information that is expected to reduce the impact of the phase 79 /// ordering problem. Indeed, a vector type conveys information that: 80 /// 1. the associated loops have dependency semantics that do not prevent 81 /// vectorization; 82 /// 2. the associate loops have been sliced in chunks of static sizes that are 83 /// compatible with vector sizes (i.e. similar to unroll-and-jam); 84 /// 3. the inner loops, in the unroll-and-jam analogy of 2, are captured by 85 /// the 86 /// vector type and no vectorization hampering transformations can be 87 /// applied to them anymore; 88 /// 4. the underlying memrefs are accessed in some notional contiguous way 89 /// that allows loading into vectors with some amount of spatial locality; 90 /// In other words, super-vectorization provides a level of separation of 91 /// concern by way of opacity to subsequent passes. This has the effect of 92 /// encapsulating and propagating vectorization constraints down the list of 93 /// passes until we are ready to lower further. 94 /// 95 /// For a particular target, a notion of minimal n-d vector size will be 96 /// specified and vectorization targets a multiple of those. In the following 97 /// paragraph, let "k ." represent "a multiple of", to be understood as a 98 /// multiple in the same dimension (e.g. vector<16 x k . 128> summarizes 99 /// vector<16 x 128>, vector<16 x 256>, vector<16 x 1024>, etc). 100 /// 101 /// Some non-exhaustive notable super-vector sizes of interest include: 102 /// - CPU: vector<k . HW_vector_size>, 103 /// vector<k' . core_count x k . HW_vector_size>, 104 /// vector<socket_count x k' . core_count x k . HW_vector_size>; 105 /// - GPU: vector<k . warp_size>, 106 /// vector<k . warp_size x float2>, 107 /// vector<k . warp_size x float4>, 108 /// vector<k . warp_size x 4 x 4x 4> (for tensor_core sizes). 109 /// 110 /// Loops and operations are emitted that operate on those super-vector shapes. 111 /// Subsequent lowering passes will materialize to actual HW vector sizes. These 112 /// passes are expected to be (gradually) more target-specific. 113 /// 114 /// At a high level, a vectorized load in a loop will resemble: 115 /// ```mlir 116 /// affine.for %i = ? to ? step ? { 117 /// %v_a = vector.transfer_read A[%i] : memref<?xf32>, vector<128xf32> 118 /// } 119 /// ``` 120 /// It is the responsibility of the implementation of vector.transfer_read to 121 /// materialize vector registers from the original scalar memrefs. A later (more 122 /// target-dependent) lowering pass will materialize to actual HW vector sizes. 123 /// This lowering may be occur at different times: 124 /// 1. at the MLIR level into a combination of loops, unrolling, DmaStartOp + 125 /// DmaWaitOp + vectorized operations for data transformations and shuffle; 126 /// thus opening opportunities for unrolling and pipelining. This is an 127 /// instance of library call "whiteboxing"; or 128 /// 2. later in the a target-specific lowering pass or hand-written library 129 /// call; achieving full separation of concerns. This is an instance of 130 /// library call; or 131 /// 3. a mix of both, e.g. based on a model. 132 /// In the future, these operations will expose a contract to constrain the 133 /// search on vectorization patterns and sizes. 134 /// 135 /// Occurrence of super-vectorization in the compiler flow: 136 /// ======================================================= 137 /// This is an active area of investigation. We start with 2 remarks to position 138 /// super-vectorization in the context of existing ongoing work: LLVM VPLAN 139 /// and LLVM SLP Vectorizer. 140 /// 141 /// LLVM VPLAN: 142 /// ----------- 143 /// The astute reader may have noticed that in the limit, super-vectorization 144 /// can be applied at a similar time and with similar objectives than VPLAN. 145 /// For instance, in the case of a traditional, polyhedral compilation-flow (for 146 /// instance, the PPCG project uses ISL to provide dependence analysis, 147 /// multi-level(scheduling + tiling), lifting footprint to fast memory, 148 /// communication synthesis, mapping, register optimizations) and before 149 /// unrolling. When vectorization is applied at this *late* level in a typical 150 /// polyhedral flow, and is instantiated with actual hardware vector sizes, 151 /// super-vectorization is expected to match (or subsume) the type of patterns 152 /// that LLVM's VPLAN aims at targeting. The main difference here is that MLIR 153 /// is higher level and our implementation should be significantly simpler. Also 154 /// note that in this mode, recursive patterns are probably a bit of an overkill 155 /// although it is reasonable to expect that mixing a bit of outer loop and 156 /// inner loop vectorization + unrolling will provide interesting choices to 157 /// MLIR. 158 /// 159 /// LLVM SLP Vectorizer: 160 /// -------------------- 161 /// Super-vectorization however is not meant to be usable in a similar fashion 162 /// to the SLP vectorizer. The main difference lies in the information that 163 /// both vectorizers use: super-vectorization examines contiguity of memory 164 /// references along fastest varying dimensions and loops with recursive nested 165 /// patterns capturing imperfectly-nested loop nests; the SLP vectorizer, on 166 /// the other hand, performs flat pattern matching inside a single unrolled loop 167 /// body and stitches together pieces of load and store operations into full 168 /// 1-D vectors. We envision that the SLP vectorizer is a good way to capture 169 /// innermost loop, control-flow dependent patterns that super-vectorization may 170 /// not be able to capture easily. In other words, super-vectorization does not 171 /// aim at replacing the SLP vectorizer and the two solutions are complementary. 172 /// 173 /// Ongoing investigations: 174 /// ----------------------- 175 /// We discuss the following *early* places where super-vectorization is 176 /// applicable and touch on the expected benefits and risks . We list the 177 /// opportunities in the context of the traditional polyhedral compiler flow 178 /// described in PPCG. There are essentially 6 places in the MLIR pass pipeline 179 /// we expect to experiment with super-vectorization: 180 /// 1. Right after language lowering to MLIR: this is the earliest time where 181 /// super-vectorization is expected to be applied. At this level, all the 182 /// language/user/library-level annotations are available and can be fully 183 /// exploited. Examples include loop-type annotations (such as parallel, 184 /// reduction, scan, dependence distance vector, vectorizable) as well as 185 /// memory access annotations (such as non-aliasing writes guaranteed, 186 /// indirect accesses that are permutations by construction) accesses or 187 /// that a particular operation is prescribed atomic by the user. At this 188 /// level, anything that enriches what dependence analysis can do should be 189 /// aggressively exploited. At this level we are close to having explicit 190 /// vector types in the language, except we do not impose that burden on the 191 /// programmer/library: we derive information from scalar code + annotations. 192 /// 2. After dependence analysis and before polyhedral scheduling: the 193 /// information that supports vectorization does not need to be supplied by a 194 /// higher level of abstraction. Traditional dependence anaysis is available 195 /// in MLIR and will be used to drive vectorization and cost models. 196 /// 197 /// Let's pause here and remark that applying super-vectorization as described 198 /// in 1. and 2. presents clear opportunities and risks: 199 /// - the opportunity is that vectorization is burned in the type system and 200 /// is protected from the adverse effect of loop scheduling, tiling, loop 201 /// interchange and all passes downstream. Provided that subsequent passes are 202 /// able to operate on vector types; the vector shapes, associated loop 203 /// iterator properties, alignment, and contiguity of fastest varying 204 /// dimensions are preserved until we lower the super-vector types. We expect 205 /// this to significantly rein in on the adverse effects of phase ordering. 206 /// - the risks are that a. all passes after super-vectorization have to work 207 /// on elemental vector types (not that this is always true, wherever 208 /// vectorization is applied) and b. that imposing vectorization constraints 209 /// too early may be overall detrimental to loop fusion, tiling and other 210 /// transformations because the dependence distances are coarsened when 211 /// operating on elemental vector types. For this reason, the pattern 212 /// profitability analysis should include a component that also captures the 213 /// maximal amount of fusion available under a particular pattern. This is 214 /// still at the stage of rought ideas but in this context, search is our 215 /// friend as the Tensor Comprehensions and auto-TVM contributions 216 /// demonstrated previously. 217 /// Bottom-line is we do not yet have good answers for the above but aim at 218 /// making it easy to answer such questions. 219 /// 220 /// Back to our listing, the last places where early super-vectorization makes 221 /// sense are: 222 /// 3. right after polyhedral-style scheduling: PLUTO-style algorithms are known 223 /// to improve locality, parallelism and be configurable (e.g. max-fuse, 224 /// smart-fuse etc). They can also have adverse effects on contiguity 225 /// properties that are required for vectorization but the vector.transfer 226 /// copy-reshape-pad-transpose abstraction is expected to help recapture 227 /// these properties. 228 /// 4. right after polyhedral-style scheduling+tiling; 229 /// 5. right after scheduling+tiling+rescheduling: points 4 and 5 represent 230 /// probably the most promising places because applying tiling achieves a 231 /// separation of concerns that allows rescheduling to worry less about 232 /// locality and more about parallelism and distribution (e.g. min-fuse). 233 /// 234 /// At these levels the risk-reward looks different: on one hand we probably 235 /// lost a good deal of language/user/library-level annotation; on the other 236 /// hand we gained parallelism and locality through scheduling and tiling. 237 /// However we probably want to ensure tiling is compatible with the 238 /// full-tile-only abstraction used in super-vectorization or suffer the 239 /// consequences. It is too early to place bets on what will win but we expect 240 /// super-vectorization to be the right abstraction to allow exploring at all 241 /// these levels. And again, search is our friend. 242 /// 243 /// Lastly, we mention it again here: 244 /// 6. as a MLIR-based alternative to VPLAN. 245 /// 246 /// Lowering, unrolling, pipelining: 247 /// ================================ 248 /// TODO(ntv): point to the proper places. 249 /// 250 /// Algorithm: 251 /// ========== 252 /// The algorithm proceeds in a few steps: 253 /// 1. defining super-vectorization patterns and matching them on the tree of 254 /// AffineForOp. A super-vectorization pattern is defined as a recursive 255 /// data structures that matches and captures nested, imperfectly-nested 256 /// loops that have a. comformable loop annotations attached (e.g. parallel, 257 /// reduction, vectoriable, ...) as well as b. all contiguous load/store 258 /// operations along a specified minor dimension (not necessarily the 259 /// fastest varying) ; 260 /// 2. analyzing those patterns for profitability (TODO(ntv): and 261 /// interference); 262 /// 3. Then, for each pattern in order: 263 /// a. applying iterative rewriting of the loop and the load operations in 264 /// DFS postorder. Rewriting is implemented by coarsening the loops and 265 /// turning load operations into opaque vector.transfer_read ops; 266 /// b. keeping track of the load operations encountered as "roots" and the 267 /// store operations as "terminals"; 268 /// c. traversing the use-def chains starting from the roots and iteratively 269 /// propagating vectorized values. Scalar values that are encountered 270 /// during this process must come from outside the scope of the current 271 /// pattern (TODO(ntv): enforce this and generalize). Such a scalar value 272 /// is vectorized only if it is a constant (into a vector splat). The 273 /// non-constant case is not supported for now and results in the pattern 274 /// failing to vectorize; 275 /// d. performing a second traversal on the terminals (store ops) to 276 /// rewriting the scalar value they write to memory into vector form. 277 /// If the scalar value has been vectorized previously, we simply replace 278 /// it by its vector form. Otherwise, if the scalar value is a constant, 279 /// it is vectorized into a splat. In all other cases, vectorization for 280 /// the pattern currently fails. 281 /// e. if everything under the root AffineForOp in the current pattern 282 /// vectorizes properly, we commit that loop to the IR. Otherwise we 283 /// discard it and restore a previously cloned version of the loop. Thanks 284 /// to the recursive scoping nature of matchers and captured patterns, 285 /// this is transparently achieved by a simple RAII implementation. 286 /// f. vectorization is applied on the next pattern in the list. Because 287 /// pattern interference avoidance is not yet implemented and that we do 288 /// not support further vectorizing an already vector load we need to 289 /// re-verify that the pattern is still vectorizable. This is expected to 290 /// make cost models more difficult to write and is subject to improvement 291 /// in the future. 292 /// 293 /// Points c. and d. above are worth additional comment. In most passes that 294 /// do not change the type of operands, it is usually preferred to eagerly 295 /// `replaceAllUsesWith`. Unfortunately this does not work for vectorization 296 /// because during the use-def chain traversal, all the operands of an operation 297 /// must be available in vector form. Trying to propagate eagerly makes the IR 298 /// temporarily invalid and results in errors such as: 299 /// `vectorize.mlir:308:13: error: 'addf' op requires the same type for all 300 /// operands and results 301 /// %s5 = addf %a5, %b5 : f32` 302 /// 303 /// Lastly, we show a minimal example for which use-def chains rooted in load / 304 /// vector.transfer_read are not enough. This is what motivated splitting 305 /// terminal processing out of the use-def chains starting from loads. In the 306 /// following snippet, there is simply no load:: 307 /// ```mlir 308 /// mlfunc @fill(%A : memref<128xf32>) -> () { 309 /// %f1 = constant 1.0 : f32 310 /// affine.for %i0 = 0 to 32 { 311 /// store %f1, %A[%i0] : memref<128xf32, 0> 312 /// } 313 /// return 314 /// } 315 /// ``` 316 /// 317 /// Choice of loop transformation to support the algorithm: 318 /// ======================================================= 319 /// The choice of loop transformation to apply for coarsening vectorized loops 320 /// is still subject to exploratory tradeoffs. In particular, say we want to 321 /// vectorize by a factor 128, we want to transform the following input: 322 /// ```mlir 323 /// affine.for %i = %M to %N { 324 /// %a = load A[%i] : memref<?xf32> 325 /// } 326 /// ``` 327 /// 328 /// Traditionally, one would vectorize late (after scheduling, tiling, 329 /// memory promotion etc) say after stripmining (and potentially unrolling in 330 /// the case of LLVM's SLP vectorizer): 331 /// ```mlir 332 /// affine.for %i = floor(%M, 128) to ceil(%N, 128) { 333 /// affine.for %ii = max(%M, 128 * %i) to min(%N, 128*%i + 127) { 334 /// %a = load A[%ii] : memref<?xf32> 335 /// } 336 /// } 337 /// ``` 338 /// 339 /// Instead, we seek to vectorize early and freeze vector types before 340 /// scheduling, so we want to generate a pattern that resembles: 341 /// ```mlir 342 /// affine.for %i = ? to ? step ? { 343 /// %v_a = vector.transfer_read A[%i] : memref<?xf32>, vector<128xf32> 344 /// } 345 /// ``` 346 /// 347 /// i. simply dividing the lower / upper bounds by 128 creates issues 348 /// when representing expressions such as ii + 1 because now we only 349 /// have access to original values that have been divided. Additional 350 /// information is needed to specify accesses at below-128 granularity; 351 /// ii. another alternative is to coarsen the loop step but this may have 352 /// consequences on dependence analysis and fusability of loops: fusable 353 /// loops probably need to have the same step (because we don't want to 354 /// stripmine/unroll to enable fusion). 355 /// As a consequence, we choose to represent the coarsening using the loop 356 /// step for now and reevaluate in the future. Note that we can renormalize 357 /// loop steps later if/when we have evidence that they are problematic. 358 /// 359 /// For the simple strawman example above, vectorizing for a 1-D vector 360 /// abstraction of size 128 returns code similar to: 361 /// ```mlir 362 /// affine.for %i = %M to %N step 128 { 363 /// %v_a = vector.transfer_read A[%i] : memref<?xf32>, vector<128xf32> 364 /// } 365 /// ``` 366 /// 367 /// Unsupported cases, extensions, and work in progress (help welcome :-) ): 368 /// ======================================================================== 369 /// 1. lowering to concrete vector types for various HW; 370 /// 2. reduction support; 371 /// 3. non-effecting padding during vector.transfer_read and filter during 372 /// vector.transfer_write; 373 /// 4. misalignment support vector.transfer_read / vector.transfer_write 374 /// (hopefully without read-modify-writes); 375 /// 5. control-flow support; 376 /// 6. cost-models, heuristics and search; 377 /// 7. Op implementation, extensions and implication on memref views; 378 /// 8. many TODOs left around. 379 /// 380 /// Examples: 381 /// ========= 382 /// Consider the following Function: 383 /// ```mlir 384 /// mlfunc @vector_add_2d(%M : index, %N : index) -> f32 { 385 /// %A = alloc (%M, %N) : memref<?x?xf32, 0> 386 /// %B = alloc (%M, %N) : memref<?x?xf32, 0> 387 /// %C = alloc (%M, %N) : memref<?x?xf32, 0> 388 /// %f1 = constant 1.0 : f32 389 /// %f2 = constant 2.0 : f32 390 /// affine.for %i0 = 0 to %M { 391 /// affine.for %i1 = 0 to %N { 392 /// // non-scoped %f1 393 /// store %f1, %A[%i0, %i1] : memref<?x?xf32, 0> 394 /// } 395 /// } 396 /// affine.for %i2 = 0 to %M { 397 /// affine.for %i3 = 0 to %N { 398 /// // non-scoped %f2 399 /// store %f2, %B[%i2, %i3] : memref<?x?xf32, 0> 400 /// } 401 /// } 402 /// affine.for %i4 = 0 to %M { 403 /// affine.for %i5 = 0 to %N { 404 /// %a5 = load %A[%i4, %i5] : memref<?x?xf32, 0> 405 /// %b5 = load %B[%i4, %i5] : memref<?x?xf32, 0> 406 /// %s5 = addf %a5, %b5 : f32 407 /// // non-scoped %f1 408 /// %s6 = addf %s5, %f1 : f32 409 /// // non-scoped %f2 410 /// %s7 = addf %s5, %f2 : f32 411 /// // diamond dependency. 412 /// %s8 = addf %s7, %s6 : f32 413 /// store %s8, %C[%i4, %i5] : memref<?x?xf32, 0> 414 /// } 415 /// } 416 /// %c7 = constant 7 : index 417 /// %c42 = constant 42 : index 418 /// %res = load %C[%c7, %c42] : memref<?x?xf32, 0> 419 /// return %res : f32 420 /// } 421 /// ``` 422 /// 423 /// TODO(ntv): update post b/119731251. 424 /// The -vectorize pass with the following arguments: 425 /// ``` 426 /// -vectorize -virtual-vector-size 256 --test-fastest-varying=0 427 /// ``` 428 /// 429 /// produces this standard innermost-loop vectorized code: 430 /// ```mlir 431 /// mlfunc @vector_add_2d(%arg0 : index, %arg1 : index) -> f32 { 432 /// %0 = alloc(%arg0, %arg1) : memref<?x?xf32> 433 /// %1 = alloc(%arg0, %arg1) : memref<?x?xf32> 434 /// %2 = alloc(%arg0, %arg1) : memref<?x?xf32> 435 /// %cst = constant 1.0 : f32 436 /// %cst_0 = constant 2.0 : f32 437 /// affine.for %i0 = 0 to %arg0 { 438 /// affine.for %i1 = 0 to %arg1 step 256 { 439 /// %cst_1 = constant dense<vector<256xf32>, 1.0> : 440 /// vector<256xf32> 441 /// vector.transfer_write %cst_1, %0[%i0, %i1] : 442 /// vector<256xf32>, memref<?x?xf32> 443 /// } 444 /// } 445 /// affine.for %i2 = 0 to %arg0 { 446 /// affine.for %i3 = 0 to %arg1 step 256 { 447 /// %cst_2 = constant dense<vector<256xf32>, 2.0> : 448 /// vector<256xf32> 449 /// vector.transfer_write %cst_2, %1[%i2, %i3] : 450 /// vector<256xf32>, memref<?x?xf32> 451 /// } 452 /// } 453 /// affine.for %i4 = 0 to %arg0 { 454 /// affine.for %i5 = 0 to %arg1 step 256 { 455 /// %3 = vector.transfer_read %0[%i4, %i5] : 456 /// memref<?x?xf32>, vector<256xf32> 457 /// %4 = vector.transfer_read %1[%i4, %i5] : 458 /// memref<?x?xf32>, vector<256xf32> 459 /// %5 = addf %3, %4 : vector<256xf32> 460 /// %cst_3 = constant dense<vector<256xf32>, 1.0> : 461 /// vector<256xf32> 462 /// %6 = addf %5, %cst_3 : vector<256xf32> 463 /// %cst_4 = constant dense<vector<256xf32>, 2.0> : 464 /// vector<256xf32> 465 /// %7 = addf %5, %cst_4 : vector<256xf32> 466 /// %8 = addf %7, %6 : vector<256xf32> 467 /// vector.transfer_write %8, %2[%i4, %i5] : 468 /// vector<256xf32>, memref<?x?xf32> 469 /// } 470 /// } 471 /// %c7 = constant 7 : index 472 /// %c42 = constant 42 : index 473 /// %9 = load %2[%c7, %c42] : memref<?x?xf32> 474 /// return %9 : f32 475 /// } 476 /// ``` 477 /// 478 /// TODO(ntv): update post b/119731251. 479 /// The -vectorize pass with the following arguments: 480 /// ``` 481 /// -vectorize -virtual-vector-size 32 -virtual-vector-size 256 482 /// --test-fastest-varying=1 --test-fastest-varying=0 483 /// ``` 484 /// 485 /// produces this more insteresting mixed outer-innermost-loop vectorized code: 486 /// ```mlir 487 /// mlfunc @vector_add_2d(%arg0 : index, %arg1 : index) -> f32 { 488 /// %0 = alloc(%arg0, %arg1) : memref<?x?xf32> 489 /// %1 = alloc(%arg0, %arg1) : memref<?x?xf32> 490 /// %2 = alloc(%arg0, %arg1) : memref<?x?xf32> 491 /// %cst = constant 1.0 : f32 492 /// %cst_0 = constant 2.0 : f32 493 /// affine.for %i0 = 0 to %arg0 step 32 { 494 /// affine.for %i1 = 0 to %arg1 step 256 { 495 /// %cst_1 = constant dense<vector<32x256xf32>, 1.0> : 496 /// vector<32x256xf32> 497 /// vector.transfer_write %cst_1, %0[%i0, %i1] : 498 /// vector<32x256xf32>, memref<?x?xf32> 499 /// } 500 /// } 501 /// affine.for %i2 = 0 to %arg0 step 32 { 502 /// affine.for %i3 = 0 to %arg1 step 256 { 503 /// %cst_2 = constant dense<vector<32x256xf32>, 2.0> : 504 /// vector<32x256xf32> 505 /// vector.transfer_write %cst_2, %1[%i2, %i3] : 506 /// vector<32x256xf32>, memref<?x?xf32> 507 /// } 508 /// } 509 /// affine.for %i4 = 0 to %arg0 step 32 { 510 /// affine.for %i5 = 0 to %arg1 step 256 { 511 /// %3 = vector.transfer_read %0[%i4, %i5] : 512 /// memref<?x?xf32> vector<32x256xf32> 513 /// %4 = vector.transfer_read %1[%i4, %i5] : 514 /// memref<?x?xf32>, vector<32x256xf32> 515 /// %5 = addf %3, %4 : vector<32x256xf32> 516 /// %cst_3 = constant dense<vector<32x256xf32>, 1.0> : 517 /// vector<32x256xf32> 518 /// %6 = addf %5, %cst_3 : vector<32x256xf32> 519 /// %cst_4 = constant dense<vector<32x256xf32>, 2.0> : 520 /// vector<32x256xf32> 521 /// %7 = addf %5, %cst_4 : vector<32x256xf32> 522 /// %8 = addf %7, %6 : vector<32x256xf32> 523 /// vector.transfer_write %8, %2[%i4, %i5] : 524 /// vector<32x256xf32>, memref<?x?xf32> 525 /// } 526 /// } 527 /// %c7 = constant 7 : index 528 /// %c42 = constant 42 : index 529 /// %9 = load %2[%c7, %c42] : memref<?x?xf32> 530 /// return %9 : f32 531 /// } 532 /// ``` 533 /// 534 /// Of course, much more intricate n-D imperfectly-nested patterns can be 535 /// vectorized too and specified in a fully declarative fashion. 536 537 #define DEBUG_TYPE "early-vect" 538 539 using functional::makePtrDynCaster; 540 using functional::map; 541 using llvm::dbgs; 542 using llvm::SetVector; 543 544 static llvm::cl::OptionCategory clOptionsCategory("vectorize options"); 545 546 static llvm::cl::list<int> clVirtualVectorSize( 547 "virtual-vector-size", 548 llvm::cl::desc("Specify an n-D virtual vector size for vectorization"), 549 llvm::cl::ZeroOrMore, llvm::cl::cat(clOptionsCategory)); 550 551 static llvm::cl::list<int> clFastestVaryingPattern( 552 "test-fastest-varying", 553 llvm::cl::desc( 554 "Specify a 1-D, 2-D or 3-D pattern of fastest varying memory" 555 " dimensions to match. See defaultPatterns in Vectorize.cpp for a" 556 " description and examples. This is used for testing purposes"), 557 llvm::cl::ZeroOrMore, llvm::cl::cat(clOptionsCategory)); 558 559 /// Forward declaration. 560 static FilterFunctionType 561 isVectorizableLoopPtrFactory(const llvm::DenseSet<Operation *> ¶llelLoops, 562 int fastestVaryingMemRefDimension); 563 564 /// Creates a vectorization pattern from the command line arguments. 565 /// Up to 3-D patterns are supported. 566 /// If the command line argument requests a pattern of higher order, returns an 567 /// empty pattern list which will conservatively result in no vectorization. 568 static std::vector<NestedPattern> 569 makePatterns(const llvm::DenseSet<Operation *> ¶llelLoops, int vectorRank, 570 ArrayRef<int64_t> fastestVaryingPattern) { 571 using matcher::For; 572 int64_t d0 = fastestVaryingPattern.empty() ? -1 : fastestVaryingPattern[0]; 573 int64_t d1 = fastestVaryingPattern.size() < 2 ? -1 : fastestVaryingPattern[1]; 574 int64_t d2 = fastestVaryingPattern.size() < 3 ? -1 : fastestVaryingPattern[2]; 575 switch (vectorRank) { 576 case 1: 577 return {For(isVectorizableLoopPtrFactory(parallelLoops, d0))}; 578 case 2: 579 return {For(isVectorizableLoopPtrFactory(parallelLoops, d0), 580 For(isVectorizableLoopPtrFactory(parallelLoops, d1)))}; 581 case 3: 582 return {For(isVectorizableLoopPtrFactory(parallelLoops, d0), 583 For(isVectorizableLoopPtrFactory(parallelLoops, d1), 584 For(isVectorizableLoopPtrFactory(parallelLoops, d2))))}; 585 default: { 586 return std::vector<NestedPattern>(); 587 } 588 } 589 } 590 591 namespace { 592 593 /// Base state for the vectorize pass. 594 /// Command line arguments are preempted by non-empty pass arguments. 595 struct Vectorize : public FunctionPass<Vectorize> { 596 Vectorize(); 597 Vectorize(ArrayRef<int64_t> virtualVectorSize); 598 void runOnFunction() override; 599 600 // The virtual vector size that we vectorize to. 601 SmallVector<int64_t, 4> vectorSizes; 602 // Optionally, the fixed mapping from loop to fastest varying MemRef dimension 603 // for all the MemRefs within a loop pattern: 604 // the index represents the loop depth, the value represents the k^th 605 // fastest varying memory dimension. 606 // This is voluntarily restrictive and is meant to precisely target a 607 // particular loop/op pair, for testing purposes. 608 SmallVector<int64_t, 4> fastestVaryingPattern; 609 }; 610 611 } // end anonymous namespace 612 613 Vectorize::Vectorize() 614 : vectorSizes(clVirtualVectorSize.begin(), clVirtualVectorSize.end()), 615 fastestVaryingPattern(clFastestVaryingPattern.begin(), 616 clFastestVaryingPattern.end()) {} 617 618 Vectorize::Vectorize(ArrayRef<int64_t> virtualVectorSize) : Vectorize() { 619 if (!virtualVectorSize.empty()) { 620 this->vectorSizes.assign(virtualVectorSize.begin(), 621 virtualVectorSize.end()); 622 } 623 } 624 625 /////// TODO(ntv): Hoist to a VectorizationStrategy.cpp when appropriate. 626 ///////// 627 namespace { 628 629 struct VectorizationStrategy { 630 SmallVector<int64_t, 8> vectorSizes; 631 DenseMap<Operation *, unsigned> loopToVectorDim; 632 }; 633 634 } // end anonymous namespace 635 636 static void vectorizeLoopIfProfitable(Operation *loop, unsigned depthInPattern, 637 unsigned patternDepth, 638 VectorizationStrategy *strategy) { 639 assert(patternDepth > depthInPattern && 640 "patternDepth is greater than depthInPattern"); 641 if (patternDepth - depthInPattern > strategy->vectorSizes.size()) { 642 // Don't vectorize this loop 643 return; 644 } 645 strategy->loopToVectorDim[loop] = 646 strategy->vectorSizes.size() - (patternDepth - depthInPattern); 647 } 648 649 /// Implements a simple strawman strategy for vectorization. 650 /// Given a matched pattern `matches` of depth `patternDepth`, this strategy 651 /// greedily assigns the fastest varying dimension ** of the vector ** to the 652 /// innermost loop in the pattern. 653 /// When coupled with a pattern that looks for the fastest varying dimension in 654 /// load/store MemRefs, this creates a generic vectorization strategy that works 655 /// for any loop in a hierarchy (outermost, innermost or intermediate). 656 /// 657 /// TODO(ntv): In the future we should additionally increase the power of the 658 /// profitability analysis along 3 directions: 659 /// 1. account for loop extents (both static and parametric + annotations); 660 /// 2. account for data layout permutations; 661 /// 3. account for impact of vectorization on maximal loop fusion. 662 /// Then we can quantify the above to build a cost model and search over 663 /// strategies. 664 static LogicalResult analyzeProfitability(ArrayRef<NestedMatch> matches, 665 unsigned depthInPattern, 666 unsigned patternDepth, 667 VectorizationStrategy *strategy) { 668 for (auto m : matches) { 669 if (failed(analyzeProfitability(m.getMatchedChildren(), depthInPattern + 1, 670 patternDepth, strategy))) { 671 return failure(); 672 } 673 vectorizeLoopIfProfitable(m.getMatchedOperation(), depthInPattern, 674 patternDepth, strategy); 675 } 676 return success(); 677 } 678 679 ///// end TODO(ntv): Hoist to a VectorizationStrategy.cpp when appropriate ///// 680 681 namespace { 682 683 struct VectorizationState { 684 /// Adds an entry of pre/post vectorization operations in the state. 685 void registerReplacement(Operation *key, Operation *value); 686 /// When the current vectorization pattern is successful, this erases the 687 /// operations that were marked for erasure in the proper order and resets 688 /// the internal state for the next pattern. 689 void finishVectorizationPattern(); 690 691 // In-order tracking of original Operation that have been vectorized. 692 // Erase in reverse order. 693 SmallVector<Operation *, 16> toErase; 694 // Set of Operation that have been vectorized (the values in the 695 // vectorizationMap for hashed access). The vectorizedSet is used in 696 // particular to filter the operations that have already been vectorized by 697 // this pattern, when iterating over nested loops in this pattern. 698 DenseSet<Operation *> vectorizedSet; 699 // Map of old scalar Operation to new vectorized Operation. 700 DenseMap<Operation *, Operation *> vectorizationMap; 701 // Map of old scalar Value to new vectorized Value. 702 DenseMap<Value *, Value *> replacementMap; 703 // The strategy drives which loop to vectorize by which amount. 704 const VectorizationStrategy *strategy; 705 // Use-def roots. These represent the starting points for the worklist in the 706 // vectorizeNonTerminals function. They consist of the subset of load 707 // operations that have been vectorized. They can be retrieved from 708 // `vectorizationMap` but it is convenient to keep track of them in a separate 709 // data structure. 710 DenseSet<Operation *> roots; 711 // Terminal operations for the worklist in the vectorizeNonTerminals 712 // function. They consist of the subset of store operations that have been 713 // vectorized. They can be retrieved from `vectorizationMap` but it is 714 // convenient to keep track of them in a separate data structure. Since they 715 // do not necessarily belong to use-def chains starting from loads (e.g 716 // storing a constant), we need to handle them in a post-pass. 717 DenseSet<Operation *> terminals; 718 // Checks that the type of `op` is AffineStoreOp and adds it to the terminals 719 // set. 720 void registerTerminal(Operation *op); 721 722 private: 723 void registerReplacement(Value *key, Value *value); 724 }; 725 726 } // end namespace 727 728 void VectorizationState::registerReplacement(Operation *key, Operation *value) { 729 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ commit vectorized op: "); 730 LLVM_DEBUG(key->print(dbgs())); 731 LLVM_DEBUG(dbgs() << " into "); 732 LLVM_DEBUG(value->print(dbgs())); 733 assert(key->getNumResults() == 1 && "already registered"); 734 assert(value->getNumResults() == 1 && "already registered"); 735 assert(vectorizedSet.count(value) == 0 && "already registered"); 736 assert(vectorizationMap.count(key) == 0 && "already registered"); 737 toErase.push_back(key); 738 vectorizedSet.insert(value); 739 vectorizationMap.insert(std::make_pair(key, value)); 740 registerReplacement(key->getResult(0), value->getResult(0)); 741 if (isa<AffineLoadOp>(key)) { 742 assert(roots.count(key) == 0 && "root was already inserted previously"); 743 roots.insert(key); 744 } 745 } 746 747 void VectorizationState::registerTerminal(Operation *op) { 748 assert(isa<AffineStoreOp>(op) && "terminal must be a AffineStoreOp"); 749 assert(terminals.count(op) == 0 && 750 "terminal was already inserted previously"); 751 terminals.insert(op); 752 } 753 754 void VectorizationState::finishVectorizationPattern() { 755 while (!toErase.empty()) { 756 auto *op = toErase.pop_back_val(); 757 LLVM_DEBUG(dbgs() << "\n[early-vect] finishVectorizationPattern erase: "); 758 LLVM_DEBUG(op->print(dbgs())); 759 op->erase(); 760 } 761 } 762 763 void VectorizationState::registerReplacement(Value *key, Value *value) { 764 assert(replacementMap.count(key) == 0 && "replacement already registered"); 765 replacementMap.insert(std::make_pair(key, value)); 766 } 767 768 // Apply 'map' with 'mapOperands' returning resulting values in 'results'. 769 static void computeMemoryOpIndices(Operation *op, AffineMap map, 770 ArrayRef<Value *> mapOperands, 771 SmallVectorImpl<Value *> &results) { 772 OpBuilder builder(op); 773 for (auto resultExpr : map.getResults()) { 774 auto singleResMap = 775 builder.getAffineMap(map.getNumDims(), map.getNumSymbols(), resultExpr); 776 auto afOp = 777 builder.create<AffineApplyOp>(op->getLoc(), singleResMap, mapOperands); 778 results.push_back(afOp); 779 } 780 } 781 782 ////// TODO(ntv): Hoist to a VectorizationMaterialize.cpp when appropriate. //// 783 784 /// Handles the vectorization of load and store MLIR operations. 785 /// 786 /// AffineLoadOp operations are the roots of the vectorizeNonTerminals call. 787 /// They are vectorized immediately. The resulting vector.transfer_read is 788 /// immediately registered to replace all uses of the AffineLoadOp in this 789 /// pattern's scope. 790 /// 791 /// AffineStoreOp are the terminals of the vectorizeNonTerminals call. They 792 /// need to be vectorized late once all the use-def chains have been traversed. 793 /// Additionally, they may have ssa-values operands which come from outside the 794 /// scope of the current pattern. 795 /// Such special cases force us to delay the vectorization of the stores until 796 /// the last step. Here we merely register the store operation. 797 template <typename LoadOrStoreOpPointer> 798 static LogicalResult vectorizeRootOrTerminal(Value *iv, 799 LoadOrStoreOpPointer memoryOp, 800 VectorizationState *state) { 801 auto memRefType = memoryOp.getMemRef()->getType().template cast<MemRefType>(); 802 803 auto elementType = memRefType.getElementType(); 804 // TODO(ntv): ponder whether we want to further vectorize a vector value. 805 assert(VectorType::isValidElementType(elementType) && 806 "Not a valid vector element type"); 807 auto vectorType = VectorType::get(state->strategy->vectorSizes, elementType); 808 809 // Materialize a MemRef with 1 vector. 810 auto *opInst = memoryOp.getOperation(); 811 // For now, vector.transfers must be aligned, operate only on indices with an 812 // identity subset of AffineMap and do not change layout. 813 // TODO(ntv): increase the expressiveness power of vector.transfer operations 814 // as needed by various targets. 815 if (auto load = dyn_cast<AffineLoadOp>(opInst)) { 816 OpBuilder b(opInst); 817 SmallVector<Value *, 4> mapOperands(load.getIndices()); 818 SmallVector<Value *, 8> indices; 819 indices.reserve(load.getMemRefType().getRank()); 820 if (load.getAffineMap() != 821 b.getMultiDimIdentityMap(load.getMemRefType().getRank())) { 822 computeMemoryOpIndices(opInst, load.getAffineMap(), mapOperands, indices); 823 } else { 824 indices.append(load.getIndices().begin(), load.getIndices().end()); 825 } 826 auto permutationMap = 827 makePermutationMap(opInst, indices, state->strategy->loopToVectorDim); 828 if (!permutationMap) 829 return LogicalResult::Failure; 830 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: "); 831 LLVM_DEBUG(permutationMap.print(dbgs())); 832 auto transfer = b.create<vector::VectorTransferReadOp>( 833 opInst->getLoc(), vectorType, memoryOp.getMemRef(), 834 map(makePtrDynCaster<Value>(), indices), permutationMap); 835 state->registerReplacement(opInst, transfer.getOperation()); 836 } else { 837 state->registerTerminal(opInst); 838 } 839 return success(); 840 } 841 /// end TODO(ntv): Hoist to a VectorizationMaterialize.cpp when appropriate. /// 842 843 /// Coarsens the loops bounds and transforms all remaining load and store 844 /// operations into the appropriate vector.transfer. 845 static LogicalResult vectorizeAffineForOp(AffineForOp loop, int64_t step, 846 VectorizationState *state) { 847 using namespace functional; 848 loop.setStep(step); 849 850 FilterFunctionType notVectorizedThisPattern = [state](Operation &op) { 851 if (!matcher::isLoadOrStore(op)) { 852 return false; 853 } 854 return state->vectorizationMap.count(&op) == 0 && 855 state->vectorizedSet.count(&op) == 0 && 856 state->roots.count(&op) == 0 && state->terminals.count(&op) == 0; 857 }; 858 auto loadAndStores = matcher::Op(notVectorizedThisPattern); 859 SmallVector<NestedMatch, 8> loadAndStoresMatches; 860 loadAndStores.match(loop.getOperation(), &loadAndStoresMatches); 861 for (auto ls : loadAndStoresMatches) { 862 auto *opInst = ls.getMatchedOperation(); 863 auto load = dyn_cast<AffineLoadOp>(opInst); 864 auto store = dyn_cast<AffineStoreOp>(opInst); 865 LLVM_DEBUG(opInst->print(dbgs())); 866 LogicalResult result = 867 load ? vectorizeRootOrTerminal(loop.getInductionVar(), load, state) 868 : vectorizeRootOrTerminal(loop.getInductionVar(), store, state); 869 if (failed(result)) { 870 return failure(); 871 } 872 } 873 return success(); 874 } 875 876 /// Returns a FilterFunctionType that can be used in NestedPattern to match a 877 /// loop whose underlying load/store accesses are either invariant or all 878 // varying along the `fastestVaryingMemRefDimension`. 879 static FilterFunctionType 880 isVectorizableLoopPtrFactory(const llvm::DenseSet<Operation *> ¶llelLoops, 881 int fastestVaryingMemRefDimension) { 882 return [¶llelLoops, fastestVaryingMemRefDimension](Operation &forOp) { 883 auto loop = cast<AffineForOp>(forOp); 884 auto parallelIt = parallelLoops.find(loop); 885 if (parallelIt == parallelLoops.end()) 886 return false; 887 int memRefDim = -1; 888 auto vectorizableBody = isVectorizableLoopBody(loop, &memRefDim); 889 if (!vectorizableBody) 890 return false; 891 return memRefDim == -1 || fastestVaryingMemRefDimension == -1 || 892 memRefDim == fastestVaryingMemRefDimension; 893 }; 894 } 895 896 /// Apply vectorization of `loop` according to `state`. This is only triggered 897 /// if all vectorizations in `childrenMatches` have already succeeded 898 /// recursively in DFS post-order. 899 static LogicalResult 900 vectorizeLoopsAndLoadsRecursively(NestedMatch oneMatch, 901 VectorizationState *state) { 902 auto *loopInst = oneMatch.getMatchedOperation(); 903 auto loop = cast<AffineForOp>(loopInst); 904 auto childrenMatches = oneMatch.getMatchedChildren(); 905 906 // 1. DFS postorder recursion, if any of my children fails, I fail too. 907 for (auto m : childrenMatches) { 908 if (failed(vectorizeLoopsAndLoadsRecursively(m, state))) { 909 return failure(); 910 } 911 } 912 913 // 2. This loop may have been omitted from vectorization for various reasons 914 // (e.g. due to the performance model or pattern depth > vector size). 915 auto it = state->strategy->loopToVectorDim.find(loopInst); 916 if (it == state->strategy->loopToVectorDim.end()) { 917 return success(); 918 } 919 920 // 3. Actual post-order transformation. 921 auto vectorDim = it->second; 922 assert(vectorDim < state->strategy->vectorSizes.size() && 923 "vector dim overflow"); 924 // a. get actual vector size 925 auto vectorSize = state->strategy->vectorSizes[vectorDim]; 926 // b. loop transformation for early vectorization is still subject to 927 // exploratory tradeoffs (see top of the file). Apply coarsening, i.e.: 928 // | ub -> ub 929 // | step -> step * vectorSize 930 LLVM_DEBUG(dbgs() << "\n[early-vect] vectorizeForOp by " << vectorSize 931 << " : "); 932 LLVM_DEBUG(loopInst->print(dbgs())); 933 return vectorizeAffineForOp(loop, loop.getStep() * vectorSize, state); 934 } 935 936 /// Tries to transform a scalar constant into a vector splat of that constant. 937 /// Returns the vectorized splat operation if the constant is a valid vector 938 /// element type. 939 /// If `type` is not a valid vector type or if the scalar constant is not a 940 /// valid vector element type, returns nullptr. 941 static Value *vectorizeConstant(Operation *op, ConstantOp constant, Type type) { 942 if (!type || !type.isa<VectorType>() || 943 !VectorType::isValidElementType(constant.getType())) { 944 return nullptr; 945 } 946 OpBuilder b(op); 947 Location loc = op->getLoc(); 948 auto vectorType = type.cast<VectorType>(); 949 auto attr = DenseElementsAttr::get(vectorType, constant.getValue()); 950 auto *constantOpInst = constant.getOperation(); 951 952 OperationState state(loc, constantOpInst->getName().getStringRef(), {}, 953 {vectorType}, {b.getNamedAttr("value", attr)}); 954 955 return b.createOperation(state)->getResult(0); 956 } 957 958 /// Tries to vectorize a given operand `op` of Operation `op` during 959 /// def-chain propagation or during terminal vectorization, by applying the 960 /// following logic: 961 /// 1. if the defining operation is part of the vectorizedSet (i.e. vectorized 962 /// useby -def propagation), `op` is already in the proper vector form; 963 /// 2. otherwise, the `op` may be in some other vector form that fails to 964 /// vectorize atm (i.e. broadcasting required), returns nullptr to indicate 965 /// failure; 966 /// 3. if the `op` is a constant, returns the vectorized form of the constant; 967 /// 4. non-constant scalars are currently non-vectorizable, in particular to 968 /// guard against vectorizing an index which may be loop-variant and needs 969 /// special handling. 970 /// 971 /// In particular this logic captures some of the use cases where definitions 972 /// that are not scoped under the current pattern are needed to vectorize. 973 /// One such example is top level function constants that need to be splatted. 974 /// 975 /// Returns an operand that has been vectorized to match `state`'s strategy if 976 /// vectorization is possible with the above logic. Returns nullptr otherwise. 977 /// 978 /// TODO(ntv): handle more complex cases. 979 static Value *vectorizeOperand(Value *operand, Operation *op, 980 VectorizationState *state) { 981 LLVM_DEBUG(dbgs() << "\n[early-vect]vectorize operand: "); 982 LLVM_DEBUG(operand->print(dbgs())); 983 // 1. If this value has already been vectorized this round, we are done. 984 if (state->vectorizedSet.count(operand->getDefiningOp()) > 0) { 985 LLVM_DEBUG(dbgs() << " -> already vector operand"); 986 return operand; 987 } 988 // 1.b. Delayed on-demand replacement of a use. 989 // Note that we cannot just call replaceAllUsesWith because it may result 990 // in ops with mixed types, for ops whose operands have not all yet 991 // been vectorized. This would be invalid IR. 992 auto it = state->replacementMap.find(operand); 993 if (it != state->replacementMap.end()) { 994 auto *res = it->second; 995 LLVM_DEBUG(dbgs() << "-> delayed replacement by: "); 996 LLVM_DEBUG(res->print(dbgs())); 997 return res; 998 } 999 // 2. TODO(ntv): broadcast needed. 1000 if (operand->getType().isa<VectorType>()) { 1001 LLVM_DEBUG(dbgs() << "-> non-vectorizable"); 1002 return nullptr; 1003 } 1004 // 3. vectorize constant. 1005 if (auto constant = dyn_cast<ConstantOp>(operand->getDefiningOp())) { 1006 return vectorizeConstant( 1007 op, constant, 1008 VectorType::get(state->strategy->vectorSizes, operand->getType())); 1009 } 1010 // 4. currently non-vectorizable. 1011 LLVM_DEBUG(dbgs() << "-> non-vectorizable"); 1012 LLVM_DEBUG(operand->print(dbgs())); 1013 return nullptr; 1014 } 1015 1016 /// Encodes Operation-specific behavior for vectorization. In general we assume 1017 /// that all operands of an op must be vectorized but this is not always true. 1018 /// In the future, it would be nice to have a trait that describes how a 1019 /// particular operation vectorizes. For now we implement the case distinction 1020 /// here. 1021 /// Returns a vectorized form of an operation or nullptr if vectorization fails. 1022 // TODO(ntv): consider adding a trait to Op to describe how it gets vectorized. 1023 // Maybe some Ops are not vectorizable or require some tricky logic, we cannot 1024 // do one-off logic here; ideally it would be TableGen'd. 1025 static Operation *vectorizeOneOperation(Operation *opInst, 1026 VectorizationState *state) { 1027 // Sanity checks. 1028 assert(!isa<AffineLoadOp>(opInst) && 1029 "all loads must have already been fully vectorized independently"); 1030 assert(!isa<vector::VectorTransferReadOp>(opInst) && 1031 "vector.transfer_read cannot be further vectorized"); 1032 assert(!isa<vector::VectorTransferWriteOp>(opInst) && 1033 "vector.transfer_write cannot be further vectorized"); 1034 1035 if (auto store = dyn_cast<AffineStoreOp>(opInst)) { 1036 OpBuilder b(opInst); 1037 auto *memRef = store.getMemRef(); 1038 auto *value = store.getValueToStore(); 1039 auto *vectorValue = vectorizeOperand(value, opInst, state); 1040 1041 SmallVector<Value *, 4> mapOperands(store.getIndices()); 1042 SmallVector<Value *, 8> indices; 1043 indices.reserve(store.getMemRefType().getRank()); 1044 if (store.getAffineMap() != 1045 b.getMultiDimIdentityMap(store.getMemRefType().getRank())) { 1046 computeMemoryOpIndices(opInst, store.getAffineMap(), mapOperands, 1047 indices); 1048 } else { 1049 indices.append(store.getIndices().begin(), store.getIndices().end()); 1050 } 1051 1052 auto permutationMap = 1053 makePermutationMap(opInst, indices, state->strategy->loopToVectorDim); 1054 if (!permutationMap) 1055 return nullptr; 1056 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: "); 1057 LLVM_DEBUG(permutationMap.print(dbgs())); 1058 auto transfer = b.create<vector::VectorTransferWriteOp>( 1059 opInst->getLoc(), vectorValue, memRef, indices, permutationMap); 1060 auto *res = transfer.getOperation(); 1061 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorized store: " << *res); 1062 // "Terminals" (i.e. AffineStoreOps) are erased on the spot. 1063 opInst->erase(); 1064 return res; 1065 } 1066 if (opInst->getNumRegions() != 0) 1067 return nullptr; 1068 1069 SmallVector<Type, 8> vectorTypes; 1070 for (auto *v : opInst->getResults()) { 1071 vectorTypes.push_back( 1072 VectorType::get(state->strategy->vectorSizes, v->getType())); 1073 } 1074 SmallVector<Value *, 8> vectorOperands; 1075 for (auto *v : opInst->getOperands()) { 1076 vectorOperands.push_back(vectorizeOperand(v, opInst, state)); 1077 } 1078 // Check whether a single operand is null. If so, vectorization failed. 1079 bool success = llvm::all_of(vectorOperands, [](Value *op) { return op; }); 1080 if (!success) { 1081 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ an operand failed vectorize"); 1082 return nullptr; 1083 } 1084 1085 // Create a clone of the op with the proper operands and return types. 1086 // TODO(ntv): The following assumes there is always an op with a fixed 1087 // name that works both in scalar mode and vector mode. 1088 // TODO(ntv): Is it worth considering an Operation.clone operation which 1089 // changes the type so we can promote an Operation with less boilerplate? 1090 OpBuilder b(opInst); 1091 OperationState newOp(opInst->getLoc(), opInst->getName().getStringRef(), 1092 vectorOperands, vectorTypes, opInst->getAttrs(), 1093 /*successors=*/{}, 1094 /*regions=*/{}, opInst->hasResizableOperandsList()); 1095 return b.createOperation(newOp); 1096 } 1097 1098 /// Iterates over the forward slice from the loads in the vectorization pattern 1099 /// and rewrites them using their vectorized counterpart by: 1100 /// 1. Create the forward slice starting from the laods in the vectorization 1101 /// pattern. 1102 /// 2. Topologically sorts the forward slice. 1103 /// 3. For each operation in the slice, create the vector form of this 1104 /// operation, replacing each operand by a replacement operands retrieved from 1105 /// replacementMap. If any such replacement is missing, vectorization fails. 1106 static LogicalResult vectorizeNonTerminals(VectorizationState *state) { 1107 // 1. create initial worklist with the uses of the roots. 1108 SetVector<Operation *> worklist; 1109 // Note: state->roots have already been vectorized and must not be vectorized 1110 // again. This fits `getForwardSlice` which does not insert `op` in the 1111 // result. 1112 // Note: we have to exclude terminals because some of their defs may not be 1113 // nested under the vectorization pattern (e.g. constants defined in an 1114 // encompassing scope). 1115 // TODO(ntv): Use a backward slice for terminals, avoid special casing and 1116 // merge implementations. 1117 for (auto *op : state->roots) { 1118 getForwardSlice(op, &worklist, [state](Operation *op) { 1119 return state->terminals.count(op) == 0; // propagate if not terminal 1120 }); 1121 } 1122 // We merged multiple slices, topological order may not hold anymore. 1123 worklist = topologicalSort(worklist); 1124 1125 for (unsigned i = 0; i < worklist.size(); ++i) { 1126 auto *op = worklist[i]; 1127 LLVM_DEBUG(dbgs() << "\n[early-vect] vectorize use: "); 1128 LLVM_DEBUG(op->print(dbgs())); 1129 1130 // Create vector form of the operation. 1131 // Insert it just before op, on success register op as replaced. 1132 auto *vectorizedInst = vectorizeOneOperation(op, state); 1133 if (!vectorizedInst) { 1134 return failure(); 1135 } 1136 1137 // 3. Register replacement for future uses in the scope. 1138 // Note that we cannot just call replaceAllUsesWith because it may 1139 // result in ops with mixed types, for ops whose operands have not all 1140 // yet been vectorized. This would be invalid IR. 1141 state->registerReplacement(op, vectorizedInst); 1142 } 1143 return success(); 1144 } 1145 1146 /// Vectorization is a recursive procedure where anything below can fail. 1147 /// The root match thus needs to maintain a clone for handling failure. 1148 /// Each root may succeed independently but will otherwise clean after itself if 1149 /// anything below it fails. 1150 static LogicalResult vectorizeRootMatch(NestedMatch m, 1151 VectorizationStrategy *strategy) { 1152 auto loop = cast<AffineForOp>(m.getMatchedOperation()); 1153 VectorizationState state; 1154 state.strategy = strategy; 1155 1156 // Since patterns are recursive, they can very well intersect. 1157 // Since we do not want a fully greedy strategy in general, we decouple 1158 // pattern matching, from profitability analysis, from application. 1159 // As a consequence we must check that each root pattern is still 1160 // vectorizable. If a pattern is not vectorizable anymore, we just skip it. 1161 // TODO(ntv): implement a non-greedy profitability analysis that keeps only 1162 // non-intersecting patterns. 1163 if (!isVectorizableLoopBody(loop)) { 1164 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ loop is not vectorizable"); 1165 return failure(); 1166 } 1167 1168 /// Sets up error handling for this root loop. This is how the root match 1169 /// maintains a clone for handling failure and restores the proper state via 1170 /// RAII. 1171 auto *loopInst = loop.getOperation(); 1172 OpBuilder builder(loopInst); 1173 auto clonedLoop = cast<AffineForOp>(builder.clone(*loopInst)); 1174 struct Guard { 1175 LogicalResult failure() { 1176 loop.getInductionVar()->replaceAllUsesWith(clonedLoop.getInductionVar()); 1177 loop.erase(); 1178 return mlir::failure(); 1179 } 1180 LogicalResult success() { 1181 clonedLoop.erase(); 1182 return mlir::success(); 1183 } 1184 AffineForOp loop; 1185 AffineForOp clonedLoop; 1186 } guard{loop, clonedLoop}; 1187 1188 ////////////////////////////////////////////////////////////////////////////// 1189 // Start vectorizing. 1190 // From now on, any error triggers the scope guard above. 1191 ////////////////////////////////////////////////////////////////////////////// 1192 // 1. Vectorize all the loops matched by the pattern, recursively. 1193 // This also vectorizes the roots (AffineLoadOp) as well as registers the 1194 // terminals (AffineStoreOp) for post-processing vectorization (we need to 1195 // wait for all use-def chains into them to be vectorized first). 1196 if (failed(vectorizeLoopsAndLoadsRecursively(m, &state))) { 1197 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed root vectorizeLoop"); 1198 return guard.failure(); 1199 } 1200 1201 // 2. Vectorize operations reached by use-def chains from root except the 1202 // terminals (store operations) that need to be post-processed separately. 1203 // TODO(ntv): add more as we expand. 1204 if (failed(vectorizeNonTerminals(&state))) { 1205 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed vectorizeNonTerminals"); 1206 return guard.failure(); 1207 } 1208 1209 // 3. Post-process terminals. 1210 // Note: we have to post-process terminals because some of their defs may not 1211 // be nested under the vectorization pattern (e.g. constants defined in an 1212 // encompassing scope). 1213 // TODO(ntv): Use a backward slice for terminals, avoid special casing and 1214 // merge implementations. 1215 for (auto *op : state.terminals) { 1216 if (!vectorizeOneOperation(op, &state)) { // nullptr == failure 1217 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed to vectorize terminals"); 1218 return guard.failure(); 1219 } 1220 } 1221 1222 // 4. Finish this vectorization pattern. 1223 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ success vectorizing pattern"); 1224 state.finishVectorizationPattern(); 1225 return guard.success(); 1226 } 1227 1228 /// Applies vectorization to the current Function by searching over a bunch of 1229 /// predetermined patterns. 1230 void Vectorize::runOnFunction() { 1231 FuncOp f = getFunction(); 1232 if (!fastestVaryingPattern.empty() && 1233 fastestVaryingPattern.size() != vectorSizes.size()) { 1234 f.emitRemark("Fastest varying pattern specified with different size than " 1235 "the vector size."); 1236 return signalPassFailure(); 1237 } 1238 1239 // Thread-safe RAII local context, BumpPtrAllocator freed on exit. 1240 NestedPatternContext mlContext; 1241 1242 llvm::DenseSet<Operation *> parallelLoops; 1243 f.walk([¶llelLoops](AffineForOp loop) { 1244 if (isLoopParallel(loop)) 1245 parallelLoops.insert(loop); 1246 }); 1247 1248 for (auto &pat : 1249 makePatterns(parallelLoops, vectorSizes.size(), fastestVaryingPattern)) { 1250 LLVM_DEBUG(dbgs() << "\n******************************************"); 1251 LLVM_DEBUG(dbgs() << "\n******************************************"); 1252 LLVM_DEBUG(dbgs() << "\n[early-vect] new pattern on Function\n"); 1253 LLVM_DEBUG(f.print(dbgs())); 1254 unsigned patternDepth = pat.getDepth(); 1255 1256 SmallVector<NestedMatch, 8> matches; 1257 pat.match(f, &matches); 1258 // Iterate over all the top-level matches and vectorize eagerly. 1259 // This automatically prunes intersecting matches. 1260 for (auto m : matches) { 1261 VectorizationStrategy strategy; 1262 // TODO(ntv): depending on profitability, elect to reduce the vector size. 1263 strategy.vectorSizes.assign(vectorSizes.begin(), vectorSizes.end()); 1264 if (failed(analyzeProfitability(m.getMatchedChildren(), 1, patternDepth, 1265 &strategy))) { 1266 continue; 1267 } 1268 vectorizeLoopIfProfitable(m.getMatchedOperation(), 0, patternDepth, 1269 &strategy); 1270 // TODO(ntv): if pattern does not apply, report it; alter the 1271 // cost/benefit. 1272 vectorizeRootMatch(m, &strategy); 1273 // TODO(ntv): some diagnostics if failure to vectorize occurs. 1274 } 1275 } 1276 LLVM_DEBUG(dbgs() << "\n"); 1277 } 1278 1279 std::unique_ptr<FunctionPassBase> 1280 mlir::createVectorizePass(llvm::ArrayRef<int64_t> virtualVectorSize) { 1281 return std::make_unique<Vectorize>(virtualVectorSize); 1282 } 1283 1284 static PassRegistration<Vectorize> 1285 pass("affine-vectorize", 1286 "Vectorize to a target independent n-D vector abstraction");