github.com/johnnyeven/libtools@v0.0.0-20191126065708-61829c1adf46/third_party/mlir/lib/Transforms/Vectorize.cpp (about)

     1  //===- Vectorize.cpp - Vectorize Pass Impl --------------------------------===//
     2  //
     3  // Copyright 2019 The MLIR Authors.
     4  //
     5  // Licensed under the Apache License, Version 2.0 (the "License");
     6  // you may not use this file except in compliance with the License.
     7  // You may obtain a copy of the License at
     8  //
     9  //   http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  // =============================================================================
    17  //
    18  // This file implements vectorization of loops, operations and data types to
    19  // a target-independent, n-D super-vector abstraction.
    20  //
    21  //===----------------------------------------------------------------------===//
    22  
    23  #include "mlir/Analysis/LoopAnalysis.h"
    24  #include "mlir/Analysis/NestedMatcher.h"
    25  #include "mlir/Analysis/SliceAnalysis.h"
    26  #include "mlir/Analysis/Utils.h"
    27  #include "mlir/Analysis/VectorAnalysis.h"
    28  #include "mlir/Dialect/AffineOps/AffineOps.h"
    29  #include "mlir/Dialect/StandardOps/Ops.h"
    30  #include "mlir/Dialect/VectorOps/VectorOps.h"
    31  #include "mlir/IR/AffineExpr.h"
    32  #include "mlir/IR/Builders.h"
    33  #include "mlir/IR/Location.h"
    34  #include "mlir/IR/Types.h"
    35  #include "mlir/Pass/Pass.h"
    36  #include "mlir/Support/Functional.h"
    37  #include "mlir/Support/LLVM.h"
    38  #include "mlir/Transforms/Passes.h"
    39  
    40  #include "llvm/ADT/DenseMap.h"
    41  #include "llvm/ADT/DenseSet.h"
    42  #include "llvm/ADT/SetVector.h"
    43  #include "llvm/ADT/SmallString.h"
    44  #include "llvm/ADT/SmallVector.h"
    45  #include "llvm/Support/CommandLine.h"
    46  #include "llvm/Support/Debug.h"
    47  
    48  using namespace mlir;
    49  
    50  ///
    51  /// Implements a high-level vectorization strategy on a Function.
    52  /// The abstraction used is that of super-vectors, which provide a single,
    53  /// compact, representation in the vector types, information that is expected
    54  /// to reduce the impact of the phase ordering problem
    55  ///
    56  /// Vector granularity:
    57  /// ===================
    58  /// This pass is designed to perform vectorization at a super-vector
    59  /// granularity. A super-vector is loosely defined as a vector type that is a
    60  /// multiple of a "good" vector size so the HW can efficiently implement a set
    61  /// of high-level primitives. Multiple is understood along any dimension; e.g.
    62  /// both vector<16xf32> and vector<2x8xf32> are valid super-vectors for a
    63  /// vector<8xf32> HW vector. Note that a "good vector size so the HW can
    64  /// efficiently implement a set of high-level primitives" is not necessarily an
    65  /// integer multiple of actual hardware registers. We leave details of this
    66  /// distinction unspecified for now.
    67  ///
    68  /// Some may prefer the terminology a "tile of HW vectors". In this case, one
    69  /// should note that super-vectors implement an "always full tile" abstraction.
    70  /// They guarantee no partial-tile separation is necessary by relying on a
    71  /// high-level copy-reshape abstraction that we call vector.transfer. This
    72  /// copy-reshape operations is also responsible for performing layout
    73  /// transposition if necessary. In the general case this will require a scoped
    74  /// allocation in some notional local memory.
    75  ///
    76  /// Whatever the mental model one prefers to use for this abstraction, the key
    77  /// point is that we burn into a single, compact, representation in the vector
    78  /// types, information that is expected to reduce the impact of the phase
    79  /// ordering problem. Indeed, a vector type conveys information that:
    80  ///   1. the associated loops have dependency semantics that do not prevent
    81  ///      vectorization;
    82  ///   2. the associate loops have been sliced in chunks of static sizes that are
    83  ///      compatible with vector sizes (i.e. similar to unroll-and-jam);
    84  ///   3. the inner loops, in the unroll-and-jam analogy of 2, are captured by
    85  ///   the
    86  ///      vector type and no vectorization hampering transformations can be
    87  ///      applied to them anymore;
    88  ///   4. the underlying memrefs are accessed in some notional contiguous way
    89  ///      that allows loading into vectors with some amount of spatial locality;
    90  /// In other words, super-vectorization provides a level of separation of
    91  /// concern by way of opacity to subsequent passes. This has the effect of
    92  /// encapsulating and propagating vectorization constraints down the list of
    93  /// passes until we are ready to lower further.
    94  ///
    95  /// For a particular target, a notion of minimal n-d vector size will be
    96  /// specified and vectorization targets a multiple of those. In the following
    97  /// paragraph, let "k ." represent "a multiple of", to be understood as a
    98  /// multiple in the same dimension (e.g. vector<16 x k . 128> summarizes
    99  /// vector<16 x 128>, vector<16 x 256>, vector<16 x 1024>, etc).
   100  ///
   101  /// Some non-exhaustive notable super-vector sizes of interest include:
   102  ///   - CPU: vector<k . HW_vector_size>,
   103  ///          vector<k' . core_count x k . HW_vector_size>,
   104  ///          vector<socket_count x k' . core_count x k . HW_vector_size>;
   105  ///   - GPU: vector<k . warp_size>,
   106  ///          vector<k . warp_size x float2>,
   107  ///          vector<k . warp_size x float4>,
   108  ///          vector<k . warp_size x 4 x 4x 4> (for tensor_core sizes).
   109  ///
   110  /// Loops and operations are emitted that operate on those super-vector shapes.
   111  /// Subsequent lowering passes will materialize to actual HW vector sizes. These
   112  /// passes are expected to be (gradually) more target-specific.
   113  ///
   114  /// At a high level, a vectorized load in a loop will resemble:
   115  /// ```mlir
   116  ///   affine.for %i = ? to ? step ? {
   117  ///     %v_a = vector.transfer_read A[%i] : memref<?xf32>, vector<128xf32>
   118  ///   }
   119  /// ```
   120  /// It is the responsibility of the implementation of vector.transfer_read to
   121  /// materialize vector registers from the original scalar memrefs. A later (more
   122  /// target-dependent) lowering pass will materialize to actual HW vector sizes.
   123  /// This lowering may be occur at different times:
   124  ///   1. at the MLIR level into a combination of loops, unrolling, DmaStartOp +
   125  ///      DmaWaitOp + vectorized operations for data transformations and shuffle;
   126  ///      thus opening opportunities for unrolling and pipelining. This is an
   127  ///      instance of library call "whiteboxing"; or
   128  ///   2. later in the a target-specific lowering pass or hand-written library
   129  ///      call; achieving full separation of concerns. This is an instance of
   130  ///      library call; or
   131  ///   3. a mix of both, e.g. based on a model.
   132  /// In the future, these operations will expose a contract to constrain the
   133  /// search on vectorization patterns and sizes.
   134  ///
   135  /// Occurrence of super-vectorization in the compiler flow:
   136  /// =======================================================
   137  /// This is an active area of investigation. We start with 2 remarks to position
   138  /// super-vectorization in the context of existing ongoing work: LLVM VPLAN
   139  /// and LLVM SLP Vectorizer.
   140  ///
   141  /// LLVM VPLAN:
   142  /// -----------
   143  /// The astute reader may have noticed that in the limit, super-vectorization
   144  /// can be applied at a similar time and with similar objectives than VPLAN.
   145  /// For instance, in the case of a traditional, polyhedral compilation-flow (for
   146  /// instance, the PPCG project uses ISL to provide dependence analysis,
   147  /// multi-level(scheduling + tiling), lifting footprint to fast memory,
   148  /// communication synthesis, mapping, register optimizations) and before
   149  /// unrolling. When vectorization is applied at this *late* level in a typical
   150  /// polyhedral flow, and is instantiated with actual hardware vector sizes,
   151  /// super-vectorization is expected to match (or subsume) the type of patterns
   152  /// that LLVM's VPLAN aims at targeting. The main difference here is that MLIR
   153  /// is higher level and our implementation should be significantly simpler. Also
   154  /// note that in this mode, recursive patterns are probably a bit of an overkill
   155  /// although it is reasonable to expect that mixing a bit of outer loop and
   156  /// inner loop vectorization + unrolling will provide interesting choices to
   157  /// MLIR.
   158  ///
   159  /// LLVM SLP Vectorizer:
   160  /// --------------------
   161  /// Super-vectorization however is not meant to be usable in a similar fashion
   162  /// to the SLP vectorizer. The main difference lies in the information that
   163  /// both vectorizers use: super-vectorization examines contiguity of memory
   164  /// references along fastest varying dimensions and loops with recursive nested
   165  /// patterns capturing imperfectly-nested loop nests; the SLP vectorizer, on
   166  /// the other hand, performs flat pattern matching inside a single unrolled loop
   167  /// body and stitches together pieces of load and store operations into full
   168  /// 1-D vectors. We envision that the SLP vectorizer is a good way to capture
   169  /// innermost loop, control-flow dependent patterns that super-vectorization may
   170  /// not be able to capture easily. In other words, super-vectorization does not
   171  /// aim at replacing the SLP vectorizer and the two solutions are complementary.
   172  ///
   173  /// Ongoing investigations:
   174  /// -----------------------
   175  /// We discuss the following *early* places where super-vectorization is
   176  /// applicable and touch on the expected benefits and risks . We list the
   177  /// opportunities in the context of the traditional polyhedral compiler flow
   178  /// described in PPCG. There are essentially 6 places in the MLIR pass pipeline
   179  /// we expect to experiment with super-vectorization:
   180  /// 1. Right after language lowering to MLIR: this is the earliest time where
   181  ///    super-vectorization is expected to be applied. At this level, all the
   182  ///    language/user/library-level annotations are available and can be fully
   183  ///    exploited. Examples include loop-type annotations (such as parallel,
   184  ///    reduction, scan, dependence distance vector, vectorizable) as well as
   185  ///    memory access annotations (such as non-aliasing writes guaranteed,
   186  ///    indirect accesses that are permutations by construction) accesses or
   187  ///    that a particular operation is prescribed atomic by the user. At this
   188  ///    level, anything that enriches what dependence analysis can do should be
   189  ///    aggressively exploited. At this level we are close to having explicit
   190  ///    vector types in the language, except we do not impose that burden on the
   191  ///    programmer/library: we derive information from scalar code + annotations.
   192  /// 2. After dependence analysis and before polyhedral scheduling: the
   193  ///    information that supports vectorization does not need to be supplied by a
   194  ///    higher level of abstraction. Traditional dependence anaysis is available
   195  ///    in MLIR and will be used to drive vectorization and cost models.
   196  ///
   197  /// Let's pause here and remark that applying super-vectorization as described
   198  /// in 1. and 2. presents clear opportunities and risks:
   199  ///   - the opportunity is that vectorization is burned in the type system and
   200  ///   is protected from the adverse effect of loop scheduling, tiling, loop
   201  ///   interchange and all passes downstream. Provided that subsequent passes are
   202  ///   able to operate on vector types; the vector shapes, associated loop
   203  ///   iterator properties, alignment, and contiguity of fastest varying
   204  ///   dimensions are preserved until we lower the super-vector types. We expect
   205  ///   this to significantly rein in on the adverse effects of phase ordering.
   206  ///   - the risks are that a. all passes after super-vectorization have to work
   207  ///   on elemental vector types (not that this is always true, wherever
   208  ///   vectorization is applied) and b. that imposing vectorization constraints
   209  ///   too early may be overall detrimental to loop fusion, tiling and other
   210  ///   transformations because the dependence distances are coarsened when
   211  ///   operating on elemental vector types. For this reason, the pattern
   212  ///   profitability analysis should include a component that also captures the
   213  ///   maximal amount of fusion available under a particular pattern. This is
   214  ///   still at the stage of rought ideas but in this context, search is our
   215  ///   friend as the Tensor Comprehensions and auto-TVM contributions
   216  ///   demonstrated previously.
   217  /// Bottom-line is we do not yet have good answers for the above but aim at
   218  /// making it easy to answer such questions.
   219  ///
   220  /// Back to our listing, the last places where early super-vectorization makes
   221  /// sense are:
   222  /// 3. right after polyhedral-style scheduling: PLUTO-style algorithms are known
   223  ///    to improve locality, parallelism and be configurable (e.g. max-fuse,
   224  ///    smart-fuse etc). They can also have adverse effects on contiguity
   225  ///    properties that are required for vectorization but the vector.transfer
   226  ///    copy-reshape-pad-transpose abstraction is expected to help recapture
   227  ///    these properties.
   228  /// 4. right after polyhedral-style scheduling+tiling;
   229  /// 5. right after scheduling+tiling+rescheduling: points 4 and 5 represent
   230  ///    probably the most promising places because applying tiling achieves a
   231  ///    separation of concerns that allows rescheduling to worry less about
   232  ///    locality and more about parallelism and distribution (e.g. min-fuse).
   233  ///
   234  /// At these levels the risk-reward looks different: on one hand we probably
   235  /// lost a good deal of language/user/library-level annotation; on the other
   236  /// hand we gained parallelism and locality through scheduling and tiling.
   237  /// However we probably want to ensure tiling is compatible with the
   238  /// full-tile-only abstraction used in super-vectorization or suffer the
   239  /// consequences. It is too early to place bets on what will win but we expect
   240  /// super-vectorization to be the right abstraction to allow exploring at all
   241  /// these levels. And again, search is our friend.
   242  ///
   243  /// Lastly, we mention it again here:
   244  /// 6. as a MLIR-based alternative to VPLAN.
   245  ///
   246  /// Lowering, unrolling, pipelining:
   247  /// ================================
   248  /// TODO(ntv): point to the proper places.
   249  ///
   250  /// Algorithm:
   251  /// ==========
   252  /// The algorithm proceeds in a few steps:
   253  ///  1. defining super-vectorization patterns and matching them on the tree of
   254  ///     AffineForOp. A super-vectorization pattern is defined as a recursive
   255  ///     data structures that matches and captures nested, imperfectly-nested
   256  ///     loops that have a. comformable loop annotations attached (e.g. parallel,
   257  ///     reduction, vectoriable, ...) as well as b. all contiguous load/store
   258  ///     operations along a specified minor dimension (not necessarily the
   259  ///     fastest varying) ;
   260  ///  2. analyzing those patterns for profitability (TODO(ntv): and
   261  ///     interference);
   262  ///  3. Then, for each pattern in order:
   263  ///    a. applying iterative rewriting of the loop and the load operations in
   264  ///       DFS postorder. Rewriting is implemented by coarsening the loops and
   265  ///       turning load operations into opaque vector.transfer_read ops;
   266  ///    b. keeping track of the load operations encountered as "roots" and the
   267  ///       store operations as "terminals";
   268  ///    c. traversing the use-def chains starting from the roots and iteratively
   269  ///       propagating vectorized values. Scalar values that are encountered
   270  ///       during this process must come from outside the scope of the current
   271  ///       pattern (TODO(ntv): enforce this and generalize). Such a scalar value
   272  ///       is vectorized only if it is a constant (into a vector splat). The
   273  ///       non-constant case is not supported for now and results in the pattern
   274  ///       failing to vectorize;
   275  ///    d. performing a second traversal on the terminals (store ops) to
   276  ///       rewriting the scalar value they write to memory into vector form.
   277  ///       If the scalar value has been vectorized previously, we simply replace
   278  ///       it by its vector form. Otherwise, if the scalar value is a constant,
   279  ///       it is vectorized into a splat. In all other cases, vectorization for
   280  ///       the pattern currently fails.
   281  ///    e. if everything under the root AffineForOp in the current pattern
   282  ///       vectorizes properly, we commit that loop to the IR. Otherwise we
   283  ///       discard it and restore a previously cloned version of the loop. Thanks
   284  ///       to the recursive scoping nature of matchers and captured patterns,
   285  ///       this is transparently achieved by a simple RAII implementation.
   286  ///    f. vectorization is applied on the next pattern in the list. Because
   287  ///       pattern interference avoidance is not yet implemented and that we do
   288  ///       not support further vectorizing an already vector load we need to
   289  ///       re-verify that the pattern is still vectorizable. This is expected to
   290  ///       make cost models more difficult to write and is subject to improvement
   291  ///       in the future.
   292  ///
   293  /// Points c. and d. above are worth additional comment. In most passes that
   294  /// do not change the type of operands, it is usually preferred to eagerly
   295  /// `replaceAllUsesWith`. Unfortunately this does not work for vectorization
   296  /// because during the use-def chain traversal, all the operands of an operation
   297  /// must be available in vector form. Trying to propagate eagerly makes the IR
   298  /// temporarily invalid and results in errors such as:
   299  ///   `vectorize.mlir:308:13: error: 'addf' op requires the same type for all
   300  ///   operands and results
   301  ///      %s5 = addf %a5, %b5 : f32`
   302  ///
   303  /// Lastly, we show a minimal example for which use-def chains rooted in load /
   304  /// vector.transfer_read are not enough. This is what motivated splitting
   305  /// terminal processing out of the use-def chains starting from loads. In the
   306  /// following snippet, there is simply no load::
   307  /// ```mlir
   308  /// mlfunc @fill(%A : memref<128xf32>) -> () {
   309  ///   %f1 = constant 1.0 : f32
   310  ///   affine.for %i0 = 0 to 32 {
   311  ///     store %f1, %A[%i0] : memref<128xf32, 0>
   312  ///   }
   313  ///   return
   314  /// }
   315  /// ```
   316  ///
   317  /// Choice of loop transformation to support the algorithm:
   318  /// =======================================================
   319  /// The choice of loop transformation to apply for coarsening vectorized loops
   320  /// is still subject to exploratory tradeoffs. In particular, say we want to
   321  /// vectorize by a factor 128, we want to transform the following input:
   322  /// ```mlir
   323  ///   affine.for %i = %M to %N {
   324  ///     %a = load A[%i] : memref<?xf32>
   325  ///   }
   326  /// ```
   327  ///
   328  /// Traditionally, one would vectorize late (after scheduling, tiling,
   329  /// memory promotion etc) say after stripmining (and potentially unrolling in
   330  /// the case of LLVM's SLP vectorizer):
   331  /// ```mlir
   332  ///   affine.for %i = floor(%M, 128) to ceil(%N, 128) {
   333  ///     affine.for %ii = max(%M, 128 * %i) to min(%N, 128*%i + 127) {
   334  ///       %a = load A[%ii] : memref<?xf32>
   335  ///     }
   336  ///   }
   337  /// ```
   338  ///
   339  /// Instead, we seek to vectorize early and freeze vector types before
   340  /// scheduling, so we want to generate a pattern that resembles:
   341  /// ```mlir
   342  ///   affine.for %i = ? to ? step ? {
   343  ///     %v_a = vector.transfer_read A[%i] : memref<?xf32>, vector<128xf32>
   344  ///   }
   345  /// ```
   346  ///
   347  /// i. simply dividing the lower / upper bounds by 128 creates issues
   348  ///    when representing expressions such as ii + 1 because now we only
   349  ///    have access to original values that have been divided. Additional
   350  ///    information is needed to specify accesses at below-128 granularity;
   351  /// ii. another alternative is to coarsen the loop step but this may have
   352  ///    consequences on dependence analysis and fusability of loops: fusable
   353  ///    loops probably need to have the same step (because we don't want to
   354  ///    stripmine/unroll to enable fusion).
   355  /// As a consequence, we choose to represent the coarsening using the loop
   356  /// step for now and reevaluate in the future. Note that we can renormalize
   357  /// loop steps later if/when we have evidence that they are problematic.
   358  ///
   359  /// For the simple strawman example above, vectorizing for a 1-D vector
   360  /// abstraction of size 128 returns code similar to:
   361  /// ```mlir
   362  ///   affine.for %i = %M to %N step 128 {
   363  ///     %v_a = vector.transfer_read A[%i] : memref<?xf32>, vector<128xf32>
   364  ///   }
   365  /// ```
   366  ///
   367  /// Unsupported cases, extensions, and work in progress (help welcome :-) ):
   368  /// ========================================================================
   369  ///   1. lowering to concrete vector types for various HW;
   370  ///   2. reduction support;
   371  ///   3. non-effecting padding during vector.transfer_read and filter during
   372  ///      vector.transfer_write;
   373  ///   4. misalignment support vector.transfer_read / vector.transfer_write
   374  ///      (hopefully without read-modify-writes);
   375  ///   5. control-flow support;
   376  ///   6. cost-models, heuristics and search;
   377  ///   7. Op implementation, extensions and implication on memref views;
   378  ///   8. many TODOs left around.
   379  ///
   380  /// Examples:
   381  /// =========
   382  /// Consider the following Function:
   383  /// ```mlir
   384  /// mlfunc @vector_add_2d(%M : index, %N : index) -> f32 {
   385  ///   %A = alloc (%M, %N) : memref<?x?xf32, 0>
   386  ///   %B = alloc (%M, %N) : memref<?x?xf32, 0>
   387  ///   %C = alloc (%M, %N) : memref<?x?xf32, 0>
   388  ///   %f1 = constant 1.0 : f32
   389  ///   %f2 = constant 2.0 : f32
   390  ///   affine.for %i0 = 0 to %M {
   391  ///     affine.for %i1 = 0 to %N {
   392  ///       // non-scoped %f1
   393  ///       store %f1, %A[%i0, %i1] : memref<?x?xf32, 0>
   394  ///     }
   395  ///   }
   396  ///   affine.for %i2 = 0 to %M {
   397  ///     affine.for %i3 = 0 to %N {
   398  ///       // non-scoped %f2
   399  ///       store %f2, %B[%i2, %i3] : memref<?x?xf32, 0>
   400  ///     }
   401  ///   }
   402  ///   affine.for %i4 = 0 to %M {
   403  ///     affine.for %i5 = 0 to %N {
   404  ///       %a5 = load %A[%i4, %i5] : memref<?x?xf32, 0>
   405  ///       %b5 = load %B[%i4, %i5] : memref<?x?xf32, 0>
   406  ///       %s5 = addf %a5, %b5 : f32
   407  ///       // non-scoped %f1
   408  ///       %s6 = addf %s5, %f1 : f32
   409  ///       // non-scoped %f2
   410  ///       %s7 = addf %s5, %f2 : f32
   411  ///       // diamond dependency.
   412  ///       %s8 = addf %s7, %s6 : f32
   413  ///       store %s8, %C[%i4, %i5] : memref<?x?xf32, 0>
   414  ///     }
   415  ///   }
   416  ///   %c7 = constant 7 : index
   417  ///   %c42 = constant 42 : index
   418  ///   %res = load %C[%c7, %c42] : memref<?x?xf32, 0>
   419  ///   return %res : f32
   420  /// }
   421  /// ```
   422  ///
   423  /// TODO(ntv): update post b/119731251.
   424  /// The -vectorize pass with the following arguments:
   425  /// ```
   426  /// -vectorize -virtual-vector-size 256 --test-fastest-varying=0
   427  /// ```
   428  ///
   429  /// produces this standard innermost-loop vectorized code:
   430  /// ```mlir
   431  /// mlfunc @vector_add_2d(%arg0 : index, %arg1 : index) -> f32 {
   432  ///   %0 = alloc(%arg0, %arg1) : memref<?x?xf32>
   433  ///   %1 = alloc(%arg0, %arg1) : memref<?x?xf32>
   434  ///   %2 = alloc(%arg0, %arg1) : memref<?x?xf32>
   435  ///   %cst = constant 1.0 : f32
   436  ///   %cst_0 = constant 2.0 : f32
   437  ///   affine.for %i0 = 0 to %arg0 {
   438  ///     affine.for %i1 = 0 to %arg1 step 256 {
   439  ///       %cst_1 = constant dense<vector<256xf32>, 1.0> :
   440  ///                vector<256xf32>
   441  ///       vector.transfer_write %cst_1, %0[%i0, %i1] :
   442  ///                vector<256xf32>, memref<?x?xf32>
   443  ///     }
   444  ///   }
   445  ///   affine.for %i2 = 0 to %arg0 {
   446  ///     affine.for %i3 = 0 to %arg1 step 256 {
   447  ///       %cst_2 = constant dense<vector<256xf32>, 2.0> :
   448  ///                vector<256xf32>
   449  ///       vector.transfer_write %cst_2, %1[%i2, %i3] :
   450  ///                vector<256xf32>, memref<?x?xf32>
   451  ///     }
   452  ///   }
   453  ///   affine.for %i4 = 0 to %arg0 {
   454  ///     affine.for %i5 = 0 to %arg1 step 256 {
   455  ///       %3 = vector.transfer_read %0[%i4, %i5] :
   456  ///            memref<?x?xf32>, vector<256xf32>
   457  ///       %4 = vector.transfer_read %1[%i4, %i5] :
   458  ///            memref<?x?xf32>, vector<256xf32>
   459  ///       %5 = addf %3, %4 : vector<256xf32>
   460  ///       %cst_3 = constant dense<vector<256xf32>, 1.0> :
   461  ///                vector<256xf32>
   462  ///       %6 = addf %5, %cst_3 : vector<256xf32>
   463  ///       %cst_4 = constant dense<vector<256xf32>, 2.0> :
   464  ///                vector<256xf32>
   465  ///       %7 = addf %5, %cst_4 : vector<256xf32>
   466  ///       %8 = addf %7, %6 : vector<256xf32>
   467  ///       vector.transfer_write %8, %2[%i4, %i5] :
   468  ///                vector<256xf32>, memref<?x?xf32>
   469  ///     }
   470  ///   }
   471  ///   %c7 = constant 7 : index
   472  ///   %c42 = constant 42 : index
   473  ///   %9 = load %2[%c7, %c42] : memref<?x?xf32>
   474  ///   return %9 : f32
   475  /// }
   476  /// ```
   477  ///
   478  /// TODO(ntv): update post b/119731251.
   479  /// The -vectorize pass with the following arguments:
   480  /// ```
   481  /// -vectorize -virtual-vector-size 32 -virtual-vector-size 256
   482  /// --test-fastest-varying=1 --test-fastest-varying=0
   483  /// ```
   484  ///
   485  /// produces this more insteresting mixed outer-innermost-loop vectorized code:
   486  /// ```mlir
   487  /// mlfunc @vector_add_2d(%arg0 : index, %arg1 : index) -> f32 {
   488  ///   %0 = alloc(%arg0, %arg1) : memref<?x?xf32>
   489  ///   %1 = alloc(%arg0, %arg1) : memref<?x?xf32>
   490  ///   %2 = alloc(%arg0, %arg1) : memref<?x?xf32>
   491  ///   %cst = constant 1.0 : f32
   492  ///   %cst_0 = constant 2.0 : f32
   493  ///   affine.for %i0 = 0 to %arg0 step 32 {
   494  ///     affine.for %i1 = 0 to %arg1 step 256 {
   495  ///       %cst_1 = constant dense<vector<32x256xf32>, 1.0> :
   496  ///                vector<32x256xf32>
   497  ///       vector.transfer_write %cst_1, %0[%i0, %i1] :
   498  ///                vector<32x256xf32>, memref<?x?xf32>
   499  ///     }
   500  ///   }
   501  ///   affine.for %i2 = 0 to %arg0 step 32 {
   502  ///     affine.for %i3 = 0 to %arg1 step 256 {
   503  ///       %cst_2 = constant dense<vector<32x256xf32>, 2.0> :
   504  ///                vector<32x256xf32>
   505  ///       vector.transfer_write %cst_2, %1[%i2, %i3] :
   506  ///                vector<32x256xf32>, memref<?x?xf32>
   507  ///     }
   508  ///   }
   509  ///   affine.for %i4 = 0 to %arg0 step 32 {
   510  ///     affine.for %i5 = 0 to %arg1 step 256 {
   511  ///       %3 = vector.transfer_read %0[%i4, %i5] :
   512  ///                memref<?x?xf32> vector<32x256xf32>
   513  ///       %4 = vector.transfer_read %1[%i4, %i5] :
   514  ///                memref<?x?xf32>, vector<32x256xf32>
   515  ///       %5 = addf %3, %4 : vector<32x256xf32>
   516  ///       %cst_3 = constant dense<vector<32x256xf32>, 1.0> :
   517  ///                vector<32x256xf32>
   518  ///       %6 = addf %5, %cst_3 : vector<32x256xf32>
   519  ///       %cst_4 = constant dense<vector<32x256xf32>, 2.0> :
   520  ///                vector<32x256xf32>
   521  ///       %7 = addf %5, %cst_4 : vector<32x256xf32>
   522  ///       %8 = addf %7, %6 : vector<32x256xf32>
   523  ///       vector.transfer_write %8, %2[%i4, %i5] :
   524  ///                vector<32x256xf32>, memref<?x?xf32>
   525  ///     }
   526  ///   }
   527  ///   %c7 = constant 7 : index
   528  ///   %c42 = constant 42 : index
   529  ///   %9 = load %2[%c7, %c42] : memref<?x?xf32>
   530  ///   return %9 : f32
   531  /// }
   532  /// ```
   533  ///
   534  /// Of course, much more intricate n-D imperfectly-nested patterns can be
   535  /// vectorized too and specified in a fully declarative fashion.
   536  
   537  #define DEBUG_TYPE "early-vect"
   538  
   539  using functional::makePtrDynCaster;
   540  using functional::map;
   541  using llvm::dbgs;
   542  using llvm::SetVector;
   543  
   544  static llvm::cl::OptionCategory clOptionsCategory("vectorize options");
   545  
   546  static llvm::cl::list<int> clVirtualVectorSize(
   547      "virtual-vector-size",
   548      llvm::cl::desc("Specify an n-D virtual vector size for vectorization"),
   549      llvm::cl::ZeroOrMore, llvm::cl::cat(clOptionsCategory));
   550  
   551  static llvm::cl::list<int> clFastestVaryingPattern(
   552      "test-fastest-varying",
   553      llvm::cl::desc(
   554          "Specify a 1-D, 2-D or 3-D pattern of fastest varying memory"
   555          " dimensions to match. See defaultPatterns in Vectorize.cpp for a"
   556          " description and examples. This is used for testing purposes"),
   557      llvm::cl::ZeroOrMore, llvm::cl::cat(clOptionsCategory));
   558  
   559  /// Forward declaration.
   560  static FilterFunctionType
   561  isVectorizableLoopPtrFactory(const llvm::DenseSet<Operation *> &parallelLoops,
   562                               int fastestVaryingMemRefDimension);
   563  
   564  /// Creates a vectorization pattern from the command line arguments.
   565  /// Up to 3-D patterns are supported.
   566  /// If the command line argument requests a pattern of higher order, returns an
   567  /// empty pattern list which will conservatively result in no vectorization.
   568  static std::vector<NestedPattern>
   569  makePatterns(const llvm::DenseSet<Operation *> &parallelLoops, int vectorRank,
   570               ArrayRef<int64_t> fastestVaryingPattern) {
   571    using matcher::For;
   572    int64_t d0 = fastestVaryingPattern.empty() ? -1 : fastestVaryingPattern[0];
   573    int64_t d1 = fastestVaryingPattern.size() < 2 ? -1 : fastestVaryingPattern[1];
   574    int64_t d2 = fastestVaryingPattern.size() < 3 ? -1 : fastestVaryingPattern[2];
   575    switch (vectorRank) {
   576    case 1:
   577      return {For(isVectorizableLoopPtrFactory(parallelLoops, d0))};
   578    case 2:
   579      return {For(isVectorizableLoopPtrFactory(parallelLoops, d0),
   580                  For(isVectorizableLoopPtrFactory(parallelLoops, d1)))};
   581    case 3:
   582      return {For(isVectorizableLoopPtrFactory(parallelLoops, d0),
   583                  For(isVectorizableLoopPtrFactory(parallelLoops, d1),
   584                      For(isVectorizableLoopPtrFactory(parallelLoops, d2))))};
   585    default: {
   586      return std::vector<NestedPattern>();
   587    }
   588    }
   589  }
   590  
   591  namespace {
   592  
   593  /// Base state for the vectorize pass.
   594  /// Command line arguments are preempted by non-empty pass arguments.
   595  struct Vectorize : public FunctionPass<Vectorize> {
   596    Vectorize();
   597    Vectorize(ArrayRef<int64_t> virtualVectorSize);
   598    void runOnFunction() override;
   599  
   600    // The virtual vector size that we vectorize to.
   601    SmallVector<int64_t, 4> vectorSizes;
   602    // Optionally, the fixed mapping from loop to fastest varying MemRef dimension
   603    // for all the MemRefs within a loop pattern:
   604    //   the index represents the loop depth, the value represents the k^th
   605    //   fastest varying memory dimension.
   606    // This is voluntarily restrictive and is meant to precisely target a
   607    // particular loop/op pair, for testing purposes.
   608    SmallVector<int64_t, 4> fastestVaryingPattern;
   609  };
   610  
   611  } // end anonymous namespace
   612  
   613  Vectorize::Vectorize()
   614      : vectorSizes(clVirtualVectorSize.begin(), clVirtualVectorSize.end()),
   615        fastestVaryingPattern(clFastestVaryingPattern.begin(),
   616                              clFastestVaryingPattern.end()) {}
   617  
   618  Vectorize::Vectorize(ArrayRef<int64_t> virtualVectorSize) : Vectorize() {
   619    if (!virtualVectorSize.empty()) {
   620      this->vectorSizes.assign(virtualVectorSize.begin(),
   621                               virtualVectorSize.end());
   622    }
   623  }
   624  
   625  /////// TODO(ntv): Hoist to a VectorizationStrategy.cpp when appropriate.
   626  /////////
   627  namespace {
   628  
   629  struct VectorizationStrategy {
   630    SmallVector<int64_t, 8> vectorSizes;
   631    DenseMap<Operation *, unsigned> loopToVectorDim;
   632  };
   633  
   634  } // end anonymous namespace
   635  
   636  static void vectorizeLoopIfProfitable(Operation *loop, unsigned depthInPattern,
   637                                        unsigned patternDepth,
   638                                        VectorizationStrategy *strategy) {
   639    assert(patternDepth > depthInPattern &&
   640           "patternDepth is greater than depthInPattern");
   641    if (patternDepth - depthInPattern > strategy->vectorSizes.size()) {
   642      // Don't vectorize this loop
   643      return;
   644    }
   645    strategy->loopToVectorDim[loop] =
   646        strategy->vectorSizes.size() - (patternDepth - depthInPattern);
   647  }
   648  
   649  /// Implements a simple strawman strategy for vectorization.
   650  /// Given a matched pattern `matches` of depth `patternDepth`, this strategy
   651  /// greedily assigns the fastest varying dimension ** of the vector ** to the
   652  /// innermost loop in the pattern.
   653  /// When coupled with a pattern that looks for the fastest varying dimension in
   654  /// load/store MemRefs, this creates a generic vectorization strategy that works
   655  /// for any loop in a hierarchy (outermost, innermost or intermediate).
   656  ///
   657  /// TODO(ntv): In the future we should additionally increase the power of the
   658  /// profitability analysis along 3 directions:
   659  ///   1. account for loop extents (both static and parametric + annotations);
   660  ///   2. account for data layout permutations;
   661  ///   3. account for impact of vectorization on maximal loop fusion.
   662  /// Then we can quantify the above to build a cost model and search over
   663  /// strategies.
   664  static LogicalResult analyzeProfitability(ArrayRef<NestedMatch> matches,
   665                                            unsigned depthInPattern,
   666                                            unsigned patternDepth,
   667                                            VectorizationStrategy *strategy) {
   668    for (auto m : matches) {
   669      if (failed(analyzeProfitability(m.getMatchedChildren(), depthInPattern + 1,
   670                                      patternDepth, strategy))) {
   671        return failure();
   672      }
   673      vectorizeLoopIfProfitable(m.getMatchedOperation(), depthInPattern,
   674                                patternDepth, strategy);
   675    }
   676    return success();
   677  }
   678  
   679  ///// end TODO(ntv): Hoist to a VectorizationStrategy.cpp when appropriate /////
   680  
   681  namespace {
   682  
   683  struct VectorizationState {
   684    /// Adds an entry of pre/post vectorization operations in the state.
   685    void registerReplacement(Operation *key, Operation *value);
   686    /// When the current vectorization pattern is successful, this erases the
   687    /// operations that were marked for erasure in the proper order and resets
   688    /// the internal state for the next pattern.
   689    void finishVectorizationPattern();
   690  
   691    // In-order tracking of original Operation that have been vectorized.
   692    // Erase in reverse order.
   693    SmallVector<Operation *, 16> toErase;
   694    // Set of Operation that have been vectorized (the values in the
   695    // vectorizationMap for hashed access). The vectorizedSet is used in
   696    // particular to filter the operations that have already been vectorized by
   697    // this pattern, when iterating over nested loops in this pattern.
   698    DenseSet<Operation *> vectorizedSet;
   699    // Map of old scalar Operation to new vectorized Operation.
   700    DenseMap<Operation *, Operation *> vectorizationMap;
   701    // Map of old scalar Value to new vectorized Value.
   702    DenseMap<Value *, Value *> replacementMap;
   703    // The strategy drives which loop to vectorize by which amount.
   704    const VectorizationStrategy *strategy;
   705    // Use-def roots. These represent the starting points for the worklist in the
   706    // vectorizeNonTerminals function. They consist of the subset of load
   707    // operations that have been vectorized. They can be retrieved from
   708    // `vectorizationMap` but it is convenient to keep track of them in a separate
   709    // data structure.
   710    DenseSet<Operation *> roots;
   711    // Terminal operations for the worklist in the vectorizeNonTerminals
   712    // function. They consist of the subset of store operations that have been
   713    // vectorized. They can be retrieved from `vectorizationMap` but it is
   714    // convenient to keep track of them in a separate data structure. Since they
   715    // do not necessarily belong to use-def chains starting from loads (e.g
   716    // storing a constant), we need to handle them in a post-pass.
   717    DenseSet<Operation *> terminals;
   718    // Checks that the type of `op` is AffineStoreOp and adds it to the terminals
   719    // set.
   720    void registerTerminal(Operation *op);
   721  
   722  private:
   723    void registerReplacement(Value *key, Value *value);
   724  };
   725  
   726  } // end namespace
   727  
   728  void VectorizationState::registerReplacement(Operation *key, Operation *value) {
   729    LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ commit vectorized op: ");
   730    LLVM_DEBUG(key->print(dbgs()));
   731    LLVM_DEBUG(dbgs() << "  into  ");
   732    LLVM_DEBUG(value->print(dbgs()));
   733    assert(key->getNumResults() == 1 && "already registered");
   734    assert(value->getNumResults() == 1 && "already registered");
   735    assert(vectorizedSet.count(value) == 0 && "already registered");
   736    assert(vectorizationMap.count(key) == 0 && "already registered");
   737    toErase.push_back(key);
   738    vectorizedSet.insert(value);
   739    vectorizationMap.insert(std::make_pair(key, value));
   740    registerReplacement(key->getResult(0), value->getResult(0));
   741    if (isa<AffineLoadOp>(key)) {
   742      assert(roots.count(key) == 0 && "root was already inserted previously");
   743      roots.insert(key);
   744    }
   745  }
   746  
   747  void VectorizationState::registerTerminal(Operation *op) {
   748    assert(isa<AffineStoreOp>(op) && "terminal must be a AffineStoreOp");
   749    assert(terminals.count(op) == 0 &&
   750           "terminal was already inserted previously");
   751    terminals.insert(op);
   752  }
   753  
   754  void VectorizationState::finishVectorizationPattern() {
   755    while (!toErase.empty()) {
   756      auto *op = toErase.pop_back_val();
   757      LLVM_DEBUG(dbgs() << "\n[early-vect] finishVectorizationPattern erase: ");
   758      LLVM_DEBUG(op->print(dbgs()));
   759      op->erase();
   760    }
   761  }
   762  
   763  void VectorizationState::registerReplacement(Value *key, Value *value) {
   764    assert(replacementMap.count(key) == 0 && "replacement already registered");
   765    replacementMap.insert(std::make_pair(key, value));
   766  }
   767  
   768  // Apply 'map' with 'mapOperands' returning resulting values in 'results'.
   769  static void computeMemoryOpIndices(Operation *op, AffineMap map,
   770                                     ArrayRef<Value *> mapOperands,
   771                                     SmallVectorImpl<Value *> &results) {
   772    OpBuilder builder(op);
   773    for (auto resultExpr : map.getResults()) {
   774      auto singleResMap =
   775          builder.getAffineMap(map.getNumDims(), map.getNumSymbols(), resultExpr);
   776      auto afOp =
   777          builder.create<AffineApplyOp>(op->getLoc(), singleResMap, mapOperands);
   778      results.push_back(afOp);
   779    }
   780  }
   781  
   782  ////// TODO(ntv): Hoist to a VectorizationMaterialize.cpp when appropriate. ////
   783  
   784  /// Handles the vectorization of load and store MLIR operations.
   785  ///
   786  /// AffineLoadOp operations are the roots of the vectorizeNonTerminals call.
   787  /// They are vectorized immediately. The resulting vector.transfer_read is
   788  /// immediately registered to replace all uses of the AffineLoadOp in this
   789  /// pattern's scope.
   790  ///
   791  /// AffineStoreOp are the terminals of the vectorizeNonTerminals call. They
   792  /// need to be vectorized late once all the use-def chains have been traversed.
   793  /// Additionally, they may have ssa-values operands which come from outside the
   794  /// scope of the current pattern.
   795  /// Such special cases force us to delay the vectorization of the stores until
   796  /// the last step. Here we merely register the store operation.
   797  template <typename LoadOrStoreOpPointer>
   798  static LogicalResult vectorizeRootOrTerminal(Value *iv,
   799                                               LoadOrStoreOpPointer memoryOp,
   800                                               VectorizationState *state) {
   801    auto memRefType = memoryOp.getMemRef()->getType().template cast<MemRefType>();
   802  
   803    auto elementType = memRefType.getElementType();
   804    // TODO(ntv): ponder whether we want to further vectorize a vector value.
   805    assert(VectorType::isValidElementType(elementType) &&
   806           "Not a valid vector element type");
   807    auto vectorType = VectorType::get(state->strategy->vectorSizes, elementType);
   808  
   809    // Materialize a MemRef with 1 vector.
   810    auto *opInst = memoryOp.getOperation();
   811    // For now, vector.transfers must be aligned, operate only on indices with an
   812    // identity subset of AffineMap and do not change layout.
   813    // TODO(ntv): increase the expressiveness power of vector.transfer operations
   814    // as needed by various targets.
   815    if (auto load = dyn_cast<AffineLoadOp>(opInst)) {
   816      OpBuilder b(opInst);
   817      SmallVector<Value *, 4> mapOperands(load.getIndices());
   818      SmallVector<Value *, 8> indices;
   819      indices.reserve(load.getMemRefType().getRank());
   820      if (load.getAffineMap() !=
   821          b.getMultiDimIdentityMap(load.getMemRefType().getRank())) {
   822        computeMemoryOpIndices(opInst, load.getAffineMap(), mapOperands, indices);
   823      } else {
   824        indices.append(load.getIndices().begin(), load.getIndices().end());
   825      }
   826      auto permutationMap =
   827          makePermutationMap(opInst, indices, state->strategy->loopToVectorDim);
   828      if (!permutationMap)
   829        return LogicalResult::Failure;
   830      LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: ");
   831      LLVM_DEBUG(permutationMap.print(dbgs()));
   832      auto transfer = b.create<vector::VectorTransferReadOp>(
   833          opInst->getLoc(), vectorType, memoryOp.getMemRef(),
   834          map(makePtrDynCaster<Value>(), indices), permutationMap);
   835      state->registerReplacement(opInst, transfer.getOperation());
   836    } else {
   837      state->registerTerminal(opInst);
   838    }
   839    return success();
   840  }
   841  /// end TODO(ntv): Hoist to a VectorizationMaterialize.cpp when appropriate. ///
   842  
   843  /// Coarsens the loops bounds and transforms all remaining load and store
   844  /// operations into the appropriate vector.transfer.
   845  static LogicalResult vectorizeAffineForOp(AffineForOp loop, int64_t step,
   846                                            VectorizationState *state) {
   847    using namespace functional;
   848    loop.setStep(step);
   849  
   850    FilterFunctionType notVectorizedThisPattern = [state](Operation &op) {
   851      if (!matcher::isLoadOrStore(op)) {
   852        return false;
   853      }
   854      return state->vectorizationMap.count(&op) == 0 &&
   855             state->vectorizedSet.count(&op) == 0 &&
   856             state->roots.count(&op) == 0 && state->terminals.count(&op) == 0;
   857    };
   858    auto loadAndStores = matcher::Op(notVectorizedThisPattern);
   859    SmallVector<NestedMatch, 8> loadAndStoresMatches;
   860    loadAndStores.match(loop.getOperation(), &loadAndStoresMatches);
   861    for (auto ls : loadAndStoresMatches) {
   862      auto *opInst = ls.getMatchedOperation();
   863      auto load = dyn_cast<AffineLoadOp>(opInst);
   864      auto store = dyn_cast<AffineStoreOp>(opInst);
   865      LLVM_DEBUG(opInst->print(dbgs()));
   866      LogicalResult result =
   867          load ? vectorizeRootOrTerminal(loop.getInductionVar(), load, state)
   868               : vectorizeRootOrTerminal(loop.getInductionVar(), store, state);
   869      if (failed(result)) {
   870        return failure();
   871      }
   872    }
   873    return success();
   874  }
   875  
   876  /// Returns a FilterFunctionType that can be used in NestedPattern to match a
   877  /// loop whose underlying load/store accesses are either invariant or all
   878  // varying along the `fastestVaryingMemRefDimension`.
   879  static FilterFunctionType
   880  isVectorizableLoopPtrFactory(const llvm::DenseSet<Operation *> &parallelLoops,
   881                               int fastestVaryingMemRefDimension) {
   882    return [&parallelLoops, fastestVaryingMemRefDimension](Operation &forOp) {
   883      auto loop = cast<AffineForOp>(forOp);
   884      auto parallelIt = parallelLoops.find(loop);
   885      if (parallelIt == parallelLoops.end())
   886        return false;
   887      int memRefDim = -1;
   888      auto vectorizableBody = isVectorizableLoopBody(loop, &memRefDim);
   889      if (!vectorizableBody)
   890        return false;
   891      return memRefDim == -1 || fastestVaryingMemRefDimension == -1 ||
   892             memRefDim == fastestVaryingMemRefDimension;
   893    };
   894  }
   895  
   896  /// Apply vectorization of `loop` according to `state`. This is only triggered
   897  /// if all vectorizations in `childrenMatches` have already succeeded
   898  /// recursively in DFS post-order.
   899  static LogicalResult
   900  vectorizeLoopsAndLoadsRecursively(NestedMatch oneMatch,
   901                                    VectorizationState *state) {
   902    auto *loopInst = oneMatch.getMatchedOperation();
   903    auto loop = cast<AffineForOp>(loopInst);
   904    auto childrenMatches = oneMatch.getMatchedChildren();
   905  
   906    // 1. DFS postorder recursion, if any of my children fails, I fail too.
   907    for (auto m : childrenMatches) {
   908      if (failed(vectorizeLoopsAndLoadsRecursively(m, state))) {
   909        return failure();
   910      }
   911    }
   912  
   913    // 2. This loop may have been omitted from vectorization for various reasons
   914    // (e.g. due to the performance model or pattern depth > vector size).
   915    auto it = state->strategy->loopToVectorDim.find(loopInst);
   916    if (it == state->strategy->loopToVectorDim.end()) {
   917      return success();
   918    }
   919  
   920    // 3. Actual post-order transformation.
   921    auto vectorDim = it->second;
   922    assert(vectorDim < state->strategy->vectorSizes.size() &&
   923           "vector dim overflow");
   924    //   a. get actual vector size
   925    auto vectorSize = state->strategy->vectorSizes[vectorDim];
   926    //   b. loop transformation for early vectorization is still subject to
   927    //     exploratory tradeoffs (see top of the file). Apply coarsening, i.e.:
   928    //        | ub -> ub
   929    //        | step -> step * vectorSize
   930    LLVM_DEBUG(dbgs() << "\n[early-vect] vectorizeForOp by " << vectorSize
   931                      << " : ");
   932    LLVM_DEBUG(loopInst->print(dbgs()));
   933    return vectorizeAffineForOp(loop, loop.getStep() * vectorSize, state);
   934  }
   935  
   936  /// Tries to transform a scalar constant into a vector splat of that constant.
   937  /// Returns the vectorized splat operation if the constant is a valid vector
   938  /// element type.
   939  /// If `type` is not a valid vector type or if the scalar constant is not a
   940  /// valid vector element type, returns nullptr.
   941  static Value *vectorizeConstant(Operation *op, ConstantOp constant, Type type) {
   942    if (!type || !type.isa<VectorType>() ||
   943        !VectorType::isValidElementType(constant.getType())) {
   944      return nullptr;
   945    }
   946    OpBuilder b(op);
   947    Location loc = op->getLoc();
   948    auto vectorType = type.cast<VectorType>();
   949    auto attr = DenseElementsAttr::get(vectorType, constant.getValue());
   950    auto *constantOpInst = constant.getOperation();
   951  
   952    OperationState state(loc, constantOpInst->getName().getStringRef(), {},
   953                         {vectorType}, {b.getNamedAttr("value", attr)});
   954  
   955    return b.createOperation(state)->getResult(0);
   956  }
   957  
   958  /// Tries to vectorize a given operand `op` of Operation `op` during
   959  /// def-chain propagation or during terminal vectorization, by applying the
   960  /// following logic:
   961  /// 1. if the defining operation is part of the vectorizedSet (i.e. vectorized
   962  ///    useby -def propagation), `op` is already in the proper vector form;
   963  /// 2. otherwise, the `op` may be in some other vector form that fails to
   964  ///    vectorize atm (i.e. broadcasting required), returns nullptr to indicate
   965  ///    failure;
   966  /// 3. if the `op` is a constant, returns the vectorized form of the constant;
   967  /// 4. non-constant scalars are currently non-vectorizable, in particular to
   968  ///    guard against vectorizing an index which may be loop-variant and needs
   969  ///    special handling.
   970  ///
   971  /// In particular this logic captures some of the use cases where definitions
   972  /// that are not scoped under the current pattern are needed to vectorize.
   973  /// One such example is top level function constants that need to be splatted.
   974  ///
   975  /// Returns an operand that has been vectorized to match `state`'s strategy if
   976  /// vectorization is possible with the above logic. Returns nullptr otherwise.
   977  ///
   978  /// TODO(ntv): handle more complex cases.
   979  static Value *vectorizeOperand(Value *operand, Operation *op,
   980                                 VectorizationState *state) {
   981    LLVM_DEBUG(dbgs() << "\n[early-vect]vectorize operand: ");
   982    LLVM_DEBUG(operand->print(dbgs()));
   983    // 1. If this value has already been vectorized this round, we are done.
   984    if (state->vectorizedSet.count(operand->getDefiningOp()) > 0) {
   985      LLVM_DEBUG(dbgs() << " -> already vector operand");
   986      return operand;
   987    }
   988    // 1.b. Delayed on-demand replacement of a use.
   989    //    Note that we cannot just call replaceAllUsesWith because it may result
   990    //    in ops with mixed types, for ops whose operands have not all yet
   991    //    been vectorized. This would be invalid IR.
   992    auto it = state->replacementMap.find(operand);
   993    if (it != state->replacementMap.end()) {
   994      auto *res = it->second;
   995      LLVM_DEBUG(dbgs() << "-> delayed replacement by: ");
   996      LLVM_DEBUG(res->print(dbgs()));
   997      return res;
   998    }
   999    // 2. TODO(ntv): broadcast needed.
  1000    if (operand->getType().isa<VectorType>()) {
  1001      LLVM_DEBUG(dbgs() << "-> non-vectorizable");
  1002      return nullptr;
  1003    }
  1004    // 3. vectorize constant.
  1005    if (auto constant = dyn_cast<ConstantOp>(operand->getDefiningOp())) {
  1006      return vectorizeConstant(
  1007          op, constant,
  1008          VectorType::get(state->strategy->vectorSizes, operand->getType()));
  1009    }
  1010    // 4. currently non-vectorizable.
  1011    LLVM_DEBUG(dbgs() << "-> non-vectorizable");
  1012    LLVM_DEBUG(operand->print(dbgs()));
  1013    return nullptr;
  1014  }
  1015  
  1016  /// Encodes Operation-specific behavior for vectorization. In general we assume
  1017  /// that all operands of an op must be vectorized but this is not always true.
  1018  /// In the future, it would be nice to have a trait that describes how a
  1019  /// particular operation vectorizes. For now we implement the case distinction
  1020  /// here.
  1021  /// Returns a vectorized form of an operation or nullptr if vectorization fails.
  1022  // TODO(ntv): consider adding a trait to Op to describe how it gets vectorized.
  1023  // Maybe some Ops are not vectorizable or require some tricky logic, we cannot
  1024  // do one-off logic here; ideally it would be TableGen'd.
  1025  static Operation *vectorizeOneOperation(Operation *opInst,
  1026                                          VectorizationState *state) {
  1027    // Sanity checks.
  1028    assert(!isa<AffineLoadOp>(opInst) &&
  1029           "all loads must have already been fully vectorized independently");
  1030    assert(!isa<vector::VectorTransferReadOp>(opInst) &&
  1031           "vector.transfer_read cannot be further vectorized");
  1032    assert(!isa<vector::VectorTransferWriteOp>(opInst) &&
  1033           "vector.transfer_write cannot be further vectorized");
  1034  
  1035    if (auto store = dyn_cast<AffineStoreOp>(opInst)) {
  1036      OpBuilder b(opInst);
  1037      auto *memRef = store.getMemRef();
  1038      auto *value = store.getValueToStore();
  1039      auto *vectorValue = vectorizeOperand(value, opInst, state);
  1040  
  1041      SmallVector<Value *, 4> mapOperands(store.getIndices());
  1042      SmallVector<Value *, 8> indices;
  1043      indices.reserve(store.getMemRefType().getRank());
  1044      if (store.getAffineMap() !=
  1045          b.getMultiDimIdentityMap(store.getMemRefType().getRank())) {
  1046        computeMemoryOpIndices(opInst, store.getAffineMap(), mapOperands,
  1047                               indices);
  1048      } else {
  1049        indices.append(store.getIndices().begin(), store.getIndices().end());
  1050      }
  1051  
  1052      auto permutationMap =
  1053          makePermutationMap(opInst, indices, state->strategy->loopToVectorDim);
  1054      if (!permutationMap)
  1055        return nullptr;
  1056      LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: ");
  1057      LLVM_DEBUG(permutationMap.print(dbgs()));
  1058      auto transfer = b.create<vector::VectorTransferWriteOp>(
  1059          opInst->getLoc(), vectorValue, memRef, indices, permutationMap);
  1060      auto *res = transfer.getOperation();
  1061      LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorized store: " << *res);
  1062      // "Terminals" (i.e. AffineStoreOps) are erased on the spot.
  1063      opInst->erase();
  1064      return res;
  1065    }
  1066    if (opInst->getNumRegions() != 0)
  1067      return nullptr;
  1068  
  1069    SmallVector<Type, 8> vectorTypes;
  1070    for (auto *v : opInst->getResults()) {
  1071      vectorTypes.push_back(
  1072          VectorType::get(state->strategy->vectorSizes, v->getType()));
  1073    }
  1074    SmallVector<Value *, 8> vectorOperands;
  1075    for (auto *v : opInst->getOperands()) {
  1076      vectorOperands.push_back(vectorizeOperand(v, opInst, state));
  1077    }
  1078    // Check whether a single operand is null. If so, vectorization failed.
  1079    bool success = llvm::all_of(vectorOperands, [](Value *op) { return op; });
  1080    if (!success) {
  1081      LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ an operand failed vectorize");
  1082      return nullptr;
  1083    }
  1084  
  1085    // Create a clone of the op with the proper operands and return types.
  1086    // TODO(ntv): The following assumes there is always an op with a fixed
  1087    // name that works both in scalar mode and vector mode.
  1088    // TODO(ntv): Is it worth considering an Operation.clone operation which
  1089    // changes the type so we can promote an Operation with less boilerplate?
  1090    OpBuilder b(opInst);
  1091    OperationState newOp(opInst->getLoc(), opInst->getName().getStringRef(),
  1092                         vectorOperands, vectorTypes, opInst->getAttrs(),
  1093                         /*successors=*/{},
  1094                         /*regions=*/{}, opInst->hasResizableOperandsList());
  1095    return b.createOperation(newOp);
  1096  }
  1097  
  1098  /// Iterates over the forward slice from the loads in the vectorization pattern
  1099  /// and rewrites them using their vectorized counterpart by:
  1100  ///   1. Create the forward slice starting from the laods in the vectorization
  1101  ///   pattern.
  1102  ///   2. Topologically sorts the forward slice.
  1103  ///   3. For each operation in the slice, create the vector form of this
  1104  ///   operation, replacing each operand by a replacement operands retrieved from
  1105  ///   replacementMap. If any such replacement is missing, vectorization fails.
  1106  static LogicalResult vectorizeNonTerminals(VectorizationState *state) {
  1107    // 1. create initial worklist with the uses of the roots.
  1108    SetVector<Operation *> worklist;
  1109    // Note: state->roots have already been vectorized and must not be vectorized
  1110    // again. This fits `getForwardSlice` which does not insert `op` in the
  1111    // result.
  1112    // Note: we have to exclude terminals because some of their defs may not be
  1113    // nested under the vectorization pattern (e.g. constants defined in an
  1114    // encompassing scope).
  1115    // TODO(ntv): Use a backward slice for terminals, avoid special casing and
  1116    // merge implementations.
  1117    for (auto *op : state->roots) {
  1118      getForwardSlice(op, &worklist, [state](Operation *op) {
  1119        return state->terminals.count(op) == 0; // propagate if not terminal
  1120      });
  1121    }
  1122    // We merged multiple slices, topological order may not hold anymore.
  1123    worklist = topologicalSort(worklist);
  1124  
  1125    for (unsigned i = 0; i < worklist.size(); ++i) {
  1126      auto *op = worklist[i];
  1127      LLVM_DEBUG(dbgs() << "\n[early-vect] vectorize use: ");
  1128      LLVM_DEBUG(op->print(dbgs()));
  1129  
  1130      // Create vector form of the operation.
  1131      // Insert it just before op, on success register op as replaced.
  1132      auto *vectorizedInst = vectorizeOneOperation(op, state);
  1133      if (!vectorizedInst) {
  1134        return failure();
  1135      }
  1136  
  1137      // 3. Register replacement for future uses in the scope.
  1138      //    Note that we cannot just call replaceAllUsesWith because it may
  1139      //    result in ops with mixed types, for ops whose operands have not all
  1140      //    yet been vectorized. This would be invalid IR.
  1141      state->registerReplacement(op, vectorizedInst);
  1142    }
  1143    return success();
  1144  }
  1145  
  1146  /// Vectorization is a recursive procedure where anything below can fail.
  1147  /// The root match thus needs to maintain a clone for handling failure.
  1148  /// Each root may succeed independently but will otherwise clean after itself if
  1149  /// anything below it fails.
  1150  static LogicalResult vectorizeRootMatch(NestedMatch m,
  1151                                          VectorizationStrategy *strategy) {
  1152    auto loop = cast<AffineForOp>(m.getMatchedOperation());
  1153    VectorizationState state;
  1154    state.strategy = strategy;
  1155  
  1156    // Since patterns are recursive, they can very well intersect.
  1157    // Since we do not want a fully greedy strategy in general, we decouple
  1158    // pattern matching, from profitability analysis, from application.
  1159    // As a consequence we must check that each root pattern is still
  1160    // vectorizable. If a pattern is not vectorizable anymore, we just skip it.
  1161    // TODO(ntv): implement a non-greedy profitability analysis that keeps only
  1162    // non-intersecting patterns.
  1163    if (!isVectorizableLoopBody(loop)) {
  1164      LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ loop is not vectorizable");
  1165      return failure();
  1166    }
  1167  
  1168    /// Sets up error handling for this root loop. This is how the root match
  1169    /// maintains a clone for handling failure and restores the proper state via
  1170    /// RAII.
  1171    auto *loopInst = loop.getOperation();
  1172    OpBuilder builder(loopInst);
  1173    auto clonedLoop = cast<AffineForOp>(builder.clone(*loopInst));
  1174    struct Guard {
  1175      LogicalResult failure() {
  1176        loop.getInductionVar()->replaceAllUsesWith(clonedLoop.getInductionVar());
  1177        loop.erase();
  1178        return mlir::failure();
  1179      }
  1180      LogicalResult success() {
  1181        clonedLoop.erase();
  1182        return mlir::success();
  1183      }
  1184      AffineForOp loop;
  1185      AffineForOp clonedLoop;
  1186    } guard{loop, clonedLoop};
  1187  
  1188    //////////////////////////////////////////////////////////////////////////////
  1189    // Start vectorizing.
  1190    // From now on, any error triggers the scope guard above.
  1191    //////////////////////////////////////////////////////////////////////////////
  1192    // 1. Vectorize all the loops matched by the pattern, recursively.
  1193    // This also vectorizes the roots (AffineLoadOp) as well as registers the
  1194    // terminals (AffineStoreOp) for post-processing vectorization (we need to
  1195    // wait for all use-def chains into them to be vectorized first).
  1196    if (failed(vectorizeLoopsAndLoadsRecursively(m, &state))) {
  1197      LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed root vectorizeLoop");
  1198      return guard.failure();
  1199    }
  1200  
  1201    // 2. Vectorize operations reached by use-def chains from root except the
  1202    // terminals (store operations) that need to be post-processed separately.
  1203    // TODO(ntv): add more as we expand.
  1204    if (failed(vectorizeNonTerminals(&state))) {
  1205      LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed vectorizeNonTerminals");
  1206      return guard.failure();
  1207    }
  1208  
  1209    // 3. Post-process terminals.
  1210    // Note: we have to post-process terminals because some of their defs may not
  1211    // be nested under the vectorization pattern (e.g. constants defined in an
  1212    // encompassing scope).
  1213    // TODO(ntv): Use a backward slice for terminals, avoid special casing and
  1214    // merge implementations.
  1215    for (auto *op : state.terminals) {
  1216      if (!vectorizeOneOperation(op, &state)) { // nullptr == failure
  1217        LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed to vectorize terminals");
  1218        return guard.failure();
  1219      }
  1220    }
  1221  
  1222    // 4. Finish this vectorization pattern.
  1223    LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ success vectorizing pattern");
  1224    state.finishVectorizationPattern();
  1225    return guard.success();
  1226  }
  1227  
  1228  /// Applies vectorization to the current Function by searching over a bunch of
  1229  /// predetermined patterns.
  1230  void Vectorize::runOnFunction() {
  1231    FuncOp f = getFunction();
  1232    if (!fastestVaryingPattern.empty() &&
  1233        fastestVaryingPattern.size() != vectorSizes.size()) {
  1234      f.emitRemark("Fastest varying pattern specified with different size than "
  1235                   "the vector size.");
  1236      return signalPassFailure();
  1237    }
  1238  
  1239    // Thread-safe RAII local context, BumpPtrAllocator freed on exit.
  1240    NestedPatternContext mlContext;
  1241  
  1242    llvm::DenseSet<Operation *> parallelLoops;
  1243    f.walk([&parallelLoops](AffineForOp loop) {
  1244      if (isLoopParallel(loop))
  1245        parallelLoops.insert(loop);
  1246    });
  1247  
  1248    for (auto &pat :
  1249         makePatterns(parallelLoops, vectorSizes.size(), fastestVaryingPattern)) {
  1250      LLVM_DEBUG(dbgs() << "\n******************************************");
  1251      LLVM_DEBUG(dbgs() << "\n******************************************");
  1252      LLVM_DEBUG(dbgs() << "\n[early-vect] new pattern on Function\n");
  1253      LLVM_DEBUG(f.print(dbgs()));
  1254      unsigned patternDepth = pat.getDepth();
  1255  
  1256      SmallVector<NestedMatch, 8> matches;
  1257      pat.match(f, &matches);
  1258      // Iterate over all the top-level matches and vectorize eagerly.
  1259      // This automatically prunes intersecting matches.
  1260      for (auto m : matches) {
  1261        VectorizationStrategy strategy;
  1262        // TODO(ntv): depending on profitability, elect to reduce the vector size.
  1263        strategy.vectorSizes.assign(vectorSizes.begin(), vectorSizes.end());
  1264        if (failed(analyzeProfitability(m.getMatchedChildren(), 1, patternDepth,
  1265                                        &strategy))) {
  1266          continue;
  1267        }
  1268        vectorizeLoopIfProfitable(m.getMatchedOperation(), 0, patternDepth,
  1269                                  &strategy);
  1270        // TODO(ntv): if pattern does not apply, report it; alter the
  1271        // cost/benefit.
  1272        vectorizeRootMatch(m, &strategy);
  1273        // TODO(ntv): some diagnostics if failure to vectorize occurs.
  1274      }
  1275    }
  1276    LLVM_DEBUG(dbgs() << "\n");
  1277  }
  1278  
  1279  std::unique_ptr<FunctionPassBase>
  1280  mlir::createVectorizePass(llvm::ArrayRef<int64_t> virtualVectorSize) {
  1281    return std::make_unique<Vectorize>(virtualVectorSize);
  1282  }
  1283  
  1284  static PassRegistration<Vectorize>
  1285      pass("affine-vectorize",
  1286           "Vectorize to a target independent n-D vector abstraction");