github.com/gagliardetto/golang-go@v0.0.0-20201020153340-53909ea70814/cmd/compile/internal/ssa/gen/S390X.rules (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Lowering arithmetic
     6  (Add(64|Ptr)  x y) -> (ADD  x y)
     7  (Add(32|16|8)  x y) -> (ADDW  x y)
     8  (Add32F x y) -> (FADDS x y)
     9  (Add64F x y) -> (FADD x y)
    10  
    11  (Sub(64|Ptr)  x y) -> (SUB  x y)
    12  (Sub(32|16|8)  x y) -> (SUBW  x y)
    13  (Sub32F x y) -> (FSUBS x y)
    14  (Sub64F x y) -> (FSUB x y)
    15  
    16  (Mul64  x y) -> (MULLD  x y)
    17  (Mul(32|16|8)  x y) -> (MULLW  x y)
    18  (Mul32F x y) -> (FMULS x y)
    19  (Mul64F x y) -> (FMUL x y)
    20  (Mul64uhilo x y) -> (MLGR x y)
    21  
    22  (Div32F x y) -> (FDIVS x y)
    23  (Div64F x y) -> (FDIV x y)
    24  
    25  (Div64  x y) -> (DIVD  x y)
    26  (Div64u x y) -> (DIVDU x y)
    27  // DIVW/DIVWU has a 64-bit dividend and a 32-bit divisor,
    28  // so a sign/zero extension of the dividend is required.
    29  (Div32  x y) -> (DIVW  (MOVWreg x) y)
    30  (Div32u x y) -> (DIVWU (MOVWZreg x) y)
    31  (Div16  x y) -> (DIVW  (MOVHreg x) (MOVHreg y))
    32  (Div16u x y) -> (DIVWU (MOVHZreg x) (MOVHZreg y))
    33  (Div8   x y) -> (DIVW  (MOVBreg x) (MOVBreg y))
    34  (Div8u  x y) -> (DIVWU (MOVBZreg x) (MOVBZreg y))
    35  
    36  (Hmul(64|64u)  x y) -> (MULH(D|DU)  x y)
    37  (Hmul32  x y) -> (SRDconst [32] (MULLD (MOVWreg x) (MOVWreg y)))
    38  (Hmul32u x y) -> (SRDconst [32] (MULLD (MOVWZreg x) (MOVWZreg y)))
    39  
    40  (Mod(64|64u)  x y) -> (MOD(D|DU)  x y)
    41  // MODW/MODWU has a 64-bit dividend and a 32-bit divisor,
    42  // so a sign/zero extension of the dividend is required.
    43  (Mod32  x y) -> (MODW  (MOVWreg x) y)
    44  (Mod32u x y) -> (MODWU (MOVWZreg x) y)
    45  (Mod16  x y) -> (MODW  (MOVHreg x) (MOVHreg y))
    46  (Mod16u x y) -> (MODWU (MOVHZreg x) (MOVHZreg y))
    47  (Mod8   x y) -> (MODW  (MOVBreg x) (MOVBreg y))
    48  (Mod8u  x y) -> (MODWU (MOVBZreg x) (MOVBZreg y))
    49  
    50  // (x + y) / 2 with x>=y -> (x - y) / 2 + y
    51  (Avg64u <t> x y) -> (ADD (SRDconst <t> (SUB <t> x y) [1]) y)
    52  
    53  (And64 x y) -> (AND x y)
    54  (And(32|16|8) x y) -> (ANDW x y)
    55  
    56  (Or64 x y) -> (OR x y)
    57  (Or(32|16|8) x y) -> (ORW x y)
    58  
    59  (Xor64 x y) -> (XOR x y)
    60  (Xor(32|16|8) x y) -> (XORW x y)
    61  
    62  (Neg64 x) -> (NEG x)
    63  (Neg(32|16|8) x) -> (NEGW x)
    64  (Neg32F x) -> (FNEGS x)
    65  (Neg64F x) -> (FNEG x)
    66  
    67  (Com64 x) -> (NOT x)
    68  (Com(32|16|8) x) -> (NOTW x)
    69  (NOT x) && true -> (XOR (MOVDconst [-1]) x)
    70  (NOTW x) && true -> (XORWconst [-1] x)
    71  
    72  // Lowering boolean ops
    73  (AndB x y) -> (ANDW x y)
    74  (OrB x y) -> (ORW x y)
    75  (Not x) -> (XORWconst [1] x)
    76  
    77  // Lowering pointer arithmetic
    78  (OffPtr [off] ptr:(SP)) -> (MOVDaddr [off] ptr)
    79  (OffPtr [off] ptr) && is32Bit(off) -> (ADDconst [off] ptr)
    80  (OffPtr [off] ptr) -> (ADD (MOVDconst [off]) ptr)
    81  
    82  // TODO: optimize these cases?
    83  (Ctz64NonZero x) -> (Ctz64 x)
    84  (Ctz32NonZero x) -> (Ctz32 x)
    85  
    86  // Ctz(x) = 64 - findLeftmostOne((x-1)&^x)
    87  (Ctz64 <t> x) -> (SUB (MOVDconst [64]) (FLOGR (AND <t> (SUBconst <t> [1] x) (NOT <t> x))))
    88  (Ctz32 <t> x) -> (SUB (MOVDconst [64]) (FLOGR (MOVWZreg (ANDW <t> (SUBWconst <t> [1] x) (NOTW <t> x)))))
    89  
    90  (BitLen64 x) -> (SUB (MOVDconst [64]) (FLOGR x))
    91  
    92  // POPCNT treats the input register as a vector of 8 bytes, producing
    93  // a population count for each individual byte. For inputs larger than
    94  // a single byte we therefore need to sum the individual bytes produced
    95  // by the POPCNT instruction. For example, the following instruction
    96  // sequence could be used to calculate the population count of a 4-byte
    97  // value:
    98  //
    99  //     MOVD   $0x12345678, R1 // R1=0x12345678 <-- input
   100  //     POPCNT R1, R2          // R2=0x02030404
   101  //     SRW    $16, R2, R3     // R3=0x00000203
   102  //     ADDW   R2, R3, R4      // R4=0x02030607
   103  //     SRW    $8, R4, R5      // R5=0x00020306
   104  //     ADDW   R4, R5, R6      // R6=0x0205090d
   105  //     MOVBZ  R6, R7          // R7=0x0000000d <-- result is 13
   106  //
   107  (PopCount8  x) -> (POPCNT (MOVBZreg x))
   108  (PopCount16 x) -> (MOVBZreg (SumBytes2 (POPCNT <typ.UInt16> x)))
   109  (PopCount32 x) -> (MOVBZreg (SumBytes4 (POPCNT <typ.UInt32> x)))
   110  (PopCount64 x) -> (MOVBZreg (SumBytes8 (POPCNT <typ.UInt64> x)))
   111  
   112  // SumBytes{2,4,8} pseudo operations sum the values of the rightmost
   113  // 2, 4 or 8 bytes respectively. The result is a single byte however
   114  // other bytes might contain junk so a zero extension is required if
   115  // the desired output type is larger than 1 byte.
   116  (SumBytes2 x) -> (ADDW (SRWconst <typ.UInt8> x [8]) x)
   117  (SumBytes4 x) -> (SumBytes2 (ADDW <typ.UInt16> (SRWconst <typ.UInt16> x [16]) x))
   118  (SumBytes8 x) -> (SumBytes4 (ADDW <typ.UInt32> (SRDconst <typ.UInt32> x [32]) x))
   119  
   120  (Bswap64 x) -> (MOVDBR x)
   121  (Bswap32 x) -> (MOVWBR x)
   122  
   123  // add with carry
   124  (Select0 (Add64carry x y c))
   125    -> (Select0 <typ.UInt64> (ADDE x y (Select1 <types.TypeFlags> (ADDCconst c [-1]))))
   126  (Select1 (Add64carry x y c))
   127    -> (Select0 <typ.UInt64> (ADDE (MOVDconst [0]) (MOVDconst [0]) (Select1 <types.TypeFlags> (ADDE x y (Select1 <types.TypeFlags> (ADDCconst c [-1]))))))
   128  
   129  // subtract with borrow
   130  (Select0 (Sub64borrow x y c))
   131    -> (Select0 <typ.UInt64> (SUBE x y (Select1 <types.TypeFlags> (SUBC (MOVDconst [0]) c))))
   132  (Select1 (Sub64borrow x y c))
   133    -> (NEG (Select0 <typ.UInt64> (SUBE (MOVDconst [0]) (MOVDconst [0]) (Select1 <types.TypeFlags> (SUBE x y (Select1 <types.TypeFlags> (SUBC (MOVDconst [0]) c)))))))
   134  
   135  // math package intrinsics
   136  (Sqrt        x) -> (FSQRT x)
   137  (Floor       x) -> (FIDBR [7] x)
   138  (Ceil        x) -> (FIDBR [6] x)
   139  (Trunc       x) -> (FIDBR [5] x)
   140  (RoundToEven x) -> (FIDBR [4] x)
   141  (Round       x) -> (FIDBR [1] x)
   142  (FMA     x y z) -> (FMADD z x y)
   143  
   144  // Atomic loads and stores.
   145  // The SYNC instruction (fast-BCR-serialization) prevents store-load
   146  // reordering. Other sequences of memory operations (load-load,
   147  // store-store and load-store) are already guaranteed not to be reordered.
   148  (AtomicLoad(8|32|Acq32|64|Ptr) ptr mem) -> (MOV(BZ|WZ|WZ|D|D)atomicload ptr mem)
   149  (AtomicStore(8|32|64|PtrNoWB) ptr val mem) -> (SYNC (MOV(B|W|D|D)atomicstore ptr val mem))
   150  
   151  // Store-release doesn't require store-load ordering.
   152  (AtomicStoreRel32 ptr val mem) -> (MOVWatomicstore ptr val mem)
   153  
   154  // Atomic adds.
   155  (AtomicAdd32 ptr val mem) -> (AddTupleFirst32 val (LAA ptr val mem))
   156  (AtomicAdd64 ptr val mem) -> (AddTupleFirst64 val (LAAG ptr val mem))
   157  (Select0 <t> (AddTupleFirst32 val tuple)) -> (ADDW val (Select0 <t> tuple))
   158  (Select1     (AddTupleFirst32   _ tuple)) -> (Select1 tuple)
   159  (Select0 <t> (AddTupleFirst64 val tuple)) -> (ADD val (Select0 <t> tuple))
   160  (Select1     (AddTupleFirst64   _ tuple)) -> (Select1 tuple)
   161  
   162  // Atomic exchanges.
   163  (AtomicExchange32 ptr val mem) -> (LoweredAtomicExchange32 ptr val mem)
   164  (AtomicExchange64 ptr val mem) -> (LoweredAtomicExchange64 ptr val mem)
   165  
   166  // Atomic compare and swap.
   167  (AtomicCompareAndSwap32 ptr old new_ mem) -> (LoweredAtomicCas32 ptr old new_ mem)
   168  (AtomicCompareAndSwap64 ptr old new_ mem) -> (LoweredAtomicCas64 ptr old new_ mem)
   169  
   170  // Atomic and: *(*uint8)(ptr) &= val
   171  //
   172  // Round pointer down to nearest word boundary and pad value with ones before
   173  // applying atomic AND operation to target word.
   174  //
   175  // *(*uint32)(ptr &^ 3) &= rotateleft(uint32(val) | 0xffffff00, ((3 << 3) ^ ((ptr & 3) << 3))
   176  //
   177  (AtomicAnd8 ptr val mem)
   178    -> (LANfloor
   179         ptr
   180         (RLL <typ.UInt32>
   181           (ORWconst <typ.UInt32> val [-1<<8])
   182           (RXSBG <typ.UInt32> {s390x.NewRotateParams(59, 60, 3)} (MOVDconst [3<<3]) ptr))
   183         mem)
   184  
   185  // Atomic or: *(*uint8)(ptr) |= val
   186  //
   187  // Round pointer down to nearest word boundary and pad value with zeros before
   188  // applying atomic OR operation to target word.
   189  //
   190  // *(*uint32)(ptr &^ 3) |= uint32(val) << ((3 << 3) ^ ((ptr & 3) << 3))
   191  //
   192  (AtomicOr8  ptr val mem)
   193    -> (LAOfloor
   194         ptr
   195         (SLW <typ.UInt32>
   196           (MOVBZreg <typ.UInt32> val)
   197           (RXSBG <typ.UInt32> {s390x.NewRotateParams(59, 60, 3)} (MOVDconst [3<<3]) ptr))
   198         mem)
   199  
   200  // Lowering extension
   201  // Note: we always extend to 64 bits even though some ops don't need that many result bits.
   202  (SignExt8to(16|32|64)  x) -> (MOVBreg x)
   203  (SignExt16to(32|64) x) -> (MOVHreg x)
   204  (SignExt32to64 x) -> (MOVWreg x)
   205  
   206  (ZeroExt8to(16|32|64)  x) -> (MOVBZreg x)
   207  (ZeroExt16to(32|64) x) -> (MOVHZreg x)
   208  (ZeroExt32to64 x) -> (MOVWZreg x)
   209  
   210  (Slicemask <t> x) -> (SRADconst (NEG <t> x) [63])
   211  
   212  // Lowering truncation
   213  // Because we ignore high parts of registers, truncates are just copies.
   214  (Trunc(16|32|64)to8  x) -> x
   215  (Trunc(32|64)to16 x) -> x
   216  (Trunc64to32 x) -> x
   217  
   218  // Lowering float <-> int
   219  (Cvt32to32F x) -> (CEFBRA x)
   220  (Cvt32to64F x) -> (CDFBRA x)
   221  (Cvt64to32F x) -> (CEGBRA x)
   222  (Cvt64to64F x) -> (CDGBRA x)
   223  
   224  (Cvt32Fto32 x) -> (CFEBRA x)
   225  (Cvt32Fto64 x) -> (CGEBRA x)
   226  (Cvt64Fto32 x) -> (CFDBRA x)
   227  (Cvt64Fto64 x) -> (CGDBRA x)
   228  
   229  (Cvt32Fto64F x) -> (LDEBR x)
   230  (Cvt64Fto32F x) -> (LEDBR x)
   231  
   232  (Round(32|64)F x) -> (LoweredRound(32|64)F x)
   233  
   234  // Lowering shifts
   235  
   236  // Lower bounded shifts first. No need to check shift value.
   237  (Lsh64x(64|32|16|8)  x y) && shiftIsBounded(v) -> (SLD x y)
   238  (Lsh32x(64|32|16|8)  x y) && shiftIsBounded(v) -> (SLW x y)
   239  (Lsh16x(64|32|16|8)  x y) && shiftIsBounded(v) -> (SLW x y)
   240  (Lsh8x(64|32|16|8)   x y) && shiftIsBounded(v) -> (SLW x y)
   241  (Rsh64Ux(64|32|16|8) x y) && shiftIsBounded(v) -> (SRD x y)
   242  (Rsh32Ux(64|32|16|8) x y) && shiftIsBounded(v) -> (SRW x y)
   243  (Rsh16Ux(64|32|16|8) x y) && shiftIsBounded(v) -> (SRW (MOVHZreg x) y)
   244  (Rsh8Ux(64|32|16|8)  x y) && shiftIsBounded(v) -> (SRW (MOVBZreg x) y)
   245  (Rsh64x(64|32|16|8)  x y) && shiftIsBounded(v) -> (SRAD x y)
   246  (Rsh32x(64|32|16|8)  x y) && shiftIsBounded(v) -> (SRAW x y)
   247  (Rsh16x(64|32|16|8)  x y) && shiftIsBounded(v) -> (SRAW (MOVHreg x) y)
   248  (Rsh8x(64|32|16|8)   x y) && shiftIsBounded(v) -> (SRAW (MOVBreg x) y)
   249  
   250  // Unsigned shifts need to return 0 if shift amount is >= width of shifted value.
   251  //   result = shift >= 64 ? 0 : arg << shift
   252  (Lsh(64|32|16|8)x64 <t> x y) -> (LOCGR {s390x.GreaterOrEqual} <t> (SL(D|W|W|W) <t> x y) (MOVDconst [0]) (CMPUconst y [64]))
   253  (Lsh(64|32|16|8)x32 <t> x y) -> (LOCGR {s390x.GreaterOrEqual} <t> (SL(D|W|W|W) <t> x y) (MOVDconst [0]) (CMPWUconst y [64]))
   254  (Lsh(64|32|16|8)x16 <t> x y) -> (LOCGR {s390x.GreaterOrEqual} <t> (SL(D|W|W|W) <t> x y) (MOVDconst [0]) (CMPWUconst (MOVHZreg y) [64]))
   255  (Lsh(64|32|16|8)x8  <t> x y) -> (LOCGR {s390x.GreaterOrEqual} <t> (SL(D|W|W|W) <t> x y) (MOVDconst [0]) (CMPWUconst (MOVBZreg y) [64]))
   256  
   257  (Rsh(64|32)Ux64 <t> x y) -> (LOCGR {s390x.GreaterOrEqual} <t> (SR(D|W) <t> x y) (MOVDconst [0]) (CMPUconst y [64]))
   258  (Rsh(64|32)Ux32 <t> x y) -> (LOCGR {s390x.GreaterOrEqual} <t> (SR(D|W) <t> x y) (MOVDconst [0]) (CMPWUconst y [64]))
   259  (Rsh(64|32)Ux16 <t> x y) -> (LOCGR {s390x.GreaterOrEqual} <t> (SR(D|W) <t> x y) (MOVDconst [0]) (CMPWUconst (MOVHZreg y) [64]))
   260  (Rsh(64|32)Ux8  <t> x y) -> (LOCGR {s390x.GreaterOrEqual} <t> (SR(D|W) <t> x y) (MOVDconst [0]) (CMPWUconst (MOVBZreg y) [64]))
   261  
   262  (Rsh(16|8)Ux64 <t> x y) -> (LOCGR {s390x.GreaterOrEqual} <t> (SRW <t> (MOV(H|B)Zreg x) y) (MOVDconst [0]) (CMPUconst y [64]))
   263  (Rsh(16|8)Ux32 <t> x y) -> (LOCGR {s390x.GreaterOrEqual} <t> (SRW <t> (MOV(H|B)Zreg x) y) (MOVDconst [0]) (CMPWUconst y [64]))
   264  (Rsh(16|8)Ux16 <t> x y) -> (LOCGR {s390x.GreaterOrEqual} <t> (SRW <t> (MOV(H|B)Zreg x) y) (MOVDconst [0]) (CMPWUconst (MOVHZreg y) [64]))
   265  (Rsh(16|8)Ux8  <t> x y) -> (LOCGR {s390x.GreaterOrEqual} <t> (SRW <t> (MOV(H|B)Zreg x) y) (MOVDconst [0]) (CMPWUconst (MOVBZreg y) [64]))
   266  
   267  // Signed right shift needs to return 0/-1 if shift amount is >= width of shifted value.
   268  // We implement this by setting the shift value to 63 (all ones) if the shift value is more than 63.
   269  //   result = arg >> (shift >= 64 ? 63 : shift)
   270  (Rsh(64|32)x64 x y) -> (SRA(D|W) x (LOCGR {s390x.GreaterOrEqual} <y.Type> y (MOVDconst <y.Type> [63]) (CMPUconst  y [64])))
   271  (Rsh(64|32)x32 x y) -> (SRA(D|W) x (LOCGR {s390x.GreaterOrEqual} <y.Type> y (MOVDconst <y.Type> [63]) (CMPWUconst y [64])))
   272  (Rsh(64|32)x16 x y) -> (SRA(D|W) x (LOCGR {s390x.GreaterOrEqual} <y.Type> y (MOVDconst <y.Type> [63]) (CMPWUconst (MOVHZreg y) [64])))
   273  (Rsh(64|32)x8  x y) -> (SRA(D|W) x (LOCGR {s390x.GreaterOrEqual} <y.Type> y (MOVDconst <y.Type> [63]) (CMPWUconst (MOVBZreg y) [64])))
   274  
   275  (Rsh(16|8)x64 x y) -> (SRAW (MOV(H|B)reg x) (LOCGR {s390x.GreaterOrEqual} <y.Type> y (MOVDconst <y.Type> [63]) (CMPUconst  y [64])))
   276  (Rsh(16|8)x32 x y) -> (SRAW (MOV(H|B)reg x) (LOCGR {s390x.GreaterOrEqual} <y.Type> y (MOVDconst <y.Type> [63]) (CMPWUconst y [64])))
   277  (Rsh(16|8)x16 x y) -> (SRAW (MOV(H|B)reg x) (LOCGR {s390x.GreaterOrEqual} <y.Type> y (MOVDconst <y.Type> [63]) (CMPWUconst (MOVHZreg y) [64])))
   278  (Rsh(16|8)x8  x y) -> (SRAW (MOV(H|B)reg x) (LOCGR {s390x.GreaterOrEqual} <y.Type> y (MOVDconst <y.Type> [63]) (CMPWUconst (MOVBZreg y) [64])))
   279  
   280  // Lowering rotates
   281  (RotateLeft8 <t> x (MOVDconst [c])) -> (Or8 (Lsh8x64 <t> x (MOVDconst [c&7])) (Rsh8Ux64 <t> x (MOVDconst [-c&7])))
   282  (RotateLeft16 <t> x (MOVDconst [c])) -> (Or16 (Lsh16x64 <t> x (MOVDconst [c&15])) (Rsh16Ux64 <t> x (MOVDconst [-c&15])))
   283  (RotateLeft32 x y) -> (RLL  x y)
   284  (RotateLeft64 x y) -> (RLLG x y)
   285  
   286  // Lowering comparisons
   287  (Less64      x y) -> (LOCGR {s390x.Less} (MOVDconst [0]) (MOVDconst [1]) (CMP x y))
   288  (Less32      x y) -> (LOCGR {s390x.Less} (MOVDconst [0]) (MOVDconst [1]) (CMPW x y))
   289  (Less(16|8)  x y) -> (LOCGR {s390x.Less} (MOVDconst [0]) (MOVDconst [1]) (CMPW (MOV(H|B)reg x) (MOV(H|B)reg y)))
   290  (Less64U     x y) -> (LOCGR {s390x.Less} (MOVDconst [0]) (MOVDconst [1]) (CMPU x y))
   291  (Less32U     x y) -> (LOCGR {s390x.Less} (MOVDconst [0]) (MOVDconst [1]) (CMPWU x y))
   292  (Less(16|8)U x y) -> (LOCGR {s390x.Less} (MOVDconst [0]) (MOVDconst [1]) (CMPWU (MOV(H|B)Zreg x) (MOV(H|B)Zreg y)))
   293  (Less64F     x y) -> (LOCGR {s390x.Less} (MOVDconst [0]) (MOVDconst [1]) (FCMP x y))
   294  (Less32F     x y) -> (LOCGR {s390x.Less} (MOVDconst [0]) (MOVDconst [1]) (FCMPS x y))
   295  
   296  (Leq64      x y) -> (LOCGR {s390x.LessOrEqual} (MOVDconst [0]) (MOVDconst [1]) (CMP x y))
   297  (Leq32      x y) -> (LOCGR {s390x.LessOrEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPW x y))
   298  (Leq(16|8)  x y) -> (LOCGR {s390x.LessOrEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPW (MOV(H|B)reg x) (MOV(H|B)reg y)))
   299  (Leq64U     x y) -> (LOCGR {s390x.LessOrEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPU x y))
   300  (Leq32U     x y) -> (LOCGR {s390x.LessOrEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPWU x y))
   301  (Leq(16|8)U x y) -> (LOCGR {s390x.LessOrEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPWU (MOV(H|B)Zreg x) (MOV(H|B)Zreg y)))
   302  (Leq64F     x y) -> (LOCGR {s390x.LessOrEqual} (MOVDconst [0]) (MOVDconst [1]) (FCMP x y))
   303  (Leq32F     x y) -> (LOCGR {s390x.LessOrEqual} (MOVDconst [0]) (MOVDconst [1]) (FCMPS x y))
   304  
   305  (Greater64      x y) -> (LOCGR {s390x.Greater} (MOVDconst [0]) (MOVDconst [1]) (CMP x y))
   306  (Greater32      x y) -> (LOCGR {s390x.Greater} (MOVDconst [0]) (MOVDconst [1]) (CMPW x y))
   307  (Greater(16|8)  x y) -> (LOCGR {s390x.Greater} (MOVDconst [0]) (MOVDconst [1]) (CMPW (MOV(H|B)reg x) (MOV(H|B)reg y)))
   308  (Greater64U     x y) -> (LOCGR {s390x.Greater} (MOVDconst [0]) (MOVDconst [1]) (CMPU x y))
   309  (Greater32U     x y) -> (LOCGR {s390x.Greater} (MOVDconst [0]) (MOVDconst [1]) (CMPWU x y))
   310  (Greater(16|8)U x y) -> (LOCGR {s390x.Greater} (MOVDconst [0]) (MOVDconst [1]) (CMPWU (MOV(H|B)Zreg x) (MOV(H|B)Zreg y)))
   311  (Greater64F     x y) -> (LOCGR {s390x.Greater} (MOVDconst [0]) (MOVDconst [1]) (FCMP x y))
   312  (Greater32F     x y) -> (LOCGR {s390x.Greater} (MOVDconst [0]) (MOVDconst [1]) (FCMPS x y))
   313  
   314  (Geq64      x y) -> (LOCGR {s390x.GreaterOrEqual} (MOVDconst [0]) (MOVDconst [1]) (CMP x y))
   315  (Geq32      x y) -> (LOCGR {s390x.GreaterOrEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPW x y))
   316  (Geq(16|8)  x y) -> (LOCGR {s390x.GreaterOrEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPW (MOV(H|B)reg x) (MOV(H|B)reg y)))
   317  (Geq64U     x y) -> (LOCGR {s390x.GreaterOrEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPU x y))
   318  (Geq32U     x y) -> (LOCGR {s390x.GreaterOrEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPWU x y))
   319  (Geq(16|8)U x y) -> (LOCGR {s390x.GreaterOrEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPWU (MOV(H|B)Zreg x) (MOV(H|B)Zreg y)))
   320  (Geq64F     x y) -> (LOCGR {s390x.GreaterOrEqual} (MOVDconst [0]) (MOVDconst [1]) (FCMP x y))
   321  (Geq32F     x y) -> (LOCGR {s390x.GreaterOrEqual} (MOVDconst [0]) (MOVDconst [1]) (FCMPS x y))
   322  
   323  (Eq(64|Ptr) x y) -> (LOCGR {s390x.Equal} (MOVDconst [0]) (MOVDconst [1]) (CMP x y))
   324  (Eq32       x y) -> (LOCGR {s390x.Equal} (MOVDconst [0]) (MOVDconst [1]) (CMPW x y))
   325  (Eq(16|8|B) x y) -> (LOCGR {s390x.Equal} (MOVDconst [0]) (MOVDconst [1]) (CMPW (MOV(H|B|B)reg x) (MOV(H|B|B)reg y)))
   326  (Eq64F      x y) -> (LOCGR {s390x.Equal} (MOVDconst [0]) (MOVDconst [1]) (FCMP x y))
   327  (Eq32F      x y) -> (LOCGR {s390x.Equal} (MOVDconst [0]) (MOVDconst [1]) (FCMPS x y))
   328  
   329  (Neq(64|Ptr) x y) -> (LOCGR {s390x.NotEqual} (MOVDconst [0]) (MOVDconst [1]) (CMP x y))
   330  (Neq32       x y) -> (LOCGR {s390x.NotEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPW x y))
   331  (Neq(16|8|B) x y) -> (LOCGR {s390x.NotEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPW (MOV(H|B|B)reg x) (MOV(H|B|B)reg y)))
   332  (Neq64F      x y) -> (LOCGR {s390x.NotEqual} (MOVDconst [0]) (MOVDconst [1]) (FCMP x y))
   333  (Neq32F      x y) -> (LOCGR {s390x.NotEqual} (MOVDconst [0]) (MOVDconst [1]) (FCMPS x y))
   334  
   335  // Lowering loads
   336  (Load <t> ptr mem) && (is64BitInt(t) || isPtr(t)) -> (MOVDload ptr mem)
   337  (Load <t> ptr mem) && is32BitInt(t) && isSigned(t) -> (MOVWload ptr mem)
   338  (Load <t> ptr mem) && is32BitInt(t) && !isSigned(t) -> (MOVWZload ptr mem)
   339  (Load <t> ptr mem) && is16BitInt(t) && isSigned(t) -> (MOVHload ptr mem)
   340  (Load <t> ptr mem) && is16BitInt(t) && !isSigned(t) -> (MOVHZload ptr mem)
   341  (Load <t> ptr mem) && is8BitInt(t) && isSigned(t) -> (MOVBload ptr mem)
   342  (Load <t> ptr mem) && (t.IsBoolean() || (is8BitInt(t) && !isSigned(t))) -> (MOVBZload ptr mem)
   343  (Load <t> ptr mem) && is32BitFloat(t) -> (FMOVSload ptr mem)
   344  (Load <t> ptr mem) && is64BitFloat(t) -> (FMOVDload ptr mem)
   345  
   346  // Lowering stores
   347  // These more-specific FP versions of Store pattern should come first.
   348  (Store {t} ptr val mem) && t.(*types.Type).Size() == 8 && is64BitFloat(val.Type) -> (FMOVDstore ptr val mem)
   349  (Store {t} ptr val mem) && t.(*types.Type).Size() == 4 && is32BitFloat(val.Type) -> (FMOVSstore ptr val mem)
   350  
   351  (Store {t} ptr val mem) && t.(*types.Type).Size() == 8 -> (MOVDstore ptr val mem)
   352  (Store {t} ptr val mem) && t.(*types.Type).Size() == 4 -> (MOVWstore ptr val mem)
   353  (Store {t} ptr val mem) && t.(*types.Type).Size() == 2 -> (MOVHstore ptr val mem)
   354  (Store {t} ptr val mem) && t.(*types.Type).Size() == 1 -> (MOVBstore ptr val mem)
   355  
   356  // Lowering moves
   357  
   358  // Load and store for small copies.
   359  (Move [0] _ _ mem) -> mem
   360  (Move [1] dst src mem) -> (MOVBstore dst (MOVBZload src mem) mem)
   361  (Move [2] dst src mem) -> (MOVHstore dst (MOVHZload src mem) mem)
   362  (Move [4] dst src mem) -> (MOVWstore dst (MOVWZload src mem) mem)
   363  (Move [8] dst src mem) -> (MOVDstore dst (MOVDload src mem) mem)
   364  (Move [16] dst src mem) ->
   365  	(MOVDstore [8] dst (MOVDload [8] src mem)
   366  		(MOVDstore dst (MOVDload src mem) mem))
   367  (Move [24] dst src mem) ->
   368          (MOVDstore [16] dst (MOVDload [16] src mem)
   369  	        (MOVDstore [8] dst (MOVDload [8] src mem)
   370                  (MOVDstore dst (MOVDload src mem) mem)))
   371  (Move [3] dst src mem) ->
   372  	(MOVBstore [2] dst (MOVBZload [2] src mem)
   373  		(MOVHstore dst (MOVHZload src mem) mem))
   374  (Move [5] dst src mem) ->
   375  	(MOVBstore [4] dst (MOVBZload [4] src mem)
   376  		(MOVWstore dst (MOVWZload src mem) mem))
   377  (Move [6] dst src mem) ->
   378  	(MOVHstore [4] dst (MOVHZload [4] src mem)
   379  		(MOVWstore dst (MOVWZload src mem) mem))
   380  (Move [7] dst src mem) ->
   381  	(MOVBstore [6] dst (MOVBZload [6] src mem)
   382  		(MOVHstore [4] dst (MOVHZload [4] src mem)
   383  			(MOVWstore dst (MOVWZload src mem) mem)))
   384  
   385  // MVC for other moves. Use up to 4 instructions (sizes up to 1024 bytes).
   386  (Move [s] dst src mem) && s > 0 && s <= 256 ->
   387  	(MVC [makeValAndOff(s, 0)] dst src mem)
   388  (Move [s] dst src mem) && s > 256 && s <= 512 ->
   389  	(MVC [makeValAndOff(s-256, 256)] dst src (MVC [makeValAndOff(256, 0)] dst src mem))
   390  (Move [s] dst src mem) && s > 512 && s <= 768 ->
   391  	(MVC [makeValAndOff(s-512, 512)] dst src (MVC [makeValAndOff(256, 256)] dst src (MVC [makeValAndOff(256, 0)] dst src mem)))
   392  (Move [s] dst src mem) && s > 768 && s <= 1024 ->
   393  	(MVC [makeValAndOff(s-768, 768)] dst src (MVC [makeValAndOff(256, 512)] dst src (MVC [makeValAndOff(256, 256)] dst src (MVC [makeValAndOff(256, 0)] dst src mem))))
   394  
   395  // Move more than 1024 bytes using a loop.
   396  (Move [s] dst src mem) && s > 1024 ->
   397  	(LoweredMove [s%256] dst src (ADD <src.Type> src (MOVDconst [(s/256)*256])) mem)
   398  
   399  // Lowering Zero instructions
   400  (Zero [0] _ mem) -> mem
   401  (Zero [1] destptr mem) -> (MOVBstoreconst [0] destptr mem)
   402  (Zero [2] destptr mem) -> (MOVHstoreconst [0] destptr mem)
   403  (Zero [4] destptr mem) -> (MOVWstoreconst [0] destptr mem)
   404  (Zero [8] destptr mem) -> (MOVDstoreconst [0] destptr mem)
   405  (Zero [3] destptr mem) ->
   406  	(MOVBstoreconst [makeValAndOff(0,2)] destptr
   407  		(MOVHstoreconst [0] destptr mem))
   408  (Zero [5] destptr mem) ->
   409  	(MOVBstoreconst [makeValAndOff(0,4)] destptr
   410  		(MOVWstoreconst [0] destptr mem))
   411  (Zero [6] destptr mem) ->
   412  	(MOVHstoreconst [makeValAndOff(0,4)] destptr
   413  		(MOVWstoreconst [0] destptr mem))
   414  (Zero [7] destptr mem) ->
   415  	(MOVWstoreconst [makeValAndOff(0,3)] destptr
   416  		(MOVWstoreconst [0] destptr mem))
   417  
   418  (Zero [s] destptr mem) && s > 0 && s <= 1024 ->
   419  	(CLEAR [makeValAndOff(s, 0)] destptr mem)
   420  
   421  // Move more than 1024 bytes using a loop.
   422  (Zero [s] destptr mem) && s > 1024 ->
   423  	(LoweredZero [s%256] destptr (ADDconst <destptr.Type> destptr [(s/256)*256]) mem)
   424  
   425  // Lowering constants
   426  (Const(64|32|16|8)  [val]) -> (MOVDconst [val])
   427  (Const(32|64)F [val]) -> (FMOV(S|D)const [val])
   428  (ConstNil) -> (MOVDconst [0])
   429  (ConstBool [b]) -> (MOVDconst [b])
   430  
   431  // Lowering calls
   432  (StaticCall [argwid] {target} mem) -> (CALLstatic [argwid] {target} mem)
   433  (ClosureCall [argwid] entry closure mem) -> (CALLclosure [argwid] entry closure mem)
   434  (InterCall [argwid] entry mem) -> (CALLinter [argwid] entry mem)
   435  
   436  // Miscellaneous
   437  (IsNonNil p) -> (LOCGR {s390x.NotEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPconst p [0]))
   438  (IsInBounds idx len) -> (LOCGR {s390x.Less} (MOVDconst [0]) (MOVDconst [1]) (CMPU idx len))
   439  (IsSliceInBounds idx len) -> (LOCGR {s390x.LessOrEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPU idx len))
   440  (NilCheck ptr mem) -> (LoweredNilCheck ptr mem)
   441  (GetG mem) -> (LoweredGetG mem)
   442  (GetClosurePtr) -> (LoweredGetClosurePtr)
   443  (GetCallerSP) -> (LoweredGetCallerSP)
   444  (GetCallerPC) -> (LoweredGetCallerPC)
   445  (Addr {sym} base) -> (MOVDaddr {sym} base)
   446  (LocalAddr {sym} base _) -> (MOVDaddr {sym} base)
   447  (ITab (Load ptr mem)) -> (MOVDload ptr mem)
   448  
   449  // block rewrites
   450  (If cond yes no) -> (CLIJ {s390x.LessOrGreater} (MOVBZreg <typ.Bool> cond) [0] yes no)
   451  
   452  // Write barrier.
   453  (WB {fn} destptr srcptr mem) -> (LoweredWB {fn} destptr srcptr mem)
   454  
   455  (PanicBounds [kind] x y mem) && boundsABI(kind) == 0 -> (LoweredPanicBoundsA [kind] x y mem)
   456  (PanicBounds [kind] x y mem) && boundsABI(kind) == 1 -> (LoweredPanicBoundsB [kind] x y mem)
   457  (PanicBounds [kind] x y mem) && boundsABI(kind) == 2 -> (LoweredPanicBoundsC [kind] x y mem)
   458  
   459  // ***************************
   460  // Above: lowering rules
   461  // Below: optimizations
   462  // ***************************
   463  // TODO: Should the optimizations be a separate pass?
   464  
   465  // Note: when removing unnecessary sign/zero extensions.
   466  //
   467  // After a value is spilled it is restored using a sign- or zero-extension
   468  // to register-width as appropriate for its type. For example, a uint8 will
   469  // be restored using a MOVBZ (llgc) instruction which will zero extend the
   470  // 8-bit value to 64-bits.
   471  //
   472  // This is a hazard when folding sign- and zero-extensions since we need to
   473  // ensure not only that the value in the argument register is correctly
   474  // extended but also that it will still be correctly extended if it is
   475  // spilled and restored.
   476  //
   477  // In general this means we need type checks when the RHS of a rule is an
   478  // OpCopy (i.e. "(... x:(...) ...) -> x").
   479  
   480  // Merge double extensions.
   481  (MOV(H|HZ)reg e:(MOV(B|BZ)reg x)) && clobberIfDead(e) -> (MOV(B|BZ)reg x)
   482  (MOV(W|WZ)reg e:(MOV(B|BZ)reg x)) && clobberIfDead(e) -> (MOV(B|BZ)reg x)
   483  (MOV(W|WZ)reg e:(MOV(H|HZ)reg x)) && clobberIfDead(e) -> (MOV(H|HZ)reg x)
   484  
   485  // Bypass redundant sign extensions.
   486  (MOV(B|BZ)reg e:(MOVBreg x)) && clobberIfDead(e) -> (MOV(B|BZ)reg x)
   487  (MOV(B|BZ)reg e:(MOVHreg x)) && clobberIfDead(e) -> (MOV(B|BZ)reg x)
   488  (MOV(B|BZ)reg e:(MOVWreg x)) && clobberIfDead(e) -> (MOV(B|BZ)reg x)
   489  (MOV(H|HZ)reg e:(MOVHreg x)) && clobberIfDead(e) -> (MOV(H|HZ)reg x)
   490  (MOV(H|HZ)reg e:(MOVWreg x)) && clobberIfDead(e) -> (MOV(H|HZ)reg x)
   491  (MOV(W|WZ)reg e:(MOVWreg x)) && clobberIfDead(e) -> (MOV(W|WZ)reg x)
   492  
   493  // Bypass redundant zero extensions.
   494  (MOV(B|BZ)reg e:(MOVBZreg x)) && clobberIfDead(e) -> (MOV(B|BZ)reg x)
   495  (MOV(B|BZ)reg e:(MOVHZreg x)) && clobberIfDead(e) -> (MOV(B|BZ)reg x)
   496  (MOV(B|BZ)reg e:(MOVWZreg x)) && clobberIfDead(e) -> (MOV(B|BZ)reg x)
   497  (MOV(H|HZ)reg e:(MOVHZreg x)) && clobberIfDead(e) -> (MOV(H|HZ)reg x)
   498  (MOV(H|HZ)reg e:(MOVWZreg x)) && clobberIfDead(e) -> (MOV(H|HZ)reg x)
   499  (MOV(W|WZ)reg e:(MOVWZreg x)) && clobberIfDead(e) -> (MOV(W|WZ)reg x)
   500  
   501  // Remove zero extensions after zero extending load.
   502  // Note: take care that if x is spilled it is restored correctly.
   503  (MOV(B|H|W)Zreg x:(MOVBZload    _   _)) && (!x.Type.IsSigned() || x.Type.Size() > 1) -> x
   504  (MOV(B|H|W)Zreg x:(MOVBZloadidx _ _ _)) && (!x.Type.IsSigned() || x.Type.Size() > 1) -> x
   505  (MOV(H|W)Zreg   x:(MOVHZload    _   _)) && (!x.Type.IsSigned() || x.Type.Size() > 2) -> x
   506  (MOV(H|W)Zreg   x:(MOVHZloadidx _ _ _)) && (!x.Type.IsSigned() || x.Type.Size() > 2) -> x
   507  (MOVWZreg       x:(MOVWZload    _   _)) && (!x.Type.IsSigned() || x.Type.Size() > 4) -> x
   508  (MOVWZreg       x:(MOVWZloadidx _ _ _)) && (!x.Type.IsSigned() || x.Type.Size() > 4) -> x
   509  
   510  // Remove sign extensions after sign extending load.
   511  // Note: take care that if x is spilled it is restored correctly.
   512  (MOV(B|H|W)reg x:(MOVBload    _   _)) && (x.Type.IsSigned() || x.Type.Size() == 8) -> x
   513  (MOV(B|H|W)reg x:(MOVBloadidx _ _ _)) && (x.Type.IsSigned() || x.Type.Size() == 8) -> x
   514  (MOV(H|W)reg   x:(MOVHload    _   _)) && (x.Type.IsSigned() || x.Type.Size() == 8) -> x
   515  (MOV(H|W)reg   x:(MOVHloadidx _ _ _)) && (x.Type.IsSigned() || x.Type.Size() == 8) -> x
   516  (MOVWreg       x:(MOVWload    _   _)) && (x.Type.IsSigned() || x.Type.Size() == 8) -> x
   517  (MOVWreg       x:(MOVWloadidx _ _ _)) && (x.Type.IsSigned() || x.Type.Size() == 8) -> x
   518  
   519  // Remove sign extensions after zero extending load.
   520  // These type checks are probably unnecessary but do them anyway just in case.
   521  (MOV(H|W)reg x:(MOVBZload    _   _)) && (!x.Type.IsSigned() || x.Type.Size() > 1) -> x
   522  (MOV(H|W)reg x:(MOVBZloadidx _ _ _)) && (!x.Type.IsSigned() || x.Type.Size() > 1) -> x
   523  (MOVWreg     x:(MOVHZload    _   _)) && (!x.Type.IsSigned() || x.Type.Size() > 2) -> x
   524  (MOVWreg     x:(MOVHZloadidx _ _ _)) && (!x.Type.IsSigned() || x.Type.Size() > 2) -> x
   525  
   526  // Fold sign and zero extensions into loads.
   527  //
   528  // Note: The combined instruction must end up in the same block
   529  // as the original load. If not, we end up making a value with
   530  // memory type live in two different blocks, which can lead to
   531  // multiple memory values alive simultaneously.
   532  //
   533  // Make sure we don't combine these ops if the load has another use.
   534  // This prevents a single load from being split into multiple loads
   535  // which then might return different values.  See test/atomicload.go.
   536  (MOV(B|H|W)Zreg <t> x:(MOV(B|H|W)load [o] {s} p mem))
   537    && x.Uses == 1
   538    && clobber(x)
   539    -> @x.Block (MOV(B|H|W)Zload <t> [o] {s} p mem)
   540  (MOV(B|H|W)reg <t> x:(MOV(B|H|W)Zload [o] {s} p mem))
   541    && x.Uses == 1
   542    && clobber(x)
   543    -> @x.Block (MOV(B|H|W)load <t> [o] {s} p mem)
   544  (MOV(B|H|W)Zreg <t> x:(MOV(B|H|W)loadidx [o] {s} p i mem))
   545    && x.Uses == 1
   546    && clobber(x)
   547    -> @x.Block (MOV(B|H|W)Zloadidx <t> [o] {s} p i mem)
   548  (MOV(B|H|W)reg <t> x:(MOV(B|H|W)Zloadidx [o] {s} p i mem))
   549    && x.Uses == 1
   550    && clobber(x)
   551    -> @x.Block (MOV(B|H|W)loadidx <t> [o] {s} p i mem)
   552  
   553  // Remove zero extensions after argument load.
   554  (MOVBZreg x:(Arg <t>)) && !t.IsSigned() && t.Size() == 1 -> x
   555  (MOVHZreg x:(Arg <t>)) && !t.IsSigned() && t.Size() <= 2 -> x
   556  (MOVWZreg x:(Arg <t>)) && !t.IsSigned() && t.Size() <= 4 -> x
   557  
   558  // Remove sign extensions after argument load.
   559  (MOVBreg x:(Arg <t>)) && t.IsSigned() && t.Size() == 1 -> x
   560  (MOVHreg x:(Arg <t>)) && t.IsSigned() && t.Size() <= 2 -> x
   561  (MOVWreg x:(Arg <t>)) && t.IsSigned() && t.Size() <= 4 -> x
   562  
   563  // Fold zero extensions into constants.
   564  (MOVBZreg (MOVDconst [c])) -> (MOVDconst [int64( uint8(c))])
   565  (MOVHZreg (MOVDconst [c])) -> (MOVDconst [int64(uint16(c))])
   566  (MOVWZreg (MOVDconst [c])) -> (MOVDconst [int64(uint32(c))])
   567  
   568  // Fold sign extensions into constants.
   569  (MOVBreg (MOVDconst [c])) -> (MOVDconst [int64( int8(c))])
   570  (MOVHreg (MOVDconst [c])) -> (MOVDconst [int64(int16(c))])
   571  (MOVWreg (MOVDconst [c])) -> (MOVDconst [int64(int32(c))])
   572  
   573  // Remove zero extension of conditional move.
   574  // Note: only for MOVBZreg for now since it is added as part of 'if' statement lowering.
   575  (MOVBZreg x:(LOCGR (MOVDconst [c]) (MOVDconst [d]) _))
   576    && int64(uint8(c)) == c
   577    && int64(uint8(d)) == d
   578    && (!x.Type.IsSigned() || x.Type.Size() > 1)
   579    -> x
   580  
   581  // Fold boolean tests into blocks.
   582  // Note: this must match If statement lowering.
   583  (CLIJ {s390x.LessOrGreater} (LOCGR {d} (MOVDconst [0]) (MOVDconst [x]) cmp) [0] yes no)
   584    && int32(x) != 0
   585    -> (BRC {d} cmp yes no)
   586  
   587  // Compare-and-branch.
   588  // Note: bit 3 (unordered) must not be set so we mask out s390x.Unordered.
   589  (BRC {c} (CMP   x y) yes no) -> (CGRJ  {c.(s390x.CCMask)&^s390x.Unordered} x y yes no)
   590  (BRC {c} (CMPW  x y) yes no) -> (CRJ   {c.(s390x.CCMask)&^s390x.Unordered} x y yes no)
   591  (BRC {c} (CMPU  x y) yes no) -> (CLGRJ {c.(s390x.CCMask)&^s390x.Unordered} x y yes no)
   592  (BRC {c} (CMPWU x y) yes no) -> (CLRJ  {c.(s390x.CCMask)&^s390x.Unordered} x y yes no)
   593  
   594  // Compare-and-branch (immediate).
   595  // Note: bit 3 (unordered) must not be set so we mask out s390x.Unordered.
   596  (BRC {c} (CMPconst   x [y]) yes no) && is8Bit(y)  -> (CGIJ  {c.(s390x.CCMask)&^s390x.Unordered} x [int64(int8(y))] yes no)
   597  (BRC {c} (CMPWconst  x [y]) yes no) && is8Bit(y)  -> (CIJ   {c.(s390x.CCMask)&^s390x.Unordered} x [int64(int8(y))] yes no)
   598  (BRC {c} (CMPUconst  x [y]) yes no) && isU8Bit(y) -> (CLGIJ {c.(s390x.CCMask)&^s390x.Unordered} x [int64(int8(y))] yes no)
   599  (BRC {c} (CMPWUconst x [y]) yes no) && isU8Bit(y) -> (CLIJ  {c.(s390x.CCMask)&^s390x.Unordered} x [int64(int8(y))] yes no)
   600  
   601  // Absorb immediate into compare-and-branch.
   602  (C(R|GR)J  {c} x (MOVDconst [y]) yes no) && is8Bit(y)  -> (C(I|GI)J  {c} x [int64(int8(y))] yes no)
   603  (CL(R|GR)J {c} x (MOVDconst [y]) yes no) && isU8Bit(y) -> (CL(I|GI)J {c} x [int64(int8(y))] yes no)
   604  (C(R|GR)J  {c} (MOVDconst [x]) y yes no) && is8Bit(x)  -> (C(I|GI)J  {c.(s390x.CCMask).ReverseComparison()} y [int64(int8(x))] yes no)
   605  (CL(R|GR)J {c} (MOVDconst [x]) y yes no) && isU8Bit(x) -> (CL(I|GI)J {c.(s390x.CCMask).ReverseComparison()} y [int64(int8(x))] yes no)
   606  
   607  // Prefer comparison with immediate to compare-and-branch.
   608  (CGRJ  {c} x (MOVDconst [y]) yes no) && !is8Bit(y)  && is32Bit(y)  -> (BRC {c} (CMPconst   x [int64(int32(y))]) yes no)
   609  (CRJ   {c} x (MOVDconst [y]) yes no) && !is8Bit(y)  && is32Bit(y)  -> (BRC {c} (CMPWconst  x [int64(int32(y))]) yes no)
   610  (CLGRJ {c} x (MOVDconst [y]) yes no) && !isU8Bit(y) && isU32Bit(y) -> (BRC {c} (CMPUconst  x [int64(int32(y))]) yes no)
   611  (CLRJ  {c} x (MOVDconst [y]) yes no) && !isU8Bit(y) && isU32Bit(y) -> (BRC {c} (CMPWUconst x [int64(int32(y))]) yes no)
   612  (CGRJ  {c} (MOVDconst [x]) y yes no) && !is8Bit(x)  && is32Bit(x)  -> (BRC {c.(s390x.CCMask).ReverseComparison()} (CMPconst   y [int64(int32(x))]) yes no)
   613  (CRJ   {c} (MOVDconst [x]) y yes no) && !is8Bit(x)  && is32Bit(x)  -> (BRC {c.(s390x.CCMask).ReverseComparison()} (CMPWconst  y [int64(int32(x))]) yes no)
   614  (CLGRJ {c} (MOVDconst [x]) y yes no) && !isU8Bit(x) && isU32Bit(x) -> (BRC {c.(s390x.CCMask).ReverseComparison()} (CMPUconst  y [int64(int32(x))]) yes no)
   615  (CLRJ  {c} (MOVDconst [x]) y yes no) && !isU8Bit(x) && isU32Bit(x) -> (BRC {c.(s390x.CCMask).ReverseComparison()} (CMPWUconst y [int64(int32(x))]) yes no)
   616  
   617  // Absorb sign/zero extensions into 32-bit compare-and-branch.
   618  (CIJ  {c} (MOV(W|WZ)reg x) [y] yes no) -> (CIJ  {c} x [y] yes no)
   619  (CLIJ {c} (MOV(W|WZ)reg x) [y] yes no) -> (CLIJ {c} x [y] yes no)
   620  
   621  // Bring out-of-range signed immediates into range by varying branch condition.
   622  (BRC {s390x.Less}           (CMPconst  x [ 128]) yes no) -> (CGIJ {s390x.LessOrEqual}    x [ 127] yes no)
   623  (BRC {s390x.Less}           (CMPWconst x [ 128]) yes no) -> (CIJ  {s390x.LessOrEqual}    x [ 127] yes no)
   624  (BRC {s390x.LessOrEqual}    (CMPconst  x [-129]) yes no) -> (CGIJ {s390x.Less}           x [-128] yes no)
   625  (BRC {s390x.LessOrEqual}    (CMPWconst x [-129]) yes no) -> (CIJ  {s390x.Less}           x [-128] yes no)
   626  (BRC {s390x.Greater}        (CMPconst  x [-129]) yes no) -> (CGIJ {s390x.GreaterOrEqual} x [-128] yes no)
   627  (BRC {s390x.Greater}        (CMPWconst x [-129]) yes no) -> (CIJ  {s390x.GreaterOrEqual} x [-128] yes no)
   628  (BRC {s390x.GreaterOrEqual} (CMPconst  x [ 128]) yes no) -> (CGIJ {s390x.Greater}        x [ 127] yes no)
   629  (BRC {s390x.GreaterOrEqual} (CMPWconst x [ 128]) yes no) -> (CIJ  {s390x.Greater}        x [ 127] yes no)
   630  
   631  // Bring out-of-range unsigned immediates into range by varying branch condition.
   632  // Note: int64(int8(255)) == -1
   633  (BRC {s390x.Less}           (CMP(WU|U)const  x [256]) yes no) -> (C(L|LG)IJ {s390x.LessOrEqual} x [-1] yes no)
   634  (BRC {s390x.GreaterOrEqual} (CMP(WU|U)const  x [256]) yes no) -> (C(L|LG)IJ {s390x.Greater}     x [-1] yes no)
   635  
   636  // Fold constants into instructions.
   637  (ADD x (MOVDconst [c])) && is32Bit(c) -> (ADDconst [c] x)
   638  (ADDW x (MOVDconst [c])) -> (ADDWconst [int64(int32(c))] x)
   639  
   640  (SUB x (MOVDconst [c])) && is32Bit(c) -> (SUBconst x [c])
   641  (SUB (MOVDconst [c]) x) && is32Bit(c) -> (NEG (SUBconst <v.Type> x [c]))
   642  (SUBW x (MOVDconst [c])) -> (SUBWconst x [int64(int32(c))])
   643  (SUBW (MOVDconst [c]) x) -> (NEGW (SUBWconst <v.Type> x [int64(int32(c))]))
   644  
   645  (MULLD x (MOVDconst [c])) && is32Bit(c) -> (MULLDconst [c] x)
   646  (MULLW x (MOVDconst [c])) -> (MULLWconst [int64(int32(c))] x)
   647  
   648  // NILF instructions leave the high 32 bits unchanged which is
   649  // equivalent to the leftmost 32 bits being set.
   650  // TODO(mundaym): modify the assembler to accept 64-bit values
   651  // and use isU32Bit(^c).
   652  (AND x (MOVDconst [c])) && is32Bit(c) && c < 0 -> (ANDconst [c] x)
   653  (AND x (MOVDconst [c])) && is32Bit(c) && c >= 0 -> (MOVWZreg (ANDWconst <typ.UInt32> [int64(int32(c))] x))
   654  (ANDW x (MOVDconst [c])) -> (ANDWconst [int64(int32(c))] x)
   655  
   656  (ANDWconst [c] (ANDWconst [d] x)) -> (ANDWconst [c & d] x)
   657  (ANDconst [c] (ANDconst [d] x)) -> (ANDconst [c & d] x)
   658  
   659  (OR x (MOVDconst [c])) && isU32Bit(c) -> (ORconst [c] x)
   660  (ORW x (MOVDconst [c])) -> (ORWconst [int64(int32(c))] x)
   661  
   662  (XOR x (MOVDconst [c])) && isU32Bit(c) -> (XORconst [c] x)
   663  (XORW x (MOVDconst [c])) -> (XORWconst [int64(int32(c))] x)
   664  
   665  // Constant shifts.
   666  (S(LD|RD|RAD|LW|RW|RAW) x (MOVDconst [c]))
   667  	-> (S(LD|RD|RAD|LW|RW|RAW)const x [c&63])
   668  
   669  // Shifts only use the rightmost 6 bits of the shift value.
   670  (S(LD|RD|RAD|LW|RW|RAW) x (AND (MOVDconst [c]) y))
   671  	-> (S(LD|RD|RAD|LW|RW|RAW) x (ANDWconst <typ.UInt32> [c&63] y))
   672  (S(LD|RD|RAD|LW|RW|RAW) x (ANDWconst [c] y)) && c&63 == 63
   673  	-> (S(LD|RD|RAD|LW|RW|RAW) x y)
   674  (SLD  x (MOV(W|H|B|WZ|HZ|BZ)reg y)) -> (SLD  x y)
   675  (SRD  x (MOV(W|H|B|WZ|HZ|BZ)reg y)) -> (SRD  x y)
   676  (SRAD x (MOV(W|H|B|WZ|HZ|BZ)reg y)) -> (SRAD x y)
   677  (SLW  x (MOV(W|H|B|WZ|HZ|BZ)reg y)) -> (SLW  x y)
   678  (SRW  x (MOV(W|H|B|WZ|HZ|BZ)reg y)) -> (SRW  x y)
   679  (SRAW x (MOV(W|H|B|WZ|HZ|BZ)reg y)) -> (SRAW x y)
   680  
   681  // Constant rotate generation
   682  (RLL  x (MOVDconst [c])) -> (RLLconst  x [c&31])
   683  (RLLG x (MOVDconst [c])) -> (RLLGconst x [c&63])
   684  
   685  (ADD (SLDconst x [c]) (SRDconst x [d])) && d == 64-c -> (RLLGconst [c] x)
   686  ( OR (SLDconst x [c]) (SRDconst x [d])) && d == 64-c -> (RLLGconst [c] x)
   687  (XOR (SLDconst x [c]) (SRDconst x [d])) && d == 64-c -> (RLLGconst [c] x)
   688  
   689  (ADDW (SLWconst x [c]) (SRWconst x [d])) && d == 32-c -> (RLLconst [c] x)
   690  ( ORW (SLWconst x [c]) (SRWconst x [d])) && d == 32-c -> (RLLconst [c] x)
   691  (XORW (SLWconst x [c]) (SRWconst x [d])) && d == 32-c -> (RLLconst [c] x)
   692  
   693  (CMP x (MOVDconst [c])) && is32Bit(c) -> (CMPconst x [c])
   694  (CMP (MOVDconst [c]) x) && is32Bit(c) -> (InvertFlags (CMPconst x [c]))
   695  (CMPW x (MOVDconst [c])) -> (CMPWconst x [int64(int32(c))])
   696  (CMPW (MOVDconst [c]) x) -> (InvertFlags (CMPWconst x [int64(int32(c))]))
   697  (CMPU x (MOVDconst [c])) && isU32Bit(c) -> (CMPUconst x [int64(int32(c))])
   698  (CMPU (MOVDconst [c]) x) && isU32Bit(c) -> (InvertFlags (CMPUconst x [int64(int32(c))]))
   699  (CMPWU x (MOVDconst [c])) -> (CMPWUconst x [int64(int32(c))])
   700  (CMPWU (MOVDconst [c]) x) -> (InvertFlags (CMPWUconst x [int64(int32(c))]))
   701  
   702  // Using MOV{W,H,B}Zreg instead of AND is cheaper.
   703  (AND x (MOVDconst [0xFF])) -> (MOVBZreg x)
   704  (AND x (MOVDconst [0xFFFF])) -> (MOVHZreg x)
   705  (AND x (MOVDconst [0xFFFFFFFF])) -> (MOVWZreg x)
   706  (ANDWconst [0xFF] x) -> (MOVBZreg x)
   707  (ANDWconst [0xFFFF] x) -> (MOVHZreg x)
   708  
   709  // strength reduction
   710  (MULLDconst [-1] x) -> (NEG x)
   711  (MULLDconst [0] _) -> (MOVDconst [0])
   712  (MULLDconst [1] x) -> x
   713  (MULLDconst [c] x) && isPowerOfTwo(c) -> (SLDconst [log2(c)] x)
   714  (MULLDconst [c] x) && isPowerOfTwo(c+1) && c >= 15 -> (SUB (SLDconst <v.Type> [log2(c+1)] x) x)
   715  (MULLDconst [c] x) && isPowerOfTwo(c-1) && c >= 17 -> (ADD (SLDconst <v.Type> [log2(c-1)] x) x)
   716  
   717  (MULLWconst [-1] x) -> (NEGW x)
   718  (MULLWconst [0] _) -> (MOVDconst [0])
   719  (MULLWconst [1] x) -> x
   720  (MULLWconst [c] x) && isPowerOfTwo(c) -> (SLWconst [log2(c)] x)
   721  (MULLWconst [c] x) && isPowerOfTwo(c+1) && c >= 15 -> (SUBW (SLWconst <v.Type> [log2(c+1)] x) x)
   722  (MULLWconst [c] x) && isPowerOfTwo(c-1) && c >= 17 -> (ADDW (SLWconst <v.Type> [log2(c-1)] x) x)
   723  
   724  // Fold ADD into MOVDaddr. Odd offsets from SB shouldn't be folded (LARL can't handle them).
   725  (ADDconst [c] (MOVDaddr [d] {s} x:(SB))) && ((c+d)&1 == 0) && is32Bit(c+d) -> (MOVDaddr [c+d] {s} x)
   726  (ADDconst [c] (MOVDaddr [d] {s} x)) && x.Op != OpSB && is20Bit(c+d) -> (MOVDaddr [c+d] {s} x)
   727  (ADD idx (MOVDaddr [c] {s} ptr)) && ptr.Op != OpSB && idx.Op != OpSB -> (MOVDaddridx [c] {s} ptr idx)
   728  
   729  // fold ADDconst into MOVDaddrx
   730  (ADDconst [c] (MOVDaddridx [d] {s} x y)) && is20Bit(c+d) -> (MOVDaddridx [c+d] {s} x y)
   731  (MOVDaddridx [c] {s} (ADDconst [d] x) y) && is20Bit(c+d) && x.Op != OpSB -> (MOVDaddridx [c+d] {s} x y)
   732  (MOVDaddridx [c] {s} x (ADDconst [d] y)) && is20Bit(c+d) && y.Op != OpSB -> (MOVDaddridx [c+d] {s} x y)
   733  
   734  // reverse ordering of compare instruction
   735  (LOCGR {c} x y (InvertFlags cmp)) -> (LOCGR {c.(s390x.CCMask).ReverseComparison()} x y cmp)
   736  
   737  // replace load from same location as preceding store with copy
   738  (MOVDload  [off] {sym} ptr1 (MOVDstore  [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) -> x
   739  (MOVWload  [off] {sym} ptr1 (MOVWstore  [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) -> (MOVWreg x)
   740  (MOVHload  [off] {sym} ptr1 (MOVHstore  [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) -> (MOVHreg x)
   741  (MOVBload  [off] {sym} ptr1 (MOVBstore  [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) -> (MOVBreg x)
   742  (MOVWZload [off] {sym} ptr1 (MOVWstore  [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) -> (MOVWZreg x)
   743  (MOVHZload [off] {sym} ptr1 (MOVHstore  [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) -> (MOVHZreg x)
   744  (MOVBZload [off] {sym} ptr1 (MOVBstore  [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) -> (MOVBZreg x)
   745  (MOVDload  [off] {sym} ptr1 (FMOVDstore [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) -> (LGDR x)
   746  (FMOVDload [off] {sym} ptr1 (MOVDstore  [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) -> (LDGR x)
   747  (FMOVDload [off] {sym} ptr1 (FMOVDstore [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) -> x
   748  (FMOVSload [off] {sym} ptr1 (FMOVSstore [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) -> x
   749  
   750  // prefer FPR <-> GPR moves over combined load ops
   751  (MULLDload <t> [off] {sym} x ptr1 (FMOVDstore [off] {sym} ptr2 y _)) && isSamePtr(ptr1, ptr2) -> (MULLD x (LGDR <t> y))
   752  (ADDload   <t> [off] {sym} x ptr1 (FMOVDstore [off] {sym} ptr2 y _)) && isSamePtr(ptr1, ptr2) -> (ADD   x (LGDR <t> y))
   753  (SUBload   <t> [off] {sym} x ptr1 (FMOVDstore [off] {sym} ptr2 y _)) && isSamePtr(ptr1, ptr2) -> (SUB   x (LGDR <t> y))
   754  (ORload    <t> [off] {sym} x ptr1 (FMOVDstore [off] {sym} ptr2 y _)) && isSamePtr(ptr1, ptr2) -> (OR    x (LGDR <t> y))
   755  (ANDload   <t> [off] {sym} x ptr1 (FMOVDstore [off] {sym} ptr2 y _)) && isSamePtr(ptr1, ptr2) -> (AND   x (LGDR <t> y))
   756  (XORload   <t> [off] {sym} x ptr1 (FMOVDstore [off] {sym} ptr2 y _)) && isSamePtr(ptr1, ptr2) -> (XOR   x (LGDR <t> y))
   757  
   758  // detect attempts to set/clear the sign bit
   759  // may need to be reworked when NIHH/OIHH are added
   760  (SRDconst [1] (SLDconst [1] (LGDR <t> x))) -> (LGDR <t> (LPDFR <x.Type> x))
   761  (LDGR <t> (SRDconst [1] (SLDconst [1] x))) -> (LPDFR (LDGR <t> x))
   762  (AND (MOVDconst [^(-1<<63)]) (LGDR <t> x)) -> (LGDR <t> (LPDFR <x.Type> x))
   763  (LDGR <t> (AND (MOVDconst [^(-1<<63)]) x)) -> (LPDFR (LDGR <t> x))
   764  (OR (MOVDconst [-1<<63]) (LGDR <t> x))     -> (LGDR <t> (LNDFR <x.Type> x))
   765  (LDGR <t> (OR (MOVDconst [-1<<63]) x))     -> (LNDFR (LDGR <t> x))
   766  
   767  // detect attempts to set the sign bit with load
   768  (LDGR <t> x:(ORload <t1> [off] {sym} (MOVDconst [-1<<63]) ptr mem)) && x.Uses == 1 && clobber(x) -> @x.Block (LNDFR <t> (LDGR <t> (MOVDload <t1> [off] {sym} ptr mem)))
   769  
   770  // detect copysign
   771  (OR (SLDconst [63] (SRDconst [63] (LGDR x))) (LGDR (LPDFR <t> y))) -> (LGDR (CPSDR <t> y x))
   772  (OR (SLDconst [63] (SRDconst [63] (LGDR x))) (MOVDconst [c])) && c & -1<<63 == 0 -> (LGDR (CPSDR <x.Type> (FMOVDconst <x.Type> [c]) x))
   773  (OR (AND (MOVDconst [-1<<63]) (LGDR x)) (LGDR (LPDFR <t> y))) -> (LGDR (CPSDR <t> y x))
   774  (OR (AND (MOVDconst [-1<<63]) (LGDR x)) (MOVDconst [c])) && c & -1<<63 == 0 -> (LGDR (CPSDR <x.Type> (FMOVDconst <x.Type> [c]) x))
   775  (CPSDR y (FMOVDconst [c])) && c & -1<<63 == 0 -> (LPDFR y)
   776  (CPSDR y (FMOVDconst [c])) && c & -1<<63 != 0 -> (LNDFR y)
   777  
   778  // absorb negations into set/clear sign bit
   779  (FNEG  (LPDFR x)) -> (LNDFR x)
   780  (FNEG  (LNDFR x)) -> (LPDFR x)
   781  (FNEGS (LPDFR x)) -> (LNDFR x)
   782  (FNEGS (LNDFR x)) -> (LPDFR x)
   783  
   784  // no need to convert float32 to float64 to set/clear sign bit
   785  (LEDBR (LPDFR (LDEBR x))) -> (LPDFR x)
   786  (LEDBR (LNDFR (LDEBR x))) -> (LNDFR x)
   787  
   788  // remove unnecessary FPR <-> GPR moves
   789  (LDGR (LGDR x)) -> x
   790  (LGDR (LDGR x)) -> x
   791  
   792  // Don't extend before storing
   793  (MOVWstore [off] {sym} ptr (MOVWreg x) mem) -> (MOVWstore [off] {sym} ptr x mem)
   794  (MOVHstore [off] {sym} ptr (MOVHreg x) mem) -> (MOVHstore [off] {sym} ptr x mem)
   795  (MOVBstore [off] {sym} ptr (MOVBreg x) mem) -> (MOVBstore [off] {sym} ptr x mem)
   796  (MOVWstore [off] {sym} ptr (MOVWZreg x) mem) -> (MOVWstore [off] {sym} ptr x mem)
   797  (MOVHstore [off] {sym} ptr (MOVHZreg x) mem) -> (MOVHstore [off] {sym} ptr x mem)
   798  (MOVBstore [off] {sym} ptr (MOVBZreg x) mem) -> (MOVBstore [off] {sym} ptr x mem)
   799  
   800  // Fold constants into memory operations.
   801  // Note that this is not always a good idea because if not all the uses of
   802  // the ADDconst get eliminated, we still have to compute the ADDconst and we now
   803  // have potentially two live values (ptr and (ADDconst [off] ptr)) instead of one.
   804  // Nevertheless, let's do it!
   805  (MOVDload   [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(off1+off2) -> (MOVDload  [off1+off2] {sym} ptr mem)
   806  (MOVWload   [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(off1+off2) -> (MOVWload  [off1+off2] {sym} ptr mem)
   807  (MOVHload   [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(off1+off2) -> (MOVHload  [off1+off2] {sym} ptr mem)
   808  (MOVBload   [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(off1+off2) -> (MOVBload  [off1+off2] {sym} ptr mem)
   809  (MOVWZload  [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(off1+off2) -> (MOVWZload [off1+off2] {sym} ptr mem)
   810  (MOVHZload  [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(off1+off2) -> (MOVHZload [off1+off2] {sym} ptr mem)
   811  (MOVBZload  [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(off1+off2) -> (MOVBZload [off1+off2] {sym} ptr mem)
   812  (FMOVSload  [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(off1+off2) -> (FMOVSload [off1+off2] {sym} ptr mem)
   813  (FMOVDload  [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(off1+off2) -> (FMOVDload [off1+off2] {sym} ptr mem)
   814  
   815  (MOVDstore  [off1] {sym} (ADDconst [off2] ptr) val mem) && is20Bit(off1+off2) -> (MOVDstore  [off1+off2] {sym} ptr val mem)
   816  (MOVWstore  [off1] {sym} (ADDconst [off2] ptr) val mem) && is20Bit(off1+off2) -> (MOVWstore  [off1+off2] {sym} ptr val mem)
   817  (MOVHstore  [off1] {sym} (ADDconst [off2] ptr) val mem) && is20Bit(off1+off2) -> (MOVHstore  [off1+off2] {sym} ptr val mem)
   818  (MOVBstore  [off1] {sym} (ADDconst [off2] ptr) val mem) && is20Bit(off1+off2) -> (MOVBstore  [off1+off2] {sym} ptr val mem)
   819  (FMOVSstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is20Bit(off1+off2) -> (FMOVSstore [off1+off2] {sym} ptr val mem)
   820  (FMOVDstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is20Bit(off1+off2) -> (FMOVDstore [off1+off2] {sym} ptr val mem)
   821  
   822  (ADDload   [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(off1+off2) -> (ADDload   [off1+off2] {sym} x ptr mem)
   823  (ADDWload  [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(off1+off2) -> (ADDWload  [off1+off2] {sym} x ptr mem)
   824  (MULLDload [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(off1+off2) -> (MULLDload [off1+off2] {sym} x ptr mem)
   825  (MULLWload [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(off1+off2) -> (MULLWload [off1+off2] {sym} x ptr mem)
   826  (SUBload   [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(off1+off2) -> (SUBload   [off1+off2] {sym} x ptr mem)
   827  (SUBWload  [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(off1+off2) -> (SUBWload  [off1+off2] {sym} x ptr mem)
   828  
   829  (ANDload   [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(off1+off2) -> (ANDload   [off1+off2] {sym} x ptr mem)
   830  (ANDWload  [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(off1+off2) -> (ANDWload  [off1+off2] {sym} x ptr mem)
   831  (ORload    [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(off1+off2) -> (ORload    [off1+off2] {sym} x ptr mem)
   832  (ORWload   [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(off1+off2) -> (ORWload   [off1+off2] {sym} x ptr mem)
   833  (XORload   [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(off1+off2) -> (XORload   [off1+off2] {sym} x ptr mem)
   834  (XORWload  [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(off1+off2) -> (XORWload  [off1+off2] {sym} x ptr mem)
   835  
   836  // Fold constants into stores.
   837  (MOVDstore [off] {sym} ptr (MOVDconst [c]) mem) && is16Bit(c) && isU12Bit(off) && ptr.Op != OpSB ->
   838  	(MOVDstoreconst [makeValAndOff(c,off)] {sym} ptr mem)
   839  (MOVWstore [off] {sym} ptr (MOVDconst [c]) mem) && is16Bit(c) && isU12Bit(off) && ptr.Op != OpSB ->
   840  	(MOVWstoreconst [makeValAndOff(int64(int32(c)),off)] {sym} ptr mem)
   841  (MOVHstore [off] {sym} ptr (MOVDconst [c]) mem) && isU12Bit(off) && ptr.Op != OpSB ->
   842  	(MOVHstoreconst [makeValAndOff(int64(int16(c)),off)] {sym} ptr mem)
   843  (MOVBstore [off] {sym} ptr (MOVDconst [c]) mem) && is20Bit(off) && ptr.Op != OpSB ->
   844  	(MOVBstoreconst [makeValAndOff(int64(int8(c)),off)] {sym} ptr mem)
   845  
   846  // Fold address offsets into constant stores.
   847  (MOVDstoreconst [sc] {s} (ADDconst [off] ptr) mem) && isU12Bit(ValAndOff(sc).Off()+off) ->
   848  	(MOVDstoreconst [ValAndOff(sc).add(off)] {s} ptr mem)
   849  (MOVWstoreconst [sc] {s} (ADDconst [off] ptr) mem) && isU12Bit(ValAndOff(sc).Off()+off) ->
   850  	(MOVWstoreconst [ValAndOff(sc).add(off)] {s} ptr mem)
   851  (MOVHstoreconst [sc] {s} (ADDconst [off] ptr) mem) && isU12Bit(ValAndOff(sc).Off()+off) ->
   852  	(MOVHstoreconst [ValAndOff(sc).add(off)] {s} ptr mem)
   853  (MOVBstoreconst [sc] {s} (ADDconst [off] ptr) mem) && is20Bit(ValAndOff(sc).Off()+off) ->
   854  	(MOVBstoreconst [ValAndOff(sc).add(off)] {s} ptr mem)
   855  
   856  // Merge address calculations into loads and stores.
   857  // Offsets from SB must not be merged into unaligned memory accesses because
   858  // loads/stores using PC-relative addressing directly must be aligned to the
   859  // size of the target.
   860  (MOVDload   [off1] {sym1} (MOVDaddr <t> [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || (t.IsPtr() && t.Elem().Alignment()%8 == 0 && (off1+off2)%8 == 0)) ->
   861  	(MOVDload  [off1+off2] {mergeSym(sym1,sym2)} base mem)
   862  (MOVWZload  [off1] {sym1} (MOVDaddr <t> [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || (t.IsPtr() && t.Elem().Alignment()%4 == 0 && (off1+off2)%4 == 0)) ->
   863  	(MOVWZload  [off1+off2] {mergeSym(sym1,sym2)} base mem)
   864  (MOVHZload  [off1] {sym1} (MOVDaddr <t> [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || (t.IsPtr() && t.Elem().Alignment()%2 == 0 && (off1+off2)%2 == 0)) ->
   865  	(MOVHZload  [off1+off2] {mergeSym(sym1,sym2)} base mem)
   866  (MOVBZload  [off1] {sym1} (MOVDaddr [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
   867  	(MOVBZload  [off1+off2] {mergeSym(sym1,sym2)} base mem)
   868  (FMOVSload [off1] {sym1} (MOVDaddr [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
   869  	(FMOVSload [off1+off2] {mergeSym(sym1,sym2)} base mem)
   870  (FMOVDload [off1] {sym1} (MOVDaddr [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
   871  	(FMOVDload [off1+off2] {mergeSym(sym1,sym2)} base mem)
   872  
   873  (MOVWload [off1] {sym1} (MOVDaddr <t> [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || (t.IsPtr() && t.Elem().Alignment()%4 == 0 && (off1+off2)%4 == 0)) ->
   874  	(MOVWload [off1+off2] {mergeSym(sym1,sym2)} base mem)
   875  (MOVHload [off1] {sym1} (MOVDaddr <t> [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || (t.IsPtr() && t.Elem().Alignment()%2 == 0 && (off1+off2)%2 == 0)) ->
   876  	(MOVHload [off1+off2] {mergeSym(sym1,sym2)} base mem)
   877  (MOVBload [off1] {sym1} (MOVDaddr [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
   878  	(MOVBload [off1+off2] {mergeSym(sym1,sym2)} base mem)
   879  
   880  (MOVDstore  [off1] {sym1} (MOVDaddr <t> [off2] {sym2} base) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || (t.IsPtr() && t.Elem().Alignment()%8 == 0 && (off1+off2)%8 == 0)) ->
   881  	(MOVDstore  [off1+off2] {mergeSym(sym1,sym2)} base val mem)
   882  (MOVWstore  [off1] {sym1} (MOVDaddr <t> [off2] {sym2} base) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || (t.IsPtr() && t.Elem().Alignment()%4 == 0 && (off1+off2)%4 == 0)) ->
   883  	(MOVWstore  [off1+off2] {mergeSym(sym1,sym2)} base val mem)
   884  (MOVHstore  [off1] {sym1} (MOVDaddr <t> [off2] {sym2} base) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || (t.IsPtr() && t.Elem().Alignment()%2 == 0 && (off1+off2)%2 == 0)) ->
   885  	(MOVHstore  [off1+off2] {mergeSym(sym1,sym2)} base val mem)
   886  (MOVBstore  [off1] {sym1} (MOVDaddr [off2] {sym2} base) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
   887  	(MOVBstore  [off1+off2] {mergeSym(sym1,sym2)} base val mem)
   888  (FMOVSstore [off1] {sym1} (MOVDaddr [off2] {sym2} base) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
   889  	(FMOVSstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
   890  (FMOVDstore [off1] {sym1} (MOVDaddr [off2] {sym2} base) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
   891  	(FMOVDstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
   892  
   893  (ADDload   [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(o1+o2) && canMergeSym(s1, s2) -> (ADDload   [o1+o2] {mergeSym(s1, s2)} x ptr mem)
   894  (ADDWload  [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(o1+o2) && canMergeSym(s1, s2) -> (ADDWload  [o1+o2] {mergeSym(s1, s2)} x ptr mem)
   895  (MULLDload [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(o1+o2) && canMergeSym(s1, s2) -> (MULLDload [o1+o2] {mergeSym(s1, s2)} x ptr mem)
   896  (MULLWload [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(o1+o2) && canMergeSym(s1, s2) -> (MULLWload [o1+o2] {mergeSym(s1, s2)} x ptr mem)
   897  (SUBload   [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(o1+o2) && canMergeSym(s1, s2) -> (SUBload   [o1+o2] {mergeSym(s1, s2)} x ptr mem)
   898  (SUBWload  [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(o1+o2) && canMergeSym(s1, s2) -> (SUBWload  [o1+o2] {mergeSym(s1, s2)} x ptr mem)
   899  
   900  (ANDload   [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(o1+o2) && canMergeSym(s1, s2) -> (ANDload   [o1+o2] {mergeSym(s1, s2)} x ptr mem)
   901  (ANDWload  [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(o1+o2) && canMergeSym(s1, s2) -> (ANDWload  [o1+o2] {mergeSym(s1, s2)} x ptr mem)
   902  (ORload    [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(o1+o2) && canMergeSym(s1, s2) -> (ORload    [o1+o2] {mergeSym(s1, s2)} x ptr mem)
   903  (ORWload   [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(o1+o2) && canMergeSym(s1, s2) -> (ORWload   [o1+o2] {mergeSym(s1, s2)} x ptr mem)
   904  (XORload   [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(o1+o2) && canMergeSym(s1, s2) -> (XORload   [o1+o2] {mergeSym(s1, s2)} x ptr mem)
   905  (XORWload  [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(o1+o2) && canMergeSym(s1, s2) -> (XORWload  [o1+o2] {mergeSym(s1, s2)} x ptr mem)
   906  
   907  // Cannot store constant to SB directly (no 'move relative long immediate' instructions).
   908  (MOVDstoreconst [sc] {sym1} (MOVDaddr [off] {sym2} ptr) mem) && ptr.Op != OpSB && canMergeSym(sym1, sym2) && ValAndOff(sc).canAdd(off) ->
   909  	(MOVDstoreconst [ValAndOff(sc).add(off)] {mergeSym(sym1, sym2)} ptr mem)
   910  (MOVWstoreconst [sc] {sym1} (MOVDaddr [off] {sym2} ptr) mem) && ptr.Op != OpSB && canMergeSym(sym1, sym2) && ValAndOff(sc).canAdd(off) ->
   911  	(MOVWstoreconst [ValAndOff(sc).add(off)] {mergeSym(sym1, sym2)} ptr mem)
   912  (MOVHstoreconst [sc] {sym1} (MOVDaddr [off] {sym2} ptr) mem) && ptr.Op != OpSB && canMergeSym(sym1, sym2) && ValAndOff(sc).canAdd(off) ->
   913  	(MOVHstoreconst [ValAndOff(sc).add(off)] {mergeSym(sym1, sym2)} ptr mem)
   914  (MOVBstoreconst [sc] {sym1} (MOVDaddr [off] {sym2} ptr) mem) && ptr.Op != OpSB && canMergeSym(sym1, sym2) && ValAndOff(sc).canAdd(off) ->
   915  	(MOVBstoreconst [ValAndOff(sc).add(off)] {mergeSym(sym1, sym2)} ptr mem)
   916  
   917  // generating indexed loads and stores
   918  (MOVBZload [off1] {sym1} (MOVDaddridx [off2] {sym2} ptr idx) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
   919  	(MOVBZloadidx [off1+off2] {mergeSym(sym1,sym2)} ptr idx mem)
   920  (MOVBload [off1] {sym1} (MOVDaddridx [off2] {sym2} ptr idx) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
   921  	(MOVBloadidx [off1+off2] {mergeSym(sym1,sym2)} ptr idx mem)
   922  (MOVHZload [off1] {sym1} (MOVDaddridx [off2] {sym2} ptr idx) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
   923  	(MOVHZloadidx [off1+off2] {mergeSym(sym1,sym2)} ptr idx mem)
   924  (MOVHload [off1] {sym1} (MOVDaddridx [off2] {sym2} ptr idx) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
   925  	(MOVHloadidx [off1+off2] {mergeSym(sym1,sym2)} ptr idx mem)
   926  (MOVWZload [off1] {sym1} (MOVDaddridx [off2] {sym2} ptr idx) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
   927  	(MOVWZloadidx [off1+off2] {mergeSym(sym1,sym2)} ptr idx mem)
   928  (MOVWload [off1] {sym1} (MOVDaddridx [off2] {sym2} ptr idx) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
   929  	(MOVWloadidx [off1+off2] {mergeSym(sym1,sym2)} ptr idx mem)
   930  (MOVDload [off1] {sym1} (MOVDaddridx [off2] {sym2} ptr idx) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
   931  	(MOVDloadidx [off1+off2] {mergeSym(sym1,sym2)} ptr idx mem)
   932  (FMOVSload [off1] {sym1} (MOVDaddridx [off2] {sym2} ptr idx) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
   933  	(FMOVSloadidx [off1+off2] {mergeSym(sym1,sym2)} ptr idx mem)
   934  (FMOVDload [off1] {sym1} (MOVDaddridx [off2] {sym2} ptr idx) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
   935  	(FMOVDloadidx [off1+off2] {mergeSym(sym1,sym2)} ptr idx mem)
   936  
   937  (MOVBstore [off1] {sym1} (MOVDaddridx [off2] {sym2} ptr idx) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
   938  	(MOVBstoreidx [off1+off2] {mergeSym(sym1,sym2)} ptr idx val mem)
   939  (MOVHstore [off1] {sym1} (MOVDaddridx [off2] {sym2} ptr idx) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
   940  	(MOVHstoreidx [off1+off2] {mergeSym(sym1,sym2)} ptr idx val mem)
   941  (MOVWstore [off1] {sym1} (MOVDaddridx [off2] {sym2} ptr idx) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
   942  	(MOVWstoreidx [off1+off2] {mergeSym(sym1,sym2)} ptr idx val mem)
   943  (MOVDstore [off1] {sym1} (MOVDaddridx [off2] {sym2} ptr idx) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
   944  	(MOVDstoreidx [off1+off2] {mergeSym(sym1,sym2)} ptr idx val mem)
   945  (FMOVSstore [off1] {sym1} (MOVDaddridx [off2] {sym2} ptr idx) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
   946  	(FMOVSstoreidx [off1+off2] {mergeSym(sym1,sym2)} ptr idx val mem)
   947  (FMOVDstore [off1] {sym1} (MOVDaddridx [off2] {sym2} ptr idx) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
   948  	(FMOVDstoreidx [off1+off2] {mergeSym(sym1,sym2)} ptr idx val mem)
   949  
   950  (MOVBZload [off] {sym} (ADD ptr idx) mem) && ptr.Op != OpSB -> (MOVBZloadidx [off] {sym} ptr idx mem)
   951  (MOVBload  [off] {sym} (ADD ptr idx) mem) && ptr.Op != OpSB -> (MOVBloadidx  [off] {sym} ptr idx mem)
   952  (MOVHZload [off] {sym} (ADD ptr idx) mem) && ptr.Op != OpSB -> (MOVHZloadidx [off] {sym} ptr idx mem)
   953  (MOVHload  [off] {sym} (ADD ptr idx) mem) && ptr.Op != OpSB -> (MOVHloadidx  [off] {sym} ptr idx mem)
   954  (MOVWZload [off] {sym} (ADD ptr idx) mem) && ptr.Op != OpSB -> (MOVWZloadidx [off] {sym} ptr idx mem)
   955  (MOVWload  [off] {sym} (ADD ptr idx) mem) && ptr.Op != OpSB -> (MOVWloadidx  [off] {sym} ptr idx mem)
   956  (MOVDload  [off] {sym} (ADD ptr idx) mem) && ptr.Op != OpSB -> (MOVDloadidx  [off] {sym} ptr idx mem)
   957  (FMOVSload [off] {sym} (ADD ptr idx) mem) && ptr.Op != OpSB -> (FMOVSloadidx [off] {sym} ptr idx mem)
   958  (FMOVDload [off] {sym} (ADD ptr idx) mem) && ptr.Op != OpSB -> (FMOVDloadidx [off] {sym} ptr idx mem)
   959  
   960  (MOVBstore  [off] {sym} (ADD ptr idx) val mem) && ptr.Op != OpSB -> (MOVBstoreidx  [off] {sym} ptr idx val mem)
   961  (MOVHstore  [off] {sym} (ADD ptr idx) val mem) && ptr.Op != OpSB -> (MOVHstoreidx  [off] {sym} ptr idx val mem)
   962  (MOVWstore  [off] {sym} (ADD ptr idx) val mem) && ptr.Op != OpSB -> (MOVWstoreidx  [off] {sym} ptr idx val mem)
   963  (MOVDstore  [off] {sym} (ADD ptr idx) val mem) && ptr.Op != OpSB -> (MOVDstoreidx  [off] {sym} ptr idx val mem)
   964  (FMOVSstore [off] {sym} (ADD ptr idx) val mem) && ptr.Op != OpSB -> (FMOVSstoreidx [off] {sym} ptr idx val mem)
   965  (FMOVDstore [off] {sym} (ADD ptr idx) val mem) && ptr.Op != OpSB -> (FMOVDstoreidx [off] {sym} ptr idx val mem)
   966  
   967  // combine ADD into indexed loads and stores
   968  (MOVBZloadidx [c] {sym} (ADDconst [d] ptr) idx mem) && is20Bit(c+d) -> (MOVBZloadidx [c+d] {sym} ptr idx mem)
   969  (MOVBloadidx  [c] {sym} (ADDconst [d] ptr) idx mem) && is20Bit(c+d) -> (MOVBloadidx  [c+d] {sym} ptr idx mem)
   970  (MOVHZloadidx [c] {sym} (ADDconst [d] ptr) idx mem) && is20Bit(c+d) -> (MOVHZloadidx [c+d] {sym} ptr idx mem)
   971  (MOVHloadidx  [c] {sym} (ADDconst [d] ptr) idx mem) && is20Bit(c+d) -> (MOVHloadidx  [c+d] {sym} ptr idx mem)
   972  (MOVWZloadidx [c] {sym} (ADDconst [d] ptr) idx mem) && is20Bit(c+d) -> (MOVWZloadidx [c+d] {sym} ptr idx mem)
   973  (MOVWloadidx  [c] {sym} (ADDconst [d] ptr) idx mem) && is20Bit(c+d) -> (MOVWloadidx  [c+d] {sym} ptr idx mem)
   974  (MOVDloadidx  [c] {sym} (ADDconst [d] ptr) idx mem) && is20Bit(c+d) -> (MOVDloadidx  [c+d] {sym} ptr idx mem)
   975  (FMOVSloadidx [c] {sym} (ADDconst [d] ptr) idx mem) && is20Bit(c+d) -> (FMOVSloadidx [c+d] {sym} ptr idx mem)
   976  (FMOVDloadidx [c] {sym} (ADDconst [d] ptr) idx mem) && is20Bit(c+d) -> (FMOVDloadidx [c+d] {sym} ptr idx mem)
   977  
   978  (MOVBstoreidx  [c] {sym} (ADDconst [d] ptr) idx val mem) && is20Bit(c+d) -> (MOVBstoreidx  [c+d] {sym} ptr idx val mem)
   979  (MOVHstoreidx  [c] {sym} (ADDconst [d] ptr) idx val mem) && is20Bit(c+d) -> (MOVHstoreidx  [c+d] {sym} ptr idx val mem)
   980  (MOVWstoreidx  [c] {sym} (ADDconst [d] ptr) idx val mem) && is20Bit(c+d) -> (MOVWstoreidx  [c+d] {sym} ptr idx val mem)
   981  (MOVDstoreidx  [c] {sym} (ADDconst [d] ptr) idx val mem) && is20Bit(c+d) -> (MOVDstoreidx  [c+d] {sym} ptr idx val mem)
   982  (FMOVSstoreidx [c] {sym} (ADDconst [d] ptr) idx val mem) && is20Bit(c+d) -> (FMOVSstoreidx [c+d] {sym} ptr idx val mem)
   983  (FMOVDstoreidx [c] {sym} (ADDconst [d] ptr) idx val mem) && is20Bit(c+d) -> (FMOVDstoreidx [c+d] {sym} ptr idx val mem)
   984  
   985  (MOVBZloadidx [c] {sym} ptr (ADDconst [d] idx) mem) && is20Bit(c+d) -> (MOVBZloadidx [c+d] {sym} ptr idx mem)
   986  (MOVBloadidx  [c] {sym} ptr (ADDconst [d] idx) mem) && is20Bit(c+d) -> (MOVBloadidx  [c+d] {sym} ptr idx mem)
   987  (MOVHZloadidx [c] {sym} ptr (ADDconst [d] idx) mem) && is20Bit(c+d) -> (MOVHZloadidx [c+d] {sym} ptr idx mem)
   988  (MOVHloadidx  [c] {sym} ptr (ADDconst [d] idx) mem) && is20Bit(c+d) -> (MOVHloadidx  [c+d] {sym} ptr idx mem)
   989  (MOVWZloadidx [c] {sym} ptr (ADDconst [d] idx) mem) && is20Bit(c+d) -> (MOVWZloadidx [c+d] {sym} ptr idx mem)
   990  (MOVWloadidx  [c] {sym} ptr (ADDconst [d] idx) mem) && is20Bit(c+d) -> (MOVWloadidx  [c+d] {sym} ptr idx mem)
   991  (MOVDloadidx  [c] {sym} ptr (ADDconst [d] idx) mem) && is20Bit(c+d) -> (MOVDloadidx  [c+d] {sym} ptr idx mem)
   992  (FMOVSloadidx [c] {sym} ptr (ADDconst [d] idx) mem) && is20Bit(c+d) -> (FMOVSloadidx [c+d] {sym} ptr idx mem)
   993  (FMOVDloadidx [c] {sym} ptr (ADDconst [d] idx) mem) && is20Bit(c+d) -> (FMOVDloadidx [c+d] {sym} ptr idx mem)
   994  
   995  (MOVBstoreidx  [c] {sym} ptr (ADDconst [d] idx) val mem) && is20Bit(c+d) -> (MOVBstoreidx  [c+d] {sym} ptr idx val mem)
   996  (MOVHstoreidx  [c] {sym} ptr (ADDconst [d] idx) val mem) && is20Bit(c+d) -> (MOVHstoreidx  [c+d] {sym} ptr idx val mem)
   997  (MOVWstoreidx  [c] {sym} ptr (ADDconst [d] idx) val mem) && is20Bit(c+d) -> (MOVWstoreidx  [c+d] {sym} ptr idx val mem)
   998  (MOVDstoreidx  [c] {sym} ptr (ADDconst [d] idx) val mem) && is20Bit(c+d) -> (MOVDstoreidx  [c+d] {sym} ptr idx val mem)
   999  (FMOVSstoreidx [c] {sym} ptr (ADDconst [d] idx) val mem) && is20Bit(c+d) -> (FMOVSstoreidx [c+d] {sym} ptr idx val mem)
  1000  (FMOVDstoreidx [c] {sym} ptr (ADDconst [d] idx) val mem) && is20Bit(c+d) -> (FMOVDstoreidx [c+d] {sym} ptr idx val mem)
  1001  
  1002  // MOVDaddr into MOVDaddridx
  1003  (MOVDaddridx [off1] {sym1} (MOVDaddr [off2] {sym2} x) y) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) && x.Op != OpSB ->
  1004         (MOVDaddridx [off1+off2] {mergeSym(sym1,sym2)} x y)
  1005  (MOVDaddridx [off1] {sym1} x (MOVDaddr [off2] {sym2} y)) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) && y.Op != OpSB ->
  1006         (MOVDaddridx [off1+off2] {mergeSym(sym1,sym2)} x y)
  1007  
  1008  // Absorb InvertFlags into branches.
  1009  (BRC {c} (InvertFlags cmp) yes no) -> (BRC {c.(s390x.CCMask).ReverseComparison()} cmp yes no)
  1010  
  1011  // Constant comparisons.
  1012  (CMPconst (MOVDconst [x]) [y]) && x==y -> (FlagEQ)
  1013  (CMPconst (MOVDconst [x]) [y]) && x<y -> (FlagLT)
  1014  (CMPconst (MOVDconst [x]) [y]) && x>y -> (FlagGT)
  1015  (CMPUconst (MOVDconst [x]) [y]) && uint64(x)==uint64(y) -> (FlagEQ)
  1016  (CMPUconst (MOVDconst [x]) [y]) && uint64(x)<uint64(y) -> (FlagLT)
  1017  (CMPUconst (MOVDconst [x]) [y]) && uint64(x)>uint64(y) -> (FlagGT)
  1018  
  1019  (CMPWconst (MOVDconst [x]) [y]) && int32(x)==int32(y) -> (FlagEQ)
  1020  (CMPWconst (MOVDconst [x]) [y]) && int32(x)<int32(y) -> (FlagLT)
  1021  (CMPWconst (MOVDconst [x]) [y]) && int32(x)>int32(y) -> (FlagGT)
  1022  (CMPWUconst (MOVDconst [x]) [y]) && uint32(x)==uint32(y) -> (FlagEQ)
  1023  (CMPWUconst (MOVDconst [x]) [y]) && uint32(x)<uint32(y) -> (FlagLT)
  1024  (CMPWUconst (MOVDconst [x]) [y]) && uint32(x)>uint32(y) -> (FlagGT)
  1025  
  1026  (CMP(W|WU)const (MOVBZreg _) [c]) &&   0xff < c -> (FlagLT)
  1027  (CMP(W|WU)const (MOVHZreg _) [c]) && 0xffff < c -> (FlagLT)
  1028  
  1029  (CMPconst  (SRDconst _ [c]) [n]) && c > 0 && n < 0 -> (FlagGT)
  1030  (CMPWconst (SRWconst _ [c]) [n]) && c > 0 && n < 0 -> (FlagGT)
  1031  
  1032  (CMPUconst  (SRDconst _ [c]) [n]) && c > 0 && c < 64 && (1<<uint(64-c)) <= uint64(n) -> (FlagLT)
  1033  (CMPWUconst (SRWconst _ [c]) [n]) && c > 0 && c < 32 && (1<<uint(32-c)) <= uint32(n) -> (FlagLT)
  1034  
  1035  (CMPWconst  (ANDWconst _ [m]) [n]) && int32(m) >= 0 &&  int32(m) <  int32(n) -> (FlagLT)
  1036  (CMPWUconst (ANDWconst _ [m]) [n]) && uint32(m) < uint32(n) -> (FlagLT)
  1037  
  1038  // Constant compare-and-branch with immediate.
  1039  (CGIJ  {c} (MOVDconst [x]) [y] yes no) && c.(s390x.CCMask)&s390x.Equal   != 0 &&  int64(x) ==  int64( int8(y)) -> (First yes no)
  1040  (CGIJ  {c} (MOVDconst [x]) [y] yes no) && c.(s390x.CCMask)&s390x.Less    != 0 &&  int64(x) <   int64( int8(y)) -> (First yes no)
  1041  (CGIJ  {c} (MOVDconst [x]) [y] yes no) && c.(s390x.CCMask)&s390x.Greater != 0 &&  int64(x) >   int64( int8(y)) -> (First yes no)
  1042  (CIJ   {c} (MOVDconst [x]) [y] yes no) && c.(s390x.CCMask)&s390x.Equal   != 0 &&  int32(x) ==  int32( int8(y)) -> (First yes no)
  1043  (CIJ   {c} (MOVDconst [x]) [y] yes no) && c.(s390x.CCMask)&s390x.Less    != 0 &&  int32(x) <   int32( int8(y)) -> (First yes no)
  1044  (CIJ   {c} (MOVDconst [x]) [y] yes no) && c.(s390x.CCMask)&s390x.Greater != 0 &&  int32(x) >   int32( int8(y)) -> (First yes no)
  1045  (CLGIJ {c} (MOVDconst [x]) [y] yes no) && c.(s390x.CCMask)&s390x.Equal   != 0 && uint64(x) == uint64(uint8(y)) -> (First yes no)
  1046  (CLGIJ {c} (MOVDconst [x]) [y] yes no) && c.(s390x.CCMask)&s390x.Less    != 0 && uint64(x) <  uint64(uint8(y)) -> (First yes no)
  1047  (CLGIJ {c} (MOVDconst [x]) [y] yes no) && c.(s390x.CCMask)&s390x.Greater != 0 && uint64(x) >  uint64(uint8(y)) -> (First yes no)
  1048  (CLIJ  {c} (MOVDconst [x]) [y] yes no) && c.(s390x.CCMask)&s390x.Equal   != 0 && uint32(x) == uint32(uint8(y)) -> (First yes no)
  1049  (CLIJ  {c} (MOVDconst [x]) [y] yes no) && c.(s390x.CCMask)&s390x.Less    != 0 && uint32(x) <  uint32(uint8(y)) -> (First yes no)
  1050  (CLIJ  {c} (MOVDconst [x]) [y] yes no) && c.(s390x.CCMask)&s390x.Greater != 0 && uint32(x) >  uint32(uint8(y)) -> (First yes no)
  1051  (CGIJ  {c} (MOVDconst [x]) [y] yes no) && c.(s390x.CCMask)&s390x.Equal   == 0 &&  int64(x) ==  int64( int8(y)) -> (First no yes)
  1052  (CGIJ  {c} (MOVDconst [x]) [y] yes no) && c.(s390x.CCMask)&s390x.Less    == 0 &&  int64(x) <   int64( int8(y)) -> (First no yes)
  1053  (CGIJ  {c} (MOVDconst [x]) [y] yes no) && c.(s390x.CCMask)&s390x.Greater == 0 &&  int64(x) >   int64( int8(y)) -> (First no yes)
  1054  (CIJ   {c} (MOVDconst [x]) [y] yes no) && c.(s390x.CCMask)&s390x.Equal   == 0 &&  int32(x) ==  int32( int8(y)) -> (First no yes)
  1055  (CIJ   {c} (MOVDconst [x]) [y] yes no) && c.(s390x.CCMask)&s390x.Less    == 0 &&  int32(x) <   int32( int8(y)) -> (First no yes)
  1056  (CIJ   {c} (MOVDconst [x]) [y] yes no) && c.(s390x.CCMask)&s390x.Greater == 0 &&  int32(x) >   int32( int8(y)) -> (First no yes)
  1057  (CLGIJ {c} (MOVDconst [x]) [y] yes no) && c.(s390x.CCMask)&s390x.Equal   == 0 && uint64(x) == uint64(uint8(y)) -> (First no yes)
  1058  (CLGIJ {c} (MOVDconst [x]) [y] yes no) && c.(s390x.CCMask)&s390x.Less    == 0 && uint64(x) <  uint64(uint8(y)) -> (First no yes)
  1059  (CLGIJ {c} (MOVDconst [x]) [y] yes no) && c.(s390x.CCMask)&s390x.Greater == 0 && uint64(x) >  uint64(uint8(y)) -> (First no yes)
  1060  (CLIJ  {c} (MOVDconst [x]) [y] yes no) && c.(s390x.CCMask)&s390x.Equal   == 0 && uint32(x) == uint32(uint8(y)) -> (First no yes)
  1061  (CLIJ  {c} (MOVDconst [x]) [y] yes no) && c.(s390x.CCMask)&s390x.Less    == 0 && uint32(x) <  uint32(uint8(y)) -> (First no yes)
  1062  (CLIJ  {c} (MOVDconst [x]) [y] yes no) && c.(s390x.CCMask)&s390x.Greater == 0 && uint32(x) >  uint32(uint8(y)) -> (First no yes)
  1063  
  1064  // Constant compare-and-branch with immediate when unsigned comparison with zero.
  1065  (C(L|LG)IJ {s390x.GreaterOrEqual} _ [0] yes no) -> (First yes no)
  1066  (C(L|LG)IJ {s390x.Less}           _ [0] yes no) -> (First no yes)
  1067  
  1068  // Constant compare-and-branch when operands match.
  1069  (C(GR|R|LGR|LR)J {c} x y yes no) && x == y && c.(s390x.CCMask)&s390x.Equal != 0 -> (First yes no)
  1070  (C(GR|R|LGR|LR)J {c} x y yes no) && x == y && c.(s390x.CCMask)&s390x.Equal == 0 -> (First no yes)
  1071  
  1072  // Convert 64-bit comparisons to 32-bit comparisons and signed comparisons
  1073  // to unsigned comparisons.
  1074  // Helps simplify constant comparison detection.
  1075  (CM(P|PU)const (MOV(W|WZ)reg x) [c]) -> (CMP(W|WU)const x [c])
  1076  (CM(P|P|PU|PU)const x:(MOV(H|HZ|H|HZ)reg _) [c]) -> (CMP(W|W|WU|WU)const x [c])
  1077  (CM(P|P|PU|PU)const x:(MOV(B|BZ|B|BZ)reg _) [c]) -> (CMP(W|W|WU|WU)const x [c])
  1078  (CMPconst  (MOV(WZ|W)reg x:(ANDWconst [m] _)) [c]) && int32(m) >= 0 && c >= 0 -> (CMPWUconst x [c])
  1079  (CMPUconst (MOV(WZ|W)reg x:(ANDWconst [m] _)) [c]) && int32(m) >= 0           -> (CMPWUconst x [c])
  1080  (CMPconst  x:(SRDconst _ [c]) [n]) && c > 0 && n >= 0 -> (CMPUconst  x [n])
  1081  (CMPWconst x:(SRWconst _ [c]) [n]) && c > 0 && n >= 0 -> (CMPWUconst x [n])
  1082  
  1083  // Absorb sign and zero extensions into 32-bit comparisons.
  1084  (CMP(W|W|WU|WU)      x (MOV(W|WZ|W|WZ)reg y))   -> (CMP(W|W|WU|WU) x y)
  1085  (CMP(W|W|WU|WU)      (MOV(W|WZ|W|WZ)reg x) y)   -> (CMP(W|W|WU|WU) x y)
  1086  (CMP(W|W|WU|WU)const (MOV(W|WZ|W|WZ)reg x) [c]) -> (CMP(W|W|WU|WU)const x [c])
  1087  
  1088  // Absorb flag constants into branches.
  1089  (BRC {c} (FlagEQ) yes no) && c.(s390x.CCMask) & s390x.Equal     != 0 -> (First yes no)
  1090  (BRC {c} (FlagLT) yes no) && c.(s390x.CCMask) & s390x.Less      != 0 -> (First yes no)
  1091  (BRC {c} (FlagGT) yes no) && c.(s390x.CCMask) & s390x.Greater   != 0 -> (First yes no)
  1092  (BRC {c} (FlagOV) yes no) && c.(s390x.CCMask) & s390x.Unordered != 0 -> (First yes no)
  1093  
  1094  (BRC {c} (FlagEQ) yes no) && c.(s390x.CCMask) & s390x.Equal     == 0 -> (First no yes)
  1095  (BRC {c} (FlagLT) yes no) && c.(s390x.CCMask) & s390x.Less      == 0 -> (First no yes)
  1096  (BRC {c} (FlagGT) yes no) && c.(s390x.CCMask) & s390x.Greater   == 0 -> (First no yes)
  1097  (BRC {c} (FlagOV) yes no) && c.(s390x.CCMask) & s390x.Unordered == 0 -> (First no yes)
  1098  
  1099  // Absorb flag constants into SETxx ops.
  1100  (LOCGR {c} _ x (FlagEQ)) && c.(s390x.CCMask) & s390x.Equal     != 0 -> x
  1101  (LOCGR {c} _ x (FlagLT)) && c.(s390x.CCMask) & s390x.Less      != 0 -> x
  1102  (LOCGR {c} _ x (FlagGT)) && c.(s390x.CCMask) & s390x.Greater   != 0 -> x
  1103  (LOCGR {c} _ x (FlagOV)) && c.(s390x.CCMask) & s390x.Unordered != 0 -> x
  1104  
  1105  (LOCGR {c} x _ (FlagEQ)) && c.(s390x.CCMask) & s390x.Equal     == 0 -> x
  1106  (LOCGR {c} x _ (FlagLT)) && c.(s390x.CCMask) & s390x.Less      == 0 -> x
  1107  (LOCGR {c} x _ (FlagGT)) && c.(s390x.CCMask) & s390x.Greater   == 0 -> x
  1108  (LOCGR {c} x _ (FlagOV)) && c.(s390x.CCMask) & s390x.Unordered == 0 -> x
  1109  
  1110  // Remove redundant *const ops
  1111  (ADDconst [0] x) -> x
  1112  (ADDWconst [c] x) && int32(c)==0 -> x
  1113  (SUBconst [0] x) -> x
  1114  (SUBWconst [c] x) && int32(c) == 0 -> x
  1115  (ANDconst [0] _)                 -> (MOVDconst [0])
  1116  (ANDWconst [c] _) && int32(c)==0  -> (MOVDconst [0])
  1117  (ANDconst [-1] x)                -> x
  1118  (ANDWconst [c] x) && int32(c)==-1 -> x
  1119  (ORconst [0] x)                  -> x
  1120  (ORWconst [c] x) && int32(c)==0   -> x
  1121  (ORconst [-1] _)                 -> (MOVDconst [-1])
  1122  (ORWconst [c] _) && int32(c)==-1  -> (MOVDconst [-1])
  1123  (XORconst [0] x)                  -> x
  1124  (XORWconst [c] x) && int32(c)==0   -> x
  1125  
  1126  // Convert constant subtracts to constant adds.
  1127  (SUBconst [c] x) && c != -(1<<31) -> (ADDconst [-c] x)
  1128  (SUBWconst [c] x) -> (ADDWconst [int64(int32(-c))] x)
  1129  
  1130  // generic constant folding
  1131  // TODO: more of this
  1132  (ADDconst [c] (MOVDconst [d])) -> (MOVDconst [c+d])
  1133  (ADDWconst [c] (MOVDconst [d])) -> (MOVDconst [int64(int32(c+d))])
  1134  (ADDconst [c] (ADDconst [d] x)) && is32Bit(c+d) -> (ADDconst [c+d] x)
  1135  (ADDWconst [c] (ADDWconst [d] x)) -> (ADDWconst [int64(int32(c+d))] x)
  1136  (SUBconst (MOVDconst [d]) [c]) -> (MOVDconst [d-c])
  1137  (SUBconst (SUBconst x [d]) [c]) && is32Bit(-c-d) -> (ADDconst [-c-d] x)
  1138  (SRADconst [c] (MOVDconst [d])) -> (MOVDconst [d>>uint64(c)])
  1139  (SRAWconst [c] (MOVDconst [d])) -> (MOVDconst [int64(int32(d))>>uint64(c)])
  1140  (NEG (MOVDconst [c])) -> (MOVDconst [-c])
  1141  (NEGW (MOVDconst [c])) -> (MOVDconst [int64(int32(-c))])
  1142  (MULLDconst [c] (MOVDconst [d])) -> (MOVDconst [c*d])
  1143  (MULLWconst [c] (MOVDconst [d])) -> (MOVDconst [int64(int32(c*d))])
  1144  (AND (MOVDconst [c]) (MOVDconst [d])) -> (MOVDconst [c&d])
  1145  (ANDconst [c] (MOVDconst [d])) -> (MOVDconst [c&d])
  1146  (ANDWconst [c] (MOVDconst [d])) -> (MOVDconst [c&d])
  1147  (OR (MOVDconst [c]) (MOVDconst [d])) -> (MOVDconst [c|d])
  1148  (ORconst [c] (MOVDconst [d])) -> (MOVDconst [c|d])
  1149  (ORWconst [c] (MOVDconst [d])) -> (MOVDconst [c|d])
  1150  (XOR (MOVDconst [c]) (MOVDconst [d])) -> (MOVDconst [c^d])
  1151  (XORconst [c] (MOVDconst [d])) -> (MOVDconst [c^d])
  1152  (XORWconst [c] (MOVDconst [d])) -> (MOVDconst [c^d])
  1153  (LoweredRound32F x:(FMOVSconst)) -> x
  1154  (LoweredRound64F x:(FMOVDconst)) -> x
  1155  
  1156  // generic simplifications
  1157  // TODO: more of this
  1158  (ADD x (NEG y)) -> (SUB x y)
  1159  (ADDW x (NEGW y)) -> (SUBW x y)
  1160  (SUB x x) -> (MOVDconst [0])
  1161  (SUBW x x) -> (MOVDconst [0])
  1162  (AND x x) -> x
  1163  (ANDW x x) -> x
  1164  (OR x x) -> x
  1165  (ORW x x) -> x
  1166  (XOR x x) -> (MOVDconst [0])
  1167  (XORW x x) -> (MOVDconst [0])
  1168  (NEG (ADDconst [c] (NEG x))) && c != -(1<<31) -> (ADDconst [-c] x)
  1169  (MOVBZreg (ANDWconst [m] x)) -> (MOVWZreg (ANDWconst <typ.UInt32> [int64( uint8(m))] x))
  1170  (MOVHZreg (ANDWconst [m] x)) -> (MOVWZreg (ANDWconst <typ.UInt32> [int64(uint16(m))] x))
  1171  (MOVBreg  (ANDWconst [m] x)) &&  int8(m) >= 0 -> (MOVWZreg (ANDWconst <typ.UInt32> [int64( uint8(m))] x))
  1172  (MOVHreg  (ANDWconst [m] x)) && int16(m) >= 0 -> (MOVWZreg (ANDWconst <typ.UInt32> [int64(uint16(m))] x))
  1173  
  1174  // carry flag generation
  1175  // (only constant fold carry of zero)
  1176  (Select1 (ADDCconst (MOVDconst [c]) [d]))
  1177    && uint64(c+d) >= uint64(c) && c+d == 0
  1178    -> (FlagEQ)
  1179  (Select1 (ADDCconst (MOVDconst [c]) [d]))
  1180    && uint64(c+d) >= uint64(c) && c+d != 0
  1181    -> (FlagLT)
  1182  
  1183  // borrow flag generation
  1184  // (only constant fold borrow of zero)
  1185  (Select1 (SUBC (MOVDconst [c]) (MOVDconst [d])))
  1186    && uint64(d) <= uint64(c) && c-d == 0
  1187    -> (FlagGT)
  1188  (Select1 (SUBC (MOVDconst [c]) (MOVDconst [d])))
  1189    && uint64(d) <= uint64(c) && c-d != 0
  1190    -> (FlagOV)
  1191  
  1192  // add with carry
  1193  (ADDE x y (FlagEQ)) -> (ADDC x y)
  1194  (ADDE x y (FlagLT)) -> (ADDC x y)
  1195  (ADDC x (MOVDconst [c])) && is16Bit(c) -> (ADDCconst x [c])
  1196  (Select0 (ADDCconst (MOVDconst [c]) [d])) -> (MOVDconst [c+d])
  1197  
  1198  // subtract with borrow
  1199  (SUBE x y (FlagGT)) -> (SUBC x y)
  1200  (SUBE x y (FlagOV)) -> (SUBC x y)
  1201  (Select0 (SUBC (MOVDconst [c]) (MOVDconst [d]))) -> (MOVDconst [c-d])
  1202  
  1203  // collapse carry chain
  1204  (ADDE x y (Select1 (ADDCconst [-1] (Select0 (ADDE (MOVDconst [0]) (MOVDconst [0]) c)))))
  1205    -> (ADDE x y c)
  1206  
  1207  // collapse borrow chain
  1208  (SUBE x y (Select1 (SUBC (MOVDconst [0]) (NEG (Select0 (SUBE (MOVDconst [0]) (MOVDconst [0]) c))))))
  1209    -> (SUBE x y c)
  1210  
  1211  // fused multiply-add
  1212  (FADD (FMUL y z) x) -> (FMADD x y z)
  1213  (FADDS (FMULS y z) x) -> (FMADDS x y z)
  1214  (FSUB (FMUL y z) x) -> (FMSUB x y z)
  1215  (FSUBS (FMULS y z) x) -> (FMSUBS x y z)
  1216  
  1217  // Fold memory operations into operations.
  1218  // Exclude global data (SB) because these instructions cannot handle relative addresses.
  1219  // TODO(mundaym): use LARL in the assembler to handle SB?
  1220  // TODO(mundaym): indexed versions of these?
  1221  (ADD <t> x g:(MOVDload [off] {sym} ptr mem)) && ptr.Op != OpSB && is20Bit(off) && canMergeLoadClobber(v, g, x) && clobber(g)
  1222  	-> (ADDload <t> [off] {sym} x ptr mem)
  1223  (ADD <t> g:(MOVDload [off] {sym} ptr mem) x) && ptr.Op != OpSB && is20Bit(off) && canMergeLoadClobber(v, g, x) && clobber(g)
  1224  	-> (ADDload <t> [off] {sym} x ptr mem)
  1225  (ADDW <t> x g:(MOVWload [off] {sym} ptr mem)) && ptr.Op != OpSB && is20Bit(off) && canMergeLoadClobber(v, g, x) && clobber(g)
  1226  	-> (ADDWload <t> [off] {sym} x ptr mem)
  1227  (ADDW <t> g:(MOVWload [off] {sym} ptr mem) x) && ptr.Op != OpSB && is20Bit(off) && canMergeLoadClobber(v, g, x) && clobber(g)
  1228  	-> (ADDWload <t> [off] {sym} x ptr mem)
  1229  (ADDW <t> x g:(MOVWZload [off] {sym} ptr mem)) && ptr.Op != OpSB && is20Bit(off) && canMergeLoadClobber(v, g, x) && clobber(g)
  1230  	-> (ADDWload <t> [off] {sym} x ptr mem)
  1231  (ADDW <t> g:(MOVWZload [off] {sym} ptr mem) x) && ptr.Op != OpSB && is20Bit(off) && canMergeLoadClobber(v, g, x) && clobber(g)
  1232  	-> (ADDWload <t> [off] {sym} x ptr mem)
  1233  (MULLD <t> x g:(MOVDload [off] {sym} ptr mem)) && ptr.Op != OpSB && is20Bit(off) && canMergeLoadClobber(v, g, x) && clobber(g)
  1234  	-> (MULLDload <t> [off] {sym} x ptr mem)
  1235  (MULLD <t> g:(MOVDload [off] {sym} ptr mem) x) && ptr.Op != OpSB && is20Bit(off) && canMergeLoadClobber(v, g, x) && clobber(g)
  1236  	-> (MULLDload <t> [off] {sym} x ptr mem)
  1237  (MULLW <t> x g:(MOVWload [off] {sym} ptr mem)) && ptr.Op != OpSB && is20Bit(off) && canMergeLoadClobber(v, g, x) && clobber(g)
  1238  	-> (MULLWload <t> [off] {sym} x ptr mem)
  1239  (MULLW <t> g:(MOVWload [off] {sym} ptr mem) x) && ptr.Op != OpSB && is20Bit(off) && canMergeLoadClobber(v, g, x) && clobber(g)
  1240  	-> (MULLWload <t> [off] {sym} x ptr mem)
  1241  (MULLW <t> x g:(MOVWZload [off] {sym} ptr mem)) && ptr.Op != OpSB && is20Bit(off) && canMergeLoadClobber(v, g, x) && clobber(g)
  1242  	-> (MULLWload <t> [off] {sym} x ptr mem)
  1243  (MULLW <t> g:(MOVWZload [off] {sym} ptr mem) x) && ptr.Op != OpSB && is20Bit(off) && canMergeLoadClobber(v, g, x) && clobber(g)
  1244  	-> (MULLWload <t> [off] {sym} x ptr mem)
  1245  (SUB <t> x g:(MOVDload [off] {sym} ptr mem)) && ptr.Op != OpSB && is20Bit(off) && canMergeLoadClobber(v, g, x) && clobber(g)
  1246  	-> (SUBload <t> [off] {sym} x ptr mem)
  1247  (SUBW <t> x g:(MOVWload [off] {sym} ptr mem)) && ptr.Op != OpSB && is20Bit(off) && canMergeLoadClobber(v, g, x) && clobber(g)
  1248  	-> (SUBWload <t> [off] {sym} x ptr mem)
  1249  (SUBW <t> x g:(MOVWZload [off] {sym} ptr mem)) && ptr.Op != OpSB && is20Bit(off) && canMergeLoadClobber(v, g, x) && clobber(g)
  1250  	-> (SUBWload <t> [off] {sym} x ptr mem)
  1251  (AND <t> x g:(MOVDload [off] {sym} ptr mem)) && ptr.Op != OpSB && is20Bit(off) && canMergeLoadClobber(v, g, x) && clobber(g)
  1252  	-> (ANDload <t> [off] {sym} x ptr mem)
  1253  (AND <t> g:(MOVDload [off] {sym} ptr mem) x) && ptr.Op != OpSB && is20Bit(off) && canMergeLoadClobber(v, g, x) && clobber(g)
  1254  	-> (ANDload <t> [off] {sym} x ptr mem)
  1255  (ANDW <t> x g:(MOVWload [off] {sym} ptr mem)) && ptr.Op != OpSB && is20Bit(off) && canMergeLoadClobber(v, g, x) && clobber(g)
  1256  	-> (ANDWload <t> [off] {sym} x ptr mem)
  1257  (ANDW <t> g:(MOVWload [off] {sym} ptr mem) x) && ptr.Op != OpSB && is20Bit(off) && canMergeLoadClobber(v, g, x) && clobber(g)
  1258  	-> (ANDWload <t> [off] {sym} x ptr mem)
  1259  (ANDW <t> x g:(MOVWZload [off] {sym} ptr mem)) && ptr.Op != OpSB && is20Bit(off) && canMergeLoadClobber(v, g, x) && clobber(g)
  1260  	-> (ANDWload <t> [off] {sym} x ptr mem)
  1261  (ANDW <t> g:(MOVWZload [off] {sym} ptr mem) x) && ptr.Op != OpSB && is20Bit(off) && canMergeLoadClobber(v, g, x) && clobber(g)
  1262  	-> (ANDWload <t> [off] {sym} x ptr mem)
  1263  (OR <t> x g:(MOVDload [off] {sym} ptr mem)) && ptr.Op != OpSB && is20Bit(off) && canMergeLoadClobber(v, g, x) && clobber(g)
  1264  	-> (ORload <t> [off] {sym} x ptr mem)
  1265  (OR <t> g:(MOVDload [off] {sym} ptr mem) x) && ptr.Op != OpSB && is20Bit(off) && canMergeLoadClobber(v, g, x) && clobber(g)
  1266  	-> (ORload <t> [off] {sym} x ptr mem)
  1267  (ORW <t> x g:(MOVWload [off] {sym} ptr mem)) && ptr.Op != OpSB && is20Bit(off) && canMergeLoadClobber(v, g, x) && clobber(g)
  1268  	-> (ORWload <t> [off] {sym} x ptr mem)
  1269  (ORW <t> g:(MOVWload [off] {sym} ptr mem) x) && ptr.Op != OpSB && is20Bit(off) && canMergeLoadClobber(v, g, x) && clobber(g)
  1270  	-> (ORWload <t> [off] {sym} x ptr mem)
  1271  (ORW <t> x g:(MOVWZload [off] {sym} ptr mem)) && ptr.Op != OpSB && is20Bit(off) && canMergeLoadClobber(v, g, x) && clobber(g)
  1272  	-> (ORWload <t> [off] {sym} x ptr mem)
  1273  (ORW <t> g:(MOVWZload [off] {sym} ptr mem) x) && ptr.Op != OpSB && is20Bit(off) && canMergeLoadClobber(v, g, x) && clobber(g)
  1274  	-> (ORWload <t> [off] {sym} x ptr mem)
  1275  (XOR <t> x g:(MOVDload [off] {sym} ptr mem)) && ptr.Op != OpSB && is20Bit(off) && canMergeLoadClobber(v, g, x) && clobber(g)
  1276  	-> (XORload <t> [off] {sym} x ptr mem)
  1277  (XOR <t> g:(MOVDload [off] {sym} ptr mem) x) && ptr.Op != OpSB && is20Bit(off) && canMergeLoadClobber(v, g, x) && clobber(g)
  1278  	-> (XORload <t> [off] {sym} x ptr mem)
  1279  (XORW <t> x g:(MOVWload [off] {sym} ptr mem)) && ptr.Op != OpSB && is20Bit(off) && canMergeLoadClobber(v, g, x) && clobber(g)
  1280  	-> (XORWload <t> [off] {sym} x ptr mem)
  1281  (XORW <t> g:(MOVWload [off] {sym} ptr mem) x) && ptr.Op != OpSB && is20Bit(off) && canMergeLoadClobber(v, g, x) && clobber(g)
  1282  	-> (XORWload <t> [off] {sym} x ptr mem)
  1283  (XORW <t> x g:(MOVWZload [off] {sym} ptr mem)) && ptr.Op != OpSB && is20Bit(off) && canMergeLoadClobber(v, g, x) && clobber(g)
  1284  	-> (XORWload <t> [off] {sym} x ptr mem)
  1285  (XORW <t> g:(MOVWZload [off] {sym} ptr mem) x) && ptr.Op != OpSB && is20Bit(off) && canMergeLoadClobber(v, g, x) && clobber(g)
  1286  	-> (XORWload <t> [off] {sym} x ptr mem)
  1287  
  1288  // Combine constant stores into larger (unaligned) stores.
  1289  // Avoid SB because constant stores to relative offsets are
  1290  // emulated by the assembler and also can't handle unaligned offsets.
  1291  (MOVBstoreconst [c] {s} p x:(MOVBstoreconst [a] {s} p mem))
  1292    && p.Op != OpSB
  1293    && x.Uses == 1
  1294    && ValAndOff(a).Off() + 1 == ValAndOff(c).Off()
  1295    && clobber(x)
  1296    -> (MOVHstoreconst [makeValAndOff(ValAndOff(c).Val()&0xff | ValAndOff(a).Val()<<8, ValAndOff(a).Off())] {s} p mem)
  1297  (MOVHstoreconst [c] {s} p x:(MOVHstoreconst [a] {s} p mem))
  1298    && p.Op != OpSB
  1299    && x.Uses == 1
  1300    && ValAndOff(a).Off() + 2 == ValAndOff(c).Off()
  1301    && clobber(x)
  1302    -> (MOVWstore [ValAndOff(a).Off()] {s} p (MOVDconst [int64(int32(ValAndOff(c).Val()&0xffff | ValAndOff(a).Val()<<16))]) mem)
  1303  (MOVWstoreconst [c] {s} p x:(MOVWstoreconst [a] {s} p mem))
  1304    && p.Op != OpSB
  1305    && x.Uses == 1
  1306    && ValAndOff(a).Off() + 4 == ValAndOff(c).Off()
  1307    && clobber(x)
  1308    -> (MOVDstore [ValAndOff(a).Off()] {s} p (MOVDconst [ValAndOff(c).Val()&0xffffffff | ValAndOff(a).Val()<<32]) mem)
  1309  
  1310  // Combine stores into larger (unaligned) stores.
  1311  // It doesn't work on global data (based on SB) because stores with relative addressing
  1312  // require that the memory operand be aligned.
  1313  (MOVBstore [i] {s} p w x:(MOVBstore [i-1] {s} p (SRDconst [8] w) mem))
  1314    && p.Op != OpSB
  1315    && x.Uses == 1
  1316    && clobber(x)
  1317    -> (MOVHstore [i-1] {s} p w mem)
  1318  (MOVBstore [i] {s} p w0:(SRDconst [j] w) x:(MOVBstore [i-1] {s} p (SRDconst [j+8] w) mem))
  1319    && p.Op != OpSB
  1320    && x.Uses == 1
  1321    && clobber(x)
  1322    -> (MOVHstore [i-1] {s} p w0 mem)
  1323  (MOVBstore [i] {s} p w x:(MOVBstore [i-1] {s} p (SRWconst [8] w) mem))
  1324    && p.Op != OpSB
  1325    && x.Uses == 1
  1326    && clobber(x)
  1327    -> (MOVHstore [i-1] {s} p w mem)
  1328  (MOVBstore [i] {s} p w0:(SRWconst [j] w) x:(MOVBstore [i-1] {s} p (SRWconst [j+8] w) mem))
  1329    && p.Op != OpSB
  1330    && x.Uses == 1
  1331    && clobber(x)
  1332    -> (MOVHstore [i-1] {s} p w0 mem)
  1333  (MOVHstore [i] {s} p w x:(MOVHstore [i-2] {s} p (SRDconst [16] w) mem))
  1334    && p.Op != OpSB
  1335    && x.Uses == 1
  1336    && clobber(x)
  1337    -> (MOVWstore [i-2] {s} p w mem)
  1338  (MOVHstore [i] {s} p w0:(SRDconst [j] w) x:(MOVHstore [i-2] {s} p (SRDconst [j+16] w) mem))
  1339    && p.Op != OpSB
  1340    && x.Uses == 1
  1341    && clobber(x)
  1342    -> (MOVWstore [i-2] {s} p w0 mem)
  1343  (MOVHstore [i] {s} p w x:(MOVHstore [i-2] {s} p (SRWconst [16] w) mem))
  1344    && p.Op != OpSB
  1345    && x.Uses == 1
  1346    && clobber(x)
  1347    -> (MOVWstore [i-2] {s} p w mem)
  1348  (MOVHstore [i] {s} p w0:(SRWconst [j] w) x:(MOVHstore [i-2] {s} p (SRWconst [j+16] w) mem))
  1349    && p.Op != OpSB
  1350    && x.Uses == 1
  1351    && clobber(x)
  1352    -> (MOVWstore [i-2] {s} p w0 mem)
  1353  (MOVWstore [i] {s} p (SRDconst [32] w) x:(MOVWstore [i-4] {s} p w mem))
  1354    && p.Op != OpSB
  1355    && x.Uses == 1
  1356    && clobber(x)
  1357    -> (MOVDstore [i-4] {s} p w mem)
  1358  (MOVWstore [i] {s} p w0:(SRDconst [j] w) x:(MOVWstore [i-4] {s} p (SRDconst [j+32] w) mem))
  1359    && p.Op != OpSB
  1360    && x.Uses == 1
  1361    && clobber(x)
  1362    -> (MOVDstore [i-4] {s} p w0 mem)
  1363  
  1364  (MOVBstoreidx [i] {s} p idx w x:(MOVBstoreidx [i-1] {s} p idx (SRDconst [8] w) mem))
  1365    && x.Uses == 1
  1366    && clobber(x)
  1367    -> (MOVHstoreidx [i-1] {s} p idx w mem)
  1368  (MOVBstoreidx [i] {s} p idx w0:(SRDconst [j] w) x:(MOVBstoreidx [i-1] {s} p idx (SRDconst [j+8] w) mem))
  1369    && x.Uses == 1
  1370    && clobber(x)
  1371    -> (MOVHstoreidx [i-1] {s} p idx w0 mem)
  1372  (MOVBstoreidx [i] {s} p idx w x:(MOVBstoreidx [i-1] {s} p idx (SRWconst [8] w) mem))
  1373    && x.Uses == 1
  1374    && clobber(x)
  1375    -> (MOVHstoreidx [i-1] {s} p idx w mem)
  1376  (MOVBstoreidx [i] {s} p idx w0:(SRWconst [j] w) x:(MOVBstoreidx [i-1] {s} p idx (SRWconst [j+8] w) mem))
  1377    && x.Uses == 1
  1378    && clobber(x)
  1379    -> (MOVHstoreidx [i-1] {s} p idx w0 mem)
  1380  (MOVHstoreidx [i] {s} p idx w x:(MOVHstoreidx [i-2] {s} p idx (SRDconst [16] w) mem))
  1381    && x.Uses == 1
  1382    && clobber(x)
  1383    -> (MOVWstoreidx [i-2] {s} p idx w mem)
  1384  (MOVHstoreidx [i] {s} p idx w0:(SRDconst [j] w) x:(MOVHstoreidx [i-2] {s} p idx (SRDconst [j+16] w) mem))
  1385    && x.Uses == 1
  1386    && clobber(x)
  1387    -> (MOVWstoreidx [i-2] {s} p idx w0 mem)
  1388  (MOVHstoreidx [i] {s} p idx w x:(MOVHstoreidx [i-2] {s} p idx (SRWconst [16] w) mem))
  1389    && x.Uses == 1
  1390    && clobber(x)
  1391    -> (MOVWstoreidx [i-2] {s} p idx w mem)
  1392  (MOVHstoreidx [i] {s} p idx w0:(SRWconst [j] w) x:(MOVHstoreidx [i-2] {s} p idx (SRWconst [j+16] w) mem))
  1393    && x.Uses == 1
  1394    && clobber(x)
  1395    -> (MOVWstoreidx [i-2] {s} p idx w0 mem)
  1396  (MOVWstoreidx [i] {s} p idx w x:(MOVWstoreidx [i-4] {s} p idx (SRDconst [32] w) mem))
  1397    && x.Uses == 1
  1398    && clobber(x)
  1399    -> (MOVDstoreidx [i-4] {s} p idx w mem)
  1400  (MOVWstoreidx [i] {s} p idx w0:(SRDconst [j] w) x:(MOVWstoreidx [i-4] {s} p idx (SRDconst [j+32] w) mem))
  1401    && x.Uses == 1
  1402    && clobber(x)
  1403    -> (MOVDstoreidx [i-4] {s} p idx w0 mem)
  1404  
  1405  // Combine stores into larger (unaligned) stores with the bytes reversed (little endian).
  1406  // Store-with-bytes-reversed instructions do not support relative memory addresses,
  1407  // so these stores can't operate on global data (SB).
  1408  (MOVBstore [i] {s} p (SRDconst [8] w) x:(MOVBstore [i-1] {s} p w mem))
  1409    && p.Op != OpSB
  1410    && x.Uses == 1
  1411    && clobber(x)
  1412    -> (MOVHBRstore [i-1] {s} p w mem)
  1413  (MOVBstore [i] {s} p (SRDconst [j] w) x:(MOVBstore [i-1] {s} p w0:(SRDconst [j-8] w) mem))
  1414    && p.Op != OpSB
  1415    && x.Uses == 1
  1416    && clobber(x)
  1417    -> (MOVHBRstore [i-1] {s} p w0 mem)
  1418  (MOVBstore [i] {s} p (SRWconst [8] w) x:(MOVBstore [i-1] {s} p w mem))
  1419    && p.Op != OpSB
  1420    && x.Uses == 1
  1421    && clobber(x)
  1422    -> (MOVHBRstore [i-1] {s} p w mem)
  1423  (MOVBstore [i] {s} p (SRWconst [j] w) x:(MOVBstore [i-1] {s} p w0:(SRWconst [j-8] w) mem))
  1424    && p.Op != OpSB
  1425    && x.Uses == 1
  1426    && clobber(x)
  1427    -> (MOVHBRstore [i-1] {s} p w0 mem)
  1428  (MOVHBRstore [i] {s} p (SRDconst [16] w) x:(MOVHBRstore [i-2] {s} p w mem))
  1429    && x.Uses == 1
  1430    && clobber(x)
  1431    -> (MOVWBRstore [i-2] {s} p w mem)
  1432  (MOVHBRstore [i] {s} p (SRDconst [j] w) x:(MOVHBRstore [i-2] {s} p w0:(SRDconst [j-16] w) mem))
  1433    && x.Uses == 1
  1434    && clobber(x)
  1435    -> (MOVWBRstore [i-2] {s} p w0 mem)
  1436  (MOVHBRstore [i] {s} p (SRWconst [16] w) x:(MOVHBRstore [i-2] {s} p w mem))
  1437    && x.Uses == 1
  1438    && clobber(x)
  1439    -> (MOVWBRstore [i-2] {s} p w mem)
  1440  (MOVHBRstore [i] {s} p (SRWconst [j] w) x:(MOVHBRstore [i-2] {s} p w0:(SRWconst [j-16] w) mem))
  1441    && x.Uses == 1
  1442    && clobber(x)
  1443    -> (MOVWBRstore [i-2] {s} p w0 mem)
  1444  (MOVWBRstore [i] {s} p (SRDconst [32] w) x:(MOVWBRstore [i-4] {s} p w mem))
  1445    && x.Uses == 1
  1446    && clobber(x)
  1447    -> (MOVDBRstore [i-4] {s} p w mem)
  1448  (MOVWBRstore [i] {s} p (SRDconst [j] w) x:(MOVWBRstore [i-4] {s} p w0:(SRDconst [j-32] w) mem))
  1449    && x.Uses == 1
  1450    && clobber(x)
  1451    -> (MOVDBRstore [i-4] {s} p w0 mem)
  1452  
  1453  (MOVBstoreidx [i] {s} p idx (SRDconst [8] w) x:(MOVBstoreidx [i-1] {s} p idx w mem))
  1454    && x.Uses == 1
  1455    && clobber(x)
  1456    -> (MOVHBRstoreidx [i-1] {s} p idx w mem)
  1457  (MOVBstoreidx [i] {s} p idx (SRDconst [j] w) x:(MOVBstoreidx [i-1] {s} p idx w0:(SRDconst [j-8] w) mem))
  1458    && x.Uses == 1
  1459    && clobber(x)
  1460    -> (MOVHBRstoreidx [i-1] {s} p idx w0 mem)
  1461  (MOVBstoreidx [i] {s} p idx (SRWconst [8] w) x:(MOVBstoreidx [i-1] {s} p idx w mem))
  1462    && x.Uses == 1
  1463    && clobber(x)
  1464    -> (MOVHBRstoreidx [i-1] {s} p idx w mem)
  1465  (MOVBstoreidx [i] {s} p idx (SRWconst [j] w) x:(MOVBstoreidx [i-1] {s} p idx w0:(SRWconst [j-8] w) mem))
  1466    && x.Uses == 1
  1467    && clobber(x)
  1468    -> (MOVHBRstoreidx [i-1] {s} p idx w0 mem)
  1469  (MOVHBRstoreidx [i] {s} p idx (SRDconst [16] w) x:(MOVHBRstoreidx [i-2] {s} p idx w mem))
  1470    && x.Uses == 1
  1471    && clobber(x)
  1472    -> (MOVWBRstoreidx [i-2] {s} p idx w mem)
  1473  (MOVHBRstoreidx [i] {s} p idx (SRDconst [j] w) x:(MOVHBRstoreidx [i-2] {s} p idx w0:(SRDconst [j-16] w) mem))
  1474    && x.Uses == 1
  1475    && clobber(x)
  1476    -> (MOVWBRstoreidx [i-2] {s} p idx w0 mem)
  1477  (MOVHBRstoreidx [i] {s} p idx (SRWconst [16] w) x:(MOVHBRstoreidx [i-2] {s} p idx w mem))
  1478    && x.Uses == 1
  1479    && clobber(x)
  1480    -> (MOVWBRstoreidx [i-2] {s} p idx w mem)
  1481  (MOVHBRstoreidx [i] {s} p idx (SRWconst [j] w) x:(MOVHBRstoreidx [i-2] {s} p idx w0:(SRWconst [j-16] w) mem))
  1482    && x.Uses == 1
  1483    && clobber(x)
  1484    -> (MOVWBRstoreidx [i-2] {s} p idx w0 mem)
  1485  (MOVWBRstoreidx [i] {s} p idx (SRDconst [32] w) x:(MOVWBRstoreidx [i-4] {s} p idx w mem))
  1486    && x.Uses == 1
  1487    && clobber(x)
  1488    -> (MOVDBRstoreidx [i-4] {s} p idx w mem)
  1489  (MOVWBRstoreidx [i] {s} p idx (SRDconst [j] w) x:(MOVWBRstoreidx [i-4] {s} p idx w0:(SRDconst [j-32] w) mem))
  1490    && x.Uses == 1
  1491    && clobber(x)
  1492    -> (MOVDBRstoreidx [i-4] {s} p idx w0 mem)
  1493  
  1494  // Combining byte loads into larger (unaligned) loads.
  1495  
  1496  // Big-endian loads
  1497  
  1498  (ORW                 x1:(MOVBZload [i1] {s} p mem)
  1499      sh:(SLWconst [8] x0:(MOVBZload [i0] {s} p mem)))
  1500    && i1 == i0+1
  1501    && p.Op != OpSB
  1502    && x0.Uses == 1
  1503    && x1.Uses == 1
  1504    && sh.Uses == 1
  1505    && mergePoint(b,x0,x1) != nil
  1506    && clobber(x0)
  1507    && clobber(x1)
  1508    && clobber(sh)
  1509    -> @mergePoint(b,x0,x1) (MOVHZload [i0] {s} p mem)
  1510  
  1511  (OR                  x1:(MOVBZload [i1] {s} p mem)
  1512      sh:(SLDconst [8] x0:(MOVBZload [i0] {s} p mem)))
  1513    && i1 == i0+1
  1514    && p.Op != OpSB
  1515    && x0.Uses == 1
  1516    && x1.Uses == 1
  1517    && sh.Uses == 1
  1518    && mergePoint(b,x0,x1) != nil
  1519    && clobber(x0)
  1520    && clobber(x1)
  1521    && clobber(sh)
  1522    -> @mergePoint(b,x0,x1) (MOVHZload [i0] {s} p mem)
  1523  
  1524  (ORW                  x1:(MOVHZload [i1] {s} p mem)
  1525      sh:(SLWconst [16] x0:(MOVHZload [i0] {s} p mem)))
  1526    && i1 == i0+2
  1527    && p.Op != OpSB
  1528    && x0.Uses == 1
  1529    && x1.Uses == 1
  1530    && sh.Uses == 1
  1531    && mergePoint(b,x0,x1) != nil
  1532    && clobber(x0)
  1533    && clobber(x1)
  1534    && clobber(sh)
  1535    -> @mergePoint(b,x0,x1) (MOVWZload [i0] {s} p mem)
  1536  
  1537  (OR                   x1:(MOVHZload [i1] {s} p mem)
  1538      sh:(SLDconst [16] x0:(MOVHZload [i0] {s} p mem)))
  1539    && i1 == i0+2
  1540    && p.Op != OpSB
  1541    && x0.Uses == 1
  1542    && x1.Uses == 1
  1543    && sh.Uses == 1
  1544    && mergePoint(b,x0,x1) != nil
  1545    && clobber(x0)
  1546    && clobber(x1)
  1547    && clobber(sh)
  1548    -> @mergePoint(b,x0,x1) (MOVWZload [i0] {s} p mem)
  1549  
  1550  (OR                   x1:(MOVWZload [i1] {s} p mem)
  1551      sh:(SLDconst [32] x0:(MOVWZload [i0] {s} p mem)))
  1552    && i1 == i0+4
  1553    && p.Op != OpSB
  1554    && x0.Uses == 1
  1555    && x1.Uses == 1
  1556    && sh.Uses == 1
  1557    && mergePoint(b,x0,x1) != nil
  1558    && clobber(x0)
  1559    && clobber(x1)
  1560    && clobber(sh)
  1561    -> @mergePoint(b,x0,x1) (MOVDload [i0] {s} p mem)
  1562  
  1563  (ORW
  1564      s0:(SLWconst [j0] x0:(MOVBZload [i0] {s} p mem))
  1565      or:(ORW
  1566          s1:(SLWconst [j1] x1:(MOVBZload [i1] {s} p mem))
  1567  	y))
  1568    && i1 == i0+1
  1569    && j1 == j0-8
  1570    && j1 % 16 == 0
  1571    && x0.Uses == 1
  1572    && x1.Uses == 1
  1573    && s0.Uses == 1
  1574    && s1.Uses == 1
  1575    && or.Uses == 1
  1576    && mergePoint(b,x0,x1,y) != nil
  1577    && clobber(x0)
  1578    && clobber(x1)
  1579    && clobber(s0)
  1580    && clobber(s1)
  1581    && clobber(or)
  1582    -> @mergePoint(b,x0,x1,y) (ORW <v.Type> (SLWconst <v.Type> [j1] (MOVHZload [i0] {s} p mem)) y)
  1583  
  1584  (OR
  1585      s0:(SLDconst [j0] x0:(MOVBZload [i0] {s} p mem))
  1586      or:(OR
  1587          s1:(SLDconst [j1] x1:(MOVBZload [i1] {s} p mem))
  1588  	y))
  1589    && i1 == i0+1
  1590    && j1 == j0-8
  1591    && j1 % 16 == 0
  1592    && x0.Uses == 1
  1593    && x1.Uses == 1
  1594    && s0.Uses == 1
  1595    && s1.Uses == 1
  1596    && or.Uses == 1
  1597    && mergePoint(b,x0,x1,y) != nil
  1598    && clobber(x0)
  1599    && clobber(x1)
  1600    && clobber(s0)
  1601    && clobber(s1)
  1602    && clobber(or)
  1603    -> @mergePoint(b,x0,x1,y) (OR <v.Type> (SLDconst <v.Type> [j1] (MOVHZload [i0] {s} p mem)) y)
  1604  
  1605  (OR
  1606      s0:(SLDconst [j0] x0:(MOVHZload [i0] {s} p mem))
  1607      or:(OR
  1608          s1:(SLDconst [j1] x1:(MOVHZload [i1] {s} p mem))
  1609  	y))
  1610    && i1 == i0+2
  1611    && j1 == j0-16
  1612    && j1 % 32 == 0
  1613    && x0.Uses == 1
  1614    && x1.Uses == 1
  1615    && s0.Uses == 1
  1616    && s1.Uses == 1
  1617    && or.Uses == 1
  1618    && mergePoint(b,x0,x1,y) != nil
  1619    && clobber(x0)
  1620    && clobber(x1)
  1621    && clobber(s0)
  1622    && clobber(s1)
  1623    && clobber(or)
  1624    -> @mergePoint(b,x0,x1,y) (OR <v.Type> (SLDconst <v.Type> [j1] (MOVWZload [i0] {s} p mem)) y)
  1625  
  1626  // Big-endian indexed loads
  1627  
  1628  (ORW                 x1:(MOVBZloadidx [i1] {s} p idx mem)
  1629      sh:(SLWconst [8] x0:(MOVBZloadidx [i0] {s} p idx mem)))
  1630    && i1 == i0+1
  1631    && p.Op != OpSB
  1632    && x0.Uses == 1
  1633    && x1.Uses == 1
  1634    && sh.Uses == 1
  1635    && mergePoint(b,x0,x1) != nil
  1636    && clobber(x0)
  1637    && clobber(x1)
  1638    && clobber(sh)
  1639    -> @mergePoint(b,x0,x1) (MOVHZloadidx [i0] {s} p idx mem)
  1640  
  1641  (OR                  x1:(MOVBZloadidx [i1] {s} p idx mem)
  1642      sh:(SLDconst [8] x0:(MOVBZloadidx [i0] {s} p idx mem)))
  1643    && i1 == i0+1
  1644    && p.Op != OpSB
  1645    && x0.Uses == 1
  1646    && x1.Uses == 1
  1647    && sh.Uses == 1
  1648    && mergePoint(b,x0,x1) != nil
  1649    && clobber(x0)
  1650    && clobber(x1)
  1651    && clobber(sh)
  1652    -> @mergePoint(b,x0,x1) (MOVHZloadidx [i0] {s} p idx mem)
  1653  
  1654  (ORW                  x1:(MOVHZloadidx [i1] {s} p idx mem)
  1655      sh:(SLWconst [16] x0:(MOVHZloadidx [i0] {s} p idx mem)))
  1656    && i1 == i0+2
  1657    && p.Op != OpSB
  1658    && x0.Uses == 1
  1659    && x1.Uses == 1
  1660    && sh.Uses == 1
  1661    && mergePoint(b,x0,x1) != nil
  1662    && clobber(x0)
  1663    && clobber(x1)
  1664    && clobber(sh)
  1665    -> @mergePoint(b,x0,x1) (MOVWZloadidx [i0] {s} p idx mem)
  1666  
  1667  (OR                   x1:(MOVHZloadidx [i1] {s} p idx mem)
  1668      sh:(SLDconst [16] x0:(MOVHZloadidx [i0] {s} p idx mem)))
  1669    && i1 == i0+2
  1670    && p.Op != OpSB
  1671    && x0.Uses == 1
  1672    && x1.Uses == 1
  1673    && sh.Uses == 1
  1674    && mergePoint(b,x0,x1) != nil
  1675    && clobber(x0)
  1676    && clobber(x1)
  1677    && clobber(sh)
  1678    -> @mergePoint(b,x0,x1) (MOVWZloadidx [i0] {s} p idx mem)
  1679  
  1680  (OR                   x1:(MOVWZloadidx [i1] {s} p idx mem)
  1681      sh:(SLDconst [32] x0:(MOVWZloadidx [i0] {s} p idx mem)))
  1682    && i1 == i0+4
  1683    && p.Op != OpSB
  1684    && x0.Uses == 1
  1685    && x1.Uses == 1
  1686    && sh.Uses == 1
  1687    && mergePoint(b,x0,x1) != nil
  1688    && clobber(x0)
  1689    && clobber(x1)
  1690    && clobber(sh)
  1691    -> @mergePoint(b,x0,x1) (MOVDloadidx [i0] {s} p idx mem)
  1692  
  1693  (ORW
  1694      s0:(SLWconst [j0] x0:(MOVBZloadidx [i0] {s} p idx mem))
  1695      or:(ORW
  1696          s1:(SLWconst [j1] x1:(MOVBZloadidx [i1] {s} p idx mem))
  1697  	y))
  1698    && i1 == i0+1
  1699    && j1 == j0-8
  1700    && j1 % 16 == 0
  1701    && x0.Uses == 1
  1702    && x1.Uses == 1
  1703    && s0.Uses == 1
  1704    && s1.Uses == 1
  1705    && or.Uses == 1
  1706    && mergePoint(b,x0,x1,y) != nil
  1707    && clobber(x0)
  1708    && clobber(x1)
  1709    && clobber(s0)
  1710    && clobber(s1)
  1711    && clobber(or)
  1712    -> @mergePoint(b,x0,x1,y) (ORW <v.Type> (SLWconst <v.Type> [j1] (MOVHZloadidx [i0] {s} p idx mem)) y)
  1713  
  1714  (OR
  1715      s0:(SLDconst [j0] x0:(MOVBZloadidx [i0] {s} p idx mem))
  1716      or:(OR
  1717          s1:(SLDconst [j1] x1:(MOVBZloadidx [i1] {s} p idx mem))
  1718  	y))
  1719    && i1 == i0+1
  1720    && j1 == j0-8
  1721    && j1 % 16 == 0
  1722    && x0.Uses == 1
  1723    && x1.Uses == 1
  1724    && s0.Uses == 1
  1725    && s1.Uses == 1
  1726    && or.Uses == 1
  1727    && mergePoint(b,x0,x1,y) != nil
  1728    && clobber(x0)
  1729    && clobber(x1)
  1730    && clobber(s0)
  1731    && clobber(s1)
  1732    && clobber(or)
  1733    -> @mergePoint(b,x0,x1,y) (OR <v.Type> (SLDconst <v.Type> [j1] (MOVHZloadidx [i0] {s} p idx mem)) y)
  1734  
  1735  (OR
  1736      s0:(SLDconst [j0] x0:(MOVHZloadidx [i0] {s} p idx mem))
  1737      or:(OR
  1738          s1:(SLDconst [j1] x1:(MOVHZloadidx [i1] {s} p idx mem))
  1739  	y))
  1740    && i1 == i0+2
  1741    && j1 == j0-16
  1742    && j1 % 32 == 0
  1743    && x0.Uses == 1
  1744    && x1.Uses == 1
  1745    && s0.Uses == 1
  1746    && s1.Uses == 1
  1747    && or.Uses == 1
  1748    && mergePoint(b,x0,x1,y) != nil
  1749    && clobber(x0)
  1750    && clobber(x1)
  1751    && clobber(s0)
  1752    && clobber(s1)
  1753    && clobber(or)
  1754    -> @mergePoint(b,x0,x1,y) (OR <v.Type> (SLDconst <v.Type> [j1] (MOVWZloadidx [i0] {s} p idx mem)) y)
  1755  
  1756  // Little-endian loads
  1757  
  1758  (ORW                 x0:(MOVBZload [i0] {s} p mem)
  1759      sh:(SLWconst [8] x1:(MOVBZload [i1] {s} p mem)))
  1760    && p.Op != OpSB
  1761    && i1 == i0+1
  1762    && x0.Uses == 1
  1763    && x1.Uses == 1
  1764    && sh.Uses == 1
  1765    && mergePoint(b,x0,x1) != nil
  1766    && clobber(x0)
  1767    && clobber(x1)
  1768    && clobber(sh)
  1769    -> @mergePoint(b,x0,x1) (MOVHZreg (MOVHBRload [i0] {s} p mem))
  1770  
  1771  (OR                  x0:(MOVBZload [i0] {s} p mem)
  1772      sh:(SLDconst [8] x1:(MOVBZload [i1] {s} p mem)))
  1773    && p.Op != OpSB
  1774    && i1 == i0+1
  1775    && x0.Uses == 1
  1776    && x1.Uses == 1
  1777    && sh.Uses == 1
  1778    && mergePoint(b,x0,x1) != nil
  1779    && clobber(x0)
  1780    && clobber(x1)
  1781    && clobber(sh)
  1782    -> @mergePoint(b,x0,x1) (MOVHZreg (MOVHBRload [i0] {s} p mem))
  1783  
  1784  (ORW                  r0:(MOVHZreg x0:(MOVHBRload [i0] {s} p mem))
  1785      sh:(SLWconst [16] r1:(MOVHZreg x1:(MOVHBRload [i1] {s} p mem))))
  1786    && i1 == i0+2
  1787    && x0.Uses == 1
  1788    && x1.Uses == 1
  1789    && r0.Uses == 1
  1790    && r1.Uses == 1
  1791    && sh.Uses == 1
  1792    && mergePoint(b,x0,x1) != nil
  1793    && clobber(x0)
  1794    && clobber(x1)
  1795    && clobber(r0)
  1796    && clobber(r1)
  1797    && clobber(sh)
  1798    -> @mergePoint(b,x0,x1) (MOVWBRload [i0] {s} p mem)
  1799  
  1800  (OR                   r0:(MOVHZreg x0:(MOVHBRload [i0] {s} p mem))
  1801      sh:(SLDconst [16] r1:(MOVHZreg x1:(MOVHBRload [i1] {s} p mem))))
  1802    && i1 == i0+2
  1803    && x0.Uses == 1
  1804    && x1.Uses == 1
  1805    && r0.Uses == 1
  1806    && r1.Uses == 1
  1807    && sh.Uses == 1
  1808    && mergePoint(b,x0,x1) != nil
  1809    && clobber(x0)
  1810    && clobber(x1)
  1811    && clobber(r0)
  1812    && clobber(r1)
  1813    && clobber(sh)
  1814    -> @mergePoint(b,x0,x1) (MOVWZreg (MOVWBRload [i0] {s} p mem))
  1815  
  1816  (OR                   r0:(MOVWZreg x0:(MOVWBRload [i0] {s} p mem))
  1817      sh:(SLDconst [32] r1:(MOVWZreg x1:(MOVWBRload [i1] {s} p mem))))
  1818    && i1 == i0+4
  1819    && x0.Uses == 1
  1820    && x1.Uses == 1
  1821    && r0.Uses == 1
  1822    && r1.Uses == 1
  1823    && sh.Uses == 1
  1824    && mergePoint(b,x0,x1) != nil
  1825    && clobber(x0)
  1826    && clobber(x1)
  1827    && clobber(r0)
  1828    && clobber(r1)
  1829    && clobber(sh)
  1830    -> @mergePoint(b,x0,x1) (MOVDBRload [i0] {s} p mem)
  1831  
  1832  (ORW
  1833      s1:(SLWconst [j1] x1:(MOVBZload [i1] {s} p mem))
  1834      or:(ORW
  1835          s0:(SLWconst [j0] x0:(MOVBZload [i0] {s} p mem))
  1836  	y))
  1837    && p.Op != OpSB
  1838    && i1 == i0+1
  1839    && j1 == j0+8
  1840    && j0 % 16 == 0
  1841    && x0.Uses == 1
  1842    && x1.Uses == 1
  1843    && s0.Uses == 1
  1844    && s1.Uses == 1
  1845    && or.Uses == 1
  1846    && mergePoint(b,x0,x1,y) != nil
  1847    && clobber(x0)
  1848    && clobber(x1)
  1849    && clobber(s0)
  1850    && clobber(s1)
  1851    && clobber(or)
  1852    -> @mergePoint(b,x0,x1,y) (ORW <v.Type> (SLWconst <v.Type> [j0] (MOVHZreg (MOVHBRload [i0] {s} p mem))) y)
  1853  
  1854  (OR
  1855      s1:(SLDconst [j1] x1:(MOVBZload [i1] {s} p mem))
  1856      or:(OR
  1857          s0:(SLDconst [j0] x0:(MOVBZload [i0] {s} p mem))
  1858  	y))
  1859    && p.Op != OpSB
  1860    && i1 == i0+1
  1861    && j1 == j0+8
  1862    && j0 % 16 == 0
  1863    && x0.Uses == 1
  1864    && x1.Uses == 1
  1865    && s0.Uses == 1
  1866    && s1.Uses == 1
  1867    && or.Uses == 1
  1868    && mergePoint(b,x0,x1,y) != nil
  1869    && clobber(x0)
  1870    && clobber(x1)
  1871    && clobber(s0)
  1872    && clobber(s1)
  1873    && clobber(or)
  1874    -> @mergePoint(b,x0,x1,y) (OR <v.Type> (SLDconst <v.Type> [j0] (MOVHZreg (MOVHBRload [i0] {s} p mem))) y)
  1875  
  1876  (OR
  1877      s1:(SLDconst [j1] r1:(MOVHZreg x1:(MOVHBRload [i1] {s} p mem)))
  1878      or:(OR
  1879          s0:(SLDconst [j0] r0:(MOVHZreg x0:(MOVHBRload [i0] {s} p mem)))
  1880  	y))
  1881    && i1 == i0+2
  1882    && j1 == j0+16
  1883    && j0 % 32 == 0
  1884    && x0.Uses == 1
  1885    && x1.Uses == 1
  1886    && r0.Uses == 1
  1887    && r1.Uses == 1
  1888    && s0.Uses == 1
  1889    && s1.Uses == 1
  1890    && or.Uses == 1
  1891    && mergePoint(b,x0,x1,y) != nil
  1892    && clobber(x0)
  1893    && clobber(x1)
  1894    && clobber(r0)
  1895    && clobber(r1)
  1896    && clobber(s0)
  1897    && clobber(s1)
  1898    && clobber(or)
  1899    -> @mergePoint(b,x0,x1,y) (OR <v.Type> (SLDconst <v.Type> [j0] (MOVWZreg (MOVWBRload [i0] {s} p mem))) y)
  1900  
  1901  // Little-endian indexed loads
  1902  
  1903  (ORW                 x0:(MOVBZloadidx [i0] {s} p idx mem)
  1904      sh:(SLWconst [8] x1:(MOVBZloadidx [i1] {s} p idx mem)))
  1905    && p.Op != OpSB
  1906    && i1 == i0+1
  1907    && x0.Uses == 1
  1908    && x1.Uses == 1
  1909    && sh.Uses == 1
  1910    && mergePoint(b,x0,x1) != nil
  1911    && clobber(x0)
  1912    && clobber(x1)
  1913    && clobber(sh)
  1914    -> @mergePoint(b,x0,x1) (MOVHZreg (MOVHBRloadidx [i0] {s} p idx mem))
  1915  
  1916  (OR                  x0:(MOVBZloadidx [i0] {s} p idx mem)
  1917      sh:(SLDconst [8] x1:(MOVBZloadidx [i1] {s} p idx mem)))
  1918    && p.Op != OpSB
  1919    && i1 == i0+1
  1920    && x0.Uses == 1
  1921    && x1.Uses == 1
  1922    && sh.Uses == 1
  1923    && mergePoint(b,x0,x1) != nil
  1924    && clobber(x0)
  1925    && clobber(x1)
  1926    && clobber(sh)
  1927    -> @mergePoint(b,x0,x1) (MOVHZreg (MOVHBRloadidx [i0] {s} p idx mem))
  1928  
  1929  (ORW                  r0:(MOVHZreg x0:(MOVHBRloadidx [i0] {s} p idx mem))
  1930      sh:(SLWconst [16] r1:(MOVHZreg x1:(MOVHBRloadidx [i1] {s} p idx mem))))
  1931    && i1 == i0+2
  1932    && x0.Uses == 1
  1933    && x1.Uses == 1
  1934    && r0.Uses == 1
  1935    && r1.Uses == 1
  1936    && sh.Uses == 1
  1937    && mergePoint(b,x0,x1) != nil
  1938    && clobber(x0)
  1939    && clobber(x1)
  1940    && clobber(r0)
  1941    && clobber(r1)
  1942    && clobber(sh)
  1943    -> @mergePoint(b,x0,x1) (MOVWBRloadidx [i0] {s} p idx mem)
  1944  
  1945  (OR                   r0:(MOVHZreg x0:(MOVHBRloadidx [i0] {s} p idx mem))
  1946      sh:(SLDconst [16] r1:(MOVHZreg x1:(MOVHBRloadidx [i1] {s} p idx mem))))
  1947    && i1 == i0+2
  1948    && x0.Uses == 1
  1949    && x1.Uses == 1
  1950    && r0.Uses == 1
  1951    && r1.Uses == 1
  1952    && sh.Uses == 1
  1953    && mergePoint(b,x0,x1) != nil
  1954    && clobber(x0)
  1955    && clobber(x1)
  1956    && clobber(r0)
  1957    && clobber(r1)
  1958    && clobber(sh)
  1959    -> @mergePoint(b,x0,x1) (MOVWZreg (MOVWBRloadidx [i0] {s} p idx mem))
  1960  
  1961  (OR                   r0:(MOVWZreg x0:(MOVWBRloadidx [i0] {s} p idx mem))
  1962      sh:(SLDconst [32] r1:(MOVWZreg x1:(MOVWBRloadidx [i1] {s} p idx mem))))
  1963    && i1 == i0+4
  1964    && x0.Uses == 1
  1965    && x1.Uses == 1
  1966    && r0.Uses == 1
  1967    && r1.Uses == 1
  1968    && sh.Uses == 1
  1969    && mergePoint(b,x0,x1) != nil
  1970    && clobber(x0)
  1971    && clobber(x1)
  1972    && clobber(r0)
  1973    && clobber(r1)
  1974    && clobber(sh)
  1975    -> @mergePoint(b,x0,x1) (MOVDBRloadidx [i0] {s} p idx mem)
  1976  
  1977  (ORW
  1978      s1:(SLWconst [j1] x1:(MOVBZloadidx [i1] {s} p idx mem))
  1979      or:(ORW
  1980          s0:(SLWconst [j0] x0:(MOVBZloadidx [i0] {s} p idx mem))
  1981  	y))
  1982    && p.Op != OpSB
  1983    && i1 == i0+1
  1984    && j1 == j0+8
  1985    && j0 % 16 == 0
  1986    && x0.Uses == 1
  1987    && x1.Uses == 1
  1988    && s0.Uses == 1
  1989    && s1.Uses == 1
  1990    && or.Uses == 1
  1991    && mergePoint(b,x0,x1,y) != nil
  1992    && clobber(x0)
  1993    && clobber(x1)
  1994    && clobber(s0)
  1995    && clobber(s1)
  1996    && clobber(or)
  1997    -> @mergePoint(b,x0,x1,y) (ORW <v.Type> (SLWconst <v.Type> [j0] (MOVHZreg (MOVHBRloadidx [i0] {s} p idx mem))) y)
  1998  
  1999  (OR
  2000      s1:(SLDconst [j1] x1:(MOVBZloadidx [i1] {s} p idx mem))
  2001      or:(OR
  2002          s0:(SLDconst [j0] x0:(MOVBZloadidx [i0] {s} p idx mem))
  2003  	y))
  2004    && p.Op != OpSB
  2005    && i1 == i0+1
  2006    && j1 == j0+8
  2007    && j0 % 16 == 0
  2008    && x0.Uses == 1
  2009    && x1.Uses == 1
  2010    && s0.Uses == 1
  2011    && s1.Uses == 1
  2012    && or.Uses == 1
  2013    && mergePoint(b,x0,x1,y) != nil
  2014    && clobber(x0)
  2015    && clobber(x1)
  2016    && clobber(s0)
  2017    && clobber(s1)
  2018    && clobber(or)
  2019    -> @mergePoint(b,x0,x1,y) (OR <v.Type> (SLDconst <v.Type> [j0] (MOVHZreg (MOVHBRloadidx [i0] {s} p idx mem))) y)
  2020  
  2021  (OR
  2022      s1:(SLDconst [j1] r1:(MOVHZreg x1:(MOVHBRloadidx [i1] {s} p idx mem)))
  2023      or:(OR
  2024          s0:(SLDconst [j0] r0:(MOVHZreg x0:(MOVHBRloadidx [i0] {s} p idx mem)))
  2025  	y))
  2026    && i1 == i0+2
  2027    && j1 == j0+16
  2028    && j0 % 32 == 0
  2029    && x0.Uses == 1
  2030    && x1.Uses == 1
  2031    && r0.Uses == 1
  2032    && r1.Uses == 1
  2033    && s0.Uses == 1
  2034    && s1.Uses == 1
  2035    && or.Uses == 1
  2036    && mergePoint(b,x0,x1,y) != nil
  2037    && clobber(x0)
  2038    && clobber(x1)
  2039    && clobber(r0)
  2040    && clobber(r1)
  2041    && clobber(s0)
  2042    && clobber(s1)
  2043    && clobber(or)
  2044    -> @mergePoint(b,x0,x1,y) (OR <v.Type> (SLDconst <v.Type> [j0] (MOVWZreg (MOVWBRloadidx [i0] {s} p idx mem))) y)
  2045  
  2046  // Combine stores into store multiples.
  2047  // 32-bit
  2048  (MOVWstore [i] {s} p w1 x:(MOVWstore [i-4] {s} p w0 mem))
  2049    && p.Op != OpSB
  2050    && x.Uses == 1
  2051    && is20Bit(i-4)
  2052    && clobber(x)
  2053    -> (STM2 [i-4] {s} p w0 w1 mem)
  2054  (MOVWstore [i] {s} p w2 x:(STM2 [i-8] {s} p w0 w1 mem))
  2055    && x.Uses == 1
  2056    && is20Bit(i-8)
  2057    && clobber(x)
  2058    -> (STM3 [i-8] {s} p w0 w1 w2 mem)
  2059  (MOVWstore [i] {s} p w3 x:(STM3 [i-12] {s} p w0 w1 w2 mem))
  2060    && x.Uses == 1
  2061    && is20Bit(i-12)
  2062    && clobber(x)
  2063    -> (STM4 [i-12] {s} p w0 w1 w2 w3 mem)
  2064  (STM2 [i] {s} p w2 w3 x:(STM2 [i-8] {s} p w0 w1 mem))
  2065    && x.Uses == 1
  2066    && is20Bit(i-8)
  2067    && clobber(x)
  2068    -> (STM4 [i-8] {s} p w0 w1 w2 w3 mem)
  2069  // 64-bit
  2070  (MOVDstore [i] {s} p w1 x:(MOVDstore [i-8] {s} p w0 mem))
  2071    && p.Op != OpSB
  2072    && x.Uses == 1
  2073    && is20Bit(i-8)
  2074    && clobber(x)
  2075    -> (STMG2 [i-8] {s} p w0 w1 mem)
  2076  (MOVDstore [i] {s} p w2 x:(STMG2 [i-16] {s} p w0 w1 mem))
  2077    && x.Uses == 1
  2078    && is20Bit(i-16)
  2079    && clobber(x)
  2080    -> (STMG3 [i-16] {s} p w0 w1 w2 mem)
  2081  (MOVDstore [i] {s} p w3 x:(STMG3 [i-24] {s} p w0 w1 w2 mem))
  2082    && x.Uses == 1
  2083    && is20Bit(i-24)
  2084    && clobber(x)
  2085    -> (STMG4 [i-24] {s} p w0 w1 w2 w3 mem)
  2086  (STMG2 [i] {s} p w2 w3 x:(STMG2 [i-16] {s} p w0 w1 mem))
  2087    && x.Uses == 1
  2088    && is20Bit(i-16)
  2089    && clobber(x)
  2090    -> (STMG4 [i-16] {s} p w0 w1 w2 w3 mem)
  2091  
  2092  // Convert 32-bit store multiples into 64-bit stores.
  2093  (STM2 [i] {s} p (SRDconst [32] x) x mem) -> (MOVDstore [i] {s} p x mem)