github.com/riscv/riscv-go@v0.0.0-20200123204226-124ebd6fcc8e/src/cmd/compile/internal/ssa/gen/RISCV.rules (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Optimizations TODO:
     6  // * Somehow track when values are already zero/signed-extended, avoid re-extending.
     7  // * Use SLTI and SLTIU for comparisons to constants, instead of SLT/SLTU with constants in registers
     8  // * Find a more efficient way to do zero/sign extension than left+right shift.
     9  //   There are many other options (store then load-extend, LUI+ANDI for zero extend, special case 32->64, ...),
    10  //   but left+right shift is simple and uniform, and we don't have real hardware to do perf testing on anyway.
    11  // * Use the zero register instead of moving 0 into a register.
    12  // * Add rules to avoid generating a temp bool value for (If (SLT[U] ...) ...).
    13  // * Optimize left and right shift by simplifying SLTIU, Neg, and ADD for
    14  //   constants.
    15  // * Arrange for non-trivial Zero and Move lowerings to use aligned loads and stores.
    16  // * Eliminate zero immediate shifts, adds, etc.
    17  // * Use a Duff's device for some moves and zeros.
    18  
    19  // Lowering arithmetic
    20  (Add64 x y) -> (ADD x y)
    21  (AddPtr x y) -> (ADD x y)
    22  (Add32 x y) -> (ADD x y)
    23  (Add16 x y) -> (ADD x y)
    24  (Add8 x y) -> (ADD x y)
    25  (Add32F x y) -> (FADDS x y)
    26  (Add64F x y) -> (FADDD x y)
    27  
    28  (Sub64 x y) -> (SUB x y)
    29  (SubPtr x y) -> (SUB x y)
    30  (Sub32 x y) -> (SUB x y)
    31  (Sub16 x y) -> (SUB x y)
    32  (Sub8 x y) -> (SUB x y)
    33  (Sub32F x y) -> (FSUBS x y)
    34  (Sub64F x y) -> (FSUBD x y)
    35  
    36  (Mul64 x y) -> (MUL  x y)
    37  (Mul32 x y) -> (MULW x y)
    38  (Mul16 x y) -> (MULW (SignExt16to32 x) (SignExt16to32 y))
    39  (Mul8 x y)  -> (MULW (SignExt8to32 x)  (SignExt8to32 y))
    40  (Mul32F x y) -> (FMULS x y)
    41  (Mul64F x y) -> (FMULD x y)
    42  
    43  (Div32F x y) -> (FDIVS x y)
    44  (Div64F x y) -> (FDIVD x y)
    45  
    46  (Div64 x y)  -> (DIV   x y)
    47  (Div64u x y) -> (DIVU  x y)
    48  (Div32 x y)  -> (DIVW  x y)
    49  (Div32u x y) -> (DIVUW x y)
    50  (Div16 x y)  -> (DIVW  (SignExt16to32 x) (SignExt16to32 y))
    51  (Div16u x y) -> (DIVUW (ZeroExt16to32 x) (ZeroExt16to32 y))
    52  (Div8 x y)   -> (DIVW  (SignExt8to32 x)  (SignExt8to32 y))
    53  (Div8u x y)  -> (DIVUW (ZeroExt8to32 x)  (ZeroExt8to32 y))
    54  
    55  (Hmul64 x y)  -> (MULH  x y)
    56  (Hmul64u x y) -> (MULHU x y)
    57  (Hmul32 x y)  -> (SRAI [32] (MUL  (SignExt32to64 x) (SignExt32to64 y)))
    58  (Hmul32u x y) -> (SRLI [32] (MUL  (ZeroExt32to64 x) (ZeroExt32to64 y)))
    59  (Hmul16 x y)  -> (SRAI [16] (MULW (SignExt16to32 x) (SignExt16to32 y)))
    60  (Hmul16u x y) -> (SRLI [16] (MULW (ZeroExt16to32 x) (ZeroExt16to32 y)))
    61  (Hmul8 x y)   -> (SRAI [8]  (MULW (SignExt8to32 x)  (SignExt8to32 y)))
    62  (Hmul8u x y)  -> (SRLI [8]  (MULW (ZeroExt8to32 x)  (ZeroExt8to32 y)))
    63  
    64  // (x + y) / 2 -> (x / 2) + (y / 2) + (x & y & 1)
    65  (Avg64u <t> x y) -> (ADD (ADD <t> (SRLI <t> [1] x) (SRLI <t> [1] y)) (ANDI <t> [1] (AND <t> x y)))
    66  
    67  (Mod64 x y)  -> (REM   x y)
    68  (Mod64u x y) -> (REMU  x y)
    69  (Mod32 x y)  -> (REMW  x y)
    70  (Mod32u x y) -> (REMUW x y)
    71  (Mod16 x y)  -> (REMW  (SignExt16to32 x) (SignExt16to32 y))
    72  (Mod16u x y) -> (REMUW (ZeroExt16to32 x) (ZeroExt16to32 y))
    73  (Mod8 x y)   -> (REMW  (SignExt8to32 x)  (SignExt8to32 y))
    74  (Mod8u x y)  -> (REMUW (ZeroExt8to32 x)  (ZeroExt8to32 y))
    75  
    76  (And64 x y) -> (AND x y)
    77  (And32 x y) -> (AND x y)
    78  (And16 x y) -> (AND x y)
    79  (And8  x y) -> (AND x y)
    80  
    81  (Or64 x y) -> (OR x y)
    82  (Or32 x y) -> (OR x y)
    83  (Or16 x y) -> (OR x y)
    84  (Or8  x y) -> (OR x y)
    85  
    86  (Xor64 x y) -> (XOR x y)
    87  (Xor32 x y) -> (XOR x y)
    88  (Xor16 x y) -> (XOR x y)
    89  (Xor8  x y) -> (XOR x y)
    90  
    91  (Neg64 x) -> (SUB (MOVDconst) x)
    92  (Neg32 x) -> (SUB (MOVWconst) x)
    93  (Neg16 x) -> (SUB (MOVHconst) x)
    94  (Neg8  x) -> (SUB (MOVBconst) x)
    95  (Neg32F x) -> (FNEGS x)
    96  (Neg64F x) -> (FNEGD x)
    97  
    98  (Com64 x) -> (XORI [int64(-1)] x)
    99  (Com32 x) -> (XORI [int64(-1)] x)
   100  (Com16 x) -> (XORI [int64(-1)] x)
   101  (Com8  x) -> (XORI [int64(-1)] x)
   102  
   103  (Sqrt x) -> (FSQRTD x)
   104  
   105  // Zero and sign extension
   106  // Shift left until the bits we want are at the top of the register.
   107  // Then logical/arithmetic shift right for zero/sign extend.
   108  // We always extend to 64 bits; there's no reason not to,
   109  // and optimization rules can then collapse some extensions.
   110  
   111  (SignExt8to16  <t> x) -> (SRAI [56] (SLLI <t> [56] x))
   112  (SignExt8to32  <t> x) -> (SRAI [56] (SLLI <t> [56] x))
   113  (SignExt8to64  <t> x) -> (SRAI [56] (SLLI <t> [56] x))
   114  (SignExt16to32 <t> x) -> (SRAI [48] (SLLI <t> [48] x))
   115  (SignExt16to64 <t> x) -> (SRAI [48] (SLLI <t> [48] x))
   116  (SignExt32to64 <t> x) -> (SRAI [32] (SLLI <t> [32] x))
   117  
   118  (ZeroExt8to16  <t> x) -> (SRLI [56] (SLLI <t> [56] x))
   119  (ZeroExt8to32  <t> x) -> (SRLI [56] (SLLI <t> [56] x))
   120  (ZeroExt8to64  <t> x) -> (SRLI [56] (SLLI <t> [56] x))
   121  (ZeroExt16to32 <t> x) -> (SRLI [48] (SLLI <t> [48] x))
   122  (ZeroExt16to64 <t> x) -> (SRLI [48] (SLLI <t> [48] x))
   123  (ZeroExt32to64 <t> x) -> (SRLI [32] (SLLI <t> [32] x))
   124  
   125  (Cvt32to32F x) -> (FCVTSW x)
   126  (Cvt32to64F x) -> (FCVTDW x)
   127  (Cvt64to32F x) -> (FCVTSL x)
   128  (Cvt64to64F x) -> (FCVTDL x)
   129  
   130  (Cvt32Fto32 x) -> (FCVTWS x)
   131  (Cvt32Fto64 x) -> (FCVTLS x)
   132  (Cvt64Fto32 x) -> (FCVTWD x)
   133  (Cvt64Fto64 x) -> (FCVTLD x)
   134  
   135  (Cvt32Fto64F x) -> (FCVTDS x)
   136  (Cvt64Fto32F x) -> (FCVTSD x)
   137  
   138  // From genericOps.go:
   139  // "0 if arg0 == 0, -1 if arg0 > 0, undef if arg0<0"
   140  //
   141  // Like other arches, we compute ~((x-1) >> 63), with arithmetic right shift.
   142  // For positive x, bit 63 of x-1 is always 0, so the result is -1.
   143  // For zero x, bit 63 of x-1 is 1, so the result is 0.
   144  //
   145  // TODO(prattmic): Use XORconst etc instead of XOR (MOVDconst).
   146  (Slicemask <t> x) -> (XOR (MOVDconst [-1]) (SRA <t> (SUB <t> x (MOVDconst [1])) (MOVDconst [63])))
   147  
   148  // Truncations
   149  // We ignore the unused high parts of registers, so truncates are just copies.
   150  (Trunc16to8  x) -> x
   151  (Trunc32to8  x) -> x
   152  (Trunc32to16 x) -> x
   153  (Trunc64to8  x) -> x
   154  (Trunc64to16 x) -> x
   155  (Trunc64to32 x) -> x
   156  
   157  // Shifts
   158  
   159  // SLL only considers the bottom 6 bits of y. If y > 64, the result should
   160  // always be 0.
   161  //
   162  // Breaking down the operation:
   163  //
   164  // (SLL x y) generates x << (y & 63).
   165  //
   166  // If y < 64, this is the value we want. Otherwise, we want zero.
   167  //
   168  // So, we AND with -1 * uint64(y < 64), which is 0xfffff... if y < 64 and 0 otherwise.
   169  (Lsh8x8   <t> x y) -> (AND (SLL <t> x y) (Neg8  <t> (SLTIU <t> [64] (ZeroExt8to64  y))))
   170  (Lsh8x16  <t> x y) -> (AND (SLL <t> x y) (Neg8  <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
   171  (Lsh8x32  <t> x y) -> (AND (SLL <t> x y) (Neg8  <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
   172  (Lsh8x64  <t> x y) -> (AND (SLL <t> x y) (Neg8  <t> (SLTIU <t> [64] y)))
   173  (Lsh16x8  <t> x y) -> (AND (SLL <t> x y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt8to64  y))))
   174  (Lsh16x16 <t> x y) -> (AND (SLL <t> x y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
   175  (Lsh16x32 <t> x y) -> (AND (SLL <t> x y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
   176  (Lsh16x64 <t> x y) -> (AND (SLL <t> x y) (Neg16 <t> (SLTIU <t> [64] y)))
   177  (Lsh32x8  <t> x y) -> (AND (SLL <t> x y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt8to64  y))))
   178  (Lsh32x16 <t> x y) -> (AND (SLL <t> x y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
   179  (Lsh32x32 <t> x y) -> (AND (SLL <t> x y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
   180  (Lsh32x64 <t> x y) -> (AND (SLL <t> x y) (Neg32 <t> (SLTIU <t> [64] y)))
   181  (Lsh64x8  <t> x y) -> (AND (SLL <t> x y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt8to64  y))))
   182  (Lsh64x16 <t> x y) -> (AND (SLL <t> x y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
   183  (Lsh64x32 <t> x y) -> (AND (SLL <t> x y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
   184  (Lsh64x64 <t> x y) -> (AND (SLL <t> x y) (Neg64 <t> (SLTIU <t> [64] y)))
   185  
   186  // SRL only considers the bottom 6 bits of y. If y > 64, the result should
   187  // always be 0. See Lsh above for a detailed description.
   188  (Rsh8Ux8   <t> x y) -> (AND (SRL <t> (ZeroExt8to64  x) y) (Neg8  <t> (SLTIU <t> [64] (ZeroExt8to64  y))))
   189  (Rsh8Ux16  <t> x y) -> (AND (SRL <t> (ZeroExt8to64  x) y) (Neg8  <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
   190  (Rsh8Ux32  <t> x y) -> (AND (SRL <t> (ZeroExt8to64  x) y) (Neg8  <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
   191  (Rsh8Ux64  <t> x y) -> (AND (SRL <t> (ZeroExt8to64  x) y) (Neg8  <t> (SLTIU <t> [64] y)))
   192  (Rsh16Ux8  <t> x y) -> (AND (SRL <t> (ZeroExt16to64 x) y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt8to64  y))))
   193  (Rsh16Ux16 <t> x y) -> (AND (SRL <t> (ZeroExt16to64 x) y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
   194  (Rsh16Ux32 <t> x y) -> (AND (SRL <t> (ZeroExt16to64 x) y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
   195  (Rsh16Ux64 <t> x y) -> (AND (SRL <t> (ZeroExt16to64 x) y) (Neg16 <t> (SLTIU <t> [64] y)))
   196  (Rsh32Ux8  <t> x y) -> (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt8to64  y))))
   197  (Rsh32Ux16 <t> x y) -> (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
   198  (Rsh32Ux32 <t> x y) -> (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
   199  (Rsh32Ux64 <t> x y) -> (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [64] y)))
   200  (Rsh64Ux8  <t> x y) -> (AND (SRL <t> x                 y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt8to64  y))))
   201  (Rsh64Ux16 <t> x y) -> (AND (SRL <t> x                 y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
   202  (Rsh64Ux32 <t> x y) -> (AND (SRL <t> x                 y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
   203  (Rsh64Ux64 <t> x y) -> (AND (SRL <t> x                 y) (Neg64 <t> (SLTIU <t> [64] y)))
   204  
   205  // SRA only considers the bottom 6 bits of y. If y > 64, the result should
   206  // be either 0 or -1 based on the sign bit.
   207  //
   208  // We implement this by performing the max shift (-1) if y >= 64.
   209  //
   210  // We OR (uint64(y < 64) - 1) into y before passing it to SRA. This leaves
   211  // us with -1 (0xffff...) if y >= 64.
   212  //
   213  // We don't need to sign-extend the OR result, as it will be at minimum 8 bits,
   214  // more than the 6 bits SRA cares about.
   215  (Rsh8x8   <t> x y) -> (SRA <t> (SignExt8to64  x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt8to64  y)))))
   216  (Rsh8x16  <t> x y) -> (SRA <t> (SignExt8to64  x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt16to64 y)))))
   217  (Rsh8x32  <t> x y) -> (SRA <t> (SignExt8to64  x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt32to64 y)))))
   218  (Rsh8x64  <t> x y) -> (SRA <t> (SignExt8to64  x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] y))))
   219  (Rsh16x8  <t> x y) -> (SRA <t> (SignExt16to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt8to64  y)))))
   220  (Rsh16x16 <t> x y) -> (SRA <t> (SignExt16to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt16to64 y)))))
   221  (Rsh16x32 <t> x y) -> (SRA <t> (SignExt16to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt32to64 y)))))
   222  (Rsh16x64 <t> x y) -> (SRA <t> (SignExt16to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] y))))
   223  (Rsh32x8  <t> x y) -> (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt8to64  y)))))
   224  (Rsh32x16 <t> x y) -> (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt16to64 y)))))
   225  (Rsh32x32 <t> x y) -> (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt32to64 y)))))
   226  (Rsh32x64 <t> x y) -> (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] y))))
   227  (Rsh64x8  <t> x y) -> (SRA <t> x                 (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt8to64  y)))))
   228  (Rsh64x16 <t> x y) -> (SRA <t> x                 (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt16to64 y)))))
   229  (Rsh64x32 <t> x y) -> (SRA <t> x                 (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt32to64 y)))))
   230  (Rsh64x64 <t> x y) -> (SRA <t> x                 (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] y))))
   231  
   232  (Less64  x y) -> (SLT  x y)
   233  (Less32  x y) -> (SLT  (SignExt32to64 x) (SignExt32to64 y))
   234  (Less16  x y) -> (SLT  (SignExt16to64 x) (SignExt16to64 y))
   235  (Less8   x y) -> (SLT  (SignExt8to64  x) (SignExt8to64  y))
   236  (Less64U x y) -> (SLTU x y)
   237  (Less32U x y) -> (SLTU (ZeroExt32to64 x) (ZeroExt32to64 y))
   238  (Less16U x y) -> (SLTU (ZeroExt16to64 x) (ZeroExt16to64 y))
   239  (Less8U  x y) -> (SLTU (ZeroExt8to64  x) (ZeroExt8to64  y))
   240  (Less64F x y) -> (FLTD x y)
   241  (Less32F x y) -> (FLTS x y)
   242  
   243  // Convert x <= y to !(y > x).
   244  (Leq64  x y) -> (Not (Less64  y x))
   245  (Leq32  x y) -> (Not (Less32  y x))
   246  (Leq16  x y) -> (Not (Less16  y x))
   247  (Leq8   x y) -> (Not (Less8   y x))
   248  (Leq64U x y) -> (Not (Less64U y x))
   249  (Leq32U x y) -> (Not (Less32U y x))
   250  (Leq16U x y) -> (Not (Less16U y x))
   251  (Leq8U  x y) -> (Not (Less8U  y x))
   252  (Leq64F x y) -> (FLED x y)
   253  (Leq32F x y) -> (FLES x y)
   254  
   255  // Convert x > y to y < x.
   256  (Greater64  x y) -> (Less64  y x)
   257  (Greater32  x y) -> (Less32  y x)
   258  (Greater16  x y) -> (Less16  y x)
   259  (Greater8   x y) -> (Less8   y x)
   260  (Greater64U x y) -> (Less64U y x)
   261  (Greater32U x y) -> (Less32U y x)
   262  (Greater16U x y) -> (Less16U y x)
   263  (Greater8U  x y) -> (Less8U  y x)
   264  (Greater64F x y) -> (FLTD y x)
   265  (Greater32F x y) -> (FLTS y x)
   266  
   267  // Convert x >= y to !(x < y)
   268  (Geq64  x y) -> (Not (Less64  x y))
   269  (Geq32  x y) -> (Not (Less32  x y))
   270  (Geq16  x y) -> (Not (Less16  x y))
   271  (Geq8   x y) -> (Not (Less8   x y))
   272  (Geq64U x y) -> (Not (Less64U x y))
   273  (Geq32U x y) -> (Not (Less32U x y))
   274  (Geq16U x y) -> (Not (Less16U x y))
   275  (Geq8U  x y) -> (Not (Less8U  x y))
   276  (Geq64F x y) -> (FLED y x)
   277  (Geq32F x y) -> (FLES y x)
   278  
   279  (EqPtr x y) -> (SEQZ (SUB <x.Type> x y))
   280  (Eq64  x y) -> (SEQZ (SUB <x.Type> x y))
   281  (Eq32  x y) -> (SEQZ (ZeroExt32to64 (SUB <x.Type> x y)))
   282  (Eq16  x y) -> (SEQZ (ZeroExt16to64 (SUB <x.Type> x y)))
   283  (Eq8   x y) -> (SEQZ (ZeroExt8to64  (SUB <x.Type> x y)))
   284  (Eq64F x y) -> (FEQD x y)
   285  (Eq32F x y) -> (FEQS x y)
   286  
   287  (NeqPtr x y) -> (SNEZ (SUB <x.Type> x y))
   288  (Neq64  x y) -> (SNEZ (SUB <x.Type> x y))
   289  (Neq32  x y) -> (SNEZ (ZeroExt32to64 (SUB <x.Type> x y)))
   290  (Neq16  x y) -> (SNEZ (ZeroExt16to64 (SUB <x.Type> x y)))
   291  (Neq8   x y) -> (SNEZ (ZeroExt8to64  (SUB <x.Type> x y)))
   292  (Neq64F x y) -> (FNED x y)
   293  (Neq32F x y) -> (FNES x y)
   294  
   295  // Loads
   296  (Load <t> ptr mem) &&  t.IsBoolean()                  -> (MOVBUload ptr mem)
   297  (Load <t> ptr mem) && ( is8BitInt(t) &&  isSigned(t)) -> (MOVBload  ptr mem)
   298  (Load <t> ptr mem) && ( is8BitInt(t) && !isSigned(t)) -> (MOVBUload ptr mem)
   299  (Load <t> ptr mem) && (is16BitInt(t) &&  isSigned(t)) -> (MOVHload  ptr mem)
   300  (Load <t> ptr mem) && (is16BitInt(t) && !isSigned(t)) -> (MOVHUload ptr mem)
   301  (Load <t> ptr mem) && (is32BitInt(t) &&  isSigned(t)) -> (MOVWload  ptr mem)
   302  (Load <t> ptr mem) && (is32BitInt(t) && !isSigned(t)) -> (MOVWUload ptr mem)
   303  (Load <t> ptr mem) && (is64BitInt(t) || isPtr(t))     -> (MOVDload  ptr mem)
   304  (Load <t> ptr mem) &&  is32BitFloat(t)                -> (FMOVWload ptr mem)
   305  (Load <t> ptr mem) &&  is64BitFloat(t)                -> (FMOVDload ptr mem)
   306  
   307  // Stores
   308  (Store [1] ptr val mem)                            -> (MOVBstore ptr val mem)
   309  (Store [2] ptr val mem)                            -> (MOVHstore ptr val mem)
   310  (Store [4] ptr val mem) && !is32BitFloat(val.Type) -> (MOVWstore ptr val mem)
   311  (Store [8] ptr val mem) && !is64BitFloat(val.Type) -> (MOVDstore ptr val mem)
   312  (Store [4] ptr val mem) &&  is32BitFloat(val.Type) -> (FMOVWstore ptr val mem)
   313  (Store [8] ptr val mem) &&  is64BitFloat(val.Type) -> (FMOVDstore ptr val mem)
   314  
   315  // We need to fold MOVaddr into the LD/MOVDstore ops so that the live variable analysis
   316  // knows what variables are being read/written by the ops.
   317  (MOVBUload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
   318  	(MOVBUload [off1+off2] {mergeSym(sym1,sym2)} base mem)
   319  (MOVBload  [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
   320  	(MOVBload  [off1+off2] {mergeSym(sym1,sym2)} base mem)
   321  (MOVHUload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
   322  	(MOVHUload [off1+off2] {mergeSym(sym1,sym2)} base mem)
   323  (MOVHload  [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
   324  	(MOVHload  [off1+off2] {mergeSym(sym1,sym2)} base mem)
   325  (MOVWUload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
   326  	(MOVWUload [off1+off2] {mergeSym(sym1,sym2)} base mem)
   327  (MOVWload  [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
   328  	(MOVWload  [off1+off2] {mergeSym(sym1,sym2)} base mem)
   329  (MOVDload  [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
   330  	(MOVDload  [off1+off2] {mergeSym(sym1,sym2)} base mem)
   331  
   332  (MOVBstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
   333  	(MOVBstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
   334  (MOVHstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
   335  	(MOVHstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
   336  (MOVWstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
   337  	(MOVWstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
   338  (MOVDstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
   339  	(MOVDstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
   340  
   341  (MOVBUload [off1] {sym} (ADDI [off2] base) mem) && is32Bit(off1+off2) ->
   342  	(MOVBUload [off1+off2] {sym} base mem)
   343  (MOVBload  [off1] {sym} (ADDI [off2] base) mem) && is32Bit(off1+off2) ->
   344  	(MOVBload  [off1+off2] {sym} base mem)
   345  (MOVHUload [off1] {sym} (ADDI [off2] base) mem) && is32Bit(off1+off2) ->
   346  	(MOVHUload [off1+off2] {sym} base mem)
   347  (MOVHload  [off1] {sym} (ADDI [off2] base) mem) && is32Bit(off1+off2) ->
   348  	(MOVHload  [off1+off2] {sym} base mem)
   349  (MOVWUload [off1] {sym} (ADDI [off2] base) mem) && is32Bit(off1+off2) ->
   350  	(MOVWUload [off1+off2] {sym} base mem)
   351  (MOVWload  [off1] {sym} (ADDI [off2] base) mem) && is32Bit(off1+off2) ->
   352  	(MOVWload  [off1+off2] {sym} base mem)
   353  (MOVDload  [off1] {sym} (ADDI [off2] base) mem) && is32Bit(off1+off2) ->
   354  	(MOVDload  [off1+off2] {sym} base mem)
   355  
   356  (MOVBstore [off1] {sym} (ADDI [off2] base) val mem) && is32Bit(off1+off2) ->
   357  	(MOVBstore [off1+off2] {sym} base val mem)
   358  (MOVHstore [off1] {sym} (ADDI [off2] base) val mem) && is32Bit(off1+off2) ->
   359  	(MOVHstore [off1+off2] {sym} base val mem)
   360  (MOVWstore [off1] {sym} (ADDI [off2] base) val mem) && is32Bit(off1+off2) ->
   361  	(MOVWstore [off1+off2] {sym} base val mem)
   362  (MOVDstore [off1] {sym} (ADDI [off2] base) val mem) && is32Bit(off1+off2) ->
   363  	(MOVDstore [off1+off2] {sym} base val mem)
   364  
   365  // Similarly, fold ADDI into MOVaddr to avoid confusing live variable analysis
   366  // with OffPtr -> ADDI.
   367  (ADDI [c] (MOVaddr [d] {s} x)) && is32Bit(c+d) -> (MOVaddr [c+d] {s} x)
   368  
   369  // Zeroing
   370  // TODO: more optimized zeroing, including attempting to use aligned accesses.
   371  (Zero [s]   _ mem) && SizeAndAlign(s).Size() == 0 -> mem
   372  (Zero [s] ptr mem) && SizeAndAlign(s).Size() == 1 -> (MOVBstore ptr (MOVBconst) mem)
   373  (Zero [s] ptr mem) && SizeAndAlign(s).Size() == 2 -> (MOVHstore ptr (MOVHconst) mem)
   374  (Zero [s] ptr mem) && SizeAndAlign(s).Size() == 4 -> (MOVWstore ptr (MOVWconst) mem)
   375  (Zero [s] ptr mem) && SizeAndAlign(s).Size() == 8 -> (MOVDstore ptr (MOVDconst) mem)
   376  
   377  // Generic zeroing uses a loop
   378  (Zero [s] ptr mem) ->
   379  	(LoweredZero [SizeAndAlign(s).Align()]
   380  		ptr
   381  		(ADD <ptr.Type> ptr (MOVDconst [SizeAndAlign(s).Size()-moveSize(SizeAndAlign(s).Align(), config)]))
   382  		mem)
   383  
   384  (Convert x mem) -> (MOVconvert x mem)
   385  
   386  // Checks
   387  (IsNonNil p) -> (NeqPtr (MOVDconst) p)
   388  (IsInBounds idx len) -> (Less64U idx len)
   389  (IsSliceInBounds idx len) -> (Leq64U idx len)
   390  
   391  // Trivial lowering
   392  (NilCheck ptr mem) -> (LoweredNilCheck ptr mem)
   393  (GetClosurePtr) -> (LoweredGetClosurePtr)
   394  
   395  // Moves
   396  // TODO: more optimized moves, including attempting to use aligned accesses.
   397  (Move [s]   _   _ mem) && SizeAndAlign(s).Size() == 0 -> mem
   398  (Move [s] dst src mem) && SizeAndAlign(s).Size() == 1 -> (MOVBstore dst (MOVBload src mem) mem)
   399  (Move [s] dst src mem) && SizeAndAlign(s).Size() == 2 -> (MOVHstore dst (MOVHload src mem) mem)
   400  (Move [s] dst src mem) && SizeAndAlign(s).Size() == 4 -> (MOVWstore dst (MOVWload src mem) mem)
   401  (Move [s] dst src mem) && SizeAndAlign(s).Size() == 8 -> (MOVDstore dst (MOVDload src mem) mem)
   402  
   403  // Generic move uses a loop
   404  (Move [s] dst src mem) ->
   405  	(LoweredMove [SizeAndAlign(s).Align()]
   406  		dst
   407  		src
   408  		(ADDI <src.Type> [SizeAndAlign(s).Size()-moveSize(SizeAndAlign(s).Align(), config)] src)
   409  		mem)
   410  
   411  // Boolean ops; 0=false, 1=true
   412  (AndB x y) -> (AND  x y)
   413  (OrB  x y) -> (OR   x y)
   414  (EqB  x y) -> (Eq8  x y)
   415  (NeqB x y) -> (Neq8 x y)
   416  (Not  x)   -> (XORI [1] x)
   417  
   418  // TODO: Special handling for SP offsets, like ARM
   419  (OffPtr [off] ptr:(SP)) -> (MOVaddr [off] ptr)
   420  (OffPtr [off] ptr) -> (ADDI [off] ptr)
   421  
   422  (Const8 [val]) -> (MOVBconst [val])
   423  (Const16 [val]) -> (MOVHconst [val])
   424  (Const32 [val]) -> (MOVWconst [val])
   425  (Const64 [val]) -> (MOVDconst [val])
   426  (Const32F [val]) -> (FMVSX (MOVSconst [val]))
   427  (Const64F [val]) -> (FMVDX (MOVDconst [val]))
   428  (ConstNil) -> (MOVDconst [0])
   429  (ConstBool [b]) -> (MOVBconst [b])
   430  
   431  // Convert 64 bit immediate to two 32 bit immediates, combine with add and shift.
   432  // The lower 32 bit immediate will be treated as signed,
   433  // so if it is negative, adjust for the borrow by incrementing the top half.
   434  // We don't have to worry about overflow from the increment,
   435  // because if the top half is all 1s, and int32(c) is negative,
   436  // then the overall constant fits in an int32.
   437  (MOVDconst <t> [c]) && !is32Bit(c) && int32(c) <  0 -> (ADD (SLLI <t> [32] (MOVDconst [c>>32+1])) (MOVDconst [int64(int32(c))]))
   438  (MOVDconst <t> [c]) && !is32Bit(c) && int32(c) >= 0 -> (ADD (SLLI <t> [32] (MOVDconst [c>>32+0])) (MOVDconst [int64(int32(c))]))
   439  
   440  (Addr {sym} base) -> (MOVaddr {sym} base)
   441  
   442  // Conditional branches
   443  //
   444  // cond is 1 if true. BNE compares against 0.
   445  //
   446  // TODO(prattmic): RISCV branch instructions take two operands to compare,
   447  // so we could generate more efficient code by computing the condition in the
   448  // branch itself. Unfortunately, the compiler doesn't current support Blocks
   449  // with two control values. Revisit adding support for two control values.
   450  (If cond yes no) -> (BNE cond yes no)
   451  
   452  // Calls
   453  (StaticCall  [argwid] {target}      mem) -> (CALLstatic  [argwid] {target}      mem)
   454  (ClosureCall [argwid] entry closure mem) -> (CALLclosure [argwid] entry closure mem)
   455  (DeferCall   [argwid]               mem) -> (CALLdefer   [argwid]               mem)
   456  (GoCall      [argwid]               mem) -> (CALLgo      [argwid]               mem)
   457  (InterCall   [argwid] entry         mem) -> (CALLinter   [argwid] entry         mem)
   458  
   459  // remove redundant *const ops
   460  (ADDI [0]  x) -> x