github.com/tetratelabs/wazero@v1.7.3-0.20240513003603-48f702e154b5/internal/engine/wazevo/backend/isa/amd64/machine_vec.go (about)

     1  package amd64
     2  
     3  import (
     4  	"fmt"
     5  
     6  	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
     7  	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
     8  )
     9  
    10  var swizzleMask = [16]byte{
    11  	0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
    12  	0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
    13  }
    14  
    15  func (m *machine) lowerSwizzle(x, y ssa.Value, ret ssa.Value) {
    16  	masklabel := m.getOrAllocateConstLabel(&m.constSwizzleMaskConstIndex, swizzleMask[:])
    17  
    18  	// Load mask to maskReg.
    19  	maskReg := m.c.AllocateVReg(ssa.TypeV128)
    20  	loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(masklabel)), maskReg)
    21  	m.insert(loadMask)
    22  
    23  	// Copy x and y to tmp registers.
    24  	xx := m.getOperand_Reg(m.c.ValueDefinition(x))
    25  	tmpDst := m.copyToTmp(xx.reg())
    26  	yy := m.getOperand_Reg(m.c.ValueDefinition(y))
    27  	tmpX := m.copyToTmp(yy.reg())
    28  
    29  	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePaddusb, newOperandReg(maskReg), tmpX))
    30  	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmpX), tmpDst))
    31  
    32  	// Copy the result to the destination register.
    33  	m.copyTo(tmpDst, m.c.VRegOf(ret))
    34  }
    35  
    36  func (m *machine) lowerInsertLane(x, y ssa.Value, index byte, ret ssa.Value, lane ssa.VecLane) {
    37  	// Copy x to tmp.
    38  	tmpDst := m.c.AllocateVReg(ssa.TypeV128)
    39  	m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, m.getOperand_Mem_Reg(m.c.ValueDefinition(x)), tmpDst))
    40  
    41  	yy := m.getOperand_Reg(m.c.ValueDefinition(y))
    42  	switch lane {
    43  	case ssa.VecLaneI8x16:
    44  		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrb, index, yy, tmpDst))
    45  	case ssa.VecLaneI16x8:
    46  		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, index, yy, tmpDst))
    47  	case ssa.VecLaneI32x4:
    48  		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrd, index, yy, tmpDst))
    49  	case ssa.VecLaneI64x2:
    50  		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, index, yy, tmpDst))
    51  	case ssa.VecLaneF32x4:
    52  		// In INSERTPS instruction, the destination index is encoded at 4 and 5 bits of the argument.
    53  		// See https://www.felixcloutier.com/x86/insertps
    54  		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeInsertps, index<<4, yy, tmpDst))
    55  	case ssa.VecLaneF64x2:
    56  		if index == 0 {
    57  			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovsd, yy, tmpDst))
    58  		} else {
    59  			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMovlhps, yy, tmpDst))
    60  		}
    61  	default:
    62  		panic(fmt.Sprintf("invalid lane type: %s", lane))
    63  	}
    64  
    65  	m.copyTo(tmpDst, m.c.VRegOf(ret))
    66  }
    67  
    68  func (m *machine) lowerExtractLane(x ssa.Value, index byte, signed bool, ret ssa.Value, lane ssa.VecLane) {
    69  	// Pextr variants are used to extract a lane from a vector register.
    70  	xx := m.getOperand_Reg(m.c.ValueDefinition(x))
    71  
    72  	tmpDst := m.c.AllocateVReg(ret.Type())
    73  	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpDst))
    74  	switch lane {
    75  	case ssa.VecLaneI8x16:
    76  		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrb, index, xx, tmpDst))
    77  		if signed {
    78  			m.insert(m.allocateInstr().asMovsxRmR(extModeBL, newOperandReg(tmpDst), tmpDst))
    79  		} else {
    80  			m.insert(m.allocateInstr().asMovzxRmR(extModeBL, newOperandReg(tmpDst), tmpDst))
    81  		}
    82  	case ssa.VecLaneI16x8:
    83  		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrw, index, xx, tmpDst))
    84  		if signed {
    85  			m.insert(m.allocateInstr().asMovsxRmR(extModeWL, newOperandReg(tmpDst), tmpDst))
    86  		} else {
    87  			m.insert(m.allocateInstr().asMovzxRmR(extModeWL, newOperandReg(tmpDst), tmpDst))
    88  		}
    89  	case ssa.VecLaneI32x4:
    90  		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrd, index, xx, tmpDst))
    91  	case ssa.VecLaneI64x2:
    92  		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrq, index, xx, tmpDst))
    93  	case ssa.VecLaneF32x4:
    94  		if index == 0 {
    95  			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovss, xx, tmpDst))
    96  		} else {
    97  			m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, index, xx, tmpDst))
    98  		}
    99  	case ssa.VecLaneF64x2:
   100  		if index == 0 {
   101  			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovsd, xx, tmpDst))
   102  		} else {
   103  			m.copyTo(xx.reg(), tmpDst)
   104  			m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0b00_00_11_10, newOperandReg(tmpDst), tmpDst))
   105  		}
   106  	default:
   107  		panic(fmt.Sprintf("invalid lane type: %s", lane))
   108  	}
   109  
   110  	m.copyTo(tmpDst, m.c.VRegOf(ret))
   111  }
   112  
   113  var sqmulRoundSat = [16]byte{
   114  	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
   115  	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
   116  }
   117  
   118  func (m *machine) lowerSqmulRoundSat(x, y, ret ssa.Value) {
   119  	// See https://github.com/WebAssembly/simd/pull/365 for the following logic.
   120  	maskLabel := m.getOrAllocateConstLabel(&m.constSqmulRoundSatIndex, sqmulRoundSat[:])
   121  
   122  	tmp := m.c.AllocateVReg(ssa.TypeV128)
   123  	loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), tmp)
   124  	m.insert(loadMask)
   125  
   126  	xx, yy := m.getOperand_Reg(m.c.ValueDefinition(x)), m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
   127  	tmpX := m.copyToTmp(xx.reg())
   128  
   129  	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmulhrsw, yy, tmpX))
   130  	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePcmpeqd, newOperandReg(tmpX), tmp))
   131  	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmpX))
   132  
   133  	m.copyTo(tmpX, m.c.VRegOf(ret))
   134  }
   135  
   136  func (m *machine) lowerVUshr(x, y, ret ssa.Value, lane ssa.VecLane) {
   137  	switch lane {
   138  	case ssa.VecLaneI8x16:
   139  		m.lowerVUshri8x16(x, y, ret)
   140  	case ssa.VecLaneI16x8, ssa.VecLaneI32x4, ssa.VecLaneI64x2:
   141  		m.lowerShr(x, y, ret, lane, false)
   142  	default:
   143  		panic(fmt.Sprintf("invalid lane type: %s", lane))
   144  	}
   145  }
   146  
   147  // i8x16LogicalSHRMaskTable is necessary for emulating non-existent packed bytes logical right shifts on amd64.
   148  // The mask is applied after performing packed word shifts on the value to clear out the unnecessary bits.
   149  var i8x16LogicalSHRMaskTable = [8 * 16]byte{ // (the number of possible shift amount 0, 1, ..., 7.) * 16 bytes.
   150  	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // for 0 shift
   151  	0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // for 1 shift
   152  	0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, // for 2 shift
   153  	0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, // for 3 shift
   154  	0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, // for 4 shift
   155  	0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, // for 5 shift
   156  	0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, // for 6 shift
   157  	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // for 7 shift
   158  }
   159  
   160  func (m *machine) lowerVUshri8x16(x, y, ret ssa.Value) {
   161  	tmpGpReg := m.c.AllocateVReg(ssa.TypeI32)
   162  	// Load the modulo 8 mask to tmpReg.
   163  	m.lowerIconst(tmpGpReg, 0x7, false)
   164  	// Take the modulo 8 of the shift amount.
   165  	shiftAmt := m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y))
   166  	m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd, shiftAmt, tmpGpReg, false))
   167  
   168  	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
   169  	xx := m.copyToTmp(_xx.reg())
   170  
   171  	vecTmp := m.c.AllocateVReg(ssa.TypeV128)
   172  	m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(tmpGpReg), vecTmp, false))
   173  	m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrlw, newOperandReg(vecTmp), xx))
   174  
   175  	maskTableLabel := m.getOrAllocateConstLabel(&m.constI8x16LogicalSHRMaskTableIndex, i8x16LogicalSHRMaskTable[:])
   176  	base := m.c.AllocateVReg(ssa.TypeI64)
   177  	lea := m.allocateInstr().asLEA(newOperandLabel(maskTableLabel), base)
   178  	m.insert(lea)
   179  
   180  	// Shift tmpGpReg by 4 to multiply the shift amount by 16.
   181  	m.insert(m.allocateInstr().asShiftR(shiftROpShiftLeft, newOperandImm32(4), tmpGpReg, false))
   182  
   183  	mem := m.newAmodeRegRegShift(0, base, tmpGpReg, 0)
   184  	loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(mem), vecTmp)
   185  	m.insert(loadMask)
   186  
   187  	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePand, newOperandReg(vecTmp), xx))
   188  	m.copyTo(xx, m.c.VRegOf(ret))
   189  }
   190  
   191  func (m *machine) lowerVSshr(x, y, ret ssa.Value, lane ssa.VecLane) {
   192  	switch lane {
   193  	case ssa.VecLaneI8x16:
   194  		m.lowerVSshri8x16(x, y, ret)
   195  	case ssa.VecLaneI16x8, ssa.VecLaneI32x4:
   196  		m.lowerShr(x, y, ret, lane, true)
   197  	case ssa.VecLaneI64x2:
   198  		m.lowerVSshri64x2(x, y, ret)
   199  	default:
   200  		panic(fmt.Sprintf("invalid lane type: %s", lane))
   201  	}
   202  }
   203  
   204  func (m *machine) lowerVSshri8x16(x, y, ret ssa.Value) {
   205  	shiftAmtReg := m.c.AllocateVReg(ssa.TypeI32)
   206  	// Load the modulo 8 mask to tmpReg.
   207  	m.lowerIconst(shiftAmtReg, 0x7, false)
   208  	// Take the modulo 8 of the shift amount.
   209  	shiftAmt := m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y))
   210  	m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd, shiftAmt, shiftAmtReg, false))
   211  
   212  	// Copy the x value to two temporary registers.
   213  	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
   214  	xx := m.copyToTmp(_xx.reg())
   215  	vecTmp := m.c.AllocateVReg(ssa.TypeV128)
   216  	m.copyTo(xx, vecTmp)
   217  
   218  	// Assuming that we have
   219  	//  xx   = [b1, ..., b16]
   220  	//  vecTmp = [b1, ..., b16]
   221  	// at this point, then we use PUNPCKLBW and PUNPCKHBW to produce:
   222  	//  xx   = [b1, b1, b2, b2, ..., b8, b8]
   223  	//  vecTmp = [b9, b9, b10, b10, ..., b16, b16]
   224  	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePunpcklbw, newOperandReg(xx), xx))
   225  	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePunpckhbw, newOperandReg(vecTmp), vecTmp))
   226  
   227  	// Adding 8 to the shift amount, and then move the amount to vecTmp2.
   228  	vecTmp2 := m.c.AllocateVReg(ssa.TypeV128)
   229  	m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAdd, newOperandImm32(8), shiftAmtReg, false))
   230  	m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(shiftAmtReg), vecTmp2, false))
   231  
   232  	// Perform the word packed arithmetic right shifts on vreg and vecTmp.
   233  	// This changes these two registers as:
   234  	//  xx   = [xxx, b1 >> s, xxx, b2 >> s, ..., xxx, b8 >> s]
   235  	//  vecTmp = [xxx, b9 >> s, xxx, b10 >> s, ..., xxx, b16 >> s]
   236  	// where xxx is 1 or 0 depending on each byte's sign, and ">>" is the arithmetic shift on a byte.
   237  	m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsraw, newOperandReg(vecTmp2), xx))
   238  	m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsraw, newOperandReg(vecTmp2), vecTmp))
   239  
   240  	// Finally, we can get the result by packing these two word vectors.
   241  	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePacksswb, newOperandReg(vecTmp), xx))
   242  
   243  	m.copyTo(xx, m.c.VRegOf(ret))
   244  }
   245  
   246  func (m *machine) lowerVSshri64x2(x, y, ret ssa.Value) {
   247  	// Load the shift amount to RCX.
   248  	shiftAmt := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
   249  	m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, shiftAmt, rcxVReg))
   250  
   251  	tmpGp := m.c.AllocateVReg(ssa.TypeI64)
   252  
   253  	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
   254  	xxReg := m.copyToTmp(_xx.reg())
   255  
   256  	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp))
   257  	m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrq, 0, newOperandReg(xxReg), tmpGp))
   258  	m.insert(m.allocateInstr().asShiftR(shiftROpShiftRightArithmetic, newOperandReg(rcxVReg), tmpGp, true))
   259  	m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 0, newOperandReg(tmpGp), xxReg))
   260  	m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrq, 1, newOperandReg(xxReg), tmpGp))
   261  	m.insert(m.allocateInstr().asShiftR(shiftROpShiftRightArithmetic, newOperandReg(rcxVReg), tmpGp, true))
   262  	m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 1, newOperandReg(tmpGp), xxReg))
   263  
   264  	m.copyTo(xxReg, m.c.VRegOf(ret))
   265  }
   266  
   267  func (m *machine) lowerShr(x, y, ret ssa.Value, lane ssa.VecLane, signed bool) {
   268  	var modulo uint64
   269  	var shiftOp sseOpcode
   270  	switch lane {
   271  	case ssa.VecLaneI16x8:
   272  		modulo = 0xf
   273  		if signed {
   274  			shiftOp = sseOpcodePsraw
   275  		} else {
   276  			shiftOp = sseOpcodePsrlw
   277  		}
   278  	case ssa.VecLaneI32x4:
   279  		modulo = 0x1f
   280  		if signed {
   281  			shiftOp = sseOpcodePsrad
   282  		} else {
   283  			shiftOp = sseOpcodePsrld
   284  		}
   285  	case ssa.VecLaneI64x2:
   286  		modulo = 0x3f
   287  		if signed {
   288  			panic("BUG")
   289  		}
   290  		shiftOp = sseOpcodePsrlq
   291  	default:
   292  		panic(fmt.Sprintf("invalid lane type: %s", lane))
   293  	}
   294  
   295  	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
   296  	xx := m.copyToTmp(_xx.reg())
   297  
   298  	tmpGpReg := m.c.AllocateVReg(ssa.TypeI32)
   299  	// Load the modulo 8 mask to tmpReg.
   300  	m.lowerIconst(tmpGpReg, modulo, false)
   301  	// Take the modulo 8 of the shift amount.
   302  	m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd,
   303  		m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y)), tmpGpReg, false))
   304  	// And move it to a xmm register.
   305  	tmpVec := m.c.AllocateVReg(ssa.TypeV128)
   306  	m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(tmpGpReg), tmpVec, false))
   307  
   308  	// Then do the actual shift.
   309  	m.insert(m.allocateInstr().asXmmRmiReg(shiftOp, newOperandReg(tmpVec), xx))
   310  
   311  	m.copyTo(xx, m.c.VRegOf(ret))
   312  }
   313  
   314  func (m *machine) lowerVIshl(x, y, ret ssa.Value, lane ssa.VecLane) {
   315  	var modulo uint64
   316  	var shiftOp sseOpcode
   317  	var isI8x16 bool
   318  	switch lane {
   319  	case ssa.VecLaneI8x16:
   320  		isI8x16 = true
   321  		modulo = 0x7
   322  		shiftOp = sseOpcodePsllw
   323  	case ssa.VecLaneI16x8:
   324  		modulo = 0xf
   325  		shiftOp = sseOpcodePsllw
   326  	case ssa.VecLaneI32x4:
   327  		modulo = 0x1f
   328  		shiftOp = sseOpcodePslld
   329  	case ssa.VecLaneI64x2:
   330  		modulo = 0x3f
   331  		shiftOp = sseOpcodePsllq
   332  	default:
   333  		panic(fmt.Sprintf("invalid lane type: %s", lane))
   334  	}
   335  
   336  	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
   337  	xx := m.copyToTmp(_xx.reg())
   338  
   339  	tmpGpReg := m.c.AllocateVReg(ssa.TypeI32)
   340  	// Load the modulo 8 mask to tmpReg.
   341  	m.lowerIconst(tmpGpReg, modulo, false)
   342  	// Take the modulo 8 of the shift amount.
   343  	m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd,
   344  		m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y)), tmpGpReg, false))
   345  	// And move it to a xmm register.
   346  	tmpVec := m.c.AllocateVReg(ssa.TypeV128)
   347  	m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(tmpGpReg), tmpVec, false))
   348  
   349  	// Then do the actual shift.
   350  	m.insert(m.allocateInstr().asXmmRmiReg(shiftOp, newOperandReg(tmpVec), xx))
   351  
   352  	if isI8x16 {
   353  		maskTableLabel := m.getOrAllocateConstLabel(&m.constI8x16SHLMaskTableIndex, i8x16SHLMaskTable[:])
   354  		base := m.c.AllocateVReg(ssa.TypeI64)
   355  		lea := m.allocateInstr().asLEA(newOperandLabel(maskTableLabel), base)
   356  		m.insert(lea)
   357  
   358  		// Shift tmpGpReg by 4 to multiply the shift amount by 16.
   359  		m.insert(m.allocateInstr().asShiftR(shiftROpShiftLeft, newOperandImm32(4), tmpGpReg, false))
   360  
   361  		mem := m.newAmodeRegRegShift(0, base, tmpGpReg, 0)
   362  		loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(mem), tmpVec)
   363  		m.insert(loadMask)
   364  
   365  		m.insert(m.allocateInstr().asXmmRmR(sseOpcodePand, newOperandReg(tmpVec), xx))
   366  	}
   367  
   368  	m.copyTo(xx, m.c.VRegOf(ret))
   369  }
   370  
   371  // i8x16SHLMaskTable is necessary for emulating non-existent packed bytes left shifts on amd64.
   372  // The mask is applied after performing packed word shifts on the value to clear out the unnecessary bits.
   373  var i8x16SHLMaskTable = [8 * 16]byte{ // (the number of possible shift amount 0, 1, ..., 7.) * 16 bytes.
   374  	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // for 0 shift
   375  	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, // for 1 shift
   376  	0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, // for 2 shift
   377  	0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, // for 3 shift
   378  	0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // for 4 shift
   379  	0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, // for 5 shift
   380  	0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, // for 6 shift
   381  	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, // for 7 shift
   382  }
   383  
   384  func (m *machine) lowerVRound(x, ret ssa.Value, imm byte, _64 bool) {
   385  	xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
   386  	var round sseOpcode
   387  	if _64 {
   388  		round = sseOpcodeRoundpd
   389  	} else {
   390  		round = sseOpcodeRoundps
   391  	}
   392  	m.insert(m.allocateInstr().asXmmUnaryRmRImm(round, imm, xx, m.c.VRegOf(ret)))
   393  }
   394  
   395  var (
   396  	allOnesI8x16              = [16]byte{0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1}
   397  	allOnesI16x8              = [16]byte{0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0}
   398  	extAddPairwiseI16x8uMask1 = [16]byte{0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80}
   399  	extAddPairwiseI16x8uMask2 = [16]byte{0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00}
   400  )
   401  
   402  func (m *machine) lowerExtIaddPairwise(x, ret ssa.Value, srcLane ssa.VecLane, signed bool) {
   403  	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
   404  	xx := m.copyToTmp(_xx.reg())
   405  	switch srcLane {
   406  	case ssa.VecLaneI8x16:
   407  		allOneReg := m.c.AllocateVReg(ssa.TypeV128)
   408  		mask := m.getOrAllocateConstLabel(&m.constAllOnesI8x16Index, allOnesI8x16[:])
   409  		m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), allOneReg))
   410  
   411  		var resultReg regalloc.VReg
   412  		if signed {
   413  			resultReg = allOneReg
   414  			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddubsw, newOperandReg(xx), resultReg))
   415  		} else {
   416  			// Interpreter tmp (all ones) as signed byte meaning that all the multiply-add is unsigned.
   417  			resultReg = xx
   418  			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddubsw, newOperandReg(allOneReg), resultReg))
   419  		}
   420  		m.copyTo(resultReg, m.c.VRegOf(ret))
   421  
   422  	case ssa.VecLaneI16x8:
   423  		if signed {
   424  			allOnesReg := m.c.AllocateVReg(ssa.TypeV128)
   425  			mask := m.getOrAllocateConstLabel(&m.constAllOnesI16x8Index, allOnesI16x8[:])
   426  			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), allOnesReg))
   427  			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddwd, newOperandReg(allOnesReg), xx))
   428  			m.copyTo(xx, m.c.VRegOf(ret))
   429  		} else {
   430  			maskReg := m.c.AllocateVReg(ssa.TypeV128)
   431  			mask := m.getOrAllocateConstLabel(&m.constExtAddPairwiseI16x8uMask1Index, extAddPairwiseI16x8uMask1[:])
   432  			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), maskReg))
   433  
   434  			// Flip the sign bits on xx.
   435  			//
   436  			// Assuming that xx = [w1, ..., w8], now we have,
   437  			// 	xx[i] = int8(-w1) for i = 0...8
   438  			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(maskReg), xx))
   439  
   440  			mask = m.getOrAllocateConstLabel(&m.constAllOnesI16x8Index, allOnesI16x8[:])
   441  			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), maskReg))
   442  
   443  			// For i = 0,..4 (as this results in i32x4 lanes), now we have
   444  			// xx[i] = int32(-wn + -w(n+1)) = int32(-(wn + w(n+1)))
   445  			// c.assembler.CompileRegisterToRegister(amd64.PMADDWD, tmp, vr)
   446  			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddwd, newOperandReg(maskReg), xx))
   447  
   448  			mask = m.getOrAllocateConstLabel(&m.constExtAddPairwiseI16x8uMask2Index, extAddPairwiseI16x8uMask2[:])
   449  			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), maskReg))
   450  
   451  			// vr[i] = int32(-(wn + w(n+1))) + int32(math.MaxInt16+1) = int32((wn + w(n+1))) = uint32(wn + w(n+1)).
   452  			// c.assembler.CompileRegisterToRegister(amd64.PADDD, tmp, vr)
   453  			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePaddd, newOperandReg(maskReg), xx))
   454  
   455  			m.copyTo(xx, m.c.VRegOf(ret))
   456  		}
   457  	default:
   458  		panic(fmt.Sprintf("invalid lane type: %s", srcLane))
   459  	}
   460  }
   461  
   462  func (m *machine) lowerWidenLow(x, ret ssa.Value, lane ssa.VecLane, signed bool) {
   463  	var sseOp sseOpcode
   464  	switch lane {
   465  	case ssa.VecLaneI8x16:
   466  		if signed {
   467  			sseOp = sseOpcodePmovsxbw
   468  		} else {
   469  			sseOp = sseOpcodePmovzxbw
   470  		}
   471  	case ssa.VecLaneI16x8:
   472  		if signed {
   473  			sseOp = sseOpcodePmovsxwd
   474  		} else {
   475  			sseOp = sseOpcodePmovzxwd
   476  		}
   477  	case ssa.VecLaneI32x4:
   478  		if signed {
   479  			sseOp = sseOpcodePmovsxdq
   480  		} else {
   481  			sseOp = sseOpcodePmovzxdq
   482  		}
   483  	default:
   484  		panic(fmt.Sprintf("invalid lane type: %s", lane))
   485  	}
   486  
   487  	xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
   488  	m.insert(m.allocateInstr().asXmmUnaryRmR(sseOp, xx, m.c.VRegOf(ret)))
   489  }
   490  
   491  func (m *machine) lowerWidenHigh(x, ret ssa.Value, lane ssa.VecLane, signed bool) {
   492  	tmp := m.c.AllocateVReg(ssa.TypeV128)
   493  	xx := m.getOperand_Reg(m.c.ValueDefinition(x))
   494  	m.copyTo(xx.reg(), tmp)
   495  	m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePalignr, 8, newOperandReg(tmp), tmp))
   496  
   497  	var sseOp sseOpcode
   498  	switch lane {
   499  	case ssa.VecLaneI8x16:
   500  		if signed {
   501  			sseOp = sseOpcodePmovsxbw
   502  		} else {
   503  			sseOp = sseOpcodePmovzxbw
   504  		}
   505  	case ssa.VecLaneI16x8:
   506  		if signed {
   507  			sseOp = sseOpcodePmovsxwd
   508  		} else {
   509  			sseOp = sseOpcodePmovzxwd
   510  		}
   511  	case ssa.VecLaneI32x4:
   512  		if signed {
   513  			sseOp = sseOpcodePmovsxdq
   514  		} else {
   515  			sseOp = sseOpcodePmovzxdq
   516  		}
   517  	default:
   518  		panic(fmt.Sprintf("invalid lane type: %s", lane))
   519  	}
   520  
   521  	m.insert(m.allocateInstr().asXmmUnaryRmR(sseOp, newOperandReg(tmp), m.c.VRegOf(ret)))
   522  }
   523  
   524  func (m *machine) lowerLoadSplat(ptr ssa.Value, offset uint32, ret ssa.Value, lane ssa.VecLane) {
   525  	tmpDst, tmpGp := m.c.AllocateVReg(ssa.TypeV128), m.c.AllocateVReg(ssa.TypeI64)
   526  	am := newOperandMem(m.lowerToAddressMode(ptr, offset))
   527  
   528  	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpDst))
   529  	switch lane {
   530  	case ssa.VecLaneI8x16:
   531  		m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, am, tmpGp))
   532  		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrb, 0, newOperandReg(tmpGp), tmpDst))
   533  		tmpZeroVec := m.c.AllocateVReg(ssa.TypeV128)
   534  		m.insert(m.allocateInstr().asZeros(tmpZeroVec))
   535  		m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmpZeroVec), tmpDst))
   536  	case ssa.VecLaneI16x8:
   537  		m.insert(m.allocateInstr().asMovzxRmR(extModeWQ, am, tmpGp))
   538  		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 0, newOperandReg(tmpGp), tmpDst))
   539  		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 1, newOperandReg(tmpGp), tmpDst))
   540  		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst))
   541  	case ssa.VecLaneI32x4:
   542  		m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, am, tmpGp))
   543  		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrd, 0, newOperandReg(tmpGp), tmpDst))
   544  		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst))
   545  	case ssa.VecLaneI64x2:
   546  		m.insert(m.allocateInstr().asMov64MR(am, tmpGp))
   547  		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 0, newOperandReg(tmpGp), tmpDst))
   548  		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 1, newOperandReg(tmpGp), tmpDst))
   549  	default:
   550  		panic(fmt.Sprintf("invalid lane type: %s", lane))
   551  	}
   552  
   553  	m.copyTo(tmpDst, m.c.VRegOf(ret))
   554  }
   555  
   556  var f64x2CvtFromIMask = [16]byte{
   557  	0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   558  }
   559  
   560  func (m *machine) lowerVFcvtFromInt(x, ret ssa.Value, lane ssa.VecLane, signed bool) {
   561  	switch lane {
   562  	case ssa.VecLaneF32x4:
   563  		if signed {
   564  			xx := m.getOperand_Reg(m.c.ValueDefinition(x))
   565  			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, xx, m.c.VRegOf(ret)))
   566  		} else {
   567  			xx := m.getOperand_Reg(m.c.ValueDefinition(x))
   568  			// Copy the value to two temporary registers.
   569  			tmp := m.copyToTmp(xx.reg())
   570  			tmp2 := m.copyToTmp(xx.reg())
   571  
   572  			// Clear the higher 16 bits of each 32-bit element.
   573  			m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePslld, newOperandImm32(0xa), tmp))
   574  			m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrld, newOperandImm32(0xa), tmp))
   575  
   576  			// Subtract the higher 16-bits from tmp2: clear the lower 16-bits of tmp2.
   577  			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePsubd, newOperandReg(tmp), tmp2))
   578  
   579  			// Convert the lower 16-bits in tmp.
   580  			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, newOperandReg(tmp), tmp))
   581  
   582  			// Left shift by one and convert tmp2, meaning that halved conversion result of higher 16-bits in tmp2.
   583  			m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrld, newOperandImm32(1), tmp2))
   584  			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, newOperandReg(tmp2), tmp2))
   585  
   586  			// Double the converted halved higher 16bits.
   587  			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAddps, newOperandReg(tmp2), tmp2))
   588  
   589  			// Get the conversion result by add tmp (holding lower 16-bit conversion) into tmp2.
   590  			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAddps, newOperandReg(tmp), tmp2))
   591  
   592  			m.copyTo(tmp2, m.c.VRegOf(ret))
   593  		}
   594  	case ssa.VecLaneF64x2:
   595  		if signed {
   596  			xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
   597  			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2pd, xx, m.c.VRegOf(ret)))
   598  		} else {
   599  			maskReg := m.c.AllocateVReg(ssa.TypeV128)
   600  			maskLabel := m.getOrAllocateConstLabel(&m.constF64x2CvtFromIMaskIndex, f64x2CvtFromIMask[:])
   601  			// maskReg = [0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]
   602  			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), maskReg))
   603  
   604  			_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
   605  			xx := m.copyToTmp(_xx.reg())
   606  
   607  			// Given that we have xx = [d1, d2, d3, d4], this results in
   608  			//	xx = [d1, [0x00, 0x00, 0x30, 0x43], d2, [0x00, 0x00, 0x30, 0x43]]
   609  			//     = [float64(uint32(d1)) + 0x1.0p52, float64(uint32(d2)) + 0x1.0p52]
   610  			//     ^See https://stackoverflow.com/questions/13269523/can-all-32-bit-ints-be-exactly-represented-as-a-double
   611  			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeUnpcklps, newOperandReg(maskReg), xx))
   612  
   613  			// maskReg = [float64(0x1.0p52), float64(0x1.0p52)]
   614  			maskLabel = m.getOrAllocateConstLabel(&m.constTwop52Index, twop52[:])
   615  			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), maskReg))
   616  
   617  			// Now, we get the result as
   618  			// 	xx = [float64(uint32(d1)), float64(uint32(d2))]
   619  			// because the following equality always satisfies:
   620  			//  float64(0x1.0p52 + float64(uint32(x))) - float64(0x1.0p52 + float64(uint32(y))) = float64(uint32(x)) - float64(uint32(y))
   621  			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeSubpd, newOperandReg(maskReg), xx))
   622  
   623  			m.copyTo(xx, m.c.VRegOf(ret))
   624  		}
   625  	default:
   626  		panic(fmt.Sprintf("invalid lane type: %s", lane))
   627  	}
   628  }
   629  
   630  var (
   631  	// i32sMaxOnF64x2 holds math.MaxInt32(=2147483647.0) on two f64 lanes.
   632  	i32sMaxOnF64x2 = [16]byte{
   633  		0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0xdf, 0x41, // float64(2147483647.0)
   634  		0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0xdf, 0x41, // float64(2147483647.0)
   635  	}
   636  
   637  	// i32sMaxOnF64x2 holds math.MaxUint32(=4294967295.0) on two f64 lanes.
   638  	i32uMaxOnF64x2 = [16]byte{
   639  		0x00, 0x00, 0xe0, 0xff, 0xff, 0xff, 0xef, 0x41, // float64(4294967295.0)
   640  		0x00, 0x00, 0xe0, 0xff, 0xff, 0xff, 0xef, 0x41, // float64(4294967295.0)
   641  	}
   642  
   643  	// twop52 holds two float64(0x1.0p52) on two f64 lanes. 0x1.0p52 is special in the sense that
   644  	// with this exponent, the mantissa represents a corresponding uint32 number, and arithmetics,
   645  	// like addition or subtraction, the resulted floating point holds exactly the same
   646  	// bit representations in 32-bit integer on its mantissa.
   647  	//
   648  	// Note: the name twop52 is common across various compiler ecosystem.
   649  	// 	E.g. https://github.com/llvm/llvm-project/blob/92ab024f81e5b64e258b7c3baaf213c7c26fcf40/compiler-rt/lib/builtins/floatdidf.c#L28
   650  	// 	E.g. https://opensource.apple.com/source/clang/clang-425.0.24/src/projects/compiler-rt/lib/floatdidf.c.auto.html
   651  	twop52 = [16]byte{
   652  		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, // float64(0x1.0p52)
   653  		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, // float64(0x1.0p52)
   654  	}
   655  )
   656  
   657  func (m *machine) lowerVFcvtToIntSat(x, ret ssa.Value, lane ssa.VecLane, signed bool) {
   658  	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
   659  	xx := m.copyToTmp(_xx.reg())
   660  
   661  	switch lane {
   662  	case ssa.VecLaneF32x4:
   663  		if signed {
   664  			tmp := m.copyToTmp(xx)
   665  
   666  			// Assuming we have xx = [v1, v2, v3, v4].
   667  			//
   668  			// Set all bits if lane is not NaN on tmp.
   669  			// tmp[i] = 0xffffffff  if vi != NaN
   670  			//        = 0           if vi == NaN
   671  			m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeCmpps, uint8(cmpPredEQ_OQ), newOperandReg(tmp), tmp))
   672  
   673  			// Clear NaN lanes on xx, meaning that
   674  			// 	xx[i] = vi  if vi != NaN
   675  			//	        0   if vi == NaN
   676  			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAndps, newOperandReg(tmp), xx))
   677  
   678  			// tmp[i] = ^vi         if vi != NaN
   679  			//        = 0xffffffff  if vi == NaN
   680  			// which means that tmp[i] & 0x80000000 != 0 if and only if vi is negative.
   681  			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeXorps, newOperandReg(xx), tmp))
   682  
   683  			// xx[i] = int32(vi)   if vi != NaN and xx is not overflowing.
   684  			//       = 0x80000000  if vi != NaN and xx is overflowing (See https://www.felixcloutier.com/x86/cvttps2dq)
   685  			//       = 0           if vi == NaN
   686  			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttps2dq, newOperandReg(xx), xx))
   687  
   688  			// Below, we have to convert 0x80000000 into 0x7FFFFFFF for positive overflowing lane.
   689  			//
   690  			// tmp[i] = 0x80000000                         if vi is positive
   691  			//        = any satisfying any&0x80000000 = 0  if vi is negative or zero.
   692  			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAndps, newOperandReg(xx), tmp))
   693  
   694  			// Arithmetic right shifting tmp by 31, meaning that we have
   695  			// tmp[i] = 0xffffffff if vi is positive, 0 otherwise.
   696  			m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrad, newOperandImm32(0x1f), tmp))
   697  
   698  			// Flipping 0x80000000 if vi is positive, otherwise keep intact.
   699  			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), xx))
   700  		} else {
   701  			tmp := m.c.AllocateVReg(ssa.TypeV128)
   702  			m.insert(m.allocateInstr().asZeros(tmp))
   703  			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMaxps, newOperandReg(tmp), xx))
   704  			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePcmpeqd, newOperandReg(tmp), tmp))
   705  			m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrld, newOperandImm32(0x1), tmp))
   706  			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, newOperandReg(tmp), tmp))
   707  			tmp2 := m.copyToTmp(xx)
   708  			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttps2dq, newOperandReg(xx), xx))
   709  			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeSubps, newOperandReg(tmp), tmp2))
   710  			m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeCmpps, uint8(cmpPredLE_OS), newOperandReg(tmp2), tmp))
   711  			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttps2dq, newOperandReg(tmp2), tmp2))
   712  			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmp2))
   713  			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmp))
   714  			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaxsd, newOperandReg(tmp), tmp2))
   715  			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePaddd, newOperandReg(tmp2), xx))
   716  		}
   717  
   718  	case ssa.VecLaneF64x2:
   719  		tmp2 := m.c.AllocateVReg(ssa.TypeV128)
   720  		if signed {
   721  			tmp := m.copyToTmp(xx)
   722  
   723  			// Set all bits for non-NaN lanes, zeros otherwise.
   724  			// I.e. tmp[i] = 0xffffffff_ffffffff if vi != NaN, 0 otherwise.
   725  			m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeCmppd, uint8(cmpPredEQ_OQ), newOperandReg(tmp), tmp))
   726  
   727  			maskLabel := m.getOrAllocateConstLabel(&m.constI32sMaxOnF64x2Index, i32sMaxOnF64x2[:])
   728  			// Load the 2147483647 into tmp2's each lane.
   729  			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), tmp2))
   730  
   731  			// tmp[i] = 2147483647 if vi != NaN, 0 otherwise.
   732  			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAndps, newOperandReg(tmp2), tmp))
   733  
   734  			// MINPD returns the source register's value as-is, so we have
   735  			//  xx[i] = vi   if vi != NaN
   736  			//        = 0    if vi == NaN
   737  			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMinpd, newOperandReg(tmp), xx))
   738  
   739  			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttpd2dq, newOperandReg(xx), xx))
   740  		} else {
   741  			tmp := m.c.AllocateVReg(ssa.TypeV128)
   742  			m.insert(m.allocateInstr().asZeros(tmp))
   743  
   744  			//  xx[i] = vi   if vi != NaN && vi > 0
   745  			//        = 0    if vi == NaN || vi <= 0
   746  			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMaxpd, newOperandReg(tmp), xx))
   747  
   748  			// tmp2[i] = float64(math.MaxUint32) = math.MaxUint32
   749  			maskIndex := m.getOrAllocateConstLabel(&m.constI32uMaxOnF64x2Index, i32uMaxOnF64x2[:])
   750  			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskIndex)), tmp2))
   751  
   752  			// xx[i] = vi   if vi != NaN && vi > 0 && vi <= math.MaxUint32
   753  			//       = 0    otherwise
   754  			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMinpd, newOperandReg(tmp2), xx))
   755  
   756  			// Round the floating points into integer.
   757  			m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeRoundpd, 0x3, newOperandReg(xx), xx))
   758  
   759  			// tmp2[i] = float64(0x1.0p52)
   760  			maskIndex = m.getOrAllocateConstLabel(&m.constTwop52Index, twop52[:])
   761  			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskIndex)), tmp2))
   762  
   763  			// xx[i] = float64(0x1.0p52) + float64(uint32(vi)) if vi != NaN && vi > 0 && vi <= math.MaxUint32
   764  			//       = 0                                       otherwise
   765  			//
   766  			// This means that xx[i] holds exactly the same bit of uint32(vi) in its lower 32-bits.
   767  			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAddpd, newOperandReg(tmp2), xx))
   768  
   769  			// At this point, we have
   770  			// 	xx  = [uint32(v0), float64(0x1.0p52), uint32(v1), float64(0x1.0p52)]
   771  			//  tmp = [0, 0, 0, 0]
   772  			// as 32x4 lanes. Therefore, SHUFPS with 0b00_00_10_00 results in
   773  			//	xx = [xx[00], xx[10], tmp[00], tmp[00]] = [xx[00], xx[10], 0, 0]
   774  			// meaning that for i = 0 and 1, we have
   775  			//  xx[i] = uint32(vi) if vi != NaN && vi > 0 && vi <= math.MaxUint32
   776  			//        = 0          otherwise.
   777  			m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeShufps, 0b00_00_10_00, newOperandReg(tmp), xx))
   778  		}
   779  	default:
   780  		panic(fmt.Sprintf("invalid lane type: %s", lane))
   781  	}
   782  
   783  	m.copyTo(xx, m.c.VRegOf(ret))
   784  }
   785  
   786  func (m *machine) lowerNarrow(x, y, ret ssa.Value, lane ssa.VecLane, signed bool) {
   787  	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
   788  	xx := m.copyToTmp(_xx.reg())
   789  	yy := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
   790  
   791  	var sseOp sseOpcode
   792  	switch lane {
   793  	case ssa.VecLaneI16x8:
   794  		if signed {
   795  			sseOp = sseOpcodePacksswb
   796  		} else {
   797  			sseOp = sseOpcodePackuswb
   798  		}
   799  	case ssa.VecLaneI32x4:
   800  		if signed {
   801  			sseOp = sseOpcodePackssdw
   802  		} else {
   803  			sseOp = sseOpcodePackusdw
   804  		}
   805  	default:
   806  		panic(fmt.Sprintf("invalid lane type: %s", lane))
   807  	}
   808  	m.insert(m.allocateInstr().asXmmRmR(sseOp, yy, xx))
   809  	m.copyTo(xx, m.c.VRegOf(ret))
   810  }
   811  
   812  func (m *machine) lowerWideningPairwiseDotProductS(x, y, ret ssa.Value) {
   813  	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
   814  	xx := m.copyToTmp(_xx.reg())
   815  	yy := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
   816  	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddwd, yy, xx))
   817  	m.copyTo(xx, m.c.VRegOf(ret))
   818  }
   819  
   820  func (m *machine) lowerVIabs(instr *ssa.Instruction) {
   821  	x, lane := instr.ArgWithLane()
   822  	rd := m.c.VRegOf(instr.Return())
   823  
   824  	if lane == ssa.VecLaneI64x2 {
   825  		_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
   826  
   827  		blendReg := xmm0VReg
   828  		m.insert(m.allocateInstr().asDefineUninitializedReg(blendReg))
   829  
   830  		tmp := m.copyToTmp(_xx.reg())
   831  		xx := m.copyToTmp(_xx.reg())
   832  
   833  		// Clear all bits on blendReg.
   834  		m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(blendReg), blendReg))
   835  		// Subtract xx from blendMaskReg.
   836  		m.insert(m.allocateInstr().asXmmRmR(sseOpcodePsubq, newOperandReg(xx), blendReg))
   837  		// Copy the subtracted value ^^ back into tmp.
   838  		m.copyTo(blendReg, xx)
   839  
   840  		m.insert(m.allocateInstr().asBlendvpd(newOperandReg(tmp), xx))
   841  
   842  		m.copyTo(xx, rd)
   843  	} else {
   844  		var vecOp sseOpcode
   845  		switch lane {
   846  		case ssa.VecLaneI8x16:
   847  			vecOp = sseOpcodePabsb
   848  		case ssa.VecLaneI16x8:
   849  			vecOp = sseOpcodePabsw
   850  		case ssa.VecLaneI32x4:
   851  			vecOp = sseOpcodePabsd
   852  		}
   853  		rn := m.getOperand_Reg(m.c.ValueDefinition(x))
   854  
   855  		i := m.allocateInstr()
   856  		i.asXmmUnaryRmR(vecOp, rn, rd)
   857  		m.insert(i)
   858  	}
   859  }
   860  
   861  func (m *machine) lowerVIpopcnt(instr *ssa.Instruction) {
   862  	x := instr.Arg()
   863  	rn := m.getOperand_Reg(m.c.ValueDefinition(x))
   864  	rd := m.c.VRegOf(instr.Return())
   865  
   866  	tmp1 := m.c.AllocateVReg(ssa.TypeV128)
   867  	m.lowerVconst(tmp1, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f)
   868  
   869  	// Copy input into tmp2.
   870  	tmp2 := m.copyToTmp(rn.reg())
   871  
   872  	// Given that we have:
   873  	//  rm = [b1, ..., b16] where bn = hn:ln and hn and ln are higher and lower 4-bits of bn.
   874  	//
   875  	// Take PAND on tmp1 and tmp2, so that we mask out all the higher bits.
   876  	//  tmp2 = [l1, ..., l16].
   877  	pand := m.allocateInstr()
   878  	pand.asXmmRmR(sseOpcodePand, newOperandReg(tmp1), tmp2)
   879  	m.insert(pand)
   880  
   881  	// Do logical (packed word) right shift by 4 on rm and PAND against the mask (tmp1); meaning that we have
   882  	//  tmp3 = [h1, ...., h16].
   883  	tmp3 := m.copyToTmp(rn.reg())
   884  	psrlw := m.allocateInstr()
   885  	psrlw.asXmmRmiReg(sseOpcodePsrlw, newOperandImm32(4), tmp3)
   886  	m.insert(psrlw)
   887  
   888  	pand2 := m.allocateInstr()
   889  	pand2.asXmmRmR(sseOpcodePand, newOperandReg(tmp1), tmp3)
   890  	m.insert(pand2)
   891  
   892  	// Read the popcntTable into tmp4, and we have
   893  	//  tmp4 = [0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04]
   894  	tmp4 := m.c.AllocateVReg(ssa.TypeV128)
   895  	m.lowerVconst(tmp4, 0x03_02_02_01_02_01_01_00, 0x04_03_03_02_03_02_02_01)
   896  
   897  	// Make a copy for later.
   898  	tmp5 := m.copyToTmp(tmp4)
   899  
   900  	//  tmp4 = [popcnt(l1), ..., popcnt(l16)].
   901  	pshufb := m.allocateInstr()
   902  	pshufb.asXmmRmR(sseOpcodePshufb, newOperandReg(tmp2), tmp4)
   903  	m.insert(pshufb)
   904  
   905  	pshufb2 := m.allocateInstr()
   906  	pshufb2.asXmmRmR(sseOpcodePshufb, newOperandReg(tmp3), tmp5)
   907  	m.insert(pshufb2)
   908  
   909  	// tmp4 + tmp5 is the result.
   910  	paddb := m.allocateInstr()
   911  	paddb.asXmmRmR(sseOpcodePaddb, newOperandReg(tmp4), tmp5)
   912  	m.insert(paddb)
   913  
   914  	m.copyTo(tmp5, rd)
   915  }
   916  
   917  func (m *machine) lowerVImul(instr *ssa.Instruction) {
   918  	x, y, lane := instr.Arg2WithLane()
   919  	rd := m.c.VRegOf(instr.Return())
   920  	if lane == ssa.VecLaneI64x2 {
   921  		rn := m.getOperand_Reg(m.c.ValueDefinition(x))
   922  		rm := m.getOperand_Reg(m.c.ValueDefinition(y))
   923  		// Assuming that we have
   924  		//	rm = [p1, p2] = [p1_lo, p1_hi, p2_lo, p2_high]
   925  		//  rn = [q1, q2] = [q1_lo, q1_hi, q2_lo, q2_high]
   926  		// where pN and qN are 64-bit (quad word) lane, and pN_lo, pN_hi, qN_lo and qN_hi are 32-bit (double word) lane.
   927  
   928  		// Copy rn into tmp1.
   929  		tmp1 := m.copyToTmp(rn.reg())
   930  
   931  		// And do the logical right shift by 32-bit on tmp1, which makes tmp1 = [0, p1_high, 0, p2_high]
   932  		shift := m.allocateInstr()
   933  		shift.asXmmRmiReg(sseOpcodePsrlq, newOperandImm32(32), tmp1)
   934  		m.insert(shift)
   935  
   936  		// Execute "pmuludq rm,tmp1", which makes tmp1 = [p1_high*q1_lo, p2_high*q2_lo] where each lane is 64-bit.
   937  		mul := m.allocateInstr()
   938  		mul.asXmmRmR(sseOpcodePmuludq, rm, tmp1)
   939  		m.insert(mul)
   940  
   941  		// Copy rm value into tmp2.
   942  		tmp2 := m.copyToTmp(rm.reg())
   943  
   944  		// And do the logical right shift by 32-bit on tmp2, which makes tmp2 = [0, q1_high, 0, q2_high]
   945  		shift2 := m.allocateInstr()
   946  		shift2.asXmmRmiReg(sseOpcodePsrlq, newOperandImm32(32), tmp2)
   947  		m.insert(shift2)
   948  
   949  		// Execute "pmuludq rm,tmp2", which makes tmp2 = [p1_lo*q1_high, p2_lo*q2_high] where each lane is 64-bit.
   950  		mul2 := m.allocateInstr()
   951  		mul2.asXmmRmR(sseOpcodePmuludq, rn, tmp2)
   952  		m.insert(mul2)
   953  
   954  		// Adds tmp1 and tmp2 and do the logical left shift by 32-bit,
   955  		// which makes tmp1 = [(p1_lo*q1_high+p1_high*q1_lo)<<32, (p2_lo*q2_high+p2_high*q2_lo)<<32]
   956  		add := m.allocateInstr()
   957  		add.asXmmRmR(sseOpcodePaddq, newOperandReg(tmp2), tmp1)
   958  		m.insert(add)
   959  
   960  		shift3 := m.allocateInstr()
   961  		shift3.asXmmRmiReg(sseOpcodePsllq, newOperandImm32(32), tmp1)
   962  		m.insert(shift3)
   963  
   964  		// Copy rm value into tmp3.
   965  		tmp3 := m.copyToTmp(rm.reg())
   966  
   967  		// "pmuludq rm,tmp3" makes tmp3 = [p1_lo*q1_lo, p2_lo*q2_lo] where each lane is 64-bit.
   968  		mul3 := m.allocateInstr()
   969  		mul3.asXmmRmR(sseOpcodePmuludq, rn, tmp3)
   970  		m.insert(mul3)
   971  
   972  		// Finally, we get the result by computing tmp1 + tmp3,
   973  		// which makes tmp1 = [(p1_lo*q1_high+p1_high*q1_lo)<<32+p1_lo*q1_lo, (p2_lo*q2_high+p2_high*q2_lo)<<32+p2_lo*q2_lo]
   974  		add2 := m.allocateInstr()
   975  		add2.asXmmRmR(sseOpcodePaddq, newOperandReg(tmp3), tmp1)
   976  		m.insert(add2)
   977  
   978  		m.copyTo(tmp1, rd)
   979  
   980  	} else {
   981  		var vecOp sseOpcode
   982  		switch lane {
   983  		case ssa.VecLaneI16x8:
   984  			vecOp = sseOpcodePmullw
   985  		case ssa.VecLaneI32x4:
   986  			vecOp = sseOpcodePmulld
   987  		default:
   988  			panic("unsupported: " + lane.String())
   989  		}
   990  		m.lowerVbBinOp(vecOp, x, y, instr.Return())
   991  	}
   992  }