github.com/tetratelabs/wazero@v1.7.3-0.20240513003603-48f702e154b5/internal/engine/wazevo/backend/isa/amd64/machine_vec.go (about) 1 package amd64 2 3 import ( 4 "fmt" 5 6 "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" 7 "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" 8 ) 9 10 var swizzleMask = [16]byte{ 11 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 12 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 13 } 14 15 func (m *machine) lowerSwizzle(x, y ssa.Value, ret ssa.Value) { 16 masklabel := m.getOrAllocateConstLabel(&m.constSwizzleMaskConstIndex, swizzleMask[:]) 17 18 // Load mask to maskReg. 19 maskReg := m.c.AllocateVReg(ssa.TypeV128) 20 loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(masklabel)), maskReg) 21 m.insert(loadMask) 22 23 // Copy x and y to tmp registers. 24 xx := m.getOperand_Reg(m.c.ValueDefinition(x)) 25 tmpDst := m.copyToTmp(xx.reg()) 26 yy := m.getOperand_Reg(m.c.ValueDefinition(y)) 27 tmpX := m.copyToTmp(yy.reg()) 28 29 m.insert(m.allocateInstr().asXmmRmR(sseOpcodePaddusb, newOperandReg(maskReg), tmpX)) 30 m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmpX), tmpDst)) 31 32 // Copy the result to the destination register. 33 m.copyTo(tmpDst, m.c.VRegOf(ret)) 34 } 35 36 func (m *machine) lowerInsertLane(x, y ssa.Value, index byte, ret ssa.Value, lane ssa.VecLane) { 37 // Copy x to tmp. 38 tmpDst := m.c.AllocateVReg(ssa.TypeV128) 39 m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, m.getOperand_Mem_Reg(m.c.ValueDefinition(x)), tmpDst)) 40 41 yy := m.getOperand_Reg(m.c.ValueDefinition(y)) 42 switch lane { 43 case ssa.VecLaneI8x16: 44 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrb, index, yy, tmpDst)) 45 case ssa.VecLaneI16x8: 46 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, index, yy, tmpDst)) 47 case ssa.VecLaneI32x4: 48 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrd, index, yy, tmpDst)) 49 case ssa.VecLaneI64x2: 50 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, index, yy, tmpDst)) 51 case ssa.VecLaneF32x4: 52 // In INSERTPS instruction, the destination index is encoded at 4 and 5 bits of the argument. 53 // See https://www.felixcloutier.com/x86/insertps 54 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeInsertps, index<<4, yy, tmpDst)) 55 case ssa.VecLaneF64x2: 56 if index == 0 { 57 m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovsd, yy, tmpDst)) 58 } else { 59 m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMovlhps, yy, tmpDst)) 60 } 61 default: 62 panic(fmt.Sprintf("invalid lane type: %s", lane)) 63 } 64 65 m.copyTo(tmpDst, m.c.VRegOf(ret)) 66 } 67 68 func (m *machine) lowerExtractLane(x ssa.Value, index byte, signed bool, ret ssa.Value, lane ssa.VecLane) { 69 // Pextr variants are used to extract a lane from a vector register. 70 xx := m.getOperand_Reg(m.c.ValueDefinition(x)) 71 72 tmpDst := m.c.AllocateVReg(ret.Type()) 73 m.insert(m.allocateInstr().asDefineUninitializedReg(tmpDst)) 74 switch lane { 75 case ssa.VecLaneI8x16: 76 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrb, index, xx, tmpDst)) 77 if signed { 78 m.insert(m.allocateInstr().asMovsxRmR(extModeBL, newOperandReg(tmpDst), tmpDst)) 79 } else { 80 m.insert(m.allocateInstr().asMovzxRmR(extModeBL, newOperandReg(tmpDst), tmpDst)) 81 } 82 case ssa.VecLaneI16x8: 83 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrw, index, xx, tmpDst)) 84 if signed { 85 m.insert(m.allocateInstr().asMovsxRmR(extModeWL, newOperandReg(tmpDst), tmpDst)) 86 } else { 87 m.insert(m.allocateInstr().asMovzxRmR(extModeWL, newOperandReg(tmpDst), tmpDst)) 88 } 89 case ssa.VecLaneI32x4: 90 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrd, index, xx, tmpDst)) 91 case ssa.VecLaneI64x2: 92 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrq, index, xx, tmpDst)) 93 case ssa.VecLaneF32x4: 94 if index == 0 { 95 m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovss, xx, tmpDst)) 96 } else { 97 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, index, xx, tmpDst)) 98 } 99 case ssa.VecLaneF64x2: 100 if index == 0 { 101 m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovsd, xx, tmpDst)) 102 } else { 103 m.copyTo(xx.reg(), tmpDst) 104 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0b00_00_11_10, newOperandReg(tmpDst), tmpDst)) 105 } 106 default: 107 panic(fmt.Sprintf("invalid lane type: %s", lane)) 108 } 109 110 m.copyTo(tmpDst, m.c.VRegOf(ret)) 111 } 112 113 var sqmulRoundSat = [16]byte{ 114 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 115 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 116 } 117 118 func (m *machine) lowerSqmulRoundSat(x, y, ret ssa.Value) { 119 // See https://github.com/WebAssembly/simd/pull/365 for the following logic. 120 maskLabel := m.getOrAllocateConstLabel(&m.constSqmulRoundSatIndex, sqmulRoundSat[:]) 121 122 tmp := m.c.AllocateVReg(ssa.TypeV128) 123 loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), tmp) 124 m.insert(loadMask) 125 126 xx, yy := m.getOperand_Reg(m.c.ValueDefinition(x)), m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) 127 tmpX := m.copyToTmp(xx.reg()) 128 129 m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmulhrsw, yy, tmpX)) 130 m.insert(m.allocateInstr().asXmmRmR(sseOpcodePcmpeqd, newOperandReg(tmpX), tmp)) 131 m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmpX)) 132 133 m.copyTo(tmpX, m.c.VRegOf(ret)) 134 } 135 136 func (m *machine) lowerVUshr(x, y, ret ssa.Value, lane ssa.VecLane) { 137 switch lane { 138 case ssa.VecLaneI8x16: 139 m.lowerVUshri8x16(x, y, ret) 140 case ssa.VecLaneI16x8, ssa.VecLaneI32x4, ssa.VecLaneI64x2: 141 m.lowerShr(x, y, ret, lane, false) 142 default: 143 panic(fmt.Sprintf("invalid lane type: %s", lane)) 144 } 145 } 146 147 // i8x16LogicalSHRMaskTable is necessary for emulating non-existent packed bytes logical right shifts on amd64. 148 // The mask is applied after performing packed word shifts on the value to clear out the unnecessary bits. 149 var i8x16LogicalSHRMaskTable = [8 * 16]byte{ // (the number of possible shift amount 0, 1, ..., 7.) * 16 bytes. 150 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // for 0 shift 151 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // for 1 shift 152 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, // for 2 shift 153 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, // for 3 shift 154 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, // for 4 shift 155 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, // for 5 shift 156 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, // for 6 shift 157 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // for 7 shift 158 } 159 160 func (m *machine) lowerVUshri8x16(x, y, ret ssa.Value) { 161 tmpGpReg := m.c.AllocateVReg(ssa.TypeI32) 162 // Load the modulo 8 mask to tmpReg. 163 m.lowerIconst(tmpGpReg, 0x7, false) 164 // Take the modulo 8 of the shift amount. 165 shiftAmt := m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y)) 166 m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd, shiftAmt, tmpGpReg, false)) 167 168 _xx := m.getOperand_Reg(m.c.ValueDefinition(x)) 169 xx := m.copyToTmp(_xx.reg()) 170 171 vecTmp := m.c.AllocateVReg(ssa.TypeV128) 172 m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(tmpGpReg), vecTmp, false)) 173 m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrlw, newOperandReg(vecTmp), xx)) 174 175 maskTableLabel := m.getOrAllocateConstLabel(&m.constI8x16LogicalSHRMaskTableIndex, i8x16LogicalSHRMaskTable[:]) 176 base := m.c.AllocateVReg(ssa.TypeI64) 177 lea := m.allocateInstr().asLEA(newOperandLabel(maskTableLabel), base) 178 m.insert(lea) 179 180 // Shift tmpGpReg by 4 to multiply the shift amount by 16. 181 m.insert(m.allocateInstr().asShiftR(shiftROpShiftLeft, newOperandImm32(4), tmpGpReg, false)) 182 183 mem := m.newAmodeRegRegShift(0, base, tmpGpReg, 0) 184 loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(mem), vecTmp) 185 m.insert(loadMask) 186 187 m.insert(m.allocateInstr().asXmmRmR(sseOpcodePand, newOperandReg(vecTmp), xx)) 188 m.copyTo(xx, m.c.VRegOf(ret)) 189 } 190 191 func (m *machine) lowerVSshr(x, y, ret ssa.Value, lane ssa.VecLane) { 192 switch lane { 193 case ssa.VecLaneI8x16: 194 m.lowerVSshri8x16(x, y, ret) 195 case ssa.VecLaneI16x8, ssa.VecLaneI32x4: 196 m.lowerShr(x, y, ret, lane, true) 197 case ssa.VecLaneI64x2: 198 m.lowerVSshri64x2(x, y, ret) 199 default: 200 panic(fmt.Sprintf("invalid lane type: %s", lane)) 201 } 202 } 203 204 func (m *machine) lowerVSshri8x16(x, y, ret ssa.Value) { 205 shiftAmtReg := m.c.AllocateVReg(ssa.TypeI32) 206 // Load the modulo 8 mask to tmpReg. 207 m.lowerIconst(shiftAmtReg, 0x7, false) 208 // Take the modulo 8 of the shift amount. 209 shiftAmt := m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y)) 210 m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd, shiftAmt, shiftAmtReg, false)) 211 212 // Copy the x value to two temporary registers. 213 _xx := m.getOperand_Reg(m.c.ValueDefinition(x)) 214 xx := m.copyToTmp(_xx.reg()) 215 vecTmp := m.c.AllocateVReg(ssa.TypeV128) 216 m.copyTo(xx, vecTmp) 217 218 // Assuming that we have 219 // xx = [b1, ..., b16] 220 // vecTmp = [b1, ..., b16] 221 // at this point, then we use PUNPCKLBW and PUNPCKHBW to produce: 222 // xx = [b1, b1, b2, b2, ..., b8, b8] 223 // vecTmp = [b9, b9, b10, b10, ..., b16, b16] 224 m.insert(m.allocateInstr().asXmmRmR(sseOpcodePunpcklbw, newOperandReg(xx), xx)) 225 m.insert(m.allocateInstr().asXmmRmR(sseOpcodePunpckhbw, newOperandReg(vecTmp), vecTmp)) 226 227 // Adding 8 to the shift amount, and then move the amount to vecTmp2. 228 vecTmp2 := m.c.AllocateVReg(ssa.TypeV128) 229 m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAdd, newOperandImm32(8), shiftAmtReg, false)) 230 m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(shiftAmtReg), vecTmp2, false)) 231 232 // Perform the word packed arithmetic right shifts on vreg and vecTmp. 233 // This changes these two registers as: 234 // xx = [xxx, b1 >> s, xxx, b2 >> s, ..., xxx, b8 >> s] 235 // vecTmp = [xxx, b9 >> s, xxx, b10 >> s, ..., xxx, b16 >> s] 236 // where xxx is 1 or 0 depending on each byte's sign, and ">>" is the arithmetic shift on a byte. 237 m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsraw, newOperandReg(vecTmp2), xx)) 238 m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsraw, newOperandReg(vecTmp2), vecTmp)) 239 240 // Finally, we can get the result by packing these two word vectors. 241 m.insert(m.allocateInstr().asXmmRmR(sseOpcodePacksswb, newOperandReg(vecTmp), xx)) 242 243 m.copyTo(xx, m.c.VRegOf(ret)) 244 } 245 246 func (m *machine) lowerVSshri64x2(x, y, ret ssa.Value) { 247 // Load the shift amount to RCX. 248 shiftAmt := m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) 249 m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, shiftAmt, rcxVReg)) 250 251 tmpGp := m.c.AllocateVReg(ssa.TypeI64) 252 253 _xx := m.getOperand_Reg(m.c.ValueDefinition(x)) 254 xxReg := m.copyToTmp(_xx.reg()) 255 256 m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp)) 257 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrq, 0, newOperandReg(xxReg), tmpGp)) 258 m.insert(m.allocateInstr().asShiftR(shiftROpShiftRightArithmetic, newOperandReg(rcxVReg), tmpGp, true)) 259 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 0, newOperandReg(tmpGp), xxReg)) 260 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrq, 1, newOperandReg(xxReg), tmpGp)) 261 m.insert(m.allocateInstr().asShiftR(shiftROpShiftRightArithmetic, newOperandReg(rcxVReg), tmpGp, true)) 262 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 1, newOperandReg(tmpGp), xxReg)) 263 264 m.copyTo(xxReg, m.c.VRegOf(ret)) 265 } 266 267 func (m *machine) lowerShr(x, y, ret ssa.Value, lane ssa.VecLane, signed bool) { 268 var modulo uint64 269 var shiftOp sseOpcode 270 switch lane { 271 case ssa.VecLaneI16x8: 272 modulo = 0xf 273 if signed { 274 shiftOp = sseOpcodePsraw 275 } else { 276 shiftOp = sseOpcodePsrlw 277 } 278 case ssa.VecLaneI32x4: 279 modulo = 0x1f 280 if signed { 281 shiftOp = sseOpcodePsrad 282 } else { 283 shiftOp = sseOpcodePsrld 284 } 285 case ssa.VecLaneI64x2: 286 modulo = 0x3f 287 if signed { 288 panic("BUG") 289 } 290 shiftOp = sseOpcodePsrlq 291 default: 292 panic(fmt.Sprintf("invalid lane type: %s", lane)) 293 } 294 295 _xx := m.getOperand_Reg(m.c.ValueDefinition(x)) 296 xx := m.copyToTmp(_xx.reg()) 297 298 tmpGpReg := m.c.AllocateVReg(ssa.TypeI32) 299 // Load the modulo 8 mask to tmpReg. 300 m.lowerIconst(tmpGpReg, modulo, false) 301 // Take the modulo 8 of the shift amount. 302 m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd, 303 m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y)), tmpGpReg, false)) 304 // And move it to a xmm register. 305 tmpVec := m.c.AllocateVReg(ssa.TypeV128) 306 m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(tmpGpReg), tmpVec, false)) 307 308 // Then do the actual shift. 309 m.insert(m.allocateInstr().asXmmRmiReg(shiftOp, newOperandReg(tmpVec), xx)) 310 311 m.copyTo(xx, m.c.VRegOf(ret)) 312 } 313 314 func (m *machine) lowerVIshl(x, y, ret ssa.Value, lane ssa.VecLane) { 315 var modulo uint64 316 var shiftOp sseOpcode 317 var isI8x16 bool 318 switch lane { 319 case ssa.VecLaneI8x16: 320 isI8x16 = true 321 modulo = 0x7 322 shiftOp = sseOpcodePsllw 323 case ssa.VecLaneI16x8: 324 modulo = 0xf 325 shiftOp = sseOpcodePsllw 326 case ssa.VecLaneI32x4: 327 modulo = 0x1f 328 shiftOp = sseOpcodePslld 329 case ssa.VecLaneI64x2: 330 modulo = 0x3f 331 shiftOp = sseOpcodePsllq 332 default: 333 panic(fmt.Sprintf("invalid lane type: %s", lane)) 334 } 335 336 _xx := m.getOperand_Reg(m.c.ValueDefinition(x)) 337 xx := m.copyToTmp(_xx.reg()) 338 339 tmpGpReg := m.c.AllocateVReg(ssa.TypeI32) 340 // Load the modulo 8 mask to tmpReg. 341 m.lowerIconst(tmpGpReg, modulo, false) 342 // Take the modulo 8 of the shift amount. 343 m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd, 344 m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y)), tmpGpReg, false)) 345 // And move it to a xmm register. 346 tmpVec := m.c.AllocateVReg(ssa.TypeV128) 347 m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(tmpGpReg), tmpVec, false)) 348 349 // Then do the actual shift. 350 m.insert(m.allocateInstr().asXmmRmiReg(shiftOp, newOperandReg(tmpVec), xx)) 351 352 if isI8x16 { 353 maskTableLabel := m.getOrAllocateConstLabel(&m.constI8x16SHLMaskTableIndex, i8x16SHLMaskTable[:]) 354 base := m.c.AllocateVReg(ssa.TypeI64) 355 lea := m.allocateInstr().asLEA(newOperandLabel(maskTableLabel), base) 356 m.insert(lea) 357 358 // Shift tmpGpReg by 4 to multiply the shift amount by 16. 359 m.insert(m.allocateInstr().asShiftR(shiftROpShiftLeft, newOperandImm32(4), tmpGpReg, false)) 360 361 mem := m.newAmodeRegRegShift(0, base, tmpGpReg, 0) 362 loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(mem), tmpVec) 363 m.insert(loadMask) 364 365 m.insert(m.allocateInstr().asXmmRmR(sseOpcodePand, newOperandReg(tmpVec), xx)) 366 } 367 368 m.copyTo(xx, m.c.VRegOf(ret)) 369 } 370 371 // i8x16SHLMaskTable is necessary for emulating non-existent packed bytes left shifts on amd64. 372 // The mask is applied after performing packed word shifts on the value to clear out the unnecessary bits. 373 var i8x16SHLMaskTable = [8 * 16]byte{ // (the number of possible shift amount 0, 1, ..., 7.) * 16 bytes. 374 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // for 0 shift 375 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, // for 1 shift 376 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, // for 2 shift 377 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, // for 3 shift 378 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // for 4 shift 379 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, // for 5 shift 380 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, // for 6 shift 381 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, // for 7 shift 382 } 383 384 func (m *machine) lowerVRound(x, ret ssa.Value, imm byte, _64 bool) { 385 xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) 386 var round sseOpcode 387 if _64 { 388 round = sseOpcodeRoundpd 389 } else { 390 round = sseOpcodeRoundps 391 } 392 m.insert(m.allocateInstr().asXmmUnaryRmRImm(round, imm, xx, m.c.VRegOf(ret))) 393 } 394 395 var ( 396 allOnesI8x16 = [16]byte{0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1} 397 allOnesI16x8 = [16]byte{0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0} 398 extAddPairwiseI16x8uMask1 = [16]byte{0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80} 399 extAddPairwiseI16x8uMask2 = [16]byte{0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00} 400 ) 401 402 func (m *machine) lowerExtIaddPairwise(x, ret ssa.Value, srcLane ssa.VecLane, signed bool) { 403 _xx := m.getOperand_Reg(m.c.ValueDefinition(x)) 404 xx := m.copyToTmp(_xx.reg()) 405 switch srcLane { 406 case ssa.VecLaneI8x16: 407 allOneReg := m.c.AllocateVReg(ssa.TypeV128) 408 mask := m.getOrAllocateConstLabel(&m.constAllOnesI8x16Index, allOnesI8x16[:]) 409 m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), allOneReg)) 410 411 var resultReg regalloc.VReg 412 if signed { 413 resultReg = allOneReg 414 m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddubsw, newOperandReg(xx), resultReg)) 415 } else { 416 // Interpreter tmp (all ones) as signed byte meaning that all the multiply-add is unsigned. 417 resultReg = xx 418 m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddubsw, newOperandReg(allOneReg), resultReg)) 419 } 420 m.copyTo(resultReg, m.c.VRegOf(ret)) 421 422 case ssa.VecLaneI16x8: 423 if signed { 424 allOnesReg := m.c.AllocateVReg(ssa.TypeV128) 425 mask := m.getOrAllocateConstLabel(&m.constAllOnesI16x8Index, allOnesI16x8[:]) 426 m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), allOnesReg)) 427 m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddwd, newOperandReg(allOnesReg), xx)) 428 m.copyTo(xx, m.c.VRegOf(ret)) 429 } else { 430 maskReg := m.c.AllocateVReg(ssa.TypeV128) 431 mask := m.getOrAllocateConstLabel(&m.constExtAddPairwiseI16x8uMask1Index, extAddPairwiseI16x8uMask1[:]) 432 m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), maskReg)) 433 434 // Flip the sign bits on xx. 435 // 436 // Assuming that xx = [w1, ..., w8], now we have, 437 // xx[i] = int8(-w1) for i = 0...8 438 m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(maskReg), xx)) 439 440 mask = m.getOrAllocateConstLabel(&m.constAllOnesI16x8Index, allOnesI16x8[:]) 441 m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), maskReg)) 442 443 // For i = 0,..4 (as this results in i32x4 lanes), now we have 444 // xx[i] = int32(-wn + -w(n+1)) = int32(-(wn + w(n+1))) 445 // c.assembler.CompileRegisterToRegister(amd64.PMADDWD, tmp, vr) 446 m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddwd, newOperandReg(maskReg), xx)) 447 448 mask = m.getOrAllocateConstLabel(&m.constExtAddPairwiseI16x8uMask2Index, extAddPairwiseI16x8uMask2[:]) 449 m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), maskReg)) 450 451 // vr[i] = int32(-(wn + w(n+1))) + int32(math.MaxInt16+1) = int32((wn + w(n+1))) = uint32(wn + w(n+1)). 452 // c.assembler.CompileRegisterToRegister(amd64.PADDD, tmp, vr) 453 m.insert(m.allocateInstr().asXmmRmR(sseOpcodePaddd, newOperandReg(maskReg), xx)) 454 455 m.copyTo(xx, m.c.VRegOf(ret)) 456 } 457 default: 458 panic(fmt.Sprintf("invalid lane type: %s", srcLane)) 459 } 460 } 461 462 func (m *machine) lowerWidenLow(x, ret ssa.Value, lane ssa.VecLane, signed bool) { 463 var sseOp sseOpcode 464 switch lane { 465 case ssa.VecLaneI8x16: 466 if signed { 467 sseOp = sseOpcodePmovsxbw 468 } else { 469 sseOp = sseOpcodePmovzxbw 470 } 471 case ssa.VecLaneI16x8: 472 if signed { 473 sseOp = sseOpcodePmovsxwd 474 } else { 475 sseOp = sseOpcodePmovzxwd 476 } 477 case ssa.VecLaneI32x4: 478 if signed { 479 sseOp = sseOpcodePmovsxdq 480 } else { 481 sseOp = sseOpcodePmovzxdq 482 } 483 default: 484 panic(fmt.Sprintf("invalid lane type: %s", lane)) 485 } 486 487 xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) 488 m.insert(m.allocateInstr().asXmmUnaryRmR(sseOp, xx, m.c.VRegOf(ret))) 489 } 490 491 func (m *machine) lowerWidenHigh(x, ret ssa.Value, lane ssa.VecLane, signed bool) { 492 tmp := m.c.AllocateVReg(ssa.TypeV128) 493 xx := m.getOperand_Reg(m.c.ValueDefinition(x)) 494 m.copyTo(xx.reg(), tmp) 495 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePalignr, 8, newOperandReg(tmp), tmp)) 496 497 var sseOp sseOpcode 498 switch lane { 499 case ssa.VecLaneI8x16: 500 if signed { 501 sseOp = sseOpcodePmovsxbw 502 } else { 503 sseOp = sseOpcodePmovzxbw 504 } 505 case ssa.VecLaneI16x8: 506 if signed { 507 sseOp = sseOpcodePmovsxwd 508 } else { 509 sseOp = sseOpcodePmovzxwd 510 } 511 case ssa.VecLaneI32x4: 512 if signed { 513 sseOp = sseOpcodePmovsxdq 514 } else { 515 sseOp = sseOpcodePmovzxdq 516 } 517 default: 518 panic(fmt.Sprintf("invalid lane type: %s", lane)) 519 } 520 521 m.insert(m.allocateInstr().asXmmUnaryRmR(sseOp, newOperandReg(tmp), m.c.VRegOf(ret))) 522 } 523 524 func (m *machine) lowerLoadSplat(ptr ssa.Value, offset uint32, ret ssa.Value, lane ssa.VecLane) { 525 tmpDst, tmpGp := m.c.AllocateVReg(ssa.TypeV128), m.c.AllocateVReg(ssa.TypeI64) 526 am := newOperandMem(m.lowerToAddressMode(ptr, offset)) 527 528 m.insert(m.allocateInstr().asDefineUninitializedReg(tmpDst)) 529 switch lane { 530 case ssa.VecLaneI8x16: 531 m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, am, tmpGp)) 532 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrb, 0, newOperandReg(tmpGp), tmpDst)) 533 tmpZeroVec := m.c.AllocateVReg(ssa.TypeV128) 534 m.insert(m.allocateInstr().asZeros(tmpZeroVec)) 535 m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmpZeroVec), tmpDst)) 536 case ssa.VecLaneI16x8: 537 m.insert(m.allocateInstr().asMovzxRmR(extModeWQ, am, tmpGp)) 538 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 0, newOperandReg(tmpGp), tmpDst)) 539 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 1, newOperandReg(tmpGp), tmpDst)) 540 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst)) 541 case ssa.VecLaneI32x4: 542 m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, am, tmpGp)) 543 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrd, 0, newOperandReg(tmpGp), tmpDst)) 544 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst)) 545 case ssa.VecLaneI64x2: 546 m.insert(m.allocateInstr().asMov64MR(am, tmpGp)) 547 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 0, newOperandReg(tmpGp), tmpDst)) 548 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 1, newOperandReg(tmpGp), tmpDst)) 549 default: 550 panic(fmt.Sprintf("invalid lane type: %s", lane)) 551 } 552 553 m.copyTo(tmpDst, m.c.VRegOf(ret)) 554 } 555 556 var f64x2CvtFromIMask = [16]byte{ 557 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 558 } 559 560 func (m *machine) lowerVFcvtFromInt(x, ret ssa.Value, lane ssa.VecLane, signed bool) { 561 switch lane { 562 case ssa.VecLaneF32x4: 563 if signed { 564 xx := m.getOperand_Reg(m.c.ValueDefinition(x)) 565 m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, xx, m.c.VRegOf(ret))) 566 } else { 567 xx := m.getOperand_Reg(m.c.ValueDefinition(x)) 568 // Copy the value to two temporary registers. 569 tmp := m.copyToTmp(xx.reg()) 570 tmp2 := m.copyToTmp(xx.reg()) 571 572 // Clear the higher 16 bits of each 32-bit element. 573 m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePslld, newOperandImm32(0xa), tmp)) 574 m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrld, newOperandImm32(0xa), tmp)) 575 576 // Subtract the higher 16-bits from tmp2: clear the lower 16-bits of tmp2. 577 m.insert(m.allocateInstr().asXmmRmR(sseOpcodePsubd, newOperandReg(tmp), tmp2)) 578 579 // Convert the lower 16-bits in tmp. 580 m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, newOperandReg(tmp), tmp)) 581 582 // Left shift by one and convert tmp2, meaning that halved conversion result of higher 16-bits in tmp2. 583 m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrld, newOperandImm32(1), tmp2)) 584 m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, newOperandReg(tmp2), tmp2)) 585 586 // Double the converted halved higher 16bits. 587 m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAddps, newOperandReg(tmp2), tmp2)) 588 589 // Get the conversion result by add tmp (holding lower 16-bit conversion) into tmp2. 590 m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAddps, newOperandReg(tmp), tmp2)) 591 592 m.copyTo(tmp2, m.c.VRegOf(ret)) 593 } 594 case ssa.VecLaneF64x2: 595 if signed { 596 xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) 597 m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2pd, xx, m.c.VRegOf(ret))) 598 } else { 599 maskReg := m.c.AllocateVReg(ssa.TypeV128) 600 maskLabel := m.getOrAllocateConstLabel(&m.constF64x2CvtFromIMaskIndex, f64x2CvtFromIMask[:]) 601 // maskReg = [0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00] 602 m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), maskReg)) 603 604 _xx := m.getOperand_Reg(m.c.ValueDefinition(x)) 605 xx := m.copyToTmp(_xx.reg()) 606 607 // Given that we have xx = [d1, d2, d3, d4], this results in 608 // xx = [d1, [0x00, 0x00, 0x30, 0x43], d2, [0x00, 0x00, 0x30, 0x43]] 609 // = [float64(uint32(d1)) + 0x1.0p52, float64(uint32(d2)) + 0x1.0p52] 610 // ^See https://stackoverflow.com/questions/13269523/can-all-32-bit-ints-be-exactly-represented-as-a-double 611 m.insert(m.allocateInstr().asXmmRmR(sseOpcodeUnpcklps, newOperandReg(maskReg), xx)) 612 613 // maskReg = [float64(0x1.0p52), float64(0x1.0p52)] 614 maskLabel = m.getOrAllocateConstLabel(&m.constTwop52Index, twop52[:]) 615 m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), maskReg)) 616 617 // Now, we get the result as 618 // xx = [float64(uint32(d1)), float64(uint32(d2))] 619 // because the following equality always satisfies: 620 // float64(0x1.0p52 + float64(uint32(x))) - float64(0x1.0p52 + float64(uint32(y))) = float64(uint32(x)) - float64(uint32(y)) 621 m.insert(m.allocateInstr().asXmmRmR(sseOpcodeSubpd, newOperandReg(maskReg), xx)) 622 623 m.copyTo(xx, m.c.VRegOf(ret)) 624 } 625 default: 626 panic(fmt.Sprintf("invalid lane type: %s", lane)) 627 } 628 } 629 630 var ( 631 // i32sMaxOnF64x2 holds math.MaxInt32(=2147483647.0) on two f64 lanes. 632 i32sMaxOnF64x2 = [16]byte{ 633 0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0xdf, 0x41, // float64(2147483647.0) 634 0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0xdf, 0x41, // float64(2147483647.0) 635 } 636 637 // i32sMaxOnF64x2 holds math.MaxUint32(=4294967295.0) on two f64 lanes. 638 i32uMaxOnF64x2 = [16]byte{ 639 0x00, 0x00, 0xe0, 0xff, 0xff, 0xff, 0xef, 0x41, // float64(4294967295.0) 640 0x00, 0x00, 0xe0, 0xff, 0xff, 0xff, 0xef, 0x41, // float64(4294967295.0) 641 } 642 643 // twop52 holds two float64(0x1.0p52) on two f64 lanes. 0x1.0p52 is special in the sense that 644 // with this exponent, the mantissa represents a corresponding uint32 number, and arithmetics, 645 // like addition or subtraction, the resulted floating point holds exactly the same 646 // bit representations in 32-bit integer on its mantissa. 647 // 648 // Note: the name twop52 is common across various compiler ecosystem. 649 // E.g. https://github.com/llvm/llvm-project/blob/92ab024f81e5b64e258b7c3baaf213c7c26fcf40/compiler-rt/lib/builtins/floatdidf.c#L28 650 // E.g. https://opensource.apple.com/source/clang/clang-425.0.24/src/projects/compiler-rt/lib/floatdidf.c.auto.html 651 twop52 = [16]byte{ 652 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, // float64(0x1.0p52) 653 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, // float64(0x1.0p52) 654 } 655 ) 656 657 func (m *machine) lowerVFcvtToIntSat(x, ret ssa.Value, lane ssa.VecLane, signed bool) { 658 _xx := m.getOperand_Reg(m.c.ValueDefinition(x)) 659 xx := m.copyToTmp(_xx.reg()) 660 661 switch lane { 662 case ssa.VecLaneF32x4: 663 if signed { 664 tmp := m.copyToTmp(xx) 665 666 // Assuming we have xx = [v1, v2, v3, v4]. 667 // 668 // Set all bits if lane is not NaN on tmp. 669 // tmp[i] = 0xffffffff if vi != NaN 670 // = 0 if vi == NaN 671 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeCmpps, uint8(cmpPredEQ_OQ), newOperandReg(tmp), tmp)) 672 673 // Clear NaN lanes on xx, meaning that 674 // xx[i] = vi if vi != NaN 675 // 0 if vi == NaN 676 m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAndps, newOperandReg(tmp), xx)) 677 678 // tmp[i] = ^vi if vi != NaN 679 // = 0xffffffff if vi == NaN 680 // which means that tmp[i] & 0x80000000 != 0 if and only if vi is negative. 681 m.insert(m.allocateInstr().asXmmRmR(sseOpcodeXorps, newOperandReg(xx), tmp)) 682 683 // xx[i] = int32(vi) if vi != NaN and xx is not overflowing. 684 // = 0x80000000 if vi != NaN and xx is overflowing (See https://www.felixcloutier.com/x86/cvttps2dq) 685 // = 0 if vi == NaN 686 m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttps2dq, newOperandReg(xx), xx)) 687 688 // Below, we have to convert 0x80000000 into 0x7FFFFFFF for positive overflowing lane. 689 // 690 // tmp[i] = 0x80000000 if vi is positive 691 // = any satisfying any&0x80000000 = 0 if vi is negative or zero. 692 m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAndps, newOperandReg(xx), tmp)) 693 694 // Arithmetic right shifting tmp by 31, meaning that we have 695 // tmp[i] = 0xffffffff if vi is positive, 0 otherwise. 696 m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrad, newOperandImm32(0x1f), tmp)) 697 698 // Flipping 0x80000000 if vi is positive, otherwise keep intact. 699 m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), xx)) 700 } else { 701 tmp := m.c.AllocateVReg(ssa.TypeV128) 702 m.insert(m.allocateInstr().asZeros(tmp)) 703 m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMaxps, newOperandReg(tmp), xx)) 704 m.insert(m.allocateInstr().asXmmRmR(sseOpcodePcmpeqd, newOperandReg(tmp), tmp)) 705 m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrld, newOperandImm32(0x1), tmp)) 706 m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, newOperandReg(tmp), tmp)) 707 tmp2 := m.copyToTmp(xx) 708 m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttps2dq, newOperandReg(xx), xx)) 709 m.insert(m.allocateInstr().asXmmRmR(sseOpcodeSubps, newOperandReg(tmp), tmp2)) 710 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeCmpps, uint8(cmpPredLE_OS), newOperandReg(tmp2), tmp)) 711 m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttps2dq, newOperandReg(tmp2), tmp2)) 712 m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmp2)) 713 m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmp)) 714 m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaxsd, newOperandReg(tmp), tmp2)) 715 m.insert(m.allocateInstr().asXmmRmR(sseOpcodePaddd, newOperandReg(tmp2), xx)) 716 } 717 718 case ssa.VecLaneF64x2: 719 tmp2 := m.c.AllocateVReg(ssa.TypeV128) 720 if signed { 721 tmp := m.copyToTmp(xx) 722 723 // Set all bits for non-NaN lanes, zeros otherwise. 724 // I.e. tmp[i] = 0xffffffff_ffffffff if vi != NaN, 0 otherwise. 725 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeCmppd, uint8(cmpPredEQ_OQ), newOperandReg(tmp), tmp)) 726 727 maskLabel := m.getOrAllocateConstLabel(&m.constI32sMaxOnF64x2Index, i32sMaxOnF64x2[:]) 728 // Load the 2147483647 into tmp2's each lane. 729 m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), tmp2)) 730 731 // tmp[i] = 2147483647 if vi != NaN, 0 otherwise. 732 m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAndps, newOperandReg(tmp2), tmp)) 733 734 // MINPD returns the source register's value as-is, so we have 735 // xx[i] = vi if vi != NaN 736 // = 0 if vi == NaN 737 m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMinpd, newOperandReg(tmp), xx)) 738 739 m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttpd2dq, newOperandReg(xx), xx)) 740 } else { 741 tmp := m.c.AllocateVReg(ssa.TypeV128) 742 m.insert(m.allocateInstr().asZeros(tmp)) 743 744 // xx[i] = vi if vi != NaN && vi > 0 745 // = 0 if vi == NaN || vi <= 0 746 m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMaxpd, newOperandReg(tmp), xx)) 747 748 // tmp2[i] = float64(math.MaxUint32) = math.MaxUint32 749 maskIndex := m.getOrAllocateConstLabel(&m.constI32uMaxOnF64x2Index, i32uMaxOnF64x2[:]) 750 m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskIndex)), tmp2)) 751 752 // xx[i] = vi if vi != NaN && vi > 0 && vi <= math.MaxUint32 753 // = 0 otherwise 754 m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMinpd, newOperandReg(tmp2), xx)) 755 756 // Round the floating points into integer. 757 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeRoundpd, 0x3, newOperandReg(xx), xx)) 758 759 // tmp2[i] = float64(0x1.0p52) 760 maskIndex = m.getOrAllocateConstLabel(&m.constTwop52Index, twop52[:]) 761 m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskIndex)), tmp2)) 762 763 // xx[i] = float64(0x1.0p52) + float64(uint32(vi)) if vi != NaN && vi > 0 && vi <= math.MaxUint32 764 // = 0 otherwise 765 // 766 // This means that xx[i] holds exactly the same bit of uint32(vi) in its lower 32-bits. 767 m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAddpd, newOperandReg(tmp2), xx)) 768 769 // At this point, we have 770 // xx = [uint32(v0), float64(0x1.0p52), uint32(v1), float64(0x1.0p52)] 771 // tmp = [0, 0, 0, 0] 772 // as 32x4 lanes. Therefore, SHUFPS with 0b00_00_10_00 results in 773 // xx = [xx[00], xx[10], tmp[00], tmp[00]] = [xx[00], xx[10], 0, 0] 774 // meaning that for i = 0 and 1, we have 775 // xx[i] = uint32(vi) if vi != NaN && vi > 0 && vi <= math.MaxUint32 776 // = 0 otherwise. 777 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeShufps, 0b00_00_10_00, newOperandReg(tmp), xx)) 778 } 779 default: 780 panic(fmt.Sprintf("invalid lane type: %s", lane)) 781 } 782 783 m.copyTo(xx, m.c.VRegOf(ret)) 784 } 785 786 func (m *machine) lowerNarrow(x, y, ret ssa.Value, lane ssa.VecLane, signed bool) { 787 _xx := m.getOperand_Reg(m.c.ValueDefinition(x)) 788 xx := m.copyToTmp(_xx.reg()) 789 yy := m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) 790 791 var sseOp sseOpcode 792 switch lane { 793 case ssa.VecLaneI16x8: 794 if signed { 795 sseOp = sseOpcodePacksswb 796 } else { 797 sseOp = sseOpcodePackuswb 798 } 799 case ssa.VecLaneI32x4: 800 if signed { 801 sseOp = sseOpcodePackssdw 802 } else { 803 sseOp = sseOpcodePackusdw 804 } 805 default: 806 panic(fmt.Sprintf("invalid lane type: %s", lane)) 807 } 808 m.insert(m.allocateInstr().asXmmRmR(sseOp, yy, xx)) 809 m.copyTo(xx, m.c.VRegOf(ret)) 810 } 811 812 func (m *machine) lowerWideningPairwiseDotProductS(x, y, ret ssa.Value) { 813 _xx := m.getOperand_Reg(m.c.ValueDefinition(x)) 814 xx := m.copyToTmp(_xx.reg()) 815 yy := m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) 816 m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddwd, yy, xx)) 817 m.copyTo(xx, m.c.VRegOf(ret)) 818 } 819 820 func (m *machine) lowerVIabs(instr *ssa.Instruction) { 821 x, lane := instr.ArgWithLane() 822 rd := m.c.VRegOf(instr.Return()) 823 824 if lane == ssa.VecLaneI64x2 { 825 _xx := m.getOperand_Reg(m.c.ValueDefinition(x)) 826 827 blendReg := xmm0VReg 828 m.insert(m.allocateInstr().asDefineUninitializedReg(blendReg)) 829 830 tmp := m.copyToTmp(_xx.reg()) 831 xx := m.copyToTmp(_xx.reg()) 832 833 // Clear all bits on blendReg. 834 m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(blendReg), blendReg)) 835 // Subtract xx from blendMaskReg. 836 m.insert(m.allocateInstr().asXmmRmR(sseOpcodePsubq, newOperandReg(xx), blendReg)) 837 // Copy the subtracted value ^^ back into tmp. 838 m.copyTo(blendReg, xx) 839 840 m.insert(m.allocateInstr().asBlendvpd(newOperandReg(tmp), xx)) 841 842 m.copyTo(xx, rd) 843 } else { 844 var vecOp sseOpcode 845 switch lane { 846 case ssa.VecLaneI8x16: 847 vecOp = sseOpcodePabsb 848 case ssa.VecLaneI16x8: 849 vecOp = sseOpcodePabsw 850 case ssa.VecLaneI32x4: 851 vecOp = sseOpcodePabsd 852 } 853 rn := m.getOperand_Reg(m.c.ValueDefinition(x)) 854 855 i := m.allocateInstr() 856 i.asXmmUnaryRmR(vecOp, rn, rd) 857 m.insert(i) 858 } 859 } 860 861 func (m *machine) lowerVIpopcnt(instr *ssa.Instruction) { 862 x := instr.Arg() 863 rn := m.getOperand_Reg(m.c.ValueDefinition(x)) 864 rd := m.c.VRegOf(instr.Return()) 865 866 tmp1 := m.c.AllocateVReg(ssa.TypeV128) 867 m.lowerVconst(tmp1, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f) 868 869 // Copy input into tmp2. 870 tmp2 := m.copyToTmp(rn.reg()) 871 872 // Given that we have: 873 // rm = [b1, ..., b16] where bn = hn:ln and hn and ln are higher and lower 4-bits of bn. 874 // 875 // Take PAND on tmp1 and tmp2, so that we mask out all the higher bits. 876 // tmp2 = [l1, ..., l16]. 877 pand := m.allocateInstr() 878 pand.asXmmRmR(sseOpcodePand, newOperandReg(tmp1), tmp2) 879 m.insert(pand) 880 881 // Do logical (packed word) right shift by 4 on rm and PAND against the mask (tmp1); meaning that we have 882 // tmp3 = [h1, ...., h16]. 883 tmp3 := m.copyToTmp(rn.reg()) 884 psrlw := m.allocateInstr() 885 psrlw.asXmmRmiReg(sseOpcodePsrlw, newOperandImm32(4), tmp3) 886 m.insert(psrlw) 887 888 pand2 := m.allocateInstr() 889 pand2.asXmmRmR(sseOpcodePand, newOperandReg(tmp1), tmp3) 890 m.insert(pand2) 891 892 // Read the popcntTable into tmp4, and we have 893 // tmp4 = [0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04] 894 tmp4 := m.c.AllocateVReg(ssa.TypeV128) 895 m.lowerVconst(tmp4, 0x03_02_02_01_02_01_01_00, 0x04_03_03_02_03_02_02_01) 896 897 // Make a copy for later. 898 tmp5 := m.copyToTmp(tmp4) 899 900 // tmp4 = [popcnt(l1), ..., popcnt(l16)]. 901 pshufb := m.allocateInstr() 902 pshufb.asXmmRmR(sseOpcodePshufb, newOperandReg(tmp2), tmp4) 903 m.insert(pshufb) 904 905 pshufb2 := m.allocateInstr() 906 pshufb2.asXmmRmR(sseOpcodePshufb, newOperandReg(tmp3), tmp5) 907 m.insert(pshufb2) 908 909 // tmp4 + tmp5 is the result. 910 paddb := m.allocateInstr() 911 paddb.asXmmRmR(sseOpcodePaddb, newOperandReg(tmp4), tmp5) 912 m.insert(paddb) 913 914 m.copyTo(tmp5, rd) 915 } 916 917 func (m *machine) lowerVImul(instr *ssa.Instruction) { 918 x, y, lane := instr.Arg2WithLane() 919 rd := m.c.VRegOf(instr.Return()) 920 if lane == ssa.VecLaneI64x2 { 921 rn := m.getOperand_Reg(m.c.ValueDefinition(x)) 922 rm := m.getOperand_Reg(m.c.ValueDefinition(y)) 923 // Assuming that we have 924 // rm = [p1, p2] = [p1_lo, p1_hi, p2_lo, p2_high] 925 // rn = [q1, q2] = [q1_lo, q1_hi, q2_lo, q2_high] 926 // where pN and qN are 64-bit (quad word) lane, and pN_lo, pN_hi, qN_lo and qN_hi are 32-bit (double word) lane. 927 928 // Copy rn into tmp1. 929 tmp1 := m.copyToTmp(rn.reg()) 930 931 // And do the logical right shift by 32-bit on tmp1, which makes tmp1 = [0, p1_high, 0, p2_high] 932 shift := m.allocateInstr() 933 shift.asXmmRmiReg(sseOpcodePsrlq, newOperandImm32(32), tmp1) 934 m.insert(shift) 935 936 // Execute "pmuludq rm,tmp1", which makes tmp1 = [p1_high*q1_lo, p2_high*q2_lo] where each lane is 64-bit. 937 mul := m.allocateInstr() 938 mul.asXmmRmR(sseOpcodePmuludq, rm, tmp1) 939 m.insert(mul) 940 941 // Copy rm value into tmp2. 942 tmp2 := m.copyToTmp(rm.reg()) 943 944 // And do the logical right shift by 32-bit on tmp2, which makes tmp2 = [0, q1_high, 0, q2_high] 945 shift2 := m.allocateInstr() 946 shift2.asXmmRmiReg(sseOpcodePsrlq, newOperandImm32(32), tmp2) 947 m.insert(shift2) 948 949 // Execute "pmuludq rm,tmp2", which makes tmp2 = [p1_lo*q1_high, p2_lo*q2_high] where each lane is 64-bit. 950 mul2 := m.allocateInstr() 951 mul2.asXmmRmR(sseOpcodePmuludq, rn, tmp2) 952 m.insert(mul2) 953 954 // Adds tmp1 and tmp2 and do the logical left shift by 32-bit, 955 // which makes tmp1 = [(p1_lo*q1_high+p1_high*q1_lo)<<32, (p2_lo*q2_high+p2_high*q2_lo)<<32] 956 add := m.allocateInstr() 957 add.asXmmRmR(sseOpcodePaddq, newOperandReg(tmp2), tmp1) 958 m.insert(add) 959 960 shift3 := m.allocateInstr() 961 shift3.asXmmRmiReg(sseOpcodePsllq, newOperandImm32(32), tmp1) 962 m.insert(shift3) 963 964 // Copy rm value into tmp3. 965 tmp3 := m.copyToTmp(rm.reg()) 966 967 // "pmuludq rm,tmp3" makes tmp3 = [p1_lo*q1_lo, p2_lo*q2_lo] where each lane is 64-bit. 968 mul3 := m.allocateInstr() 969 mul3.asXmmRmR(sseOpcodePmuludq, rn, tmp3) 970 m.insert(mul3) 971 972 // Finally, we get the result by computing tmp1 + tmp3, 973 // which makes tmp1 = [(p1_lo*q1_high+p1_high*q1_lo)<<32+p1_lo*q1_lo, (p2_lo*q2_high+p2_high*q2_lo)<<32+p2_lo*q2_lo] 974 add2 := m.allocateInstr() 975 add2.asXmmRmR(sseOpcodePaddq, newOperandReg(tmp3), tmp1) 976 m.insert(add2) 977 978 m.copyTo(tmp1, rd) 979 980 } else { 981 var vecOp sseOpcode 982 switch lane { 983 case ssa.VecLaneI16x8: 984 vecOp = sseOpcodePmullw 985 case ssa.VecLaneI32x4: 986 vecOp = sseOpcodePmulld 987 default: 988 panic("unsupported: " + lane.String()) 989 } 990 m.lowerVbBinOp(vecOp, x, y, instr.Return()) 991 } 992 }