wa-lang.org/wazero@v1.0.2/internal/engine/compiler/impl_vec_amd64.go (about) 1 package compiler 2 3 import ( 4 "errors" 5 6 "wa-lang.org/wazero/internal/asm" 7 "wa-lang.org/wazero/internal/asm/amd64" 8 "wa-lang.org/wazero/internal/wazeroir" 9 ) 10 11 // compileV128Const implements compiler.compileV128Const for amd64 architecture. 12 func (c *amd64Compiler) compileV128Const(o *wazeroir.OperationV128Const) error { 13 if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil { 14 return err 15 } 16 17 result, err := c.allocateRegister(registerTypeVector) 18 if err != nil { 19 return err 20 } 21 22 // We cannot directly load the value from memory to float regs, 23 // so we move it to int reg temporarily. 24 tmpReg, err := c.allocateRegister(registerTypeGeneralPurpose) 25 if err != nil { 26 return err 27 } 28 29 // Move the lower 64-bits. 30 if o.Lo == 0 { 31 c.assembler.CompileRegisterToRegister(amd64.XORQ, tmpReg, tmpReg) 32 } else { 33 c.assembler.CompileConstToRegister(amd64.MOVQ, int64(o.Lo), tmpReg) 34 } 35 c.assembler.CompileRegisterToRegister(amd64.MOVQ, tmpReg, result) 36 37 if o.Lo != 0 && o.Hi == 0 { 38 c.assembler.CompileRegisterToRegister(amd64.XORQ, tmpReg, tmpReg) 39 } else if o.Hi != 0 { 40 c.assembler.CompileConstToRegister(amd64.MOVQ, int64(o.Hi), tmpReg) 41 } 42 // Move the higher 64-bits with PINSRQ at the second element of 64x2 vector. 43 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, tmpReg, result, 1) 44 45 c.pushVectorRuntimeValueLocationOnRegister(result) 46 return nil 47 } 48 49 // compileV128Add implements compiler.compileV128Add for amd64 architecture. 50 func (c *amd64Compiler) compileV128Add(o *wazeroir.OperationV128Add) error { 51 x2 := c.locationStack.popV128() 52 if err := c.compileEnsureOnRegister(x2); err != nil { 53 return err 54 } 55 56 x1 := c.locationStack.popV128() 57 if err := c.compileEnsureOnRegister(x1); err != nil { 58 return err 59 } 60 var inst asm.Instruction 61 switch o.Shape { 62 case wazeroir.ShapeI8x16: 63 inst = amd64.PADDB 64 case wazeroir.ShapeI16x8: 65 inst = amd64.PADDW 66 case wazeroir.ShapeI32x4: 67 inst = amd64.PADDD 68 case wazeroir.ShapeI64x2: 69 inst = amd64.PADDQ 70 case wazeroir.ShapeF32x4: 71 inst = amd64.ADDPS 72 case wazeroir.ShapeF64x2: 73 inst = amd64.ADDPD 74 } 75 c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register) 76 77 c.pushVectorRuntimeValueLocationOnRegister(x1.register) 78 c.locationStack.markRegisterUnused(x2.register) 79 return nil 80 } 81 82 // compileV128Sub implements compiler.compileV128Sub for amd64 architecture. 83 func (c *amd64Compiler) compileV128Sub(o *wazeroir.OperationV128Sub) error { 84 x2 := c.locationStack.popV128() 85 if err := c.compileEnsureOnRegister(x2); err != nil { 86 return err 87 } 88 89 x1 := c.locationStack.popV128() 90 if err := c.compileEnsureOnRegister(x1); err != nil { 91 return err 92 } 93 var inst asm.Instruction 94 switch o.Shape { 95 case wazeroir.ShapeI8x16: 96 inst = amd64.PSUBB 97 case wazeroir.ShapeI16x8: 98 inst = amd64.PSUBW 99 case wazeroir.ShapeI32x4: 100 inst = amd64.PSUBD 101 case wazeroir.ShapeI64x2: 102 inst = amd64.PSUBQ 103 case wazeroir.ShapeF32x4: 104 inst = amd64.SUBPS 105 case wazeroir.ShapeF64x2: 106 inst = amd64.SUBPD 107 } 108 c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register) 109 110 c.pushVectorRuntimeValueLocationOnRegister(x1.register) 111 c.locationStack.markRegisterUnused(x2.register) 112 return nil 113 } 114 115 // compileV128Load implements compiler.compileV128Load for amd64 architecture. 116 func (c *amd64Compiler) compileV128Load(o *wazeroir.OperationV128Load) error { 117 result, err := c.allocateRegister(registerTypeVector) 118 if err != nil { 119 return err 120 } 121 122 switch o.Type { 123 case wazeroir.V128LoadType128: 124 err = c.compileV128LoadImpl(amd64.MOVDQU, o.Arg.Offset, 16, result) 125 case wazeroir.V128LoadType8x8s: 126 err = c.compileV128LoadImpl(amd64.PMOVSXBW, o.Arg.Offset, 8, result) 127 case wazeroir.V128LoadType8x8u: 128 err = c.compileV128LoadImpl(amd64.PMOVZXBW, o.Arg.Offset, 8, result) 129 case wazeroir.V128LoadType16x4s: 130 err = c.compileV128LoadImpl(amd64.PMOVSXWD, o.Arg.Offset, 8, result) 131 case wazeroir.V128LoadType16x4u: 132 err = c.compileV128LoadImpl(amd64.PMOVZXWD, o.Arg.Offset, 8, result) 133 case wazeroir.V128LoadType32x2s: 134 err = c.compileV128LoadImpl(amd64.PMOVSXDQ, o.Arg.Offset, 8, result) 135 case wazeroir.V128LoadType32x2u: 136 err = c.compileV128LoadImpl(amd64.PMOVZXDQ, o.Arg.Offset, 8, result) 137 case wazeroir.V128LoadType8Splat: 138 reg, err := c.compileMemoryAccessCeilSetup(o.Arg.Offset, 1) 139 if err != nil { 140 return err 141 } 142 c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVBQZX, amd64ReservedRegisterForMemory, -1, 143 reg, 1, reg) 144 // pinsrb $0, reg, result 145 // pxor tmpVReg, tmpVReg 146 // pshufb tmpVReg, result 147 c.locationStack.markRegisterUsed(result) 148 tmpVReg, err := c.allocateRegister(registerTypeVector) 149 if err != nil { 150 return err 151 } 152 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRB, reg, result, 0) 153 c.assembler.CompileRegisterToRegister(amd64.PXOR, tmpVReg, tmpVReg) 154 c.assembler.CompileRegisterToRegister(amd64.PSHUFB, tmpVReg, result) 155 case wazeroir.V128LoadType16Splat: 156 reg, err := c.compileMemoryAccessCeilSetup(o.Arg.Offset, 2) 157 if err != nil { 158 return err 159 } 160 c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVWQZX, amd64ReservedRegisterForMemory, -2, 161 reg, 1, reg) 162 // pinsrw $0, reg, result 163 // pinsrw $1, reg, result 164 // pshufd $0, result, result (result = result[0,0,0,0]) 165 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRW, reg, result, 0) 166 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRW, reg, result, 1) 167 c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, result, result, 0) 168 case wazeroir.V128LoadType32Splat: 169 reg, err := c.compileMemoryAccessCeilSetup(o.Arg.Offset, 4) 170 if err != nil { 171 return err 172 } 173 c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVLQZX, amd64ReservedRegisterForMemory, -4, 174 reg, 1, reg) 175 // pinsrd $0, reg, result 176 // pshufd $0, result, result (result = result[0,0,0,0]) 177 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRD, reg, result, 0) 178 c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, result, result, 0) 179 case wazeroir.V128LoadType64Splat: 180 reg, err := c.compileMemoryAccessCeilSetup(o.Arg.Offset, 8) 181 if err != nil { 182 return err 183 } 184 c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVQ, amd64ReservedRegisterForMemory, -8, 185 reg, 1, reg) 186 // pinsrq $0, reg, result 187 // pinsrq $1, reg, result 188 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, reg, result, 0) 189 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, reg, result, 1) 190 case wazeroir.V128LoadType32zero: 191 err = c.compileV128LoadImpl(amd64.MOVL, o.Arg.Offset, 4, result) 192 case wazeroir.V128LoadType64zero: 193 err = c.compileV128LoadImpl(amd64.MOVQ, o.Arg.Offset, 8, result) 194 } 195 196 if err != nil { 197 return err 198 } 199 200 c.pushVectorRuntimeValueLocationOnRegister(result) 201 return nil 202 } 203 204 func (c *amd64Compiler) compileV128LoadImpl(inst asm.Instruction, offset uint32, targetSizeInBytes int64, dst asm.Register) error { 205 offsetReg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes) 206 if err != nil { 207 return err 208 } 209 c.assembler.CompileMemoryWithIndexToRegister(inst, amd64ReservedRegisterForMemory, -targetSizeInBytes, 210 offsetReg, 1, dst) 211 return nil 212 } 213 214 // compileV128LoadLane implements compiler.compileV128LoadLane for amd64. 215 func (c *amd64Compiler) compileV128LoadLane(o *wazeroir.OperationV128LoadLane) error { 216 targetVector := c.locationStack.popV128() 217 if err := c.compileEnsureOnRegister(targetVector); err != nil { 218 return err 219 } 220 221 var insertInst asm.Instruction 222 switch o.LaneSize { 223 case 8: 224 insertInst = amd64.PINSRB 225 case 16: 226 insertInst = amd64.PINSRW 227 case 32: 228 insertInst = amd64.PINSRD 229 case 64: 230 insertInst = amd64.PINSRQ 231 } 232 233 targetSizeInBytes := int64(o.LaneSize / 8) 234 offsetReg, err := c.compileMemoryAccessCeilSetup(o.Arg.Offset, targetSizeInBytes) 235 if err != nil { 236 return err 237 } 238 c.assembler.CompileMemoryWithIndexAndArgToRegister(insertInst, amd64ReservedRegisterForMemory, -targetSizeInBytes, 239 offsetReg, 1, targetVector.register, o.LaneIndex) 240 241 c.pushVectorRuntimeValueLocationOnRegister(targetVector.register) 242 return nil 243 } 244 245 // compileV128Store implements compiler.compileV128Store for amd64. 246 func (c *amd64Compiler) compileV128Store(o *wazeroir.OperationV128Store) error { 247 val := c.locationStack.popV128() 248 if err := c.compileEnsureOnRegister(val); err != nil { 249 return err 250 } 251 252 const targetSizeInBytes = 16 253 offsetReg, err := c.compileMemoryAccessCeilSetup(o.Arg.Offset, targetSizeInBytes) 254 if err != nil { 255 return err 256 } 257 258 c.assembler.CompileRegisterToMemoryWithIndex(amd64.MOVDQU, val.register, 259 amd64ReservedRegisterForMemory, -targetSizeInBytes, offsetReg, 1) 260 261 c.locationStack.markRegisterUnused(val.register, offsetReg) 262 return nil 263 } 264 265 // compileV128StoreLane implements compiler.compileV128StoreLane for amd64. 266 func (c *amd64Compiler) compileV128StoreLane(o *wazeroir.OperationV128StoreLane) error { 267 var storeInst asm.Instruction 268 switch o.LaneSize { 269 case 8: 270 storeInst = amd64.PEXTRB 271 case 16: 272 storeInst = amd64.PEXTRW 273 case 32: 274 storeInst = amd64.PEXTRD 275 case 64: 276 storeInst = amd64.PEXTRQ 277 } 278 279 val := c.locationStack.popV128() 280 if err := c.compileEnsureOnRegister(val); err != nil { 281 return err 282 } 283 284 targetSizeInBytes := int64(o.LaneSize / 8) 285 offsetReg, err := c.compileMemoryAccessCeilSetup(o.Arg.Offset, targetSizeInBytes) 286 if err != nil { 287 return err 288 } 289 290 c.assembler.CompileRegisterToMemoryWithIndexAndArg(storeInst, val.register, 291 amd64ReservedRegisterForMemory, -targetSizeInBytes, offsetReg, 1, o.LaneIndex) 292 293 c.locationStack.markRegisterUnused(val.register, offsetReg) 294 return nil 295 } 296 297 // compileV128ExtractLane implements compiler.compileV128ExtractLane for amd64. 298 func (c *amd64Compiler) compileV128ExtractLane(o *wazeroir.OperationV128ExtractLane) error { 299 val := c.locationStack.popV128() 300 if err := c.compileEnsureOnRegister(val); err != nil { 301 return err 302 } 303 switch o.Shape { 304 case wazeroir.ShapeI8x16: 305 result, err := c.allocateRegister(registerTypeGeneralPurpose) 306 if err != nil { 307 return err 308 } 309 c.assembler.CompileRegisterToRegisterWithArg(amd64.PEXTRB, val.register, result, o.LaneIndex) 310 if o.Signed { 311 c.assembler.CompileRegisterToRegister(amd64.MOVBLSX, result, result) 312 } else { 313 c.assembler.CompileRegisterToRegister(amd64.MOVBLZX, result, result) 314 } 315 c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32) 316 c.locationStack.markRegisterUnused(val.register) 317 case wazeroir.ShapeI16x8: 318 result, err := c.allocateRegister(registerTypeGeneralPurpose) 319 if err != nil { 320 return err 321 } 322 c.assembler.CompileRegisterToRegisterWithArg(amd64.PEXTRW, val.register, result, o.LaneIndex) 323 if o.Signed { 324 c.assembler.CompileRegisterToRegister(amd64.MOVWLSX, result, result) 325 } else { 326 c.assembler.CompileRegisterToRegister(amd64.MOVWLZX, result, result) 327 } 328 c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32) 329 c.locationStack.markRegisterUnused(val.register) 330 case wazeroir.ShapeI32x4: 331 result, err := c.allocateRegister(registerTypeGeneralPurpose) 332 if err != nil { 333 return err 334 } 335 c.assembler.CompileRegisterToRegisterWithArg(amd64.PEXTRD, val.register, result, o.LaneIndex) 336 c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32) 337 c.locationStack.markRegisterUnused(val.register) 338 case wazeroir.ShapeI64x2: 339 result, err := c.allocateRegister(registerTypeGeneralPurpose) 340 if err != nil { 341 return err 342 } 343 c.assembler.CompileRegisterToRegisterWithArg(amd64.PEXTRQ, val.register, result, o.LaneIndex) 344 c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI64) 345 c.locationStack.markRegisterUnused(val.register) 346 case wazeroir.ShapeF32x4: 347 if o.LaneIndex != 0 { 348 c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, val.register, val.register, o.LaneIndex) 349 } 350 c.pushRuntimeValueLocationOnRegister(val.register, runtimeValueTypeF32) 351 case wazeroir.ShapeF64x2: 352 if o.LaneIndex != 0 { 353 // This case we can assume LaneIndex == 1. 354 // We have to modify the val.register as, for example: 355 // 0b11 0b10 0b01 0b00 356 // | | | | 357 // [x3, x2, x1, x0] -> [x0, x0, x3, x2] 358 // where val.register = [x3, x2, x1, x0] and each xN = 32bits. 359 // Then, we interpret the register as float64, therefore, the float64 value is obtained as [x3, x2]. 360 arg := byte(0b00_00_11_10) 361 c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, val.register, val.register, arg) 362 } 363 c.pushRuntimeValueLocationOnRegister(val.register, runtimeValueTypeF64) 364 } 365 366 return nil 367 } 368 369 // compileV128ReplaceLane implements compiler.compileV128ReplaceLane for amd64. 370 func (c *amd64Compiler) compileV128ReplaceLane(o *wazeroir.OperationV128ReplaceLane) error { 371 origin := c.locationStack.pop() 372 if err := c.compileEnsureOnRegister(origin); err != nil { 373 return err 374 } 375 376 vector := c.locationStack.popV128() 377 if err := c.compileEnsureOnRegister(vector); err != nil { 378 return err 379 } 380 381 switch o.Shape { 382 case wazeroir.ShapeI8x16: 383 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRB, origin.register, vector.register, o.LaneIndex) 384 case wazeroir.ShapeI16x8: 385 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRW, origin.register, vector.register, o.LaneIndex) 386 case wazeroir.ShapeI32x4: 387 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRD, origin.register, vector.register, o.LaneIndex) 388 case wazeroir.ShapeI64x2: 389 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, origin.register, vector.register, o.LaneIndex) 390 case wazeroir.ShapeF32x4: 391 c.assembler.CompileRegisterToRegisterWithArg(amd64.INSERTPS, origin.register, vector.register, 392 // In INSERTPS instruction, the destination index is encoded at 4 and 5 bits of the argument. 393 // See https://www.felixcloutier.com/x86/insertps 394 o.LaneIndex<<4, 395 ) 396 case wazeroir.ShapeF64x2: 397 if o.LaneIndex == 0 { 398 c.assembler.CompileRegisterToRegister(amd64.MOVSD, origin.register, vector.register) 399 } else { 400 c.assembler.CompileRegisterToRegister(amd64.MOVLHPS, origin.register, vector.register) 401 } 402 } 403 404 c.pushVectorRuntimeValueLocationOnRegister(vector.register) 405 c.locationStack.markRegisterUnused(origin.register) 406 return nil 407 } 408 409 // compileV128Splat implements compiler.compileV128Splat for amd64. 410 func (c *amd64Compiler) compileV128Splat(o *wazeroir.OperationV128Splat) (err error) { 411 origin := c.locationStack.pop() 412 if err = c.compileEnsureOnRegister(origin); err != nil { 413 return 414 } 415 416 var result asm.Register 417 switch o.Shape { 418 case wazeroir.ShapeI8x16: 419 result, err = c.allocateRegister(registerTypeVector) 420 if err != nil { 421 return err 422 } 423 c.locationStack.markRegisterUsed(result) 424 425 tmp, err := c.allocateRegister(registerTypeVector) 426 if err != nil { 427 return err 428 } 429 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRB, origin.register, result, 0) 430 c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, tmp) 431 c.assembler.CompileRegisterToRegister(amd64.PSHUFB, tmp, result) 432 case wazeroir.ShapeI16x8: 433 result, err = c.allocateRegister(registerTypeVector) 434 if err != nil { 435 return err 436 } 437 c.locationStack.markRegisterUsed(result) 438 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRW, origin.register, result, 0) 439 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRW, origin.register, result, 1) 440 c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, result, result, 0) 441 case wazeroir.ShapeI32x4: 442 result, err = c.allocateRegister(registerTypeVector) 443 if err != nil { 444 return err 445 } 446 c.locationStack.markRegisterUsed(result) 447 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRD, origin.register, result, 0) 448 c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, result, result, 0) 449 case wazeroir.ShapeI64x2: 450 result, err = c.allocateRegister(registerTypeVector) 451 if err != nil { 452 return err 453 } 454 c.locationStack.markRegisterUsed(result) 455 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, origin.register, result, 0) 456 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, origin.register, result, 1) 457 case wazeroir.ShapeF32x4: 458 result = origin.register 459 c.assembler.CompileRegisterToRegisterWithArg(amd64.INSERTPS, origin.register, result, 0) 460 c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, result, result, 0) 461 case wazeroir.ShapeF64x2: 462 result = origin.register 463 c.assembler.CompileRegisterToRegister(amd64.MOVQ, origin.register, result) 464 c.assembler.CompileRegisterToRegister(amd64.MOVLHPS, origin.register, result) 465 } 466 467 c.locationStack.markRegisterUnused(origin.register) 468 c.pushVectorRuntimeValueLocationOnRegister(result) 469 return nil 470 } 471 472 // compileV128Shuffle implements compiler.compileV128Shuffle for amd64. 473 func (c *amd64Compiler) compileV128Shuffle(o *wazeroir.OperationV128Shuffle) error { 474 w := c.locationStack.popV128() 475 if err := c.compileEnsureOnRegister(w); err != nil { 476 return err 477 } 478 479 v := c.locationStack.popV128() 480 if err := c.compileEnsureOnRegister(v); err != nil { 481 return err 482 } 483 484 tmp, err := c.allocateRegister(registerTypeVector) 485 if err != nil { 486 return err 487 } 488 489 consts := [32]byte{} 490 for i, lane := range o.Lanes { 491 if lane < 16 { 492 consts[i+16] = 0x80 493 consts[i] = lane 494 } else { 495 consts[i+16] = lane - 16 496 consts[i] = 0x80 497 } 498 } 499 500 err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, asm.NewStaticConst(consts[:16]), tmp) 501 if err != nil { 502 return err 503 } 504 c.assembler.CompileRegisterToRegister(amd64.PSHUFB, tmp, v.register) 505 err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, asm.NewStaticConst(consts[16:]), tmp) 506 if err != nil { 507 return err 508 } 509 c.assembler.CompileRegisterToRegister(amd64.PSHUFB, tmp, w.register) 510 c.assembler.CompileRegisterToRegister(amd64.ORPS, v.register, w.register) 511 512 c.pushVectorRuntimeValueLocationOnRegister(w.register) 513 c.locationStack.markRegisterUnused(v.register) 514 return nil 515 } 516 517 var swizzleConst = [16]byte{ 518 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 519 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 520 } 521 522 // compileV128Swizzle implements compiler.compileV128Swizzle for amd64. 523 func (c *amd64Compiler) compileV128Swizzle(*wazeroir.OperationV128Swizzle) error { 524 indexVec := c.locationStack.popV128() 525 if err := c.compileEnsureOnRegister(indexVec); err != nil { 526 return err 527 } 528 529 baseVec := c.locationStack.popV128() 530 if err := c.compileEnsureOnRegister(baseVec); err != nil { 531 return err 532 } 533 534 tmp, err := c.allocateRegister(registerTypeVector) 535 if err != nil { 536 return err 537 } 538 539 err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, asm.NewStaticConst(swizzleConst[:]), tmp) 540 if err != nil { 541 return err 542 } 543 544 c.assembler.CompileRegisterToRegister(amd64.PADDUSB, tmp, indexVec.register) 545 c.assembler.CompileRegisterToRegister(amd64.PSHUFB, indexVec.register, baseVec.register) 546 547 c.pushVectorRuntimeValueLocationOnRegister(baseVec.register) 548 c.locationStack.markRegisterUnused(indexVec.register) 549 return nil 550 } 551 552 // compileV128AnyTrue implements compiler.compileV128AnyTrue for amd64. 553 func (c *amd64Compiler) compileV128AnyTrue(*wazeroir.OperationV128AnyTrue) error { 554 v := c.locationStack.popV128() 555 if err := c.compileEnsureOnRegister(v); err != nil { 556 return err 557 } 558 559 c.assembler.CompileRegisterToRegister(amd64.PTEST, v.register, v.register) 560 561 c.locationStack.pushRuntimeValueLocationOnConditionalRegister(amd64.ConditionalRegisterStateNE) 562 c.locationStack.markRegisterUnused(v.register) 563 return nil 564 } 565 566 // compileV128AllTrue implements compiler.compileV128AllTrue for amd64. 567 func (c *amd64Compiler) compileV128AllTrue(o *wazeroir.OperationV128AllTrue) error { 568 v := c.locationStack.popV128() 569 if err := c.compileEnsureOnRegister(v); err != nil { 570 return err 571 } 572 573 tmp, err := c.allocateRegister(registerTypeVector) 574 if err != nil { 575 return err 576 } 577 578 var cmpInst asm.Instruction 579 switch o.Shape { 580 case wazeroir.ShapeI8x16: 581 cmpInst = amd64.PCMPEQB 582 case wazeroir.ShapeI16x8: 583 cmpInst = amd64.PCMPEQW 584 case wazeroir.ShapeI32x4: 585 cmpInst = amd64.PCMPEQD 586 case wazeroir.ShapeI64x2: 587 cmpInst = amd64.PCMPEQQ 588 } 589 590 c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, tmp) 591 c.assembler.CompileRegisterToRegister(cmpInst, v.register, tmp) 592 c.assembler.CompileRegisterToRegister(amd64.PTEST, tmp, tmp) 593 c.locationStack.markRegisterUnused(v.register, tmp) 594 c.locationStack.pushRuntimeValueLocationOnConditionalRegister(amd64.ConditionalRegisterStateE) 595 return nil 596 } 597 598 // compileV128BitMask implements compiler.compileV128BitMask for amd64. 599 func (c *amd64Compiler) compileV128BitMask(o *wazeroir.OperationV128BitMask) error { 600 v := c.locationStack.popV128() 601 if err := c.compileEnsureOnRegister(v); err != nil { 602 return err 603 } 604 605 result, err := c.allocateRegister(registerTypeGeneralPurpose) 606 if err != nil { 607 return err 608 } 609 610 switch o.Shape { 611 case wazeroir.ShapeI8x16: 612 c.assembler.CompileRegisterToRegister(amd64.PMOVMSKB, v.register, result) 613 case wazeroir.ShapeI16x8: 614 // When we have: 615 // R1 = [R1(w1), R1(w2), R1(w3), R1(w4), R1(w5), R1(w6), R1(w7), R1(v8)] 616 // R2 = [R2(w1), R2(w2), R2(w3), R2(v4), R2(w5), R2(w6), R2(w7), R2(v8)] 617 // where RX(wn) is n-th signed word (16-bit) of RX register, 618 // 619 // "PACKSSWB R1, R2" produces 620 // R1 = [ 621 // byte_sat(R1(w1)), byte_sat(R1(w2)), byte_sat(R1(w3)), byte_sat(R1(w4)), 622 // byte_sat(R1(w5)), byte_sat(R1(w6)), byte_sat(R1(w7)), byte_sat(R1(w8)), 623 // byte_sat(R2(w1)), byte_sat(R2(w2)), byte_sat(R2(w3)), byte_sat(R2(w4)), 624 // byte_sat(R2(w5)), byte_sat(R2(w6)), byte_sat(R2(w7)), byte_sat(R2(w8)), 625 // ] 626 // where R1 is the destination register, and 627 // byte_sat(w) = int8(w) if w fits as signed 8-bit, 628 // 0x80 if w is less than 0x80 629 // 0x7F if w is greater than 0x7f 630 // 631 // See https://www.felixcloutier.com/x86/packsswb:packssdw for detail. 632 // 633 // Therefore, v.register ends up having i-th and (i+8)-th bit set if i-th lane is negative (for i in 0..8). 634 c.assembler.CompileRegisterToRegister(amd64.PACKSSWB, v.register, v.register) 635 c.assembler.CompileRegisterToRegister(amd64.PMOVMSKB, v.register, result) 636 // Clear the higher bits than 8. 637 c.assembler.CompileConstToRegister(amd64.SHRQ, 8, result) 638 case wazeroir.ShapeI32x4: 639 c.assembler.CompileRegisterToRegister(amd64.MOVMSKPS, v.register, result) 640 case wazeroir.ShapeI64x2: 641 c.assembler.CompileRegisterToRegister(amd64.MOVMSKPD, v.register, result) 642 } 643 644 c.locationStack.markRegisterUnused(v.register) 645 c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32) 646 return nil 647 } 648 649 // compileV128And implements compiler.compileV128And for amd64. 650 func (c *amd64Compiler) compileV128And(*wazeroir.OperationV128And) error { 651 x2 := c.locationStack.popV128() 652 if err := c.compileEnsureOnRegister(x2); err != nil { 653 return err 654 } 655 656 x1 := c.locationStack.popV128() 657 if err := c.compileEnsureOnRegister(x1); err != nil { 658 return err 659 } 660 661 c.assembler.CompileRegisterToRegister(amd64.PAND, x2.register, x1.register) 662 663 c.locationStack.markRegisterUnused(x2.register) 664 c.pushVectorRuntimeValueLocationOnRegister(x1.register) 665 return nil 666 } 667 668 // compileV128Not implements compiler.compileV128Not for amd64. 669 func (c *amd64Compiler) compileV128Not(*wazeroir.OperationV128Not) error { 670 v := c.locationStack.popV128() 671 if err := c.compileEnsureOnRegister(v); err != nil { 672 return err 673 } 674 675 tmp, err := c.allocateRegister(registerTypeVector) 676 if err != nil { 677 return err 678 } 679 680 // Set all bits on tmp register. 681 c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, tmp, tmp) 682 // Then XOR with tmp to reverse all bits on v.register. 683 c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, v.register) 684 c.pushVectorRuntimeValueLocationOnRegister(v.register) 685 return nil 686 } 687 688 // compileV128Or implements compiler.compileV128Or for amd64. 689 func (c *amd64Compiler) compileV128Or(*wazeroir.OperationV128Or) error { 690 x2 := c.locationStack.popV128() 691 if err := c.compileEnsureOnRegister(x2); err != nil { 692 return err 693 } 694 695 x1 := c.locationStack.popV128() 696 if err := c.compileEnsureOnRegister(x1); err != nil { 697 return err 698 } 699 700 c.assembler.CompileRegisterToRegister(amd64.POR, x2.register, x1.register) 701 702 c.locationStack.markRegisterUnused(x2.register) 703 c.pushVectorRuntimeValueLocationOnRegister(x1.register) 704 return nil 705 } 706 707 // compileV128Xor implements compiler.compileV128Xor for amd64. 708 func (c *amd64Compiler) compileV128Xor(*wazeroir.OperationV128Xor) error { 709 x2 := c.locationStack.popV128() 710 if err := c.compileEnsureOnRegister(x2); err != nil { 711 return err 712 } 713 714 x1 := c.locationStack.popV128() 715 if err := c.compileEnsureOnRegister(x1); err != nil { 716 return err 717 } 718 719 c.assembler.CompileRegisterToRegister(amd64.PXOR, x2.register, x1.register) 720 721 c.locationStack.markRegisterUnused(x2.register) 722 c.pushVectorRuntimeValueLocationOnRegister(x1.register) 723 return nil 724 } 725 726 // compileV128Bitselect implements compiler.compileV128Bitselect for amd64. 727 func (c *amd64Compiler) compileV128Bitselect(*wazeroir.OperationV128Bitselect) error { 728 selector := c.locationStack.popV128() 729 if err := c.compileEnsureOnRegister(selector); err != nil { 730 return err 731 } 732 733 x2 := c.locationStack.popV128() 734 if err := c.compileEnsureOnRegister(x2); err != nil { 735 return err 736 } 737 738 x1 := c.locationStack.popV128() 739 if err := c.compileEnsureOnRegister(x1); err != nil { 740 return err 741 } 742 743 // The following logic is equivalent to v128.or(v128.and(v1, selector), v128.and(v2, v128.not(selector))) 744 // See https://github.com/WebAssembly/spec/blob/wg-2.0.draft1/proposals/simd/SIMD.md#bitwise-select 745 c.assembler.CompileRegisterToRegister(amd64.PAND, selector.register, x1.register) 746 c.assembler.CompileRegisterToRegister(amd64.PANDN, x2.register, selector.register) 747 c.assembler.CompileRegisterToRegister(amd64.POR, selector.register, x1.register) 748 749 c.locationStack.markRegisterUnused(x2.register, selector.register) 750 c.pushVectorRuntimeValueLocationOnRegister(x1.register) 751 return nil 752 } 753 754 // compileV128AndNot implements compiler.compileV128AndNot for amd64. 755 func (c *amd64Compiler) compileV128AndNot(*wazeroir.OperationV128AndNot) error { 756 x2 := c.locationStack.popV128() 757 if err := c.compileEnsureOnRegister(x2); err != nil { 758 return err 759 } 760 761 x1 := c.locationStack.popV128() 762 if err := c.compileEnsureOnRegister(x1); err != nil { 763 return err 764 } 765 766 c.assembler.CompileRegisterToRegister(amd64.PANDN, x1.register, x2.register) 767 768 c.locationStack.markRegisterUnused(x1.register) 769 c.pushVectorRuntimeValueLocationOnRegister(x2.register) 770 return nil 771 } 772 773 // compileV128Shr implements compiler.compileV128Shr for amd64. 774 func (c *amd64Compiler) compileV128Shr(o *wazeroir.OperationV128Shr) error { 775 // https://stackoverflow.com/questions/35002937/sse-simd-shift-with-one-byte-element-size-granularity 776 if o.Shape == wazeroir.ShapeI8x16 { 777 return c.compileV128ShrI8x16Impl(o.Signed) 778 } else if o.Shape == wazeroir.ShapeI64x2 && o.Signed { 779 return c.compileV128ShrI64x2SignedImpl() 780 } else { 781 return c.compileV128ShrImpl(o) 782 } 783 } 784 785 // compileV128ShrImpl implements shift right instructions except for i8x16 (logical/arithmetic) and i64x2 (arithmetic). 786 func (c *amd64Compiler) compileV128ShrImpl(o *wazeroir.OperationV128Shr) error { 787 s := c.locationStack.pop() 788 if err := c.compileEnsureOnRegister(s); err != nil { 789 return err 790 } 791 792 x1 := c.locationStack.popV128() 793 if err := c.compileEnsureOnRegister(x1); err != nil { 794 return err 795 } 796 797 vecTmp, err := c.allocateRegister(registerTypeVector) 798 if err != nil { 799 return err 800 } 801 802 var moduleConst int64 803 var shift asm.Instruction 804 switch o.Shape { 805 case wazeroir.ShapeI16x8: 806 moduleConst = 0xf // modulo 16. 807 if o.Signed { 808 shift = amd64.PSRAW 809 } else { 810 shift = amd64.PSRLW 811 } 812 case wazeroir.ShapeI32x4: 813 moduleConst = 0x1f // modulo 32. 814 if o.Signed { 815 shift = amd64.PSRAD 816 } else { 817 shift = amd64.PSRLD 818 } 819 case wazeroir.ShapeI64x2: 820 moduleConst = 0x3f // modulo 64. 821 shift = amd64.PSRLQ 822 } 823 824 gpShiftAmount := s.register 825 c.assembler.CompileConstToRegister(amd64.ANDQ, moduleConst, gpShiftAmount) 826 c.assembler.CompileRegisterToRegister(amd64.MOVL, gpShiftAmount, vecTmp) 827 c.assembler.CompileRegisterToRegister(shift, vecTmp, x1.register) 828 829 c.locationStack.markRegisterUnused(gpShiftAmount) 830 c.pushVectorRuntimeValueLocationOnRegister(x1.register) 831 return nil 832 } 833 834 // compileV128ShrI64x2SignedImpl implements compiler.compileV128Shr for i64x2 signed (arithmetic) shift. 835 // PSRAQ instruction requires AVX, so we emulate it without AVX instructions. https://www.felixcloutier.com/x86/psraw:psrad:psraq 836 func (c *amd64Compiler) compileV128ShrI64x2SignedImpl() error { 837 const shiftCountRegister = amd64.RegCX 838 839 s := c.locationStack.pop() 840 if s.register != shiftCountRegister { 841 // If another value lives on the CX register, we release it to the stack. 842 c.onValueReleaseRegisterToStack(shiftCountRegister) 843 if s.onStack() { 844 s.setRegister(shiftCountRegister) 845 c.compileLoadValueOnStackToRegister(s) 846 } else if s.onConditionalRegister() { 847 c.compileMoveConditionalToGeneralPurposeRegister(s, shiftCountRegister) 848 } else { // already on register. 849 old := s.register 850 c.assembler.CompileRegisterToRegister(amd64.MOVL, old, shiftCountRegister) 851 s.setRegister(shiftCountRegister) 852 c.locationStack.markRegisterUnused(old) 853 } 854 } 855 856 c.locationStack.markRegisterUsed(shiftCountRegister) 857 tmp, err := c.allocateRegister(registerTypeGeneralPurpose) 858 if err != nil { 859 return err 860 } 861 862 x1 := c.locationStack.popV128() 863 if err := c.compileEnsureOnRegister(x1); err != nil { 864 return err 865 } 866 867 // Extract each lane into tmp, execute SHR on tmp, and write it back to the lane. 868 c.assembler.CompileRegisterToRegisterWithArg(amd64.PEXTRQ, x1.register, tmp, 0) 869 c.assembler.CompileRegisterToRegister(amd64.SARQ, shiftCountRegister, tmp) 870 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, tmp, x1.register, 0) 871 c.assembler.CompileRegisterToRegisterWithArg(amd64.PEXTRQ, x1.register, tmp, 1) 872 c.assembler.CompileRegisterToRegister(amd64.SARQ, shiftCountRegister, tmp) 873 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, tmp, x1.register, 1) 874 875 c.locationStack.markRegisterUnused(shiftCountRegister) 876 c.pushVectorRuntimeValueLocationOnRegister(x1.register) 877 return nil 878 } 879 880 // i8x16LogicalSHRMaskTable is necessary for emulating non-existent packed bytes logical right shifts on amd64. 881 // The mask is applied after performing packed word shifts on the value to clear out the unnecessary bits. 882 var i8x16LogicalSHRMaskTable = [8 * 16]byte{ // (the number of possible shift amount 0, 1, ..., 7.) * 16 bytes. 883 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // for 0 shift 884 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // for 1 shift 885 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, // for 2 shift 886 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, // for 3 shift 887 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, // for 4 shift 888 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, // for 5 shift 889 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, // for 6 shift 890 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // for 7 shift 891 } 892 893 // compileV128ShrI64x2SignedImpl implements compiler.compileV128Shr for i8x16 signed logical/arithmetic shifts. 894 // amd64 doesn't have packed byte shifts, so we need this special casing. 895 // See https://stackoverflow.com/questions/35002937/sse-simd-shift-with-one-byte-element-size-granularity 896 func (c *amd64Compiler) compileV128ShrI8x16Impl(signed bool) error { 897 s := c.locationStack.pop() 898 if err := c.compileEnsureOnRegister(s); err != nil { 899 return err 900 } 901 902 v := c.locationStack.popV128() 903 if err := c.compileEnsureOnRegister(v); err != nil { 904 return err 905 } 906 907 vecTmp, err := c.allocateRegister(registerTypeVector) 908 if err != nil { 909 return err 910 } 911 912 gpShiftAmount := s.register 913 c.assembler.CompileConstToRegister(amd64.ANDQ, 0x7, gpShiftAmount) // mod 8. 914 915 if signed { 916 c.locationStack.markRegisterUsed(vecTmp) 917 vecTmp2, err := c.allocateRegister(registerTypeVector) 918 if err != nil { 919 return err 920 } 921 922 vreg := v.register 923 924 // Copy the value from v.register to vecTmp. 925 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vreg, vecTmp) 926 927 // Assuming that we have 928 // vreg = [b1, ..., b16] 929 // vecTmp = [b1, ..., b16] 930 // at this point, then we use PUNPCKLBW and PUNPCKHBW to produce: 931 // vreg = [b1, b1, b2, b2, ..., b8, b8] 932 // vecTmp = [b9, b9, b10, b10, ..., b16, b16] 933 c.assembler.CompileRegisterToRegister(amd64.PUNPCKLBW, vreg, vreg) 934 c.assembler.CompileRegisterToRegister(amd64.PUNPCKHBW, vecTmp, vecTmp) 935 936 // Adding 8 to the shift amount, and then move the amount to vecTmp2. 937 c.assembler.CompileConstToRegister(amd64.ADDQ, 0x8, gpShiftAmount) 938 c.assembler.CompileRegisterToRegister(amd64.MOVL, gpShiftAmount, vecTmp2) 939 940 // Perform the word packed arithmetic right shifts on vreg and vecTmp. 941 // This changes these two registers as: 942 // vreg = [xxx, b1 >> s, xxx, b2 >> s, ..., xxx, b8 >> s] 943 // vecTmp = [xxx, b9 >> s, xxx, b10 >> s, ..., xxx, b16 >> s] 944 // where xxx is 1 or 0 depending on each byte's sign, and ">>" is the arithmetic shift on a byte. 945 c.assembler.CompileRegisterToRegister(amd64.PSRAW, vecTmp2, vreg) 946 c.assembler.CompileRegisterToRegister(amd64.PSRAW, vecTmp2, vecTmp) 947 948 // Finally, we can get the result by packing these two word vectors. 949 c.assembler.CompileRegisterToRegister(amd64.PACKSSWB, vecTmp, vreg) 950 951 c.locationStack.markRegisterUnused(gpShiftAmount, vecTmp) 952 c.pushVectorRuntimeValueLocationOnRegister(vreg) 953 } else { 954 c.assembler.CompileRegisterToRegister(amd64.MOVL, gpShiftAmount, vecTmp) 955 // amd64 doesn't have packed byte shifts, so we packed word shift here, and then mark-out 956 // the unnecessary bits below. 957 c.assembler.CompileRegisterToRegister(amd64.PSRLW, vecTmp, v.register) 958 959 gpTmp, err := c.allocateRegister(registerTypeGeneralPurpose) 960 if err != nil { 961 return err 962 } 963 964 // Read the initial address of the mask table into gpTmp register. 965 err = c.assembler.CompileStaticConstToRegister(amd64.LEAQ, asm.NewStaticConst(i8x16LogicalSHRMaskTable[:]), gpTmp) 966 if err != nil { 967 return err 968 } 969 970 // We have to get the mask according to the shift amount, so we first have to do 971 // gpShiftAmount << 4 = gpShiftAmount*16 to get the initial offset of the mask (16 is the size of each mask in bytes). 972 c.assembler.CompileConstToRegister(amd64.SHLQ, 4, gpShiftAmount) 973 974 // Now ready to read the content of the mask into the vecTmp. 975 c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVDQU, 976 gpTmp, 0, gpShiftAmount, 1, 977 vecTmp, 978 ) 979 980 // Finally, clear out the unnecessary 981 c.assembler.CompileRegisterToRegister(amd64.PAND, vecTmp, v.register) 982 983 c.locationStack.markRegisterUnused(gpShiftAmount) 984 c.pushVectorRuntimeValueLocationOnRegister(v.register) 985 } 986 return nil 987 } 988 989 // i8x16SHLMaskTable is necessary for emulating non-existent packed bytes left shifts on amd64. 990 // The mask is applied after performing packed word shifts on the value to clear out the unnecessary bits. 991 var i8x16SHLMaskTable = [8 * 16]byte{ // (the number of possible shift amount 0, 1, ..., 7.) * 16 bytes. 992 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // for 0 shift 993 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, // for 1 shift 994 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, // for 2 shift 995 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, // for 3 shift 996 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // for 4 shift 997 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, // for 5 shift 998 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, // for 6 shift 999 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, // for 7 shift 1000 } 1001 1002 // compileV128Shl implements compiler.compileV128Shl for amd64. 1003 func (c *amd64Compiler) compileV128Shl(o *wazeroir.OperationV128Shl) error { 1004 s := c.locationStack.pop() 1005 if err := c.compileEnsureOnRegister(s); err != nil { 1006 return err 1007 } 1008 1009 x1 := c.locationStack.popV128() 1010 if err := c.compileEnsureOnRegister(x1); err != nil { 1011 return err 1012 } 1013 1014 vecTmp, err := c.allocateRegister(registerTypeVector) 1015 if err != nil { 1016 return err 1017 } 1018 1019 var modulo int64 1020 var shift asm.Instruction 1021 switch o.Shape { 1022 case wazeroir.ShapeI8x16: 1023 modulo = 0x7 // modulo 8. 1024 // x86 doesn't have packed bytes shift, so we use PSLLW and mask-out the redundant bits. 1025 // See https://stackoverflow.com/questions/35002937/sse-simd-shift-with-one-byte-element-size-granularity 1026 shift = amd64.PSLLW 1027 case wazeroir.ShapeI16x8: 1028 modulo = 0xf // modulo 16. 1029 shift = amd64.PSLLW 1030 case wazeroir.ShapeI32x4: 1031 modulo = 0x1f // modulo 32. 1032 shift = amd64.PSLLD 1033 case wazeroir.ShapeI64x2: 1034 modulo = 0x3f // modulo 64. 1035 shift = amd64.PSLLQ 1036 } 1037 1038 gpShiftAmount := s.register 1039 c.assembler.CompileConstToRegister(amd64.ANDQ, modulo, gpShiftAmount) 1040 c.assembler.CompileRegisterToRegister(amd64.MOVL, gpShiftAmount, vecTmp) 1041 c.assembler.CompileRegisterToRegister(shift, vecTmp, x1.register) 1042 1043 if o.Shape == wazeroir.ShapeI8x16 { 1044 gpTmp, err := c.allocateRegister(registerTypeGeneralPurpose) 1045 if err != nil { 1046 return err 1047 } 1048 1049 // Read the initial address of the mask table into gpTmp register. 1050 err = c.assembler.CompileStaticConstToRegister(amd64.LEAQ, asm.NewStaticConst(i8x16SHLMaskTable[:]), gpTmp) 1051 if err != nil { 1052 return err 1053 } 1054 1055 // We have to get the mask according to the shift amount, so we first have to do 1056 // gpShiftAmount << 4 = gpShiftAmount*16 to get the initial offset of the mask (16 is the size of each mask in bytes). 1057 c.assembler.CompileConstToRegister(amd64.SHLQ, 4, gpShiftAmount) 1058 1059 // Now ready to read the content of the mask into the vecTmp. 1060 c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVDQU, 1061 gpTmp, 0, gpShiftAmount, 1, 1062 vecTmp, 1063 ) 1064 1065 // Finally, clear out the unnecessary 1066 c.assembler.CompileRegisterToRegister(amd64.PAND, vecTmp, x1.register) 1067 } 1068 1069 c.locationStack.markRegisterUnused(gpShiftAmount) 1070 c.pushVectorRuntimeValueLocationOnRegister(x1.register) 1071 return nil 1072 } 1073 1074 // compileV128Cmp implements compiler.compileV128Cmp for amd64. 1075 func (c *amd64Compiler) compileV128Cmp(o *wazeroir.OperationV128Cmp) error { 1076 x2 := c.locationStack.popV128() 1077 if err := c.compileEnsureOnRegister(x2); err != nil { 1078 return err 1079 } 1080 1081 x1 := c.locationStack.popV128() 1082 if err := c.compileEnsureOnRegister(x1); err != nil { 1083 return err 1084 } 1085 1086 const ( 1087 // See https://www.felixcloutier.com/x86/cmppd and https://www.felixcloutier.com/x86/cmpps 1088 floatEqualArg = 0 1089 floatLessThanArg = 1 1090 floatLessThanOrEqualArg = 2 1091 floatNotEqualARg = 4 1092 ) 1093 1094 x1Reg, x2Reg, result := x1.register, x2.register, asm.NilRegister 1095 switch o.Type { 1096 case wazeroir.V128CmpTypeF32x4Eq: 1097 c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPS, x2Reg, x1Reg, floatEqualArg) 1098 result = x1Reg 1099 case wazeroir.V128CmpTypeF32x4Ne: 1100 c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPS, x2Reg, x1Reg, floatNotEqualARg) 1101 result = x1Reg 1102 case wazeroir.V128CmpTypeF32x4Lt: 1103 c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPS, x2Reg, x1Reg, floatLessThanArg) 1104 result = x1Reg 1105 case wazeroir.V128CmpTypeF32x4Gt: 1106 // Without AVX, there's no float Gt instruction, so we swap the register and use Lt instead. 1107 c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPS, x1Reg, x2Reg, floatLessThanArg) 1108 result = x2Reg 1109 case wazeroir.V128CmpTypeF32x4Le: 1110 c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPS, x2Reg, x1Reg, floatLessThanOrEqualArg) 1111 result = x1Reg 1112 case wazeroir.V128CmpTypeF32x4Ge: 1113 // Without AVX, there's no float Ge instruction, so we swap the register and use Le instead. 1114 c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPS, x1Reg, x2Reg, floatLessThanOrEqualArg) 1115 result = x2Reg 1116 case wazeroir.V128CmpTypeF64x2Eq: 1117 c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPD, x2Reg, x1Reg, floatEqualArg) 1118 result = x1Reg 1119 case wazeroir.V128CmpTypeF64x2Ne: 1120 c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPD, x2Reg, x1Reg, floatNotEqualARg) 1121 result = x1Reg 1122 case wazeroir.V128CmpTypeF64x2Lt: 1123 c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPD, x2Reg, x1Reg, floatLessThanArg) 1124 result = x1Reg 1125 case wazeroir.V128CmpTypeF64x2Gt: 1126 // Without AVX, there's no float Gt instruction, so we swap the register and use Lt instead. 1127 c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPD, x1Reg, x2Reg, floatLessThanArg) 1128 result = x2Reg 1129 case wazeroir.V128CmpTypeF64x2Le: 1130 c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPD, x2Reg, x1Reg, floatLessThanOrEqualArg) 1131 result = x1Reg 1132 case wazeroir.V128CmpTypeF64x2Ge: 1133 // Without AVX, there's no float Ge instruction, so we swap the register and use Le instead. 1134 c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPD, x1Reg, x2Reg, floatLessThanOrEqualArg) 1135 result = x2Reg 1136 case wazeroir.V128CmpTypeI8x16Eq: 1137 c.assembler.CompileRegisterToRegister(amd64.PCMPEQB, x2Reg, x1Reg) 1138 result = x1Reg 1139 case wazeroir.V128CmpTypeI8x16Ne: 1140 c.assembler.CompileRegisterToRegister(amd64.PCMPEQB, x2Reg, x1Reg) 1141 // Set all bits on x2Reg register. 1142 c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg) 1143 // Swap the bits on x1Reg register. 1144 c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg) 1145 result = x1Reg 1146 case wazeroir.V128CmpTypeI8x16LtS: 1147 c.assembler.CompileRegisterToRegister(amd64.PCMPGTB, x1Reg, x2Reg) 1148 result = x2Reg 1149 case wazeroir.V128CmpTypeI8x16LtU, wazeroir.V128CmpTypeI8x16GtU: 1150 // Take the unsigned min/max values on each byte on x1 and x2 onto x1Reg. 1151 if o.Type == wazeroir.V128CmpTypeI8x16LtU { 1152 c.assembler.CompileRegisterToRegister(amd64.PMINUB, x2Reg, x1Reg) 1153 } else { 1154 c.assembler.CompileRegisterToRegister(amd64.PMAXUB, x2Reg, x1Reg) 1155 } 1156 c.assembler.CompileRegisterToRegister(amd64.PCMPEQB, x2Reg, x1Reg) 1157 // Set all bits on x2Reg register. 1158 c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg) 1159 // Swap the bits on x2Reg register. 1160 c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg) 1161 result = x1Reg 1162 case wazeroir.V128CmpTypeI8x16GtS: 1163 c.assembler.CompileRegisterToRegister(amd64.PCMPGTB, x2Reg, x1Reg) 1164 result = x1Reg 1165 case wazeroir.V128CmpTypeI8x16LeS, wazeroir.V128CmpTypeI8x16LeU: 1166 tmp, err := c.allocateRegister(registerTypeVector) 1167 if err != nil { 1168 return err 1169 } 1170 // Copy the value on the src to tmp. 1171 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1Reg, tmp) 1172 if o.Type == wazeroir.V128CmpTypeI8x16LeS { 1173 c.assembler.CompileRegisterToRegister(amd64.PMINSB, x2Reg, tmp) 1174 } else { 1175 c.assembler.CompileRegisterToRegister(amd64.PMINUB, x2Reg, tmp) 1176 } 1177 c.assembler.CompileRegisterToRegister(amd64.PCMPEQB, tmp, x1Reg) 1178 result = x1Reg 1179 case wazeroir.V128CmpTypeI8x16GeS, wazeroir.V128CmpTypeI8x16GeU: 1180 tmp, err := c.allocateRegister(registerTypeVector) 1181 if err != nil { 1182 return err 1183 } 1184 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1Reg, tmp) 1185 if o.Type == wazeroir.V128CmpTypeI8x16GeS { 1186 c.assembler.CompileRegisterToRegister(amd64.PMAXSB, x2Reg, tmp) 1187 } else { 1188 c.assembler.CompileRegisterToRegister(amd64.PMAXUB, x2Reg, tmp) 1189 } 1190 c.assembler.CompileRegisterToRegister(amd64.PCMPEQB, tmp, x1Reg) 1191 result = x1Reg 1192 case wazeroir.V128CmpTypeI16x8Eq: 1193 c.assembler.CompileRegisterToRegister(amd64.PCMPEQW, x2Reg, x1Reg) 1194 result = x1Reg 1195 case wazeroir.V128CmpTypeI16x8Ne: 1196 c.assembler.CompileRegisterToRegister(amd64.PCMPEQW, x2Reg, x1Reg) 1197 // Set all bits on x2Reg register. 1198 c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg) 1199 // Swap the bits on x1Reg register. 1200 c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg) 1201 result = x1Reg 1202 case wazeroir.V128CmpTypeI16x8LtS: 1203 c.assembler.CompileRegisterToRegister(amd64.PCMPGTW, x1Reg, x2Reg) 1204 result = x2Reg 1205 case wazeroir.V128CmpTypeI16x8LtU, wazeroir.V128CmpTypeI16x8GtU: 1206 // Take the unsigned min/max values on each byte on x1 and x2 onto x1Reg. 1207 if o.Type == wazeroir.V128CmpTypeI16x8LtU { 1208 c.assembler.CompileRegisterToRegister(amd64.PMINUW, x2Reg, x1Reg) 1209 } else { 1210 c.assembler.CompileRegisterToRegister(amd64.PMAXUW, x2Reg, x1Reg) 1211 } 1212 c.assembler.CompileRegisterToRegister(amd64.PCMPEQW, x2Reg, x1Reg) 1213 // Set all bits on x2Reg register. 1214 c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg) 1215 // Swap the bits on x2Reg register. 1216 c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg) 1217 result = x1Reg 1218 case wazeroir.V128CmpTypeI16x8GtS: 1219 c.assembler.CompileRegisterToRegister(amd64.PCMPGTW, x2Reg, x1Reg) 1220 result = x1Reg 1221 case wazeroir.V128CmpTypeI16x8LeS, wazeroir.V128CmpTypeI16x8LeU: 1222 tmp, err := c.allocateRegister(registerTypeVector) 1223 if err != nil { 1224 return err 1225 } 1226 // Copy the value on the src to tmp. 1227 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1Reg, tmp) 1228 if o.Type == wazeroir.V128CmpTypeI16x8LeS { 1229 c.assembler.CompileRegisterToRegister(amd64.PMINSW, x2Reg, tmp) 1230 } else { 1231 c.assembler.CompileRegisterToRegister(amd64.PMINUW, x2Reg, tmp) 1232 } 1233 c.assembler.CompileRegisterToRegister(amd64.PCMPEQW, tmp, x1Reg) 1234 result = x1Reg 1235 case wazeroir.V128CmpTypeI16x8GeS, wazeroir.V128CmpTypeI16x8GeU: 1236 tmp, err := c.allocateRegister(registerTypeVector) 1237 if err != nil { 1238 return err 1239 } 1240 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1Reg, tmp) 1241 if o.Type == wazeroir.V128CmpTypeI16x8GeS { 1242 c.assembler.CompileRegisterToRegister(amd64.PMAXSW, x2Reg, tmp) 1243 } else { 1244 c.assembler.CompileRegisterToRegister(amd64.PMAXUW, x2Reg, tmp) 1245 } 1246 c.assembler.CompileRegisterToRegister(amd64.PCMPEQW, tmp, x1Reg) 1247 result = x1Reg 1248 case wazeroir.V128CmpTypeI32x4Eq: 1249 c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x1Reg) 1250 result = x1Reg 1251 case wazeroir.V128CmpTypeI32x4Ne: 1252 c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x1Reg) 1253 // Set all bits on x2Reg register. 1254 c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg) 1255 // Swap the bits on x1Reg register. 1256 c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg) 1257 result = x1Reg 1258 case wazeroir.V128CmpTypeI32x4LtS: 1259 c.assembler.CompileRegisterToRegister(amd64.PCMPGTD, x1Reg, x2Reg) 1260 result = x2Reg 1261 case wazeroir.V128CmpTypeI32x4LtU, wazeroir.V128CmpTypeI32x4GtU: 1262 // Take the unsigned min/max values on each byte on x1 and x2 onto x1Reg. 1263 if o.Type == wazeroir.V128CmpTypeI32x4LtU { 1264 c.assembler.CompileRegisterToRegister(amd64.PMINUD, x2Reg, x1Reg) 1265 } else { 1266 c.assembler.CompileRegisterToRegister(amd64.PMAXUD, x2Reg, x1Reg) 1267 } 1268 c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x1Reg) 1269 // Set all bits on x2Reg register. 1270 c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg) 1271 // Swap the bits on x2Reg register. 1272 c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg) 1273 result = x1Reg 1274 case wazeroir.V128CmpTypeI32x4GtS: 1275 c.assembler.CompileRegisterToRegister(amd64.PCMPGTD, x2Reg, x1Reg) 1276 result = x1Reg 1277 case wazeroir.V128CmpTypeI32x4LeS, wazeroir.V128CmpTypeI32x4LeU: 1278 tmp, err := c.allocateRegister(registerTypeVector) 1279 if err != nil { 1280 return err 1281 } 1282 // Copy the value on the src to tmp. 1283 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1Reg, tmp) 1284 if o.Type == wazeroir.V128CmpTypeI32x4LeS { 1285 c.assembler.CompileRegisterToRegister(amd64.PMINSD, x2Reg, tmp) 1286 } else { 1287 c.assembler.CompileRegisterToRegister(amd64.PMINUD, x2Reg, tmp) 1288 } 1289 c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, tmp, x1Reg) 1290 result = x1Reg 1291 case wazeroir.V128CmpTypeI32x4GeS, wazeroir.V128CmpTypeI32x4GeU: 1292 tmp, err := c.allocateRegister(registerTypeVector) 1293 if err != nil { 1294 return err 1295 } 1296 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1Reg, tmp) 1297 if o.Type == wazeroir.V128CmpTypeI32x4GeS { 1298 c.assembler.CompileRegisterToRegister(amd64.PMAXSD, x2Reg, tmp) 1299 } else { 1300 c.assembler.CompileRegisterToRegister(amd64.PMAXUD, x2Reg, tmp) 1301 } 1302 c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, tmp, x1Reg) 1303 result = x1Reg 1304 case wazeroir.V128CmpTypeI64x2Eq: 1305 c.assembler.CompileRegisterToRegister(amd64.PCMPEQQ, x2Reg, x1Reg) 1306 result = x1Reg 1307 case wazeroir.V128CmpTypeI64x2Ne: 1308 c.assembler.CompileRegisterToRegister(amd64.PCMPEQQ, x2Reg, x1Reg) 1309 // Set all bits on x2Reg register. 1310 c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg) 1311 // Swap the bits on x1Reg register. 1312 c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg) 1313 result = x1Reg 1314 case wazeroir.V128CmpTypeI64x2LtS: 1315 c.assembler.CompileRegisterToRegister(amd64.PCMPGTQ, x1Reg, x2Reg) 1316 result = x2Reg 1317 case wazeroir.V128CmpTypeI64x2GtS: 1318 c.assembler.CompileRegisterToRegister(amd64.PCMPGTQ, x2Reg, x1Reg) 1319 result = x1Reg 1320 case wazeroir.V128CmpTypeI64x2LeS: 1321 c.assembler.CompileRegisterToRegister(amd64.PCMPGTQ, x2Reg, x1Reg) 1322 // Set all bits on x2Reg register. 1323 c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg) 1324 // Swap the bits on x1Reg register. 1325 c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg) 1326 result = x1Reg 1327 case wazeroir.V128CmpTypeI64x2GeS: 1328 c.assembler.CompileRegisterToRegister(amd64.PCMPGTQ, x1Reg, x2Reg) 1329 // Set all bits on x1Reg register. 1330 c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x1Reg, x1Reg) 1331 // Swap the bits on x2Reg register. 1332 c.assembler.CompileRegisterToRegister(amd64.PXOR, x1Reg, x2Reg) 1333 result = x2Reg 1334 } 1335 1336 c.locationStack.markRegisterUnused(x1Reg, x2Reg) 1337 c.pushVectorRuntimeValueLocationOnRegister(result) 1338 return nil 1339 } 1340 1341 // compileV128AddSat implements compiler.compileV128AddSat for amd64. 1342 func (c *amd64Compiler) compileV128AddSat(o *wazeroir.OperationV128AddSat) error { 1343 var inst asm.Instruction 1344 switch o.Shape { 1345 case wazeroir.ShapeI8x16: 1346 if o.Signed { 1347 inst = amd64.PADDSB 1348 } else { 1349 inst = amd64.PADDUSB 1350 } 1351 case wazeroir.ShapeI16x8: 1352 if o.Signed { 1353 inst = amd64.PADDSW 1354 } else { 1355 inst = amd64.PADDUSW 1356 } 1357 } 1358 1359 x2 := c.locationStack.popV128() 1360 if err := c.compileEnsureOnRegister(x2); err != nil { 1361 return err 1362 } 1363 1364 x1 := c.locationStack.popV128() 1365 if err := c.compileEnsureOnRegister(x1); err != nil { 1366 return err 1367 } 1368 1369 c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register) 1370 1371 c.locationStack.markRegisterUnused(x2.register) 1372 c.pushVectorRuntimeValueLocationOnRegister(x1.register) 1373 return nil 1374 } 1375 1376 // compileV128SubSat implements compiler.compileV128SubSat for amd64. 1377 func (c *amd64Compiler) compileV128SubSat(o *wazeroir.OperationV128SubSat) error { 1378 var inst asm.Instruction 1379 switch o.Shape { 1380 case wazeroir.ShapeI8x16: 1381 if o.Signed { 1382 inst = amd64.PSUBSB 1383 } else { 1384 inst = amd64.PSUBUSB 1385 } 1386 case wazeroir.ShapeI16x8: 1387 if o.Signed { 1388 inst = amd64.PSUBSW 1389 } else { 1390 inst = amd64.PSUBUSW 1391 } 1392 } 1393 1394 x2 := c.locationStack.popV128() 1395 if err := c.compileEnsureOnRegister(x2); err != nil { 1396 return err 1397 } 1398 1399 x1 := c.locationStack.popV128() 1400 if err := c.compileEnsureOnRegister(x1); err != nil { 1401 return err 1402 } 1403 1404 c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register) 1405 1406 c.locationStack.markRegisterUnused(x2.register) 1407 c.pushVectorRuntimeValueLocationOnRegister(x1.register) 1408 return nil 1409 } 1410 1411 // compileV128Mul implements compiler.compileV128Mul for amd64. 1412 func (c *amd64Compiler) compileV128Mul(o *wazeroir.OperationV128Mul) error { 1413 var inst asm.Instruction 1414 switch o.Shape { 1415 case wazeroir.ShapeI16x8: 1416 inst = amd64.PMULLW 1417 case wazeroir.ShapeI32x4: 1418 inst = amd64.PMULLD 1419 case wazeroir.ShapeI64x2: 1420 return c.compileV128MulI64x2() 1421 case wazeroir.ShapeF32x4: 1422 inst = amd64.MULPS 1423 case wazeroir.ShapeF64x2: 1424 inst = amd64.MULPD 1425 } 1426 1427 x2 := c.locationStack.popV128() 1428 if err := c.compileEnsureOnRegister(x2); err != nil { 1429 return err 1430 } 1431 1432 x1 := c.locationStack.popV128() 1433 if err := c.compileEnsureOnRegister(x1); err != nil { 1434 return err 1435 } 1436 1437 c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register) 1438 1439 c.locationStack.markRegisterUnused(x2.register) 1440 c.pushVectorRuntimeValueLocationOnRegister(x1.register) 1441 return nil 1442 } 1443 1444 // compileV128MulI64x2 implements V128Mul for i64x2. 1445 func (c *amd64Compiler) compileV128MulI64x2() error { 1446 x2 := c.locationStack.popV128() 1447 if err := c.compileEnsureOnRegister(x2); err != nil { 1448 return err 1449 } 1450 1451 x1 := c.locationStack.popV128() 1452 if err := c.compileEnsureOnRegister(x1); err != nil { 1453 return err 1454 } 1455 1456 x1r, x2r := x1.register, x2.register 1457 1458 tmp1, err := c.allocateRegister(registerTypeVector) 1459 if err != nil { 1460 return err 1461 } 1462 1463 c.locationStack.markRegisterUsed(tmp1) 1464 1465 tmp2, err := c.allocateRegister(registerTypeVector) 1466 if err != nil { 1467 return err 1468 } 1469 1470 // Assuming that we have 1471 // x1r = [p1, p2] = [p1_lo, p1_hi, p2_lo, p2_high] 1472 // x2r = [q1, q2] = [q1_lo, q1_hi, q2_lo, q2_high] 1473 // where pN and qN are 64-bit (quad word) lane, and pN_lo, pN_hi, qN_lo and qN_hi are 32-bit (double word) lane. 1474 1475 // Copy x1's value into tmp1. 1476 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1r, tmp1) 1477 // And do the logical right shift by 32-bit on tmp1, which makes tmp1 = [0, p1_high, 0, p2_high] 1478 c.assembler.CompileConstToRegister(amd64.PSRLQ, 32, tmp1) 1479 1480 // Execute "pmuludq x2r,tmp1", which makes tmp1 = [p1_high*q1_lo, p2_high*q2_lo] where each lane is 64-bit. 1481 c.assembler.CompileRegisterToRegister(amd64.PMULUDQ, x2r, tmp1) 1482 1483 // Copy x2's value into tmp2. 1484 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x2r, tmp2) 1485 // And do the logical right shift by 32-bit on tmp2, which makes tmp2 = [0, q1_high, 0, q2_high] 1486 c.assembler.CompileConstToRegister(amd64.PSRLQ, 32, tmp2) 1487 1488 // Execute "pmuludq x1r,tmp2", which makes tmp2 = [p1_lo*q1_high, p2_lo*q2_high] where each lane is 64-bit. 1489 c.assembler.CompileRegisterToRegister(amd64.PMULUDQ, x1r, tmp2) 1490 1491 // Adds tmp1 and tmp2 and do the logical left shift by 32-bit, 1492 // which makes tmp1 = [(p1_lo*q1_high+p1_high*q1_lo)<<32, (p2_lo*q2_high+p2_high*q2_lo)<<32] 1493 c.assembler.CompileRegisterToRegister(amd64.PADDQ, tmp2, tmp1) 1494 c.assembler.CompileConstToRegister(amd64.PSLLQ, 32, tmp1) 1495 1496 // Execute "pmuludq x2r,x1r", which makes x1r = [p1_lo*q1_lo, p2_lo*q2_lo] where each lane is 64-bit. 1497 c.assembler.CompileRegisterToRegister(amd64.PMULUDQ, x2r, x1r) 1498 1499 // Finally, we get the result by adding x1r and tmp1, 1500 // which makes x1r = [(p1_lo*q1_high+p1_high*q1_lo)<<32+p1_lo*q1_lo, (p2_lo*q2_high+p2_high*q2_lo)<<32+p2_lo*q2_lo] 1501 c.assembler.CompileRegisterToRegister(amd64.PADDQ, tmp1, x1r) 1502 1503 c.locationStack.markRegisterUnused(x2r, tmp1) 1504 c.pushVectorRuntimeValueLocationOnRegister(x1r) 1505 return nil 1506 } 1507 1508 // compileV128Div implements compiler.compileV128Div for amd64. 1509 func (c *amd64Compiler) compileV128Div(o *wazeroir.OperationV128Div) error { 1510 x2 := c.locationStack.popV128() 1511 if err := c.compileEnsureOnRegister(x2); err != nil { 1512 return err 1513 } 1514 1515 x1 := c.locationStack.popV128() 1516 if err := c.compileEnsureOnRegister(x1); err != nil { 1517 return err 1518 } 1519 1520 var inst asm.Instruction 1521 switch o.Shape { 1522 case wazeroir.ShapeF32x4: 1523 inst = amd64.DIVPS 1524 case wazeroir.ShapeF64x2: 1525 inst = amd64.DIVPD 1526 } 1527 1528 c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register) 1529 1530 c.locationStack.markRegisterUnused(x2.register) 1531 c.pushVectorRuntimeValueLocationOnRegister(x1.register) 1532 return nil 1533 } 1534 1535 // compileV128Neg implements compiler.compileV128Neg for amd64. 1536 func (c *amd64Compiler) compileV128Neg(o *wazeroir.OperationV128Neg) error { 1537 if o.Shape <= wazeroir.ShapeI64x2 { 1538 return c.compileV128NegInt(o.Shape) 1539 } else { 1540 return c.compileV128NegFloat(o.Shape) 1541 } 1542 } 1543 1544 // compileV128NegInt implements compiler.compileV128Neg for integer lanes. 1545 func (c *amd64Compiler) compileV128NegInt(s wazeroir.Shape) error { 1546 v := c.locationStack.popV128() 1547 if err := c.compileEnsureOnRegister(v); err != nil { 1548 return err 1549 } 1550 1551 result, err := c.allocateRegister(registerTypeVector) 1552 if err != nil { 1553 return err 1554 } 1555 1556 var subInst asm.Instruction 1557 switch s { 1558 case wazeroir.ShapeI8x16: 1559 subInst = amd64.PSUBB 1560 case wazeroir.ShapeI16x8: 1561 subInst = amd64.PSUBW 1562 case wazeroir.ShapeI32x4: 1563 subInst = amd64.PSUBD 1564 case wazeroir.ShapeI64x2: 1565 subInst = amd64.PSUBQ 1566 } 1567 1568 c.assembler.CompileRegisterToRegister(amd64.PXOR, result, result) 1569 c.assembler.CompileRegisterToRegister(subInst, v.register, result) 1570 1571 c.locationStack.markRegisterUnused(v.register) 1572 c.pushVectorRuntimeValueLocationOnRegister(result) 1573 return nil 1574 } 1575 1576 // compileV128NegInt implements compiler.compileV128Neg for float lanes. 1577 func (c *amd64Compiler) compileV128NegFloat(s wazeroir.Shape) error { 1578 v := c.locationStack.popV128() 1579 if err := c.compileEnsureOnRegister(v); err != nil { 1580 return err 1581 } 1582 1583 tmp, err := c.allocateRegister(registerTypeVector) 1584 if err != nil { 1585 return err 1586 } 1587 1588 var leftShiftInst, xorInst asm.Instruction 1589 var leftShiftAmount asm.ConstantValue 1590 if s == wazeroir.ShapeF32x4 { 1591 leftShiftInst, leftShiftAmount, xorInst = amd64.PSLLD, 31, amd64.XORPS 1592 } else { 1593 leftShiftInst, leftShiftAmount, xorInst = amd64.PSLLQ, 63, amd64.XORPD 1594 } 1595 1596 // Clear all bits on tmp. 1597 c.assembler.CompileRegisterToRegister(amd64.XORPS, tmp, tmp) 1598 // Set all bits on tmp by CMPPD with arg=0 (== pseudo CMPEQPD instruction). 1599 // See https://www.felixcloutier.com/x86/cmpps 1600 // 1601 // Note: if we do not clear all the bits ^ with XORPS, this might end up not setting ones on some lane 1602 // if the lane is NaN. 1603 c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPD, tmp, tmp, 0x8) 1604 // Do the left shift on each lane to set only the most significant bit in each. 1605 c.assembler.CompileConstToRegister(leftShiftInst, leftShiftAmount, tmp) 1606 // Get the negated result by XOR on each lane with tmp. 1607 c.assembler.CompileRegisterToRegister(xorInst, tmp, v.register) 1608 1609 c.pushVectorRuntimeValueLocationOnRegister(v.register) 1610 return nil 1611 } 1612 1613 // compileV128Sqrt implements compiler.compileV128Sqrt for amd64. 1614 func (c *amd64Compiler) compileV128Sqrt(o *wazeroir.OperationV128Sqrt) error { 1615 v := c.locationStack.popV128() 1616 if err := c.compileEnsureOnRegister(v); err != nil { 1617 return err 1618 } 1619 1620 var inst asm.Instruction 1621 switch o.Shape { 1622 case wazeroir.ShapeF64x2: 1623 inst = amd64.SQRTPD 1624 case wazeroir.ShapeF32x4: 1625 inst = amd64.SQRTPS 1626 } 1627 1628 c.assembler.CompileRegisterToRegister(inst, v.register, v.register) 1629 c.pushVectorRuntimeValueLocationOnRegister(v.register) 1630 return nil 1631 } 1632 1633 // compileV128Abs implements compiler.compileV128Abs for amd64. 1634 func (c *amd64Compiler) compileV128Abs(o *wazeroir.OperationV128Abs) error { 1635 if o.Shape == wazeroir.ShapeI64x2 { 1636 return c.compileV128AbsI64x2() 1637 } 1638 1639 v := c.locationStack.popV128() 1640 if err := c.compileEnsureOnRegister(v); err != nil { 1641 return err 1642 } 1643 1644 result := v.register 1645 switch o.Shape { 1646 case wazeroir.ShapeI8x16: 1647 c.assembler.CompileRegisterToRegister(amd64.PABSB, result, result) 1648 case wazeroir.ShapeI16x8: 1649 c.assembler.CompileRegisterToRegister(amd64.PABSW, result, result) 1650 case wazeroir.ShapeI32x4: 1651 c.assembler.CompileRegisterToRegister(amd64.PABSD, result, result) 1652 case wazeroir.ShapeF32x4: 1653 tmp, err := c.allocateRegister(registerTypeVector) 1654 if err != nil { 1655 return err 1656 } 1657 // Set all bits on tmp. 1658 c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, tmp, tmp) 1659 // Shift right packed single floats by 1 to clear the sign bits. 1660 c.assembler.CompileConstToRegister(amd64.PSRLD, 1, tmp) 1661 // Clear the sign bit of vr. 1662 c.assembler.CompileRegisterToRegister(amd64.ANDPS, tmp, result) 1663 case wazeroir.ShapeF64x2: 1664 tmp, err := c.allocateRegister(registerTypeVector) 1665 if err != nil { 1666 return err 1667 } 1668 // Set all bits on tmp. 1669 c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, tmp, tmp) 1670 // Shift right packed single floats by 1 to clear the sign bits. 1671 c.assembler.CompileConstToRegister(amd64.PSRLQ, 1, tmp) 1672 // Clear the sign bit of vr. 1673 c.assembler.CompileRegisterToRegister(amd64.ANDPD, tmp, result) 1674 } 1675 1676 c.pushVectorRuntimeValueLocationOnRegister(result) 1677 return nil 1678 } 1679 1680 // compileV128AbsI64x2 implements compileV128Abs for i64x2 lanes. 1681 func (c *amd64Compiler) compileV128AbsI64x2() error { 1682 // See https://www.felixcloutier.com/x86/blendvpd 1683 const blendMaskReg = amd64.RegX0 1684 c.onValueReleaseRegisterToStack(blendMaskReg) 1685 c.locationStack.markRegisterUsed(blendMaskReg) 1686 1687 v := c.locationStack.popV128() 1688 if err := c.compileEnsureOnRegister(v); err != nil { 1689 return err 1690 } 1691 vr := v.register 1692 1693 if vr == blendMaskReg { 1694 return errors.New("BUG: X0 must not be used") 1695 } 1696 1697 tmp, err := c.allocateRegister(registerTypeVector) 1698 if err != nil { 1699 return err 1700 } 1701 c.locationStack.markRegisterUsed(tmp) 1702 1703 // Copy the value to tmp. 1704 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vr, tmp) 1705 1706 // Clear all bits on blendMaskReg. 1707 c.assembler.CompileRegisterToRegister(amd64.PXOR, blendMaskReg, blendMaskReg) 1708 // Subtract vr from blendMaskReg. 1709 c.assembler.CompileRegisterToRegister(amd64.PSUBQ, vr, blendMaskReg) 1710 // Copy the subtracted value ^^ back into vr. 1711 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, blendMaskReg, vr) 1712 1713 c.assembler.CompileRegisterToRegister(amd64.BLENDVPD, tmp, vr) 1714 1715 c.locationStack.markRegisterUnused(blendMaskReg, tmp) 1716 c.pushVectorRuntimeValueLocationOnRegister(vr) 1717 return nil 1718 } 1719 1720 var ( 1721 popcntMask = [16]byte{ 1722 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 1723 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 1724 } 1725 // popcntTable holds each index's Popcnt, for example popcntTable[5] holds popcnt(0x05). 1726 popcntTable = [16]byte{ 1727 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 1728 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 1729 } 1730 ) 1731 1732 // compileV128Popcnt implements compiler.compileV128Popcnt for amd64. 1733 func (c *amd64Compiler) compileV128Popcnt(*wazeroir.OperationV128Popcnt) error { 1734 v := c.locationStack.popV128() 1735 if err := c.compileEnsureOnRegister(v); err != nil { 1736 return err 1737 } 1738 vr := v.register 1739 1740 tmp1, err := c.allocateRegister(registerTypeVector) 1741 if err != nil { 1742 return err 1743 } 1744 1745 c.locationStack.markRegisterUsed(tmp1) 1746 1747 tmp2, err := c.allocateRegister(registerTypeVector) 1748 if err != nil { 1749 return err 1750 } 1751 1752 c.locationStack.markRegisterUsed(tmp2) 1753 1754 tmp3, err := c.allocateRegister(registerTypeVector) 1755 if err != nil { 1756 return err 1757 } 1758 1759 // Read the popcntMask into tmp1, and we have 1760 // tmp1 = [0xf, ..., 0xf] 1761 if err := c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, asm.NewStaticConst(popcntMask[:]), tmp1); err != nil { 1762 return err 1763 } 1764 1765 // Copy the original value into tmp2. 1766 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vr, tmp2) 1767 1768 // Given that we have: 1769 // v = [b1, ..., b16] where bn = hn:ln and hn and ln are higher and lower 4-bits of bn. 1770 // 1771 // Take PAND on tmp1 and tmp2, and we have 1772 // tmp2 = [l1, ..., l16]. 1773 c.assembler.CompileRegisterToRegister(amd64.PAND, tmp1, tmp2) 1774 1775 // Do logical (packed word) right shift by 4 on vr and PAND with vr and tmp1, meaning that we have 1776 // vr = [h1, ...., h16]. 1777 c.assembler.CompileConstToRegister(amd64.PSRLW, 4, vr) 1778 c.assembler.CompileRegisterToRegister(amd64.PAND, tmp1, vr) 1779 1780 // Read the popcntTable into tmp1, and we have 1781 // tmp1 = [0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04] 1782 if err := c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, asm.NewStaticConst(popcntTable[:]), tmp1); err != nil { 1783 return err 1784 } 1785 1786 // Copy the tmp1 into tmp3, and we have 1787 // tmp3 = [0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04] 1788 c.assembler.CompileRegisterToRegister(amd64.MOVDQU, tmp1, tmp3) 1789 1790 // tmp3 = [popcnt(l1), ..., popcnt(l16)]. 1791 c.assembler.CompileRegisterToRegister(amd64.PSHUFB, tmp2, tmp3) 1792 1793 // tmp1 = [popcnt(h1), ..., popcnt(h16)]. 1794 c.assembler.CompileRegisterToRegister(amd64.PSHUFB, vr, tmp1) 1795 1796 // vr = tmp1 = [popcnt(h1), ..., popcnt(h16)]. 1797 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, tmp1, vr) 1798 1799 // vr += tmp3 = [popcnt(h1)+popcnt(l1), ..., popcnt(h16)+popcnt(l16)] = [popcnt(b1), ..., popcnt(b16)]. 1800 c.assembler.CompileRegisterToRegister(amd64.PADDB, tmp3, vr) 1801 1802 c.locationStack.markRegisterUnused(tmp1, tmp2) 1803 c.pushVectorRuntimeValueLocationOnRegister(vr) 1804 return nil 1805 } 1806 1807 // compileV128Min implements compiler.compileV128Min for amd64. 1808 func (c *amd64Compiler) compileV128Min(o *wazeroir.OperationV128Min) error { 1809 x2 := c.locationStack.popV128() 1810 if err := c.compileEnsureOnRegister(x2); err != nil { 1811 return err 1812 } 1813 1814 x1 := c.locationStack.popV128() 1815 if err := c.compileEnsureOnRegister(x1); err != nil { 1816 return err 1817 } 1818 1819 if o.Shape >= wazeroir.ShapeF32x4 { 1820 return c.compileV128FloatMinImpl(o.Shape == wazeroir.ShapeF32x4, x1.register, x2.register) 1821 } 1822 1823 var inst asm.Instruction 1824 switch o.Shape { 1825 case wazeroir.ShapeI8x16: 1826 if o.Signed { 1827 inst = amd64.PMINSB 1828 } else { 1829 inst = amd64.PMINUB 1830 } 1831 case wazeroir.ShapeI16x8: 1832 if o.Signed { 1833 inst = amd64.PMINSW 1834 } else { 1835 inst = amd64.PMINUW 1836 } 1837 case wazeroir.ShapeI32x4: 1838 if o.Signed { 1839 inst = amd64.PMINSD 1840 } else { 1841 inst = amd64.PMINUD 1842 } 1843 } 1844 1845 c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register) 1846 1847 c.locationStack.markRegisterUnused(x2.register) 1848 c.pushVectorRuntimeValueLocationOnRegister(x1.register) 1849 return nil 1850 } 1851 1852 // compileV128FloatMinImpl implements compiler.compileV128Min for float lanes. 1853 func (c *amd64Compiler) compileV128FloatMinImpl(is32bit bool, x1r, x2r asm.Register) error { 1854 tmp, err := c.allocateRegister(registerTypeVector) 1855 if err != nil { 1856 return err 1857 } 1858 1859 var min, cmp, andn, or, srl /* shit right logical */ asm.Instruction 1860 var shiftNumToInverseNaN asm.ConstantValue 1861 if is32bit { 1862 min, cmp, andn, or, srl, shiftNumToInverseNaN = amd64.MINPS, amd64.CMPPS, amd64.ANDNPS, amd64.ORPS, amd64.PSRLD, 0xa 1863 } else { 1864 min, cmp, andn, or, srl, shiftNumToInverseNaN = amd64.MINPD, amd64.CMPPD, amd64.ANDNPD, amd64.ORPD, amd64.PSRLQ, 0xd 1865 } 1866 1867 // Let v1 and v2 be the operand values on x1r and x2r at this point. 1868 1869 // Copy the value into tmp: tmp=v1 1870 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1r, tmp) 1871 // tmp=min(v1, v2) 1872 c.assembler.CompileRegisterToRegister(min, x2r, tmp) 1873 // x2r=min(v2, v1) 1874 c.assembler.CompileRegisterToRegister(min, x1r, x2r) 1875 // x1r=min(v2, v1) 1876 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x2r, x1r) 1877 1878 // x2r = -0 if (v1 == -0 || x2 == -0) && v1 != NaN && v2 !=NaN 1879 // NaN if v1 == NaN || v2 == NaN 1880 // min(v1, v2) otherwise 1881 c.assembler.CompileRegisterToRegister(or, tmp, x2r) 1882 // x1r = 0^ (set all bits) if v1 == NaN || v2 == NaN 1883 // 0 otherwise 1884 c.assembler.CompileRegisterToRegisterWithArg(cmp, tmp, x1r, 3) 1885 // x2r = -0 if (v1 == -0 || x2 == -0) && v1 != NaN && v2 !=NaN 1886 // ^0 if v1 == NaN || v2 == NaN 1887 // min(v1, v2) otherwise 1888 c.assembler.CompileRegisterToRegister(or, x1r, x2r) 1889 // x1r = set all bits on the mantissa bits 1890 // 0 otherwise 1891 c.assembler.CompileConstToRegister(srl, shiftNumToInverseNaN, x1r) 1892 // x1r = x2r and !x1r 1893 // = -0 if (v1 == -0 || x2 == -0) && v1 != NaN && v2 !=NaN 1894 // set all bits on exponential and sign bit (== NaN) if v1 == NaN || v2 == NaN 1895 // min(v1, v2) otherwise 1896 c.assembler.CompileRegisterToRegister(andn, x2r, x1r) 1897 1898 c.locationStack.markRegisterUnused(x2r) 1899 c.pushVectorRuntimeValueLocationOnRegister(x1r) 1900 return nil 1901 } 1902 1903 // compileV128Max implements compiler.compileV128Max for amd64. 1904 func (c *amd64Compiler) compileV128Max(o *wazeroir.OperationV128Max) error { 1905 x2 := c.locationStack.popV128() 1906 if err := c.compileEnsureOnRegister(x2); err != nil { 1907 return err 1908 } 1909 1910 x1 := c.locationStack.popV128() 1911 if err := c.compileEnsureOnRegister(x1); err != nil { 1912 return err 1913 } 1914 1915 if o.Shape >= wazeroir.ShapeF32x4 { 1916 return c.compileV128FloatMaxImpl(o.Shape == wazeroir.ShapeF32x4, x1.register, x2.register) 1917 } 1918 1919 var inst asm.Instruction 1920 switch o.Shape { 1921 case wazeroir.ShapeI8x16: 1922 if o.Signed { 1923 inst = amd64.PMAXSB 1924 } else { 1925 inst = amd64.PMAXUB 1926 } 1927 case wazeroir.ShapeI16x8: 1928 if o.Signed { 1929 inst = amd64.PMAXSW 1930 } else { 1931 inst = amd64.PMAXUW 1932 } 1933 case wazeroir.ShapeI32x4: 1934 if o.Signed { 1935 inst = amd64.PMAXSD 1936 } else { 1937 inst = amd64.PMAXUD 1938 } 1939 } 1940 1941 c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register) 1942 1943 c.locationStack.markRegisterUnused(x2.register) 1944 c.pushVectorRuntimeValueLocationOnRegister(x1.register) 1945 return nil 1946 } 1947 1948 // compileV128FloatMaxImpl implements compiler.compileV128Max for float lanes. 1949 func (c *amd64Compiler) compileV128FloatMaxImpl(is32bit bool, x1r, x2r asm.Register) error { 1950 tmp, err := c.allocateRegister(registerTypeVector) 1951 if err != nil { 1952 return err 1953 } 1954 1955 var max, cmp, andn, or, xor, sub, srl /* shit right logical */ asm.Instruction 1956 var shiftNumToInverseNaN asm.ConstantValue 1957 if is32bit { 1958 max, cmp, andn, or, xor, sub, srl, shiftNumToInverseNaN = amd64.MAXPS, amd64.CMPPS, amd64.ANDNPS, amd64.ORPS, amd64.XORPS, amd64.SUBPS, amd64.PSRLD, 0xa 1959 } else { 1960 max, cmp, andn, or, xor, sub, srl, shiftNumToInverseNaN = amd64.MAXPD, amd64.CMPPD, amd64.ANDNPD, amd64.ORPD, amd64.XORPD, amd64.SUBPD, amd64.PSRLQ, 0xd 1961 } 1962 1963 // Let v1 and v2 be the operand values on x1r and x2r at this point. 1964 1965 // Copy the value into tmp: tmp=v2 1966 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x2r, tmp) 1967 // tmp=max(v2, v1) 1968 c.assembler.CompileRegisterToRegister(max, x1r, tmp) 1969 // x1r=max(v1, v2) 1970 c.assembler.CompileRegisterToRegister(max, x2r, x1r) 1971 // x2r=max(v1, v2) 1972 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1r, x2r) 1973 1974 // x2r = -0 if (v1 == -0 && v2 == 0) || (v1 == 0 && v2 == -0) 1975 // 0 if (v1 == 0 && v2 == 0) 1976 // -0 if (v1 == -0 && v2 == -0) 1977 // v1^v2 if v1 == NaN || v2 == NaN 1978 // 0 otherwise 1979 c.assembler.CompileRegisterToRegister(xor, tmp, x2r) 1980 // x1r = -0 if (v1 == -0 && v2 == 0) || (v1 == 0 && v2 == -0) 1981 // 0 if (v1 == 0 && v2 == 0) 1982 // -0 if (v1 == -0 && v2 == -0) 1983 // NaN if v1 == NaN || v2 == NaN 1984 // max(v1, v2) otherwise 1985 c.assembler.CompileRegisterToRegister(or, x2r, x1r) 1986 // Copy x1r into tmp. 1987 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1r, tmp) 1988 // tmp = 0 if (v1 == -0 && v2 == 0) || (v1 == 0 && v2 == -0) || (v1 == 0 && v2 == 0) 1989 // -0 if (v1 == -0 && v2 == -0) 1990 // NaN if v1 == NaN || v2 == NaN 1991 // max(v1, v2) otherwise 1992 // 1993 // Note: -0 - (-0) = 0 (!= -0) in floating point operation. 1994 c.assembler.CompileRegisterToRegister(sub, x2r, tmp) 1995 // x1r = 0^ if v1 == NaN || v2 == NaN 1996 c.assembler.CompileRegisterToRegisterWithArg(cmp, x1r, x1r, 3) 1997 // x1r = set all bits on the mantissa bits 1998 // 0 otherwise 1999 c.assembler.CompileConstToRegister(srl, shiftNumToInverseNaN, x1r) 2000 c.assembler.CompileRegisterToRegister(andn, tmp, x1r) 2001 2002 c.locationStack.markRegisterUnused(x2r) 2003 c.pushVectorRuntimeValueLocationOnRegister(x1r) 2004 return nil 2005 } 2006 2007 // compileV128AvgrU implements compiler.compileV128AvgrU for amd64. 2008 func (c *amd64Compiler) compileV128AvgrU(o *wazeroir.OperationV128AvgrU) error { 2009 x2 := c.locationStack.popV128() 2010 if err := c.compileEnsureOnRegister(x2); err != nil { 2011 return err 2012 } 2013 2014 x1 := c.locationStack.popV128() 2015 if err := c.compileEnsureOnRegister(x1); err != nil { 2016 return err 2017 } 2018 2019 var inst asm.Instruction 2020 switch o.Shape { 2021 case wazeroir.ShapeI8x16: 2022 inst = amd64.PAVGB 2023 case wazeroir.ShapeI16x8: 2024 inst = amd64.PAVGW 2025 } 2026 2027 c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register) 2028 2029 c.locationStack.markRegisterUnused(x2.register) 2030 c.pushVectorRuntimeValueLocationOnRegister(x1.register) 2031 return nil 2032 } 2033 2034 // compileV128Pmin implements compiler.compileV128Pmin for amd64. 2035 func (c *amd64Compiler) compileV128Pmin(o *wazeroir.OperationV128Pmin) error { 2036 x2 := c.locationStack.popV128() 2037 if err := c.compileEnsureOnRegister(x2); err != nil { 2038 return err 2039 } 2040 2041 x1 := c.locationStack.popV128() 2042 if err := c.compileEnsureOnRegister(x1); err != nil { 2043 return err 2044 } 2045 2046 var min asm.Instruction 2047 if o.Shape == wazeroir.ShapeF32x4 { 2048 min = amd64.MINPS 2049 } else { 2050 min = amd64.MINPD 2051 } 2052 2053 x1r, v2r := x1.register, x2.register 2054 2055 c.assembler.CompileRegisterToRegister(min, x1r, v2r) 2056 2057 c.locationStack.markRegisterUnused(x1r) 2058 c.pushVectorRuntimeValueLocationOnRegister(v2r) 2059 return nil 2060 } 2061 2062 // compileV128Pmax implements compiler.compileV128Pmax for amd64. 2063 func (c *amd64Compiler) compileV128Pmax(o *wazeroir.OperationV128Pmax) error { 2064 x2 := c.locationStack.popV128() 2065 if err := c.compileEnsureOnRegister(x2); err != nil { 2066 return err 2067 } 2068 2069 x1 := c.locationStack.popV128() 2070 if err := c.compileEnsureOnRegister(x1); err != nil { 2071 return err 2072 } 2073 2074 var min asm.Instruction 2075 if o.Shape == wazeroir.ShapeF32x4 { 2076 min = amd64.MAXPS 2077 } else { 2078 min = amd64.MAXPD 2079 } 2080 2081 x1r, v2r := x1.register, x2.register 2082 2083 c.assembler.CompileRegisterToRegister(min, x1r, v2r) 2084 2085 c.locationStack.markRegisterUnused(x1r) 2086 c.pushVectorRuntimeValueLocationOnRegister(v2r) 2087 return nil 2088 } 2089 2090 // compileV128Ceil implements compiler.compileV128Ceil for amd64. 2091 func (c *amd64Compiler) compileV128Ceil(o *wazeroir.OperationV128Ceil) error { 2092 // See https://www.felixcloutier.com/x86/roundpd 2093 const roundModeCeil = 0x2 2094 return c.compileV128RoundImpl(o.Shape == wazeroir.ShapeF32x4, roundModeCeil) 2095 } 2096 2097 // compileV128Floor implements compiler.compileV128Floor for amd64. 2098 func (c *amd64Compiler) compileV128Floor(o *wazeroir.OperationV128Floor) error { 2099 // See https://www.felixcloutier.com/x86/roundpd 2100 const roundModeFloor = 0x1 2101 return c.compileV128RoundImpl(o.Shape == wazeroir.ShapeF32x4, roundModeFloor) 2102 } 2103 2104 // compileV128Trunc implements compiler.compileV128Trunc for amd64. 2105 func (c *amd64Compiler) compileV128Trunc(o *wazeroir.OperationV128Trunc) error { 2106 // See https://www.felixcloutier.com/x86/roundpd 2107 const roundModeTrunc = 0x3 2108 return c.compileV128RoundImpl(o.Shape == wazeroir.ShapeF32x4, roundModeTrunc) 2109 } 2110 2111 // compileV128Nearest implements compiler.compileV128Nearest for amd64. 2112 func (c *amd64Compiler) compileV128Nearest(o *wazeroir.OperationV128Nearest) error { 2113 // See https://www.felixcloutier.com/x86/roundpd 2114 const roundModeNearest = 0x0 2115 return c.compileV128RoundImpl(o.Shape == wazeroir.ShapeF32x4, roundModeNearest) 2116 } 2117 2118 // compileV128RoundImpl implements compileV128Nearest compileV128Trunc compileV128Floor and compileV128Ceil 2119 // with ROUNDPS (32-bit lane) and ROUNDPD (64-bit lane). 2120 func (c *amd64Compiler) compileV128RoundImpl(is32bit bool, mode byte) error { 2121 v := c.locationStack.popV128() 2122 if err := c.compileEnsureOnRegister(v); err != nil { 2123 return err 2124 } 2125 vr := v.register 2126 2127 var round asm.Instruction 2128 if is32bit { 2129 round = amd64.ROUNDPS 2130 } else { 2131 round = amd64.ROUNDPD 2132 } 2133 2134 c.assembler.CompileRegisterToRegisterWithArg(round, vr, vr, mode) 2135 c.pushVectorRuntimeValueLocationOnRegister(vr) 2136 return nil 2137 } 2138 2139 // compileV128Extend implements compiler.compileV128Extend for amd64. 2140 func (c *amd64Compiler) compileV128Extend(o *wazeroir.OperationV128Extend) error { 2141 v := c.locationStack.popV128() 2142 if err := c.compileEnsureOnRegister(v); err != nil { 2143 return err 2144 } 2145 vr := v.register 2146 2147 if !o.UseLow { 2148 // We have to shift the higher 64-bits into the lower ones before the actual extending instruction. 2149 // Shifting right by 0x8 * 8 = 64bits and concatenate itself. 2150 // See https://www.felixcloutier.com/x86/palignr 2151 c.assembler.CompileRegisterToRegisterWithArg(amd64.PALIGNR, v.register, v.register, 0x8) 2152 } 2153 2154 var extend asm.Instruction 2155 switch o.OriginShape { 2156 case wazeroir.ShapeI8x16: 2157 if o.Signed { 2158 extend = amd64.PMOVSXBW 2159 } else { 2160 extend = amd64.PMOVZXBW 2161 } 2162 case wazeroir.ShapeI16x8: 2163 if o.Signed { 2164 extend = amd64.PMOVSXWD 2165 } else { 2166 extend = amd64.PMOVZXWD 2167 } 2168 case wazeroir.ShapeI32x4: 2169 if o.Signed { 2170 extend = amd64.PMOVSXDQ 2171 } else { 2172 extend = amd64.PMOVZXDQ 2173 } 2174 } 2175 2176 c.assembler.CompileRegisterToRegister(extend, vr, vr) 2177 c.pushVectorRuntimeValueLocationOnRegister(vr) 2178 return nil 2179 } 2180 2181 // compileV128ExtMul implements compiler.compileV128ExtMul for amd64. 2182 func (c *amd64Compiler) compileV128ExtMul(o *wazeroir.OperationV128ExtMul) error { 2183 x2 := c.locationStack.popV128() 2184 if err := c.compileEnsureOnRegister(x2); err != nil { 2185 return err 2186 } 2187 2188 x1 := c.locationStack.popV128() 2189 if err := c.compileEnsureOnRegister(x1); err != nil { 2190 return err 2191 } 2192 2193 x1r, x2r := x1.register, x2.register 2194 2195 switch o.OriginShape { 2196 case wazeroir.ShapeI8x16: 2197 if !o.UseLow { 2198 // We have to shift the higher 64-bits into the lower ones before the actual extending instruction. 2199 // Shifting right by 0x8 * 8 = 64bits and concatenate itself. 2200 // See https://www.felixcloutier.com/x86/palignr 2201 c.assembler.CompileRegisterToRegisterWithArg(amd64.PALIGNR, x1r, x1r, 0x8) 2202 c.assembler.CompileRegisterToRegisterWithArg(amd64.PALIGNR, x2r, x2r, 0x8) 2203 } 2204 2205 var ext asm.Instruction 2206 if o.Signed { 2207 ext = amd64.PMOVSXBW 2208 } else { 2209 ext = amd64.PMOVZXBW 2210 } 2211 2212 // Signed or Zero extend lower half packed bytes to packed words. 2213 c.assembler.CompileRegisterToRegister(ext, x1r, x1r) 2214 c.assembler.CompileRegisterToRegister(ext, x2r, x2r) 2215 2216 c.assembler.CompileRegisterToRegister(amd64.PMULLW, x2r, x1r) 2217 case wazeroir.ShapeI16x8: 2218 tmp, err := c.allocateRegister(registerTypeVector) 2219 if err != nil { 2220 return err 2221 } 2222 2223 // Copy the value on x1r to tmp. 2224 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1r, tmp) 2225 2226 // Multiply the values and store the lower 16-bits into x1r. 2227 c.assembler.CompileRegisterToRegister(amd64.PMULLW, x2r, x1r) 2228 if o.Signed { 2229 // Signed multiply the values and store the higher 16-bits into tmp. 2230 c.assembler.CompileRegisterToRegister(amd64.PMULHW, x2r, tmp) 2231 } else { 2232 // Unsigned multiply the values and store the higher 16-bits into tmp. 2233 c.assembler.CompileRegisterToRegister(amd64.PMULHUW, x2r, tmp) 2234 } 2235 2236 // Unpack lower or higher half of vectors (tmp and x1r) and concatenate them. 2237 if o.UseLow { 2238 c.assembler.CompileRegisterToRegister(amd64.PUNPCKLWD, tmp, x1r) 2239 } else { 2240 c.assembler.CompileRegisterToRegister(amd64.PUNPCKHWD, tmp, x1r) 2241 } 2242 case wazeroir.ShapeI32x4: 2243 var shuffleOrder byte 2244 // Given that the original state of the register is as [v1, v2, v3, v4] where vN = a word, 2245 if o.UseLow { 2246 // This makes the register as [v1, v1, v2, v2] 2247 shuffleOrder = 0b01010000 2248 } else { 2249 // This makes the register as [v3, v3, v4, v4] 2250 shuffleOrder = 0b11111010 2251 } 2252 // See https://www.felixcloutier.com/x86/pshufd 2253 c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, x1r, x1r, shuffleOrder) 2254 c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, x2r, x2r, shuffleOrder) 2255 2256 var mul asm.Instruction 2257 if o.Signed { 2258 mul = amd64.PMULDQ 2259 } else { 2260 mul = amd64.PMULUDQ 2261 } 2262 c.assembler.CompileRegisterToRegister(mul, x2r, x1r) 2263 } 2264 2265 c.locationStack.markRegisterUnused(x2r) 2266 c.pushVectorRuntimeValueLocationOnRegister(x1r) 2267 return nil 2268 } 2269 2270 var q15mulrSatSMask = [16]byte{ 2271 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 2272 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 2273 } 2274 2275 // compileV128Q15mulrSatS implements compiler.compileV128Q15mulrSatS for amd64. 2276 func (c *amd64Compiler) compileV128Q15mulrSatS(*wazeroir.OperationV128Q15mulrSatS) error { 2277 x2 := c.locationStack.popV128() 2278 if err := c.compileEnsureOnRegister(x2); err != nil { 2279 return err 2280 } 2281 2282 x1 := c.locationStack.popV128() 2283 if err := c.compileEnsureOnRegister(x1); err != nil { 2284 return err 2285 } 2286 2287 tmp, err := c.allocateRegister(registerTypeVector) 2288 if err != nil { 2289 return err 2290 } 2291 2292 x1r, x2r := x1.register, x2.register 2293 2294 // See https://github.com/WebAssembly/simd/pull/365 for the following logic. 2295 if err := c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, asm.NewStaticConst(q15mulrSatSMask[:]), tmp); err != nil { 2296 return err 2297 } 2298 2299 c.assembler.CompileRegisterToRegister(amd64.PMULHRSW, x2r, x1r) 2300 c.assembler.CompileRegisterToRegister(amd64.PCMPEQW, x1r, tmp) 2301 c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, x1r) 2302 2303 c.locationStack.markRegisterUnused(x2r) 2304 c.pushVectorRuntimeValueLocationOnRegister(x1r) 2305 return nil 2306 } 2307 2308 var ( 2309 allOnesI8x16 = [16]byte{0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1} 2310 allOnesI16x8 = [16]byte{0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0} 2311 2312 extAddPairwiseI16x8uMask = [16 * 2]byte{ 2313 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 2314 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 2315 } 2316 ) 2317 2318 // compileV128ExtAddPairwise implements compiler.compileV128ExtAddPairwise for amd64. 2319 func (c *amd64Compiler) compileV128ExtAddPairwise(o *wazeroir.OperationV128ExtAddPairwise) error { 2320 v := c.locationStack.popV128() 2321 if err := c.compileEnsureOnRegister(v); err != nil { 2322 return err 2323 } 2324 vr := v.register 2325 2326 switch o.OriginShape { 2327 case wazeroir.ShapeI8x16: 2328 allOnesReg, err := c.allocateRegister(registerTypeVector) 2329 if err != nil { 2330 return err 2331 } 2332 2333 if err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, 2334 asm.NewStaticConst(allOnesI8x16[:]), allOnesReg); err != nil { 2335 return err 2336 } 2337 2338 var result asm.Register 2339 // See https://www.felixcloutier.com/x86/pmaddubsw for detail. 2340 if o.Signed { 2341 // Interpret vr's value as signed byte and multiply with one and add pairwise, which results in pairwise 2342 // signed extadd. 2343 c.assembler.CompileRegisterToRegister(amd64.PMADDUBSW, vr, allOnesReg) 2344 result = allOnesReg 2345 } else { 2346 // Interpreter tmp (all ones) as signed byte meaning that all the multiply-add is unsigned. 2347 c.assembler.CompileRegisterToRegister(amd64.PMADDUBSW, allOnesReg, vr) 2348 result = vr 2349 } 2350 2351 if result != vr { 2352 c.locationStack.markRegisterUnused(vr) 2353 } 2354 c.pushVectorRuntimeValueLocationOnRegister(result) 2355 case wazeroir.ShapeI16x8: 2356 tmp, err := c.allocateRegister(registerTypeVector) 2357 if err != nil { 2358 return err 2359 } 2360 2361 if o.Signed { 2362 // See https://www.felixcloutier.com/x86/pmaddwd 2363 if err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, 2364 asm.NewStaticConst(allOnesI16x8[:]), tmp); err != nil { 2365 return err 2366 } 2367 2368 c.assembler.CompileRegisterToRegister(amd64.PMADDWD, tmp, vr) 2369 c.pushVectorRuntimeValueLocationOnRegister(vr) 2370 } else { 2371 2372 if err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, 2373 asm.NewStaticConst(extAddPairwiseI16x8uMask[:16]), tmp); err != nil { 2374 return err 2375 } 2376 2377 // Flip the sign bits on vr. 2378 // 2379 // Assuming that vr = [w1, ..., w8], now we have, 2380 // vr[i] = int8(-w1) for i = 0...8 2381 c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, vr) 2382 2383 if err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, 2384 asm.NewStaticConst(allOnesI16x8[:]), tmp); err != nil { 2385 return err 2386 } 2387 2388 // For i = 0,..4 (as this results in i32x4 lanes), now we have 2389 // vr[i] = int32(-wn + -w(n+1)) = int32(-(wn + w(n+1))) 2390 c.assembler.CompileRegisterToRegister(amd64.PMADDWD, tmp, vr) 2391 2392 // tmp[i] = [0, 0, 1, 0] = int32(math.MaxInt16+1) 2393 if err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, 2394 asm.NewStaticConst(extAddPairwiseI16x8uMask[16:]), tmp); err != nil { 2395 return err 2396 } 2397 2398 // vr[i] = int32(-(wn + w(n+1))) + int32(math.MaxInt16+1) = int32((wn + w(n+1))) = uint32(wn + w(n+1)). 2399 c.assembler.CompileRegisterToRegister(amd64.PADDD, tmp, vr) 2400 c.pushVectorRuntimeValueLocationOnRegister(vr) 2401 } 2402 } 2403 return nil 2404 } 2405 2406 // compileV128FloatPromote implements compiler.compileV128FloatPromote for amd64. 2407 func (c *amd64Compiler) compileV128FloatPromote(*wazeroir.OperationV128FloatPromote) error { 2408 v := c.locationStack.popV128() 2409 if err := c.compileEnsureOnRegister(v); err != nil { 2410 return err 2411 } 2412 vr := v.register 2413 2414 c.assembler.CompileRegisterToRegister(amd64.CVTPS2PD, vr, vr) 2415 c.pushVectorRuntimeValueLocationOnRegister(vr) 2416 return nil 2417 } 2418 2419 // compileV128FloatDemote implements compiler.compileV128FloatDemote for amd64. 2420 func (c *amd64Compiler) compileV128FloatDemote(*wazeroir.OperationV128FloatDemote) error { 2421 v := c.locationStack.popV128() 2422 if err := c.compileEnsureOnRegister(v); err != nil { 2423 return err 2424 } 2425 vr := v.register 2426 2427 c.assembler.CompileRegisterToRegister(amd64.CVTPD2PS, vr, vr) 2428 c.pushVectorRuntimeValueLocationOnRegister(vr) 2429 return nil 2430 } 2431 2432 // compileV128Dot implements compiler.compileV128Dot for amd64. 2433 func (c *amd64Compiler) compileV128Dot(*wazeroir.OperationV128Dot) error { 2434 x2 := c.locationStack.popV128() 2435 if err := c.compileEnsureOnRegister(x2); err != nil { 2436 return err 2437 } 2438 2439 x1 := c.locationStack.popV128() 2440 if err := c.compileEnsureOnRegister(x1); err != nil { 2441 return err 2442 } 2443 2444 c.assembler.CompileRegisterToRegister(amd64.PMADDWD, x2.register, x1.register) 2445 2446 c.locationStack.markRegisterUnused(x2.register) 2447 c.pushVectorRuntimeValueLocationOnRegister(x1.register) 2448 return nil 2449 } 2450 2451 var fConvertFromIMask = [16]byte{ 2452 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 2453 } 2454 2455 // compileV128FConvertFromI implements compiler.compileV128FConvertFromI for amd64. 2456 func (c *amd64Compiler) compileV128FConvertFromI(o *wazeroir.OperationV128FConvertFromI) error { 2457 v := c.locationStack.popV128() 2458 if err := c.compileEnsureOnRegister(v); err != nil { 2459 return err 2460 } 2461 vr := v.register 2462 2463 switch o.DestinationShape { 2464 case wazeroir.ShapeF32x4: 2465 if o.Signed { 2466 c.assembler.CompileRegisterToRegister(amd64.CVTDQ2PS, vr, vr) 2467 } else { 2468 tmp, err := c.allocateRegister(registerTypeVector) 2469 if err != nil { 2470 return err 2471 } 2472 2473 // Copy the value into tmp. 2474 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vr, tmp) 2475 2476 // Clear the higher 16-bits of tmp. 2477 c.assembler.CompileConstToRegister(amd64.PSLLD, 0xa, tmp) 2478 c.assembler.CompileConstToRegister(amd64.PSRLD, 0xa, tmp) 2479 2480 // Subtract the higher 16-bits from vr == clear the lower 16-bits of vr. 2481 c.assembler.CompileRegisterToRegister(amd64.PSUBD, tmp, vr) 2482 2483 // Convert the lower 16-bits in tmp. 2484 c.assembler.CompileRegisterToRegister(amd64.CVTDQ2PS, tmp, tmp) 2485 2486 // Left shift by one and convert vr, meaning that halved conversion result of higher 16-bits in vr. 2487 c.assembler.CompileConstToRegister(amd64.PSRLD, 1, vr) 2488 c.assembler.CompileRegisterToRegister(amd64.CVTDQ2PS, vr, vr) 2489 2490 // Double the converted halved higher 16bits. 2491 c.assembler.CompileRegisterToRegister(amd64.ADDPS, vr, vr) 2492 2493 // Get the conversion result by add tmp (holding lower 16-bit conversion) into vr. 2494 c.assembler.CompileRegisterToRegister(amd64.ADDPS, tmp, vr) 2495 } 2496 case wazeroir.ShapeF64x2: 2497 if o.Signed { 2498 c.assembler.CompileRegisterToRegister(amd64.CVTDQ2PD, vr, vr) 2499 } else { 2500 tmp, err := c.allocateRegister(registerTypeVector) 2501 if err != nil { 2502 return err 2503 } 2504 2505 // tmp = [0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00] 2506 if err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, asm.NewStaticConst(fConvertFromIMask[:16]), tmp); err != nil { 2507 return err 2508 } 2509 2510 // Given that we have vr = [d1, d2, d3, d4], this results in 2511 // vr = [d1, [0x00, 0x00, 0x30, 0x43], d2, [0x00, 0x00, 0x30, 0x43]] 2512 // = [float64(uint32(d1)) + 0x1.0p52, float64(uint32(d2)) + 0x1.0p52] 2513 // ^See https://stackoverflow.com/questions/13269523/can-all-32-bit-ints-be-exactly-represented-as-a-double 2514 c.assembler.CompileRegisterToRegister(amd64.UNPCKLPS, tmp, vr) 2515 2516 // tmp = [float64(0x1.0p52), float64(0x1.0p52)] 2517 if err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, 2518 asm.NewStaticConst(twop52[:]), tmp); err != nil { 2519 return err 2520 } 2521 2522 // Now, we get the result as 2523 // vr = [float64(uint32(d1)), float64(uint32(d2))] 2524 // because the following equality always satisfies: 2525 // float64(0x1.0p52 + float64(uint32(x))) - float64(0x1.0p52 + float64(uint32(y))) = float64(uint32(x)) - float64(uint32(y)) 2526 c.assembler.CompileRegisterToRegister(amd64.SUBPD, tmp, vr) 2527 } 2528 } 2529 2530 c.pushVectorRuntimeValueLocationOnRegister(vr) 2531 return nil 2532 } 2533 2534 // compileV128Narrow implements compiler.compileV128Narrow for amd64. 2535 func (c *amd64Compiler) compileV128Narrow(o *wazeroir.OperationV128Narrow) error { 2536 x2 := c.locationStack.popV128() 2537 if err := c.compileEnsureOnRegister(x2); err != nil { 2538 return err 2539 } 2540 2541 x1 := c.locationStack.popV128() 2542 if err := c.compileEnsureOnRegister(x1); err != nil { 2543 return err 2544 } 2545 2546 var narrow asm.Instruction 2547 switch o.OriginShape { 2548 case wazeroir.ShapeI16x8: 2549 if o.Signed { 2550 narrow = amd64.PACKSSWB 2551 } else { 2552 narrow = amd64.PACKUSWB 2553 } 2554 case wazeroir.ShapeI32x4: 2555 if o.Signed { 2556 narrow = amd64.PACKSSDW 2557 } else { 2558 narrow = amd64.PACKUSDW 2559 } 2560 } 2561 c.assembler.CompileRegisterToRegister(narrow, x2.register, x1.register) 2562 2563 c.locationStack.markRegisterUnused(x2.register) 2564 c.pushVectorRuntimeValueLocationOnRegister(x1.register) 2565 return nil 2566 } 2567 2568 var ( 2569 // i32sMaxOnF64x2 holds math.MaxInt32(=2147483647.0) on two f64 lanes. 2570 i32sMaxOnF64x2 = [16]byte{ 2571 0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0xdf, 0x41, // float64(2147483647.0) 2572 0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0xdf, 0x41, // float64(2147483647.0) 2573 } 2574 2575 // i32sMaxOnF64x2 holds math.MaxUint32(=4294967295.0) on two f64 lanes. 2576 i32uMaxOnF64x2 = [16]byte{ 2577 0x00, 0x00, 0xe0, 0xff, 0xff, 0xff, 0xef, 0x41, // float64(4294967295.0) 2578 0x00, 0x00, 0xe0, 0xff, 0xff, 0xff, 0xef, 0x41, // float64(4294967295.0) 2579 } 2580 2581 // twop52 holds two float64(0x1.0p52) on two f64 lanes. 0x1.0p52 is special in the sense that 2582 // with this exponent, the mantissa represents a corresponding uint32 number, and arithmetics, 2583 // like addition or subtraction, the resulted floating point holds exactly the same 2584 // bit representations in 32-bit integer on its mantissa. 2585 // 2586 // Note: the name twop52 is common across various compiler ecosystem. 2587 // E.g. https://github.com/llvm/llvm-project/blob/92ab024f81e5b64e258b7c3baaf213c7c26fcf40/compiler-rt/lib/builtins/floatdidf.c#L28 2588 // E.g. https://opensource.apple.com/source/clang/clang-425.0.24/src/projects/compiler-rt/lib/floatdidf.c.auto.html 2589 twop52 = [16]byte{ 2590 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, // float64(0x1.0p52) 2591 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, // float64(0x1.0p52) 2592 } 2593 ) 2594 2595 // compileV128ITruncSatFromF implements compiler.compileV128ITruncSatFromF for amd64. 2596 func (c *amd64Compiler) compileV128ITruncSatFromF(o *wazeroir.OperationV128ITruncSatFromF) error { 2597 v := c.locationStack.popV128() 2598 if err := c.compileEnsureOnRegister(v); err != nil { 2599 return err 2600 } 2601 vr := v.register 2602 2603 tmp, err := c.allocateRegister(registerTypeVector) 2604 if err != nil { 2605 return err 2606 } 2607 2608 c.locationStack.markRegisterUsed(tmp) 2609 2610 switch o.OriginShape { 2611 case wazeroir.ShapeF32x4: 2612 if o.Signed { 2613 // Copy the value into tmp. 2614 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vr, tmp) 2615 2616 // Assuming we have vr = [v1, v2, v3, v4]. 2617 // 2618 // Set all bits if lane is not NaN on tmp. 2619 // tmp[i] = 0xffffffff if vi != NaN 2620 // = 0 if vi == NaN 2621 c.assembler.CompileRegisterToRegister(amd64.CMPEQPS, tmp, tmp) 2622 2623 // Clear NaN lanes on vr, meaning that 2624 // vr[i] = vi if vi != NaN 2625 // 0 if vi == NaN 2626 c.assembler.CompileRegisterToRegister(amd64.ANDPS, tmp, vr) 2627 2628 // tmp[i] = ^vi if vi != NaN 2629 // = 0xffffffff if vi == NaN 2630 // which means that tmp[i] & 0x80000000 != 0 if and only if vi is negative. 2631 c.assembler.CompileRegisterToRegister(amd64.PXOR, vr, tmp) 2632 2633 // vr[i] = int32(vi) if vi != NaN and vr is not overflowing. 2634 // = 0x80000000 if vi != NaN and vr is overflowing (See https://www.felixcloutier.com/x86/cvttps2dq) 2635 // = 0 if vi == NaN 2636 c.assembler.CompileRegisterToRegister(amd64.CVTTPS2DQ, vr, vr) 2637 2638 // Below, we have to convert 0x80000000 into 0x7FFFFFFF for positive overflowing lane. 2639 // 2640 // tmp[i] = 0x80000000 if vi is positive 2641 // = any satisfying any&0x80000000 = 0 if vi is negative or zero. 2642 c.assembler.CompileRegisterToRegister(amd64.PAND, vr, tmp) 2643 2644 // Arithmetic right shifting tmp by 31, meaning that we have 2645 // tmp[i] = 0xffffffff if vi is positive, 0 otherwise. 2646 c.assembler.CompileConstToRegister(amd64.PSRAD, 0x1f, tmp) 2647 2648 // Flipping 0x80000000 if vi is positive, otherwise keep intact. 2649 c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, vr) 2650 } else { 2651 tmp2, err := c.allocateRegister(registerTypeVector) 2652 if err != nil { 2653 return err 2654 } 2655 2656 // See https://github.com/bytecodealliance/wasmtime/pull/2440 2657 // Note: even v8 doesn't seem to have support for this i32x4.tranc_sat_f32x4_u. 2658 c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, tmp) 2659 c.assembler.CompileRegisterToRegister(amd64.MAXPS, tmp, vr) 2660 c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, tmp, tmp) 2661 c.assembler.CompileConstToRegister(amd64.PSRLD, 0x1, tmp) 2662 c.assembler.CompileRegisterToRegister(amd64.CVTDQ2PS, tmp, tmp) 2663 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vr, tmp2) 2664 c.assembler.CompileRegisterToRegister(amd64.CVTTPS2DQ, vr, vr) 2665 c.assembler.CompileRegisterToRegister(amd64.SUBPS, tmp, tmp2) 2666 c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPS, tmp2, tmp, 0x2) // == CMPLEPS 2667 c.assembler.CompileRegisterToRegister(amd64.CVTTPS2DQ, tmp2, tmp2) 2668 c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, tmp2) 2669 c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, tmp) 2670 c.assembler.CompileRegisterToRegister(amd64.PMAXSD, tmp, tmp2) 2671 c.assembler.CompileRegisterToRegister(amd64.PADDD, tmp2, vr) 2672 } 2673 case wazeroir.ShapeF64x2: 2674 tmp2, err := c.allocateRegister(registerTypeVector) 2675 if err != nil { 2676 return err 2677 } 2678 2679 if o.Signed { 2680 // Copy the value into tmp. 2681 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vr, tmp) 2682 2683 // Set all bits for non-NaN lanes, zeros otherwise. 2684 // I.e. tmp[i] = 0xffffffff_ffffffff if vi != NaN, 0 otherwise. 2685 c.assembler.CompileRegisterToRegister(amd64.CMPEQPD, tmp, tmp) 2686 2687 // Load the 2147483647 into tmp2's each lane. 2688 if err = c.assembler.CompileStaticConstToRegister(amd64.MOVUPD, asm.NewStaticConst(i32sMaxOnF64x2[:]), tmp2); err != nil { 2689 return err 2690 } 2691 2692 // tmp[i] = 2147483647 if vi != NaN, 0 otherwise. 2693 c.assembler.CompileRegisterToRegister(amd64.ANDPS, tmp2, tmp) 2694 2695 // MINPD returns the source register's value as-is, so we have 2696 // vr[i] = vi if vi != NaN 2697 // = 0 if vi == NaN 2698 c.assembler.CompileRegisterToRegister(amd64.MINPD, tmp, vr) 2699 2700 c.assembler.CompileRegisterToRegister(amd64.CVTTPD2DQ, vr, vr) 2701 } else { 2702 // Clears all bits on tmp. 2703 c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, tmp) 2704 2705 // vr[i] = vi if vi != NaN && vi > 0 2706 // = 0 if vi == NaN || vi <= 0 2707 c.assembler.CompileRegisterToRegister(amd64.MAXPD, tmp, vr) 2708 2709 // tmp2[i] = float64(math.MaxUint32) = math.MaxUint32 2710 if err = c.assembler.CompileStaticConstToRegister(amd64.MOVUPD, asm.NewStaticConst(i32uMaxOnF64x2[:]), tmp2); err != nil { 2711 return err 2712 } 2713 2714 // vr[i] = vi if vi != NaN && vi > 0 && vi <= math.MaxUint32 2715 // = 0 otherwise 2716 c.assembler.CompileRegisterToRegister(amd64.MINPD, tmp2, vr) 2717 2718 // Round the floating points into integer. 2719 c.assembler.CompileRegisterToRegisterWithArg(amd64.ROUNDPD, vr, vr, 0x3) 2720 2721 // tmp2[i] = float64(0x1.0p52) 2722 if err = c.assembler.CompileStaticConstToRegister(amd64.MOVUPD, asm.NewStaticConst(twop52[:]), tmp2); err != nil { 2723 return err 2724 } 2725 2726 // vr[i] = float64(0x1.0p52) + float64(uint32(vi)) if vi != NaN && vi > 0 && vi <= math.MaxUint32 2727 // = 0 otherwise 2728 // 2729 // This means that vr[i] holds exactly the same bit of uint32(vi) in its lower 32-bits. 2730 c.assembler.CompileRegisterToRegister(amd64.ADDPD, tmp2, vr) 2731 2732 // At this point, we have 2733 // vr = [uint32(v0), float64(0x1.0p52), uint32(v1), float64(0x1.0p52)] 2734 // tmp = [0, 0, 0, 0] 2735 // as 32x4 lanes. Therefore, SHUFPS with 0b00_00_10_00 results in 2736 // vr = [vr[00], vr[10], tmp[00], tmp[00]] = [vr[00], vr[10], 0, 0] 2737 // meaning that for i = 0 and 1, we have 2738 // vr[i] = uint32(vi) if vi != NaN && vi > 0 && vi <= math.MaxUint32 2739 // = 0 otherwise. 2740 c.assembler.CompileRegisterToRegisterWithArg(amd64.SHUFPS, tmp, vr, 0b00_00_10_00) 2741 } 2742 } 2743 2744 c.locationStack.markRegisterUnused(tmp) 2745 c.pushVectorRuntimeValueLocationOnRegister(vr) 2746 return nil 2747 }