github.com/bananabytelabs/wazero@v0.0.0-20240105073314-54b22a776da8/internal/engine/compiler/impl_vec_amd64.go

github.com/bananabytelabs/wazero@v0.0.0-20240105073314-54b22a776da8/internal/engine/compiler/impl_vec_amd64.go (about)

     1  package compiler
     2  
     3  import (
     4  	"errors"
     5  
     6  	"github.com/bananabytelabs/wazero/internal/asm"
     7  	"github.com/bananabytelabs/wazero/internal/asm/amd64"
     8  	"github.com/bananabytelabs/wazero/internal/wazeroir"
     9  )
    10  
    11  // compileV128Const implements compiler.compileV128Const for amd64 architecture.
    12  func (c *amd64Compiler) compileV128Const(o *wazeroir.UnionOperation) error {
    13  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
    14  		return err
    15  	}
    16  
    17  	lo, hi := o.U1, o.U2
    18  
    19  	result, err := c.allocateRegister(registerTypeVector)
    20  	if err != nil {
    21  		return err
    22  	}
    23  
    24  	// We cannot directly load the value from memory to float regs,
    25  	// so we move it to int reg temporarily.
    26  	tmpReg, err := c.allocateRegister(registerTypeGeneralPurpose)
    27  	if err != nil {
    28  		return err
    29  	}
    30  
    31  	// Move the lower 64-bits.
    32  	if lo == 0 {
    33  		c.assembler.CompileRegisterToRegister(amd64.XORQ, tmpReg, tmpReg)
    34  	} else {
    35  		c.assembler.CompileConstToRegister(amd64.MOVQ, int64(lo), tmpReg)
    36  	}
    37  	c.assembler.CompileRegisterToRegister(amd64.MOVQ, tmpReg, result)
    38  
    39  	if lo != 0 && hi == 0 {
    40  		c.assembler.CompileRegisterToRegister(amd64.XORQ, tmpReg, tmpReg)
    41  	} else if hi != 0 {
    42  		c.assembler.CompileConstToRegister(amd64.MOVQ, int64(hi), tmpReg)
    43  	}
    44  	// Move the higher 64-bits with PINSRQ at the second element of 64x2 vector.
    45  	c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, tmpReg, result, 1)
    46  
    47  	c.pushVectorRuntimeValueLocationOnRegister(result)
    48  	return nil
    49  }
    50  
    51  // compileV128Add implements compiler.compileV128Add for amd64 architecture.
    52  func (c *amd64Compiler) compileV128Add(o *wazeroir.UnionOperation) error {
    53  	x2 := c.locationStack.popV128()
    54  	if err := c.compileEnsureOnRegister(x2); err != nil {
    55  		return err
    56  	}
    57  
    58  	x1 := c.locationStack.popV128()
    59  	if err := c.compileEnsureOnRegister(x1); err != nil {
    60  		return err
    61  	}
    62  	var inst asm.Instruction
    63  	shape := o.B1
    64  	switch shape {
    65  	case wazeroir.ShapeI8x16:
    66  		inst = amd64.PADDB
    67  	case wazeroir.ShapeI16x8:
    68  		inst = amd64.PADDW
    69  	case wazeroir.ShapeI32x4:
    70  		inst = amd64.PADDD
    71  	case wazeroir.ShapeI64x2:
    72  		inst = amd64.PADDQ
    73  	case wazeroir.ShapeF32x4:
    74  		inst = amd64.ADDPS
    75  	case wazeroir.ShapeF64x2:
    76  		inst = amd64.ADDPD
    77  	}
    78  	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
    79  
    80  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
    81  	c.locationStack.markRegisterUnused(x2.register)
    82  	return nil
    83  }
    84  
    85  // compileV128Sub implements compiler.compileV128Sub for amd64 architecture.
    86  func (c *amd64Compiler) compileV128Sub(o *wazeroir.UnionOperation) error {
    87  	x2 := c.locationStack.popV128()
    88  	if err := c.compileEnsureOnRegister(x2); err != nil {
    89  		return err
    90  	}
    91  
    92  	x1 := c.locationStack.popV128()
    93  	if err := c.compileEnsureOnRegister(x1); err != nil {
    94  		return err
    95  	}
    96  	var inst asm.Instruction
    97  	shape := o.B1
    98  	switch shape {
    99  	case wazeroir.ShapeI8x16:
   100  		inst = amd64.PSUBB
   101  	case wazeroir.ShapeI16x8:
   102  		inst = amd64.PSUBW
   103  	case wazeroir.ShapeI32x4:
   104  		inst = amd64.PSUBD
   105  	case wazeroir.ShapeI64x2:
   106  		inst = amd64.PSUBQ
   107  	case wazeroir.ShapeF32x4:
   108  		inst = amd64.SUBPS
   109  	case wazeroir.ShapeF64x2:
   110  		inst = amd64.SUBPD
   111  	}
   112  	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
   113  
   114  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
   115  	c.locationStack.markRegisterUnused(x2.register)
   116  	return nil
   117  }
   118  
   119  // compileV128Load implements compiler.compileV128Load for amd64 architecture.
   120  func (c *amd64Compiler) compileV128Load(o *wazeroir.UnionOperation) error {
   121  	result, err := c.allocateRegister(registerTypeVector)
   122  	if err != nil {
   123  		return err
   124  	}
   125  
   126  	offset := uint32(o.U2)
   127  	loadType := wazeroir.V128LoadType(o.B1)
   128  
   129  	switch loadType {
   130  	case wazeroir.V128LoadType128:
   131  		err = c.compileV128LoadImpl(amd64.MOVDQU, offset, 16, result)
   132  	case wazeroir.V128LoadType8x8s:
   133  		err = c.compileV128LoadImpl(amd64.PMOVSXBW, offset, 8, result)
   134  	case wazeroir.V128LoadType8x8u:
   135  		err = c.compileV128LoadImpl(amd64.PMOVZXBW, offset, 8, result)
   136  	case wazeroir.V128LoadType16x4s:
   137  		err = c.compileV128LoadImpl(amd64.PMOVSXWD, offset, 8, result)
   138  	case wazeroir.V128LoadType16x4u:
   139  		err = c.compileV128LoadImpl(amd64.PMOVZXWD, offset, 8, result)
   140  	case wazeroir.V128LoadType32x2s:
   141  		err = c.compileV128LoadImpl(amd64.PMOVSXDQ, offset, 8, result)
   142  	case wazeroir.V128LoadType32x2u:
   143  		err = c.compileV128LoadImpl(amd64.PMOVZXDQ, offset, 8, result)
   144  	case wazeroir.V128LoadType8Splat:
   145  		reg, err := c.compileMemoryAccessCeilSetup(offset, 1)
   146  		if err != nil {
   147  			return err
   148  		}
   149  		c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVBQZX, amd64ReservedRegisterForMemory, -1,
   150  			reg, 1, reg)
   151  		// pinsrb   $0, reg, result
   152  		// pxor	    tmpVReg, tmpVReg
   153  		// pshufb   tmpVReg, result
   154  		c.locationStack.markRegisterUsed(result)
   155  		tmpVReg, err := c.allocateRegister(registerTypeVector)
   156  		if err != nil {
   157  			return err
   158  		}
   159  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRB, reg, result, 0)
   160  		c.assembler.CompileRegisterToRegister(amd64.PXOR, tmpVReg, tmpVReg)
   161  		c.assembler.CompileRegisterToRegister(amd64.PSHUFB, tmpVReg, result)
   162  	case wazeroir.V128LoadType16Splat:
   163  		reg, err := c.compileMemoryAccessCeilSetup(offset, 2)
   164  		if err != nil {
   165  			return err
   166  		}
   167  		c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVWQZX, amd64ReservedRegisterForMemory, -2,
   168  			reg, 1, reg)
   169  		// pinsrw $0, reg, result
   170  		// pinsrw $1, reg, result
   171  		// pshufd $0, result, result (result = result[0,0,0,0])
   172  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRW, reg, result, 0)
   173  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRW, reg, result, 1)
   174  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, result, result, 0)
   175  	case wazeroir.V128LoadType32Splat:
   176  		reg, err := c.compileMemoryAccessCeilSetup(offset, 4)
   177  		if err != nil {
   178  			return err
   179  		}
   180  		c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVLQZX, amd64ReservedRegisterForMemory, -4,
   181  			reg, 1, reg)
   182  		// pinsrd $0, reg, result
   183  		// pshufd $0, result, result (result = result[0,0,0,0])
   184  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRD, reg, result, 0)
   185  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, result, result, 0)
   186  	case wazeroir.V128LoadType64Splat:
   187  		reg, err := c.compileMemoryAccessCeilSetup(offset, 8)
   188  		if err != nil {
   189  			return err
   190  		}
   191  		c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVQ, amd64ReservedRegisterForMemory, -8,
   192  			reg, 1, reg)
   193  		// pinsrq $0, reg, result
   194  		// pinsrq $1, reg, result
   195  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, reg, result, 0)
   196  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, reg, result, 1)
   197  	case wazeroir.V128LoadType32zero:
   198  		err = c.compileV128LoadImpl(amd64.MOVL, offset, 4, result)
   199  	case wazeroir.V128LoadType64zero:
   200  		err = c.compileV128LoadImpl(amd64.MOVQ, offset, 8, result)
   201  	}
   202  
   203  	if err != nil {
   204  		return err
   205  	}
   206  
   207  	c.pushVectorRuntimeValueLocationOnRegister(result)
   208  	return nil
   209  }
   210  
   211  func (c *amd64Compiler) compileV128LoadImpl(inst asm.Instruction, offset uint32, targetSizeInBytes int64, dst asm.Register) error {
   212  	offsetReg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes)
   213  	if err != nil {
   214  		return err
   215  	}
   216  	c.assembler.CompileMemoryWithIndexToRegister(inst, amd64ReservedRegisterForMemory, -targetSizeInBytes,
   217  		offsetReg, 1, dst)
   218  	return nil
   219  }
   220  
   221  // compileV128LoadLane implements compiler.compileV128LoadLane for amd64.
   222  func (c *amd64Compiler) compileV128LoadLane(o *wazeroir.UnionOperation) error {
   223  	targetVector := c.locationStack.popV128()
   224  	if err := c.compileEnsureOnRegister(targetVector); err != nil {
   225  		return err
   226  	}
   227  
   228  	laneSize, laneIndex := o.B1, o.B2
   229  	offset := uint32(o.U2)
   230  
   231  	var insertInst asm.Instruction
   232  	switch laneSize {
   233  	case 8:
   234  		insertInst = amd64.PINSRB
   235  	case 16:
   236  		insertInst = amd64.PINSRW
   237  	case 32:
   238  		insertInst = amd64.PINSRD
   239  	case 64:
   240  		insertInst = amd64.PINSRQ
   241  	}
   242  
   243  	targetSizeInBytes := int64(laneSize / 8)
   244  	offsetReg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes)
   245  	if err != nil {
   246  		return err
   247  	}
   248  	c.assembler.CompileMemoryWithIndexAndArgToRegister(insertInst, amd64ReservedRegisterForMemory, -targetSizeInBytes,
   249  		offsetReg, 1, targetVector.register, laneIndex)
   250  
   251  	c.pushVectorRuntimeValueLocationOnRegister(targetVector.register)
   252  	return nil
   253  }
   254  
   255  // compileV128Store implements compiler.compileV128Store for amd64.
   256  func (c *amd64Compiler) compileV128Store(o *wazeroir.UnionOperation) error {
   257  	val := c.locationStack.popV128()
   258  	if err := c.compileEnsureOnRegister(val); err != nil {
   259  		return err
   260  	}
   261  
   262  	const targetSizeInBytes = 16
   263  	offset := uint32(o.U2)
   264  	offsetReg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes)
   265  	if err != nil {
   266  		return err
   267  	}
   268  
   269  	c.assembler.CompileRegisterToMemoryWithIndex(amd64.MOVDQU, val.register,
   270  		amd64ReservedRegisterForMemory, -targetSizeInBytes, offsetReg, 1)
   271  
   272  	c.locationStack.markRegisterUnused(val.register, offsetReg)
   273  	return nil
   274  }
   275  
   276  // compileV128StoreLane implements compiler.compileV128StoreLane for amd64.
   277  func (c *amd64Compiler) compileV128StoreLane(o *wazeroir.UnionOperation) error {
   278  	var storeInst asm.Instruction
   279  	laneSize := o.B1
   280  	laneIndex := o.B2
   281  	offset := uint32(o.U2)
   282  	switch laneSize {
   283  	case 8:
   284  		storeInst = amd64.PEXTRB
   285  	case 16:
   286  		storeInst = amd64.PEXTRW
   287  	case 32:
   288  		storeInst = amd64.PEXTRD
   289  	case 64:
   290  		storeInst = amd64.PEXTRQ
   291  	}
   292  
   293  	val := c.locationStack.popV128()
   294  	if err := c.compileEnsureOnRegister(val); err != nil {
   295  		return err
   296  	}
   297  
   298  	targetSizeInBytes := int64(laneSize / 8)
   299  	offsetReg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes)
   300  	if err != nil {
   301  		return err
   302  	}
   303  
   304  	c.assembler.CompileRegisterToMemoryWithIndexAndArg(storeInst, val.register,
   305  		amd64ReservedRegisterForMemory, -targetSizeInBytes, offsetReg, 1, laneIndex)
   306  
   307  	c.locationStack.markRegisterUnused(val.register, offsetReg)
   308  	return nil
   309  }
   310  
   311  // compileV128ExtractLane implements compiler.compileV128ExtractLane for amd64.
   312  func (c *amd64Compiler) compileV128ExtractLane(o *wazeroir.UnionOperation) error {
   313  	v := c.locationStack.popV128()
   314  	if err := c.compileEnsureOnRegister(v); err != nil {
   315  		return err
   316  	}
   317  	vreg := v.register
   318  	shape := o.B1
   319  	laneIndex := o.B2
   320  	signed := o.B3
   321  	switch shape {
   322  	case wazeroir.ShapeI8x16:
   323  		result, err := c.allocateRegister(registerTypeGeneralPurpose)
   324  		if err != nil {
   325  			return err
   326  		}
   327  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PEXTRB, vreg, result, laneIndex)
   328  		if signed {
   329  			c.assembler.CompileRegisterToRegister(amd64.MOVBLSX, result, result)
   330  		} else {
   331  			c.assembler.CompileRegisterToRegister(amd64.MOVBLZX, result, result)
   332  		}
   333  		c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32)
   334  		c.locationStack.markRegisterUnused(vreg)
   335  	case wazeroir.ShapeI16x8:
   336  		result, err := c.allocateRegister(registerTypeGeneralPurpose)
   337  		if err != nil {
   338  			return err
   339  		}
   340  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PEXTRW, vreg, result, laneIndex)
   341  		if signed {
   342  			c.assembler.CompileRegisterToRegister(amd64.MOVWLSX, result, result)
   343  		} else {
   344  			c.assembler.CompileRegisterToRegister(amd64.MOVWLZX, result, result)
   345  		}
   346  		c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32)
   347  		c.locationStack.markRegisterUnused(vreg)
   348  	case wazeroir.ShapeI32x4:
   349  		result, err := c.allocateRegister(registerTypeGeneralPurpose)
   350  		if err != nil {
   351  			return err
   352  		}
   353  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PEXTRD, vreg, result, laneIndex)
   354  		c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32)
   355  		c.locationStack.markRegisterUnused(vreg)
   356  	case wazeroir.ShapeI64x2:
   357  		result, err := c.allocateRegister(registerTypeGeneralPurpose)
   358  		if err != nil {
   359  			return err
   360  		}
   361  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PEXTRQ, vreg, result, laneIndex)
   362  		c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI64)
   363  		c.locationStack.markRegisterUnused(vreg)
   364  	case wazeroir.ShapeF32x4:
   365  		if laneIndex != 0 {
   366  			c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, vreg, vreg, laneIndex)
   367  		}
   368  		c.pushRuntimeValueLocationOnRegister(vreg, runtimeValueTypeF32)
   369  	case wazeroir.ShapeF64x2:
   370  		if laneIndex != 0 {
   371  			// This case we can assume LaneIndex == 1.
   372  			// We have to modify the val.register as, for example:
   373  			//    0b11 0b10 0b01 0b00
   374  			//     |    |    |    |
   375  			//   [x3,  x2,  x1,  x0] -> [x0,  x0,  x3,  x2]
   376  			// where val.register = [x3, x2, x1, x0] and each xN = 32bits.
   377  			// Then, we interpret the register as float64, therefore, the float64 value is obtained as [x3, x2].
   378  			arg := byte(0b00_00_11_10)
   379  			c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, vreg, vreg, arg)
   380  		}
   381  		c.pushRuntimeValueLocationOnRegister(vreg, runtimeValueTypeF64)
   382  	}
   383  
   384  	return nil
   385  }
   386  
   387  // compileV128ReplaceLane implements compiler.compileV128ReplaceLane for amd64.
   388  func (c *amd64Compiler) compileV128ReplaceLane(o *wazeroir.UnionOperation) error {
   389  	origin := c.locationStack.pop()
   390  	if err := c.compileEnsureOnRegister(origin); err != nil {
   391  		return err
   392  	}
   393  
   394  	vector := c.locationStack.popV128()
   395  	if err := c.compileEnsureOnRegister(vector); err != nil {
   396  		return err
   397  	}
   398  
   399  	shape := o.B1
   400  	laneIndex := o.B2
   401  	switch shape {
   402  	case wazeroir.ShapeI8x16:
   403  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRB, origin.register, vector.register, laneIndex)
   404  	case wazeroir.ShapeI16x8:
   405  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRW, origin.register, vector.register, laneIndex)
   406  	case wazeroir.ShapeI32x4:
   407  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRD, origin.register, vector.register, laneIndex)
   408  	case wazeroir.ShapeI64x2:
   409  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, origin.register, vector.register, laneIndex)
   410  	case wazeroir.ShapeF32x4:
   411  		c.assembler.CompileRegisterToRegisterWithArg(amd64.INSERTPS, origin.register, vector.register,
   412  			// In INSERTPS instruction, the destination index is encoded at 4 and 5 bits of the argument.
   413  			// See https://www.felixcloutier.com/x86/insertps
   414  			laneIndex<<4,
   415  		)
   416  	case wazeroir.ShapeF64x2:
   417  		if laneIndex == 0 {
   418  			c.assembler.CompileRegisterToRegister(amd64.MOVSD, origin.register, vector.register)
   419  		} else {
   420  			c.assembler.CompileRegisterToRegister(amd64.MOVLHPS, origin.register, vector.register)
   421  		}
   422  	}
   423  
   424  	c.pushVectorRuntimeValueLocationOnRegister(vector.register)
   425  	c.locationStack.markRegisterUnused(origin.register)
   426  	return nil
   427  }
   428  
   429  // compileV128Splat implements compiler.compileV128Splat for amd64.
   430  func (c *amd64Compiler) compileV128Splat(o *wazeroir.UnionOperation) (err error) {
   431  	origin := c.locationStack.pop()
   432  	if err = c.compileEnsureOnRegister(origin); err != nil {
   433  		return
   434  	}
   435  
   436  	var result asm.Register
   437  	shape := o.B1
   438  	switch shape {
   439  	case wazeroir.ShapeI8x16:
   440  		result, err = c.allocateRegister(registerTypeVector)
   441  		if err != nil {
   442  			return err
   443  		}
   444  		c.locationStack.markRegisterUsed(result)
   445  
   446  		tmp, err := c.allocateRegister(registerTypeVector)
   447  		if err != nil {
   448  			return err
   449  		}
   450  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRB, origin.register, result, 0)
   451  		c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, tmp)
   452  		c.assembler.CompileRegisterToRegister(amd64.PSHUFB, tmp, result)
   453  	case wazeroir.ShapeI16x8:
   454  		result, err = c.allocateRegister(registerTypeVector)
   455  		if err != nil {
   456  			return err
   457  		}
   458  		c.locationStack.markRegisterUsed(result)
   459  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRW, origin.register, result, 0)
   460  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRW, origin.register, result, 1)
   461  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, result, result, 0)
   462  	case wazeroir.ShapeI32x4:
   463  		result, err = c.allocateRegister(registerTypeVector)
   464  		if err != nil {
   465  			return err
   466  		}
   467  		c.locationStack.markRegisterUsed(result)
   468  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRD, origin.register, result, 0)
   469  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, result, result, 0)
   470  	case wazeroir.ShapeI64x2:
   471  		result, err = c.allocateRegister(registerTypeVector)
   472  		if err != nil {
   473  			return err
   474  		}
   475  		c.locationStack.markRegisterUsed(result)
   476  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, origin.register, result, 0)
   477  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, origin.register, result, 1)
   478  	case wazeroir.ShapeF32x4:
   479  		result = origin.register
   480  		c.assembler.CompileRegisterToRegisterWithArg(amd64.INSERTPS, origin.register, result, 0)
   481  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, result, result, 0)
   482  	case wazeroir.ShapeF64x2:
   483  		result = origin.register
   484  		c.assembler.CompileRegisterToRegister(amd64.MOVQ, origin.register, result)
   485  		c.assembler.CompileRegisterToRegister(amd64.MOVLHPS, origin.register, result)
   486  	}
   487  
   488  	c.locationStack.markRegisterUnused(origin.register)
   489  	c.pushVectorRuntimeValueLocationOnRegister(result)
   490  	return nil
   491  }
   492  
   493  // compileV128Shuffle implements compiler.compileV128Shuffle for amd64.
   494  func (c *amd64Compiler) compileV128Shuffle(o *wazeroir.UnionOperation) error {
   495  	w := c.locationStack.popV128()
   496  	if err := c.compileEnsureOnRegister(w); err != nil {
   497  		return err
   498  	}
   499  
   500  	v := c.locationStack.popV128()
   501  	if err := c.compileEnsureOnRegister(v); err != nil {
   502  		return err
   503  	}
   504  
   505  	wr, vr := w.register, v.register
   506  
   507  	tmp, err := c.allocateRegister(registerTypeVector)
   508  	if err != nil {
   509  		return err
   510  	}
   511  
   512  	consts := [32]byte{}
   513  	lanes := o.Us
   514  	for i, unsignedLane := range lanes {
   515  		lane := byte(unsignedLane)
   516  		if lane < 16 {
   517  			consts[i+16] = 0x80
   518  			consts[i] = lane
   519  		} else {
   520  			consts[i+16] = lane - 16
   521  			consts[i] = 0x80
   522  		}
   523  	}
   524  
   525  	err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, asm.NewStaticConst(consts[:16]), tmp)
   526  	if err != nil {
   527  		return err
   528  	}
   529  	c.assembler.CompileRegisterToRegister(amd64.PSHUFB, tmp, vr)
   530  	err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, asm.NewStaticConst(consts[16:]), tmp)
   531  	if err != nil {
   532  		return err
   533  	}
   534  	c.assembler.CompileRegisterToRegister(amd64.PSHUFB, tmp, wr)
   535  	c.assembler.CompileRegisterToRegister(amd64.ORPS, vr, wr)
   536  
   537  	c.pushVectorRuntimeValueLocationOnRegister(wr)
   538  	c.locationStack.markRegisterUnused(vr)
   539  	return nil
   540  }
   541  
   542  var swizzleConst = [16]byte{
   543  	0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
   544  	0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
   545  }
   546  
   547  // compileV128Swizzle implements compiler.compileV128Swizzle for amd64.
   548  func (c *amd64Compiler) compileV128Swizzle(*wazeroir.UnionOperation) error {
   549  	index := c.locationStack.popV128()
   550  	if err := c.compileEnsureOnRegister(index); err != nil {
   551  		return err
   552  	}
   553  
   554  	base := c.locationStack.popV128()
   555  	if err := c.compileEnsureOnRegister(base); err != nil {
   556  		return err
   557  	}
   558  
   559  	idxReg, baseReg := index.register, base.register
   560  
   561  	tmp, err := c.allocateRegister(registerTypeVector)
   562  	if err != nil {
   563  		return err
   564  	}
   565  
   566  	err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, asm.NewStaticConst(swizzleConst[:]), tmp)
   567  	if err != nil {
   568  		return err
   569  	}
   570  
   571  	c.assembler.CompileRegisterToRegister(amd64.PADDUSB, tmp, idxReg)
   572  	c.assembler.CompileRegisterToRegister(amd64.PSHUFB, idxReg, baseReg)
   573  
   574  	c.pushVectorRuntimeValueLocationOnRegister(baseReg)
   575  	c.locationStack.markRegisterUnused(idxReg)
   576  	return nil
   577  }
   578  
   579  // compileV128AnyTrue implements compiler.compileV128AnyTrue for amd64.
   580  func (c *amd64Compiler) compileV128AnyTrue(*wazeroir.UnionOperation) error {
   581  	v := c.locationStack.popV128()
   582  	if err := c.compileEnsureOnRegister(v); err != nil {
   583  		return err
   584  	}
   585  	vreg := v.register
   586  
   587  	c.assembler.CompileRegisterToRegister(amd64.PTEST, vreg, vreg)
   588  
   589  	c.locationStack.pushRuntimeValueLocationOnConditionalRegister(amd64.ConditionalRegisterStateNE)
   590  	c.locationStack.markRegisterUnused(vreg)
   591  	return nil
   592  }
   593  
   594  // compileV128AllTrue implements compiler.compileV128AllTrue for amd64.
   595  func (c *amd64Compiler) compileV128AllTrue(o *wazeroir.UnionOperation) error {
   596  	v := c.locationStack.popV128()
   597  	if err := c.compileEnsureOnRegister(v); err != nil {
   598  		return err
   599  	}
   600  
   601  	tmp, err := c.allocateRegister(registerTypeVector)
   602  	if err != nil {
   603  		return err
   604  	}
   605  
   606  	var cmpInst asm.Instruction
   607  	shape := o.B1
   608  	switch shape {
   609  	case wazeroir.ShapeI8x16:
   610  		cmpInst = amd64.PCMPEQB
   611  	case wazeroir.ShapeI16x8:
   612  		cmpInst = amd64.PCMPEQW
   613  	case wazeroir.ShapeI32x4:
   614  		cmpInst = amd64.PCMPEQD
   615  	case wazeroir.ShapeI64x2:
   616  		cmpInst = amd64.PCMPEQQ
   617  	}
   618  
   619  	c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, tmp)
   620  	c.assembler.CompileRegisterToRegister(cmpInst, v.register, tmp)
   621  	c.assembler.CompileRegisterToRegister(amd64.PTEST, tmp, tmp)
   622  	c.locationStack.markRegisterUnused(v.register, tmp)
   623  	c.locationStack.pushRuntimeValueLocationOnConditionalRegister(amd64.ConditionalRegisterStateE)
   624  	return nil
   625  }
   626  
   627  // compileV128BitMask implements compiler.compileV128BitMask for amd64.
   628  func (c *amd64Compiler) compileV128BitMask(o *wazeroir.UnionOperation) error {
   629  	v := c.locationStack.popV128()
   630  	if err := c.compileEnsureOnRegister(v); err != nil {
   631  		return err
   632  	}
   633  
   634  	result, err := c.allocateRegister(registerTypeGeneralPurpose)
   635  	if err != nil {
   636  		return err
   637  	}
   638  
   639  	shape := o.B1
   640  	switch shape {
   641  	case wazeroir.ShapeI8x16:
   642  		c.assembler.CompileRegisterToRegister(amd64.PMOVMSKB, v.register, result)
   643  	case wazeroir.ShapeI16x8:
   644  		// When we have:
   645  		// 	R1 = [R1(w1), R1(w2), R1(w3), R1(w4), R1(w5), R1(w6), R1(w7), R1(v8)]
   646  		// 	R2 = [R2(w1), R2(w2), R2(w3), R2(v4), R2(w5), R2(w6), R2(w7), R2(v8)]
   647  		//	where RX(wn) is n-th signed word (16-bit) of RX register,
   648  		//
   649  		// "PACKSSWB R1, R2" produces
   650  		//  R1 = [
   651  		// 		byte_sat(R1(w1)), byte_sat(R1(w2)), byte_sat(R1(w3)), byte_sat(R1(w4)),
   652  		// 		byte_sat(R1(w5)), byte_sat(R1(w6)), byte_sat(R1(w7)), byte_sat(R1(w8)),
   653  		// 		byte_sat(R2(w1)), byte_sat(R2(w2)), byte_sat(R2(w3)), byte_sat(R2(w4)),
   654  		// 		byte_sat(R2(w5)), byte_sat(R2(w6)), byte_sat(R2(w7)), byte_sat(R2(w8)),
   655  		//  ]
   656  		//  where R1 is the destination register, and
   657  		// 	byte_sat(w) = int8(w) if w fits as signed 8-bit,
   658  		//                0x80 if w is less than 0x80
   659  		//                0x7F if w is greater than 0x7f
   660  		//
   661  		// See https://www.felixcloutier.com/x86/packsswb:packssdw for detail.
   662  		//
   663  		// Therefore, v.register ends up having i-th and (i+8)-th bit set if i-th lane is negative (for i in 0..8).
   664  		c.assembler.CompileRegisterToRegister(amd64.PACKSSWB, v.register, v.register)
   665  		c.assembler.CompileRegisterToRegister(amd64.PMOVMSKB, v.register, result)
   666  		// Clear the higher bits than 8.
   667  		c.assembler.CompileConstToRegister(amd64.SHRQ, 8, result)
   668  	case wazeroir.ShapeI32x4:
   669  		c.assembler.CompileRegisterToRegister(amd64.MOVMSKPS, v.register, result)
   670  	case wazeroir.ShapeI64x2:
   671  		c.assembler.CompileRegisterToRegister(amd64.MOVMSKPD, v.register, result)
   672  	}
   673  
   674  	c.locationStack.markRegisterUnused(v.register)
   675  	c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32)
   676  	return nil
   677  }
   678  
   679  // compileV128And implements compiler.compileV128And for amd64.
   680  func (c *amd64Compiler) compileV128And(*wazeroir.UnionOperation) error {
   681  	x2 := c.locationStack.popV128()
   682  	if err := c.compileEnsureOnRegister(x2); err != nil {
   683  		return err
   684  	}
   685  
   686  	x1 := c.locationStack.popV128()
   687  	if err := c.compileEnsureOnRegister(x1); err != nil {
   688  		return err
   689  	}
   690  
   691  	c.assembler.CompileRegisterToRegister(amd64.PAND, x2.register, x1.register)
   692  
   693  	c.locationStack.markRegisterUnused(x2.register)
   694  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
   695  	return nil
   696  }
   697  
   698  // compileV128Not implements compiler.compileV128Not for amd64.
   699  func (c *amd64Compiler) compileV128Not(*wazeroir.UnionOperation) error {
   700  	v := c.locationStack.popV128()
   701  	if err := c.compileEnsureOnRegister(v); err != nil {
   702  		return err
   703  	}
   704  
   705  	tmp, err := c.allocateRegister(registerTypeVector)
   706  	if err != nil {
   707  		return err
   708  	}
   709  
   710  	// Set all bits on tmp register.
   711  	c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, tmp, tmp)
   712  	// Then XOR with tmp to reverse all bits on v.register.
   713  	c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, v.register)
   714  	c.pushVectorRuntimeValueLocationOnRegister(v.register)
   715  	return nil
   716  }
   717  
   718  // compileV128Or implements compiler.compileV128Or for amd64.
   719  func (c *amd64Compiler) compileV128Or(*wazeroir.UnionOperation) error {
   720  	x2 := c.locationStack.popV128()
   721  	if err := c.compileEnsureOnRegister(x2); err != nil {
   722  		return err
   723  	}
   724  
   725  	x1 := c.locationStack.popV128()
   726  	if err := c.compileEnsureOnRegister(x1); err != nil {
   727  		return err
   728  	}
   729  
   730  	c.assembler.CompileRegisterToRegister(amd64.POR, x2.register, x1.register)
   731  
   732  	c.locationStack.markRegisterUnused(x2.register)
   733  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
   734  	return nil
   735  }
   736  
   737  // compileV128Xor implements compiler.compileV128Xor for amd64.
   738  func (c *amd64Compiler) compileV128Xor(*wazeroir.UnionOperation) error {
   739  	x2 := c.locationStack.popV128()
   740  	if err := c.compileEnsureOnRegister(x2); err != nil {
   741  		return err
   742  	}
   743  
   744  	x1 := c.locationStack.popV128()
   745  	if err := c.compileEnsureOnRegister(x1); err != nil {
   746  		return err
   747  	}
   748  
   749  	c.assembler.CompileRegisterToRegister(amd64.PXOR, x2.register, x1.register)
   750  
   751  	c.locationStack.markRegisterUnused(x2.register)
   752  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
   753  	return nil
   754  }
   755  
   756  // compileV128Bitselect implements compiler.compileV128Bitselect for amd64.
   757  func (c *amd64Compiler) compileV128Bitselect(*wazeroir.UnionOperation) error {
   758  	selector := c.locationStack.popV128()
   759  	if err := c.compileEnsureOnRegister(selector); err != nil {
   760  		return err
   761  	}
   762  
   763  	x2 := c.locationStack.popV128()
   764  	if err := c.compileEnsureOnRegister(x2); err != nil {
   765  		return err
   766  	}
   767  
   768  	x1 := c.locationStack.popV128()
   769  	if err := c.compileEnsureOnRegister(x1); err != nil {
   770  		return err
   771  	}
   772  
   773  	// The following logic is equivalent to v128.or(v128.and(v1, selector), v128.and(v2, v128.not(selector)))
   774  	// See https://github.com/WebAssembly/spec/blob/wg-2.0.draft1/proposals/simd/SIMD.md#bitwise-select
   775  	c.assembler.CompileRegisterToRegister(amd64.PAND, selector.register, x1.register)
   776  	c.assembler.CompileRegisterToRegister(amd64.PANDN, x2.register, selector.register)
   777  	c.assembler.CompileRegisterToRegister(amd64.POR, selector.register, x1.register)
   778  
   779  	c.locationStack.markRegisterUnused(x2.register, selector.register)
   780  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
   781  	return nil
   782  }
   783  
   784  // compileV128AndNot implements compiler.compileV128AndNot for amd64.
   785  func (c *amd64Compiler) compileV128AndNot(*wazeroir.UnionOperation) error {
   786  	x2 := c.locationStack.popV128()
   787  	if err := c.compileEnsureOnRegister(x2); err != nil {
   788  		return err
   789  	}
   790  
   791  	x1 := c.locationStack.popV128()
   792  	if err := c.compileEnsureOnRegister(x1); err != nil {
   793  		return err
   794  	}
   795  
   796  	c.assembler.CompileRegisterToRegister(amd64.PANDN, x1.register, x2.register)
   797  
   798  	c.locationStack.markRegisterUnused(x1.register)
   799  	c.pushVectorRuntimeValueLocationOnRegister(x2.register)
   800  	return nil
   801  }
   802  
   803  // compileV128Shr implements compiler.compileV128Shr for amd64.
   804  func (c *amd64Compiler) compileV128Shr(o *wazeroir.UnionOperation) error {
   805  	// https://stackoverflow.com/questions/35002937/sse-simd-shift-with-one-byte-element-size-granularity
   806  	shape := o.B1
   807  	signed := o.B3
   808  	if shape == wazeroir.ShapeI8x16 {
   809  		return c.compileV128ShrI8x16Impl(signed)
   810  	} else if shape == wazeroir.ShapeI64x2 && signed {
   811  		return c.compileV128ShrI64x2SignedImpl()
   812  	} else {
   813  		return c.compileV128ShrImpl(o)
   814  	}
   815  }
   816  
   817  // compileV128ShrImpl implements shift right instructions except for i8x16 (logical/arithmetic) and i64x2 (arithmetic).
   818  func (c *amd64Compiler) compileV128ShrImpl(o *wazeroir.UnionOperation) error {
   819  	s := c.locationStack.pop()
   820  	if err := c.compileEnsureOnRegister(s); err != nil {
   821  		return err
   822  	}
   823  
   824  	x1 := c.locationStack.popV128()
   825  	if err := c.compileEnsureOnRegister(x1); err != nil {
   826  		return err
   827  	}
   828  
   829  	vecTmp, err := c.allocateRegister(registerTypeVector)
   830  	if err != nil {
   831  		return err
   832  	}
   833  
   834  	var moduleConst int64
   835  	var shift asm.Instruction
   836  	shape := o.B1
   837  	signed := o.B3
   838  	switch shape {
   839  	case wazeroir.ShapeI16x8:
   840  		moduleConst = 0xf // modulo 16.
   841  		if signed {
   842  			shift = amd64.PSRAW
   843  		} else {
   844  			shift = amd64.PSRLW
   845  		}
   846  	case wazeroir.ShapeI32x4:
   847  		moduleConst = 0x1f // modulo 32.
   848  		if signed {
   849  			shift = amd64.PSRAD
   850  		} else {
   851  			shift = amd64.PSRLD
   852  		}
   853  	case wazeroir.ShapeI64x2:
   854  		moduleConst = 0x3f // modulo 64.
   855  		shift = amd64.PSRLQ
   856  	}
   857  
   858  	gpShiftAmount := s.register
   859  	c.assembler.CompileConstToRegister(amd64.ANDQ, moduleConst, gpShiftAmount)
   860  	c.assembler.CompileRegisterToRegister(amd64.MOVL, gpShiftAmount, vecTmp)
   861  	c.assembler.CompileRegisterToRegister(shift, vecTmp, x1.register)
   862  
   863  	c.locationStack.markRegisterUnused(gpShiftAmount)
   864  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
   865  	return nil
   866  }
   867  
   868  // compileV128ShrI64x2SignedImpl implements compiler.compileV128Shr for i64x2 signed (arithmetic) shift.
   869  // PSRAQ instruction requires AVX, so we emulate it without AVX instructions. https://www.felixcloutier.com/x86/psraw:psrad:psraq
   870  func (c *amd64Compiler) compileV128ShrI64x2SignedImpl() error {
   871  	const shiftCountRegister = amd64.RegCX
   872  
   873  	s := c.locationStack.pop()
   874  	if s.register != shiftCountRegister {
   875  		// If another value lives on the CX register, we release it to the stack.
   876  		c.onValueReleaseRegisterToStack(shiftCountRegister)
   877  		if s.onStack() {
   878  			s.setRegister(shiftCountRegister)
   879  			c.compileLoadValueOnStackToRegister(s)
   880  		} else if s.onConditionalRegister() {
   881  			c.compileMoveConditionalToGeneralPurposeRegister(s, shiftCountRegister)
   882  		} else { // already on register.
   883  			old := s.register
   884  			c.assembler.CompileRegisterToRegister(amd64.MOVL, old, shiftCountRegister)
   885  			s.setRegister(shiftCountRegister)
   886  			c.locationStack.markRegisterUnused(old)
   887  		}
   888  	}
   889  
   890  	c.locationStack.markRegisterUsed(shiftCountRegister)
   891  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
   892  	if err != nil {
   893  		return err
   894  	}
   895  
   896  	x1 := c.locationStack.popV128()
   897  	if err := c.compileEnsureOnRegister(x1); err != nil {
   898  		return err
   899  	}
   900  
   901  	// Extract each lane into tmp, execute SHR on tmp, and write it back to the lane.
   902  	c.assembler.CompileRegisterToRegisterWithArg(amd64.PEXTRQ, x1.register, tmp, 0)
   903  	c.assembler.CompileRegisterToRegister(amd64.SARQ, shiftCountRegister, tmp)
   904  	c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, tmp, x1.register, 0)
   905  	c.assembler.CompileRegisterToRegisterWithArg(amd64.PEXTRQ, x1.register, tmp, 1)
   906  	c.assembler.CompileRegisterToRegister(amd64.SARQ, shiftCountRegister, tmp)
   907  	c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, tmp, x1.register, 1)
   908  
   909  	c.locationStack.markRegisterUnused(shiftCountRegister)
   910  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
   911  	return nil
   912  }
   913  
   914  // i8x16LogicalSHRMaskTable is necessary for emulating non-existent packed bytes logical right shifts on amd64.
   915  // The mask is applied after performing packed word shifts on the value to clear out the unnecessary bits.
   916  var i8x16LogicalSHRMaskTable = [8 * 16]byte{ // (the number of possible shift amount 0, 1, ..., 7.) * 16 bytes.
   917  	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // for 0 shift
   918  	0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // for 1 shift
   919  	0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, // for 2 shift
   920  	0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, // for 3 shift
   921  	0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, // for 4 shift
   922  	0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, // for 5 shift
   923  	0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, // for 6 shift
   924  	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // for 7 shift
   925  }
   926  
   927  // compileV128ShrI64x2SignedImpl implements compiler.compileV128Shr for i8x16 signed logical/arithmetic shifts.
   928  // amd64 doesn't have packed byte shifts, so we need this special casing.
   929  // See https://stackoverflow.com/questions/35002937/sse-simd-shift-with-one-byte-element-size-granularity
   930  func (c *amd64Compiler) compileV128ShrI8x16Impl(signed bool) error {
   931  	s := c.locationStack.pop()
   932  	if err := c.compileEnsureOnRegister(s); err != nil {
   933  		return err
   934  	}
   935  
   936  	v := c.locationStack.popV128()
   937  	if err := c.compileEnsureOnRegister(v); err != nil {
   938  		return err
   939  	}
   940  
   941  	vecTmp, err := c.allocateRegister(registerTypeVector)
   942  	if err != nil {
   943  		return err
   944  	}
   945  
   946  	gpShiftAmount := s.register
   947  	c.assembler.CompileConstToRegister(amd64.ANDQ, 0x7, gpShiftAmount) // mod 8.
   948  
   949  	if signed {
   950  		c.locationStack.markRegisterUsed(vecTmp)
   951  		vecTmp2, err := c.allocateRegister(registerTypeVector)
   952  		if err != nil {
   953  			return err
   954  		}
   955  
   956  		vreg := v.register
   957  
   958  		// Copy the value from v.register to vecTmp.
   959  		c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vreg, vecTmp)
   960  
   961  		// Assuming that we have
   962  		//  vreg   = [b1, ..., b16]
   963  		//  vecTmp = [b1, ..., b16]
   964  		// at this point, then we use PUNPCKLBW and PUNPCKHBW to produce:
   965  		//  vreg   = [b1, b1, b2, b2, ..., b8, b8]
   966  		//  vecTmp = [b9, b9, b10, b10, ..., b16, b16]
   967  		c.assembler.CompileRegisterToRegister(amd64.PUNPCKLBW, vreg, vreg)
   968  		c.assembler.CompileRegisterToRegister(amd64.PUNPCKHBW, vecTmp, vecTmp)
   969  
   970  		// Adding 8 to the shift amount, and then move the amount to vecTmp2.
   971  		c.assembler.CompileConstToRegister(amd64.ADDQ, 0x8, gpShiftAmount)
   972  		c.assembler.CompileRegisterToRegister(amd64.MOVL, gpShiftAmount, vecTmp2)
   973  
   974  		// Perform the word packed arithmetic right shifts on vreg and vecTmp.
   975  		// This changes these two registers as:
   976  		//  vreg   = [xxx, b1 >> s, xxx, b2 >> s, ..., xxx, b8 >> s]
   977  		//  vecTmp = [xxx, b9 >> s, xxx, b10 >> s, ..., xxx, b16 >> s]
   978  		// where xxx is 1 or 0 depending on each byte's sign, and ">>" is the arithmetic shift on a byte.
   979  		c.assembler.CompileRegisterToRegister(amd64.PSRAW, vecTmp2, vreg)
   980  		c.assembler.CompileRegisterToRegister(amd64.PSRAW, vecTmp2, vecTmp)
   981  
   982  		// Finally, we can get the result by packing these two word vectors.
   983  		c.assembler.CompileRegisterToRegister(amd64.PACKSSWB, vecTmp, vreg)
   984  
   985  		c.locationStack.markRegisterUnused(gpShiftAmount, vecTmp)
   986  		c.pushVectorRuntimeValueLocationOnRegister(vreg)
   987  	} else {
   988  		c.assembler.CompileRegisterToRegister(amd64.MOVL, gpShiftAmount, vecTmp)
   989  		// amd64 doesn't have packed byte shifts, so we packed word shift here, and then mark-out
   990  		// the unnecessary bits below.
   991  		c.assembler.CompileRegisterToRegister(amd64.PSRLW, vecTmp, v.register)
   992  
   993  		gpTmp, err := c.allocateRegister(registerTypeGeneralPurpose)
   994  		if err != nil {
   995  			return err
   996  		}
   997  
   998  		// Read the initial address of the mask table into gpTmp register.
   999  		err = c.assembler.CompileStaticConstToRegister(amd64.LEAQ, asm.NewStaticConst(i8x16LogicalSHRMaskTable[:]), gpTmp)
  1000  		if err != nil {
  1001  			return err
  1002  		}
  1003  
  1004  		// We have to get the mask according to the shift amount, so we first have to do
  1005  		// gpShiftAmount << 4 = gpShiftAmount*16 to get the initial offset of the mask (16 is the size of each mask in bytes).
  1006  		c.assembler.CompileConstToRegister(amd64.SHLQ, 4, gpShiftAmount)
  1007  
  1008  		// Now ready to read the content of the mask into the vecTmp.
  1009  		c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVDQU,
  1010  			gpTmp, 0, gpShiftAmount, 1,
  1011  			vecTmp,
  1012  		)
  1013  
  1014  		// Finally, clear out the unnecessary
  1015  		c.assembler.CompileRegisterToRegister(amd64.PAND, vecTmp, v.register)
  1016  
  1017  		c.locationStack.markRegisterUnused(gpShiftAmount)
  1018  		c.pushVectorRuntimeValueLocationOnRegister(v.register)
  1019  	}
  1020  	return nil
  1021  }
  1022  
  1023  // i8x16SHLMaskTable is necessary for emulating non-existent packed bytes left shifts on amd64.
  1024  // The mask is applied after performing packed word shifts on the value to clear out the unnecessary bits.
  1025  var i8x16SHLMaskTable = [8 * 16]byte{ // (the number of possible shift amount 0, 1, ..., 7.) * 16 bytes.
  1026  	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // for 0 shift
  1027  	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, // for 1 shift
  1028  	0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, // for 2 shift
  1029  	0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, // for 3 shift
  1030  	0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // for 4 shift
  1031  	0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, // for 5 shift
  1032  	0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, // for 6 shift
  1033  	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, // for 7 shift
  1034  }
  1035  
  1036  // compileV128Shl implements compiler.compileV128Shl for amd64.
  1037  func (c *amd64Compiler) compileV128Shl(o *wazeroir.UnionOperation) error {
  1038  	s := c.locationStack.pop()
  1039  	if err := c.compileEnsureOnRegister(s); err != nil {
  1040  		return err
  1041  	}
  1042  
  1043  	x1 := c.locationStack.popV128()
  1044  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1045  		return err
  1046  	}
  1047  
  1048  	vecTmp, err := c.allocateRegister(registerTypeVector)
  1049  	if err != nil {
  1050  		return err
  1051  	}
  1052  
  1053  	var modulo int64
  1054  	var shift asm.Instruction
  1055  	shape := o.B1
  1056  	switch shape {
  1057  	case wazeroir.ShapeI8x16:
  1058  		modulo = 0x7 // modulo 8.
  1059  		// x86 doesn't have packed bytes shift, so we use PSLLW and mask-out the redundant bits.
  1060  		// See https://stackoverflow.com/questions/35002937/sse-simd-shift-with-one-byte-element-size-granularity
  1061  		shift = amd64.PSLLW
  1062  	case wazeroir.ShapeI16x8:
  1063  		modulo = 0xf // modulo 16.
  1064  		shift = amd64.PSLLW
  1065  	case wazeroir.ShapeI32x4:
  1066  		modulo = 0x1f // modulo 32.
  1067  		shift = amd64.PSLLD
  1068  	case wazeroir.ShapeI64x2:
  1069  		modulo = 0x3f // modulo 64.
  1070  		shift = amd64.PSLLQ
  1071  	}
  1072  
  1073  	gpShiftAmount := s.register
  1074  	c.assembler.CompileConstToRegister(amd64.ANDQ, modulo, gpShiftAmount)
  1075  	c.assembler.CompileRegisterToRegister(amd64.MOVL, gpShiftAmount, vecTmp)
  1076  	c.assembler.CompileRegisterToRegister(shift, vecTmp, x1.register)
  1077  
  1078  	if shape == wazeroir.ShapeI8x16 {
  1079  		gpTmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  1080  		if err != nil {
  1081  			return err
  1082  		}
  1083  
  1084  		// Read the initial address of the mask table into gpTmp register.
  1085  		err = c.assembler.CompileStaticConstToRegister(amd64.LEAQ, asm.NewStaticConst(i8x16SHLMaskTable[:]), gpTmp)
  1086  		if err != nil {
  1087  			return err
  1088  		}
  1089  
  1090  		// We have to get the mask according to the shift amount, so we first have to do
  1091  		// gpShiftAmount << 4 = gpShiftAmount*16 to get the initial offset of the mask (16 is the size of each mask in bytes).
  1092  		c.assembler.CompileConstToRegister(amd64.SHLQ, 4, gpShiftAmount)
  1093  
  1094  		// Now ready to read the content of the mask into the vecTmp.
  1095  		c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVDQU,
  1096  			gpTmp, 0, gpShiftAmount, 1,
  1097  			vecTmp,
  1098  		)
  1099  
  1100  		// Finally, clear out the unnecessary
  1101  		c.assembler.CompileRegisterToRegister(amd64.PAND, vecTmp, x1.register)
  1102  	}
  1103  
  1104  	c.locationStack.markRegisterUnused(gpShiftAmount)
  1105  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
  1106  	return nil
  1107  }
  1108  
  1109  // compileV128Cmp implements compiler.compileV128Cmp for amd64.
  1110  func (c *amd64Compiler) compileV128Cmp(o *wazeroir.UnionOperation) error {
  1111  	x2 := c.locationStack.popV128()
  1112  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1113  		return err
  1114  	}
  1115  
  1116  	x1 := c.locationStack.popV128()
  1117  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1118  		return err
  1119  	}
  1120  
  1121  	const (
  1122  		// See https://www.felixcloutier.com/x86/cmppd and https://www.felixcloutier.com/x86/cmpps
  1123  		floatEqualArg           = 0
  1124  		floatLessThanArg        = 1
  1125  		floatLessThanOrEqualArg = 2
  1126  		floatNotEqualARg        = 4
  1127  	)
  1128  
  1129  	x1Reg, x2Reg, result := x1.register, x2.register, asm.NilRegister
  1130  	v128CmpType := o.B1
  1131  	switch v128CmpType {
  1132  	case wazeroir.V128CmpTypeF32x4Eq:
  1133  		c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPS, x2Reg, x1Reg, floatEqualArg)
  1134  		result = x1Reg
  1135  	case wazeroir.V128CmpTypeF32x4Ne:
  1136  		c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPS, x2Reg, x1Reg, floatNotEqualARg)
  1137  		result = x1Reg
  1138  	case wazeroir.V128CmpTypeF32x4Lt:
  1139  		c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPS, x2Reg, x1Reg, floatLessThanArg)
  1140  		result = x1Reg
  1141  	case wazeroir.V128CmpTypeF32x4Gt:
  1142  		// Without AVX, there's no float Gt instruction, so we swap the register and use Lt instead.
  1143  		c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPS, x1Reg, x2Reg, floatLessThanArg)
  1144  		result = x2Reg
  1145  	case wazeroir.V128CmpTypeF32x4Le:
  1146  		c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPS, x2Reg, x1Reg, floatLessThanOrEqualArg)
  1147  		result = x1Reg
  1148  	case wazeroir.V128CmpTypeF32x4Ge:
  1149  		// Without AVX, there's no float Ge instruction, so we swap the register and use Le instead.
  1150  		c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPS, x1Reg, x2Reg, floatLessThanOrEqualArg)
  1151  		result = x2Reg
  1152  	case wazeroir.V128CmpTypeF64x2Eq:
  1153  		c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPD, x2Reg, x1Reg, floatEqualArg)
  1154  		result = x1Reg
  1155  	case wazeroir.V128CmpTypeF64x2Ne:
  1156  		c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPD, x2Reg, x1Reg, floatNotEqualARg)
  1157  		result = x1Reg
  1158  	case wazeroir.V128CmpTypeF64x2Lt:
  1159  		c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPD, x2Reg, x1Reg, floatLessThanArg)
  1160  		result = x1Reg
  1161  	case wazeroir.V128CmpTypeF64x2Gt:
  1162  		// Without AVX, there's no float Gt instruction, so we swap the register and use Lt instead.
  1163  		c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPD, x1Reg, x2Reg, floatLessThanArg)
  1164  		result = x2Reg
  1165  	case wazeroir.V128CmpTypeF64x2Le:
  1166  		c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPD, x2Reg, x1Reg, floatLessThanOrEqualArg)
  1167  		result = x1Reg
  1168  	case wazeroir.V128CmpTypeF64x2Ge:
  1169  		// Without AVX, there's no float Ge instruction, so we swap the register and use Le instead.
  1170  		c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPD, x1Reg, x2Reg, floatLessThanOrEqualArg)
  1171  		result = x2Reg
  1172  	case wazeroir.V128CmpTypeI8x16Eq:
  1173  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQB, x2Reg, x1Reg)
  1174  		result = x1Reg
  1175  	case wazeroir.V128CmpTypeI8x16Ne:
  1176  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQB, x2Reg, x1Reg)
  1177  		// Set all bits on x2Reg register.
  1178  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg)
  1179  		// Swap the bits on x1Reg register.
  1180  		c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg)
  1181  		result = x1Reg
  1182  	case wazeroir.V128CmpTypeI8x16LtS:
  1183  		c.assembler.CompileRegisterToRegister(amd64.PCMPGTB, x1Reg, x2Reg)
  1184  		result = x2Reg
  1185  	case wazeroir.V128CmpTypeI8x16LtU, wazeroir.V128CmpTypeI8x16GtU:
  1186  		// Take the unsigned min/max values on each byte on x1 and x2 onto x1Reg.
  1187  		if v128CmpType == wazeroir.V128CmpTypeI8x16LtU {
  1188  			c.assembler.CompileRegisterToRegister(amd64.PMINUB, x2Reg, x1Reg)
  1189  		} else {
  1190  			c.assembler.CompileRegisterToRegister(amd64.PMAXUB, x2Reg, x1Reg)
  1191  		}
  1192  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQB, x2Reg, x1Reg)
  1193  		// Set all bits on x2Reg register.
  1194  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg)
  1195  		// Swap the bits on x2Reg register.
  1196  		c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg)
  1197  		result = x1Reg
  1198  	case wazeroir.V128CmpTypeI8x16GtS:
  1199  		c.assembler.CompileRegisterToRegister(amd64.PCMPGTB, x2Reg, x1Reg)
  1200  		result = x1Reg
  1201  	case wazeroir.V128CmpTypeI8x16LeS, wazeroir.V128CmpTypeI8x16LeU:
  1202  		tmp, err := c.allocateRegister(registerTypeVector)
  1203  		if err != nil {
  1204  			return err
  1205  		}
  1206  		// Copy the value on the src to tmp.
  1207  		c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1Reg, tmp)
  1208  		if v128CmpType == wazeroir.V128CmpTypeI8x16LeS {
  1209  			c.assembler.CompileRegisterToRegister(amd64.PMINSB, x2Reg, tmp)
  1210  		} else {
  1211  			c.assembler.CompileRegisterToRegister(amd64.PMINUB, x2Reg, tmp)
  1212  		}
  1213  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQB, tmp, x1Reg)
  1214  		result = x1Reg
  1215  	case wazeroir.V128CmpTypeI8x16GeS, wazeroir.V128CmpTypeI8x16GeU:
  1216  		tmp, err := c.allocateRegister(registerTypeVector)
  1217  		if err != nil {
  1218  			return err
  1219  		}
  1220  		c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1Reg, tmp)
  1221  		if v128CmpType == wazeroir.V128CmpTypeI8x16GeS {
  1222  			c.assembler.CompileRegisterToRegister(amd64.PMAXSB, x2Reg, tmp)
  1223  		} else {
  1224  			c.assembler.CompileRegisterToRegister(amd64.PMAXUB, x2Reg, tmp)
  1225  		}
  1226  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQB, tmp, x1Reg)
  1227  		result = x1Reg
  1228  	case wazeroir.V128CmpTypeI16x8Eq:
  1229  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQW, x2Reg, x1Reg)
  1230  		result = x1Reg
  1231  	case wazeroir.V128CmpTypeI16x8Ne:
  1232  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQW, x2Reg, x1Reg)
  1233  		// Set all bits on x2Reg register.
  1234  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg)
  1235  		// Swap the bits on x1Reg register.
  1236  		c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg)
  1237  		result = x1Reg
  1238  	case wazeroir.V128CmpTypeI16x8LtS:
  1239  		c.assembler.CompileRegisterToRegister(amd64.PCMPGTW, x1Reg, x2Reg)
  1240  		result = x2Reg
  1241  	case wazeroir.V128CmpTypeI16x8LtU, wazeroir.V128CmpTypeI16x8GtU:
  1242  		// Take the unsigned min/max values on each byte on x1 and x2 onto x1Reg.
  1243  		if v128CmpType == wazeroir.V128CmpTypeI16x8LtU {
  1244  			c.assembler.CompileRegisterToRegister(amd64.PMINUW, x2Reg, x1Reg)
  1245  		} else {
  1246  			c.assembler.CompileRegisterToRegister(amd64.PMAXUW, x2Reg, x1Reg)
  1247  		}
  1248  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQW, x2Reg, x1Reg)
  1249  		// Set all bits on x2Reg register.
  1250  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg)
  1251  		// Swap the bits on x2Reg register.
  1252  		c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg)
  1253  		result = x1Reg
  1254  	case wazeroir.V128CmpTypeI16x8GtS:
  1255  		c.assembler.CompileRegisterToRegister(amd64.PCMPGTW, x2Reg, x1Reg)
  1256  		result = x1Reg
  1257  	case wazeroir.V128CmpTypeI16x8LeS, wazeroir.V128CmpTypeI16x8LeU:
  1258  		tmp, err := c.allocateRegister(registerTypeVector)
  1259  		if err != nil {
  1260  			return err
  1261  		}
  1262  		// Copy the value on the src to tmp.
  1263  		c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1Reg, tmp)
  1264  		if v128CmpType == wazeroir.V128CmpTypeI16x8LeS {
  1265  			c.assembler.CompileRegisterToRegister(amd64.PMINSW, x2Reg, tmp)
  1266  		} else {
  1267  			c.assembler.CompileRegisterToRegister(amd64.PMINUW, x2Reg, tmp)
  1268  		}
  1269  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQW, tmp, x1Reg)
  1270  		result = x1Reg
  1271  	case wazeroir.V128CmpTypeI16x8GeS, wazeroir.V128CmpTypeI16x8GeU:
  1272  		tmp, err := c.allocateRegister(registerTypeVector)
  1273  		if err != nil {
  1274  			return err
  1275  		}
  1276  		c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1Reg, tmp)
  1277  		if v128CmpType == wazeroir.V128CmpTypeI16x8GeS {
  1278  			c.assembler.CompileRegisterToRegister(amd64.PMAXSW, x2Reg, tmp)
  1279  		} else {
  1280  			c.assembler.CompileRegisterToRegister(amd64.PMAXUW, x2Reg, tmp)
  1281  		}
  1282  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQW, tmp, x1Reg)
  1283  		result = x1Reg
  1284  	case wazeroir.V128CmpTypeI32x4Eq:
  1285  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x1Reg)
  1286  		result = x1Reg
  1287  	case wazeroir.V128CmpTypeI32x4Ne:
  1288  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x1Reg)
  1289  		// Set all bits on x2Reg register.
  1290  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg)
  1291  		// Swap the bits on x1Reg register.
  1292  		c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg)
  1293  		result = x1Reg
  1294  	case wazeroir.V128CmpTypeI32x4LtS:
  1295  		c.assembler.CompileRegisterToRegister(amd64.PCMPGTD, x1Reg, x2Reg)
  1296  		result = x2Reg
  1297  	case wazeroir.V128CmpTypeI32x4LtU, wazeroir.V128CmpTypeI32x4GtU:
  1298  		// Take the unsigned min/max values on each byte on x1 and x2 onto x1Reg.
  1299  		if v128CmpType == wazeroir.V128CmpTypeI32x4LtU {
  1300  			c.assembler.CompileRegisterToRegister(amd64.PMINUD, x2Reg, x1Reg)
  1301  		} else {
  1302  			c.assembler.CompileRegisterToRegister(amd64.PMAXUD, x2Reg, x1Reg)
  1303  		}
  1304  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x1Reg)
  1305  		// Set all bits on x2Reg register.
  1306  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg)
  1307  		// Swap the bits on x2Reg register.
  1308  		c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg)
  1309  		result = x1Reg
  1310  	case wazeroir.V128CmpTypeI32x4GtS:
  1311  		c.assembler.CompileRegisterToRegister(amd64.PCMPGTD, x2Reg, x1Reg)
  1312  		result = x1Reg
  1313  	case wazeroir.V128CmpTypeI32x4LeS, wazeroir.V128CmpTypeI32x4LeU:
  1314  		tmp, err := c.allocateRegister(registerTypeVector)
  1315  		if err != nil {
  1316  			return err
  1317  		}
  1318  		// Copy the value on the src to tmp.
  1319  		c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1Reg, tmp)
  1320  		if v128CmpType == wazeroir.V128CmpTypeI32x4LeS {
  1321  			c.assembler.CompileRegisterToRegister(amd64.PMINSD, x2Reg, tmp)
  1322  		} else {
  1323  			c.assembler.CompileRegisterToRegister(amd64.PMINUD, x2Reg, tmp)
  1324  		}
  1325  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, tmp, x1Reg)
  1326  		result = x1Reg
  1327  	case wazeroir.V128CmpTypeI32x4GeS, wazeroir.V128CmpTypeI32x4GeU:
  1328  		tmp, err := c.allocateRegister(registerTypeVector)
  1329  		if err != nil {
  1330  			return err
  1331  		}
  1332  		c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1Reg, tmp)
  1333  		if v128CmpType == wazeroir.V128CmpTypeI32x4GeS {
  1334  			c.assembler.CompileRegisterToRegister(amd64.PMAXSD, x2Reg, tmp)
  1335  		} else {
  1336  			c.assembler.CompileRegisterToRegister(amd64.PMAXUD, x2Reg, tmp)
  1337  		}
  1338  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, tmp, x1Reg)
  1339  		result = x1Reg
  1340  	case wazeroir.V128CmpTypeI64x2Eq:
  1341  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQQ, x2Reg, x1Reg)
  1342  		result = x1Reg
  1343  	case wazeroir.V128CmpTypeI64x2Ne:
  1344  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQQ, x2Reg, x1Reg)
  1345  		// Set all bits on x2Reg register.
  1346  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg)
  1347  		// Swap the bits on x1Reg register.
  1348  		c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg)
  1349  		result = x1Reg
  1350  	case wazeroir.V128CmpTypeI64x2LtS:
  1351  		c.assembler.CompileRegisterToRegister(amd64.PCMPGTQ, x1Reg, x2Reg)
  1352  		result = x2Reg
  1353  	case wazeroir.V128CmpTypeI64x2GtS:
  1354  		c.assembler.CompileRegisterToRegister(amd64.PCMPGTQ, x2Reg, x1Reg)
  1355  		result = x1Reg
  1356  	case wazeroir.V128CmpTypeI64x2LeS:
  1357  		c.assembler.CompileRegisterToRegister(amd64.PCMPGTQ, x2Reg, x1Reg)
  1358  		// Set all bits on x2Reg register.
  1359  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg)
  1360  		// Swap the bits on x1Reg register.
  1361  		c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg)
  1362  		result = x1Reg
  1363  	case wazeroir.V128CmpTypeI64x2GeS:
  1364  		c.assembler.CompileRegisterToRegister(amd64.PCMPGTQ, x1Reg, x2Reg)
  1365  		// Set all bits on x1Reg register.
  1366  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x1Reg, x1Reg)
  1367  		// Swap the bits on x2Reg register.
  1368  		c.assembler.CompileRegisterToRegister(amd64.PXOR, x1Reg, x2Reg)
  1369  		result = x2Reg
  1370  	}
  1371  
  1372  	c.locationStack.markRegisterUnused(x1Reg, x2Reg)
  1373  	c.pushVectorRuntimeValueLocationOnRegister(result)
  1374  	return nil
  1375  }
  1376  
  1377  // compileV128AddSat implements compiler.compileV128AddSat for amd64.
  1378  func (c *amd64Compiler) compileV128AddSat(o *wazeroir.UnionOperation) error {
  1379  	var inst asm.Instruction
  1380  	shape := o.B1
  1381  	signed := o.B3
  1382  	switch shape {
  1383  	case wazeroir.ShapeI8x16:
  1384  		if signed {
  1385  			inst = amd64.PADDSB
  1386  		} else {
  1387  			inst = amd64.PADDUSB
  1388  		}
  1389  	case wazeroir.ShapeI16x8:
  1390  		if signed {
  1391  			inst = amd64.PADDSW
  1392  		} else {
  1393  			inst = amd64.PADDUSW
  1394  		}
  1395  	}
  1396  
  1397  	x2 := c.locationStack.popV128()
  1398  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1399  		return err
  1400  	}
  1401  
  1402  	x1 := c.locationStack.popV128()
  1403  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1404  		return err
  1405  	}
  1406  
  1407  	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
  1408  
  1409  	c.locationStack.markRegisterUnused(x2.register)
  1410  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
  1411  	return nil
  1412  }
  1413  
  1414  // compileV128SubSat implements compiler.compileV128SubSat for amd64.
  1415  func (c *amd64Compiler) compileV128SubSat(o *wazeroir.UnionOperation) error {
  1416  	var inst asm.Instruction
  1417  	shape := o.B1
  1418  	signed := o.B3
  1419  	switch shape {
  1420  	case wazeroir.ShapeI8x16:
  1421  		if signed {
  1422  			inst = amd64.PSUBSB
  1423  		} else {
  1424  			inst = amd64.PSUBUSB
  1425  		}
  1426  	case wazeroir.ShapeI16x8:
  1427  		if signed {
  1428  			inst = amd64.PSUBSW
  1429  		} else {
  1430  			inst = amd64.PSUBUSW
  1431  		}
  1432  	}
  1433  
  1434  	x2 := c.locationStack.popV128()
  1435  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1436  		return err
  1437  	}
  1438  
  1439  	x1 := c.locationStack.popV128()
  1440  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1441  		return err
  1442  	}
  1443  
  1444  	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
  1445  
  1446  	c.locationStack.markRegisterUnused(x2.register)
  1447  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
  1448  	return nil
  1449  }
  1450  
  1451  // compileV128Mul implements compiler.compileV128Mul for amd64.
  1452  func (c *amd64Compiler) compileV128Mul(o *wazeroir.UnionOperation) error {
  1453  	var inst asm.Instruction
  1454  	shape := o.B1
  1455  	switch shape {
  1456  	case wazeroir.ShapeI16x8:
  1457  		inst = amd64.PMULLW
  1458  	case wazeroir.ShapeI32x4:
  1459  		inst = amd64.PMULLD
  1460  	case wazeroir.ShapeI64x2:
  1461  		return c.compileV128MulI64x2()
  1462  	case wazeroir.ShapeF32x4:
  1463  		inst = amd64.MULPS
  1464  	case wazeroir.ShapeF64x2:
  1465  		inst = amd64.MULPD
  1466  	}
  1467  
  1468  	x2 := c.locationStack.popV128()
  1469  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1470  		return err
  1471  	}
  1472  
  1473  	x1 := c.locationStack.popV128()
  1474  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1475  		return err
  1476  	}
  1477  
  1478  	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
  1479  
  1480  	c.locationStack.markRegisterUnused(x2.register)
  1481  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
  1482  	return nil
  1483  }
  1484  
  1485  // compileV128MulI64x2 implements V128Mul for i64x2.
  1486  func (c *amd64Compiler) compileV128MulI64x2() error {
  1487  	x2 := c.locationStack.popV128()
  1488  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1489  		return err
  1490  	}
  1491  
  1492  	x1 := c.locationStack.popV128()
  1493  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1494  		return err
  1495  	}
  1496  
  1497  	x1r, x2r := x1.register, x2.register
  1498  
  1499  	tmp1, err := c.allocateRegister(registerTypeVector)
  1500  	if err != nil {
  1501  		return err
  1502  	}
  1503  
  1504  	c.locationStack.markRegisterUsed(tmp1)
  1505  
  1506  	tmp2, err := c.allocateRegister(registerTypeVector)
  1507  	if err != nil {
  1508  		return err
  1509  	}
  1510  
  1511  	// Assuming that we have
  1512  	//	x1r = [p1, p2] = [p1_lo, p1_hi, p2_lo, p2_high]
  1513  	//  x2r = [q1, q2] = [q1_lo, q1_hi, q2_lo, q2_high]
  1514  	// where pN and qN are 64-bit (quad word) lane, and pN_lo, pN_hi, qN_lo and qN_hi are 32-bit (double word) lane.
  1515  
  1516  	// Copy x1's value into tmp1.
  1517  	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1r, tmp1)
  1518  	// And do the logical right shift by 32-bit on tmp1, which makes tmp1 = [0, p1_high, 0, p2_high]
  1519  	c.assembler.CompileConstToRegister(amd64.PSRLQ, 32, tmp1)
  1520  
  1521  	// Execute "pmuludq x2r,tmp1", which makes tmp1 = [p1_high*q1_lo, p2_high*q2_lo] where each lane is 64-bit.
  1522  	c.assembler.CompileRegisterToRegister(amd64.PMULUDQ, x2r, tmp1)
  1523  
  1524  	// Copy x2's value into tmp2.
  1525  	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x2r, tmp2)
  1526  	// And do the logical right shift by 32-bit on tmp2, which makes tmp2 = [0, q1_high, 0, q2_high]
  1527  	c.assembler.CompileConstToRegister(amd64.PSRLQ, 32, tmp2)
  1528  
  1529  	// Execute "pmuludq x1r,tmp2", which makes tmp2 = [p1_lo*q1_high, p2_lo*q2_high] where each lane is 64-bit.
  1530  	c.assembler.CompileRegisterToRegister(amd64.PMULUDQ, x1r, tmp2)
  1531  
  1532  	// Adds tmp1 and tmp2 and do the logical left shift by 32-bit,
  1533  	// which makes tmp1 = [(p1_lo*q1_high+p1_high*q1_lo)<<32, (p2_lo*q2_high+p2_high*q2_lo)<<32]
  1534  	c.assembler.CompileRegisterToRegister(amd64.PADDQ, tmp2, tmp1)
  1535  	c.assembler.CompileConstToRegister(amd64.PSLLQ, 32, tmp1)
  1536  
  1537  	// Execute "pmuludq x2r,x1r", which makes x1r = [p1_lo*q1_lo, p2_lo*q2_lo] where each lane is 64-bit.
  1538  	c.assembler.CompileRegisterToRegister(amd64.PMULUDQ, x2r, x1r)
  1539  
  1540  	// Finally, we get the result by adding x1r and tmp1,
  1541  	// which makes x1r = [(p1_lo*q1_high+p1_high*q1_lo)<<32+p1_lo*q1_lo, (p2_lo*q2_high+p2_high*q2_lo)<<32+p2_lo*q2_lo]
  1542  	c.assembler.CompileRegisterToRegister(amd64.PADDQ, tmp1, x1r)
  1543  
  1544  	c.locationStack.markRegisterUnused(x2r, tmp1)
  1545  	c.pushVectorRuntimeValueLocationOnRegister(x1r)
  1546  	return nil
  1547  }
  1548  
  1549  // compileV128Div implements compiler.compileV128Div for amd64.
  1550  func (c *amd64Compiler) compileV128Div(o *wazeroir.UnionOperation) error {
  1551  	x2 := c.locationStack.popV128()
  1552  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1553  		return err
  1554  	}
  1555  
  1556  	x1 := c.locationStack.popV128()
  1557  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1558  		return err
  1559  	}
  1560  
  1561  	var inst asm.Instruction
  1562  	shape := o.B1
  1563  	switch shape {
  1564  	case wazeroir.ShapeF32x4:
  1565  		inst = amd64.DIVPS
  1566  	case wazeroir.ShapeF64x2:
  1567  		inst = amd64.DIVPD
  1568  	}
  1569  
  1570  	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
  1571  
  1572  	c.locationStack.markRegisterUnused(x2.register)
  1573  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
  1574  	return nil
  1575  }
  1576  
  1577  // compileV128Neg implements compiler.compileV128Neg for amd64.
  1578  func (c *amd64Compiler) compileV128Neg(o *wazeroir.UnionOperation) error {
  1579  	shape := o.B1
  1580  	if shape <= wazeroir.ShapeI64x2 {
  1581  		return c.compileV128NegInt(shape)
  1582  	} else {
  1583  		return c.compileV128NegFloat(shape)
  1584  	}
  1585  }
  1586  
  1587  // compileV128NegInt implements compiler.compileV128Neg for integer lanes.
  1588  func (c *amd64Compiler) compileV128NegInt(s wazeroir.Shape) error {
  1589  	v := c.locationStack.popV128()
  1590  	if err := c.compileEnsureOnRegister(v); err != nil {
  1591  		return err
  1592  	}
  1593  
  1594  	result, err := c.allocateRegister(registerTypeVector)
  1595  	if err != nil {
  1596  		return err
  1597  	}
  1598  
  1599  	var subInst asm.Instruction
  1600  	switch s {
  1601  	case wazeroir.ShapeI8x16:
  1602  		subInst = amd64.PSUBB
  1603  	case wazeroir.ShapeI16x8:
  1604  		subInst = amd64.PSUBW
  1605  	case wazeroir.ShapeI32x4:
  1606  		subInst = amd64.PSUBD
  1607  	case wazeroir.ShapeI64x2:
  1608  		subInst = amd64.PSUBQ
  1609  	}
  1610  
  1611  	c.assembler.CompileRegisterToRegister(amd64.PXOR, result, result)
  1612  	c.assembler.CompileRegisterToRegister(subInst, v.register, result)
  1613  
  1614  	c.locationStack.markRegisterUnused(v.register)
  1615  	c.pushVectorRuntimeValueLocationOnRegister(result)
  1616  	return nil
  1617  }
  1618  
  1619  // compileV128NegInt implements compiler.compileV128Neg for float lanes.
  1620  func (c *amd64Compiler) compileV128NegFloat(s wazeroir.Shape) error {
  1621  	v := c.locationStack.popV128()
  1622  	if err := c.compileEnsureOnRegister(v); err != nil {
  1623  		return err
  1624  	}
  1625  
  1626  	tmp, err := c.allocateRegister(registerTypeVector)
  1627  	if err != nil {
  1628  		return err
  1629  	}
  1630  
  1631  	var leftShiftInst, xorInst asm.Instruction
  1632  	var leftShiftAmount asm.ConstantValue
  1633  	if s == wazeroir.ShapeF32x4 {
  1634  		leftShiftInst, leftShiftAmount, xorInst = amd64.PSLLD, 31, amd64.XORPS
  1635  	} else {
  1636  		leftShiftInst, leftShiftAmount, xorInst = amd64.PSLLQ, 63, amd64.XORPD
  1637  	}
  1638  
  1639  	// Clear all bits on tmp.
  1640  	c.assembler.CompileRegisterToRegister(amd64.XORPS, tmp, tmp)
  1641  	// Set all bits on tmp by CMPPD with arg=0 (== pseudo CMPEQPD instruction).
  1642  	// See https://www.felixcloutier.com/x86/cmpps
  1643  	//
  1644  	// Note: if we do not clear all the bits ^ with XORPS, this might end up not setting ones on some lane
  1645  	// if the lane is NaN.
  1646  	c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPD, tmp, tmp, 0x8)
  1647  	// Do the left shift on each lane to set only the most significant bit in each.
  1648  	c.assembler.CompileConstToRegister(leftShiftInst, leftShiftAmount, tmp)
  1649  	// Get the negated result by XOR on each lane with tmp.
  1650  	c.assembler.CompileRegisterToRegister(xorInst, tmp, v.register)
  1651  
  1652  	c.pushVectorRuntimeValueLocationOnRegister(v.register)
  1653  	return nil
  1654  }
  1655  
  1656  // compileV128Sqrt implements compiler.compileV128Sqrt for amd64.
  1657  func (c *amd64Compiler) compileV128Sqrt(o *wazeroir.UnionOperation) error {
  1658  	v := c.locationStack.popV128()
  1659  	if err := c.compileEnsureOnRegister(v); err != nil {
  1660  		return err
  1661  	}
  1662  
  1663  	var inst asm.Instruction
  1664  	shape := o.B1
  1665  	switch shape {
  1666  	case wazeroir.ShapeF64x2:
  1667  		inst = amd64.SQRTPD
  1668  	case wazeroir.ShapeF32x4:
  1669  		inst = amd64.SQRTPS
  1670  	}
  1671  
  1672  	c.assembler.CompileRegisterToRegister(inst, v.register, v.register)
  1673  	c.pushVectorRuntimeValueLocationOnRegister(v.register)
  1674  	return nil
  1675  }
  1676  
  1677  // compileV128Abs implements compiler.compileV128Abs for amd64.
  1678  func (c *amd64Compiler) compileV128Abs(o *wazeroir.UnionOperation) error {
  1679  	shape := o.B1
  1680  	if shape == wazeroir.ShapeI64x2 {
  1681  		return c.compileV128AbsI64x2()
  1682  	}
  1683  
  1684  	v := c.locationStack.popV128()
  1685  	if err := c.compileEnsureOnRegister(v); err != nil {
  1686  		return err
  1687  	}
  1688  
  1689  	result := v.register
  1690  	switch shape {
  1691  	case wazeroir.ShapeI8x16:
  1692  		c.assembler.CompileRegisterToRegister(amd64.PABSB, result, result)
  1693  	case wazeroir.ShapeI16x8:
  1694  		c.assembler.CompileRegisterToRegister(amd64.PABSW, result, result)
  1695  	case wazeroir.ShapeI32x4:
  1696  		c.assembler.CompileRegisterToRegister(amd64.PABSD, result, result)
  1697  	case wazeroir.ShapeF32x4:
  1698  		tmp, err := c.allocateRegister(registerTypeVector)
  1699  		if err != nil {
  1700  			return err
  1701  		}
  1702  		// Set all bits on tmp.
  1703  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, tmp, tmp)
  1704  		// Shift right packed single floats by 1 to clear the sign bits.
  1705  		c.assembler.CompileConstToRegister(amd64.PSRLD, 1, tmp)
  1706  		// Clear the sign bit of vr.
  1707  		c.assembler.CompileRegisterToRegister(amd64.ANDPS, tmp, result)
  1708  	case wazeroir.ShapeF64x2:
  1709  		tmp, err := c.allocateRegister(registerTypeVector)
  1710  		if err != nil {
  1711  			return err
  1712  		}
  1713  		// Set all bits on tmp.
  1714  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, tmp, tmp)
  1715  		// Shift right packed single floats by 1 to clear the sign bits.
  1716  		c.assembler.CompileConstToRegister(amd64.PSRLQ, 1, tmp)
  1717  		// Clear the sign bit of vr.
  1718  		c.assembler.CompileRegisterToRegister(amd64.ANDPD, tmp, result)
  1719  	}
  1720  
  1721  	c.pushVectorRuntimeValueLocationOnRegister(result)
  1722  	return nil
  1723  }
  1724  
  1725  // compileV128AbsI64x2 implements compileV128Abs for i64x2 lanes.
  1726  func (c *amd64Compiler) compileV128AbsI64x2() error {
  1727  	// See https://www.felixcloutier.com/x86/blendvpd
  1728  	const blendMaskReg = amd64.RegX0
  1729  	c.onValueReleaseRegisterToStack(blendMaskReg)
  1730  	c.locationStack.markRegisterUsed(blendMaskReg)
  1731  
  1732  	v := c.locationStack.popV128()
  1733  	if err := c.compileEnsureOnRegister(v); err != nil {
  1734  		return err
  1735  	}
  1736  	vr := v.register
  1737  
  1738  	if vr == blendMaskReg {
  1739  		return errors.New("BUG: X0 must not be used")
  1740  	}
  1741  
  1742  	tmp, err := c.allocateRegister(registerTypeVector)
  1743  	if err != nil {
  1744  		return err
  1745  	}
  1746  	c.locationStack.markRegisterUsed(tmp)
  1747  
  1748  	// Copy the value to tmp.
  1749  	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vr, tmp)
  1750  
  1751  	// Clear all bits on blendMaskReg.
  1752  	c.assembler.CompileRegisterToRegister(amd64.PXOR, blendMaskReg, blendMaskReg)
  1753  	// Subtract vr from blendMaskReg.
  1754  	c.assembler.CompileRegisterToRegister(amd64.PSUBQ, vr, blendMaskReg)
  1755  	// Copy the subtracted value ^^ back into vr.
  1756  	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, blendMaskReg, vr)
  1757  
  1758  	c.assembler.CompileRegisterToRegister(amd64.BLENDVPD, tmp, vr)
  1759  
  1760  	c.locationStack.markRegisterUnused(blendMaskReg, tmp)
  1761  	c.pushVectorRuntimeValueLocationOnRegister(vr)
  1762  	return nil
  1763  }
  1764  
  1765  var (
  1766  	popcntMask = [16]byte{
  1767  		0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
  1768  		0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
  1769  	}
  1770  	// popcntTable holds each index's Popcnt, for example popcntTable[5] holds popcnt(0x05).
  1771  	popcntTable = [16]byte{
  1772  		0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03,
  1773  		0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04,
  1774  	}
  1775  )
  1776  
  1777  // compileV128Popcnt implements compiler.compileV128Popcnt for amd64.
  1778  func (c *amd64Compiler) compileV128Popcnt(operation *wazeroir.UnionOperation) error {
  1779  	v := c.locationStack.popV128()
  1780  	if err := c.compileEnsureOnRegister(v); err != nil {
  1781  		return err
  1782  	}
  1783  	vr := v.register
  1784  
  1785  	tmp1, err := c.allocateRegister(registerTypeVector)
  1786  	if err != nil {
  1787  		return err
  1788  	}
  1789  
  1790  	c.locationStack.markRegisterUsed(tmp1)
  1791  
  1792  	tmp2, err := c.allocateRegister(registerTypeVector)
  1793  	if err != nil {
  1794  		return err
  1795  	}
  1796  
  1797  	c.locationStack.markRegisterUsed(tmp2)
  1798  
  1799  	tmp3, err := c.allocateRegister(registerTypeVector)
  1800  	if err != nil {
  1801  		return err
  1802  	}
  1803  
  1804  	// Read the popcntMask into tmp1, and we have
  1805  	//  tmp1 = [0xf, ..., 0xf]
  1806  	if err := c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, asm.NewStaticConst(popcntMask[:]), tmp1); err != nil {
  1807  		return err
  1808  	}
  1809  
  1810  	// Copy the original value into tmp2.
  1811  	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vr, tmp2)
  1812  
  1813  	// Given that we have:
  1814  	//  v = [b1, ..., b16] where bn = hn:ln and hn and ln are higher and lower 4-bits of bn.
  1815  	//
  1816  	// Take PAND on tmp1 and tmp2, and we have
  1817  	//  tmp2 = [l1, ..., l16].
  1818  	c.assembler.CompileRegisterToRegister(amd64.PAND, tmp1, tmp2)
  1819  
  1820  	// Do logical (packed word) right shift by 4 on vr and PAND with vr and tmp1, meaning that we have
  1821  	//  vr = [h1, ...., h16].
  1822  	c.assembler.CompileConstToRegister(amd64.PSRLW, 4, vr)
  1823  	c.assembler.CompileRegisterToRegister(amd64.PAND, tmp1, vr)
  1824  
  1825  	// Read the popcntTable into tmp1, and we have
  1826  	//  tmp1 = [0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04]
  1827  	if err := c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, asm.NewStaticConst(popcntTable[:]), tmp1); err != nil {
  1828  		return err
  1829  	}
  1830  
  1831  	// Copy the tmp1 into tmp3, and we have
  1832  	//  tmp3 = [0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04]
  1833  	c.assembler.CompileRegisterToRegister(amd64.MOVDQU, tmp1, tmp3)
  1834  
  1835  	//  tmp3 = [popcnt(l1), ..., popcnt(l16)].
  1836  	c.assembler.CompileRegisterToRegister(amd64.PSHUFB, tmp2, tmp3)
  1837  
  1838  	//  tmp1 = [popcnt(h1), ..., popcnt(h16)].
  1839  	c.assembler.CompileRegisterToRegister(amd64.PSHUFB, vr, tmp1)
  1840  
  1841  	// vr = tmp1 = [popcnt(h1), ..., popcnt(h16)].
  1842  	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, tmp1, vr)
  1843  
  1844  	// vr += tmp3 = [popcnt(h1)+popcnt(l1), ..., popcnt(h16)+popcnt(l16)] = [popcnt(b1), ..., popcnt(b16)].
  1845  	c.assembler.CompileRegisterToRegister(amd64.PADDB, tmp3, vr)
  1846  
  1847  	c.locationStack.markRegisterUnused(tmp1, tmp2)
  1848  	c.pushVectorRuntimeValueLocationOnRegister(vr)
  1849  	return nil
  1850  }
  1851  
  1852  // compileV128Min implements compiler.compileV128Min for amd64.
  1853  func (c *amd64Compiler) compileV128Min(o *wazeroir.UnionOperation) error {
  1854  	x2 := c.locationStack.popV128()
  1855  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1856  		return err
  1857  	}
  1858  
  1859  	x1 := c.locationStack.popV128()
  1860  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1861  		return err
  1862  	}
  1863  
  1864  	shape := o.B1
  1865  	if shape >= wazeroir.ShapeF32x4 {
  1866  		return c.compileV128FloatMinImpl(shape == wazeroir.ShapeF32x4, x1.register, x2.register)
  1867  	}
  1868  
  1869  	signed := o.B3
  1870  	var inst asm.Instruction
  1871  	switch shape {
  1872  	case wazeroir.ShapeI8x16:
  1873  		if signed {
  1874  			inst = amd64.PMINSB
  1875  		} else {
  1876  			inst = amd64.PMINUB
  1877  		}
  1878  	case wazeroir.ShapeI16x8:
  1879  		if signed {
  1880  			inst = amd64.PMINSW
  1881  		} else {
  1882  			inst = amd64.PMINUW
  1883  		}
  1884  	case wazeroir.ShapeI32x4:
  1885  		if signed {
  1886  			inst = amd64.PMINSD
  1887  		} else {
  1888  			inst = amd64.PMINUD
  1889  		}
  1890  	}
  1891  
  1892  	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
  1893  
  1894  	c.locationStack.markRegisterUnused(x2.register)
  1895  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
  1896  	return nil
  1897  }
  1898  
  1899  // compileV128FloatMinImpl implements compiler.compileV128Min for float lanes.
  1900  func (c *amd64Compiler) compileV128FloatMinImpl(is32bit bool, x1r, x2r asm.Register) error {
  1901  	tmp, err := c.allocateRegister(registerTypeVector)
  1902  	if err != nil {
  1903  		return err
  1904  	}
  1905  
  1906  	var min, cmp, andn, or, srl /* shit right logical */ asm.Instruction
  1907  	var shiftNumToInverseNaN asm.ConstantValue
  1908  	if is32bit {
  1909  		min, cmp, andn, or, srl, shiftNumToInverseNaN = amd64.MINPS, amd64.CMPPS, amd64.ANDNPS, amd64.ORPS, amd64.PSRLD, 0xa
  1910  	} else {
  1911  		min, cmp, andn, or, srl, shiftNumToInverseNaN = amd64.MINPD, amd64.CMPPD, amd64.ANDNPD, amd64.ORPD, amd64.PSRLQ, 0xd
  1912  	}
  1913  
  1914  	// Let v1 and v2 be the operand values on x1r and x2r at this point.
  1915  
  1916  	// Copy the value into tmp: tmp=v1
  1917  	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1r, tmp)
  1918  	// tmp=min(v1, v2)
  1919  	c.assembler.CompileRegisterToRegister(min, x2r, tmp)
  1920  	// x2r=min(v2, v1)
  1921  	c.assembler.CompileRegisterToRegister(min, x1r, x2r)
  1922  	// x1r=min(v2, v1)
  1923  	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x2r, x1r)
  1924  
  1925  	// x2r = -0          if (v1 == -0 || x2 == -0) && v1 != NaN && v2 !=NaN
  1926  	//       NaN         if v1 == NaN || v2 == NaN
  1927  	//       min(v1, v2) otherwise
  1928  	c.assembler.CompileRegisterToRegister(or, tmp, x2r)
  1929  	// x1r = 0^ (set all bits) if v1 == NaN || v2 == NaN
  1930  	//       0 otherwise
  1931  	c.assembler.CompileRegisterToRegisterWithArg(cmp, tmp, x1r, 3)
  1932  	// x2r = -0          if (v1 == -0 || x2 == -0) && v1 != NaN && v2 !=NaN
  1933  	//       ^0          if v1 == NaN || v2 == NaN
  1934  	//       min(v1, v2) otherwise
  1935  	c.assembler.CompileRegisterToRegister(or, x1r, x2r)
  1936  	// x1r = set all bits on the mantissa bits
  1937  	//       0 otherwise
  1938  	c.assembler.CompileConstToRegister(srl, shiftNumToInverseNaN, x1r)
  1939  	// x1r = x2r and !x1r
  1940  	//     = -0                                                   if (v1 == -0 || x2 == -0) && v1 != NaN && v2 !=NaN
  1941  	//       set all bits on exponential and sign bit (== NaN)    if v1 == NaN || v2 == NaN
  1942  	//       min(v1, v2)                                          otherwise
  1943  	c.assembler.CompileRegisterToRegister(andn, x2r, x1r)
  1944  
  1945  	c.locationStack.markRegisterUnused(x2r)
  1946  	c.pushVectorRuntimeValueLocationOnRegister(x1r)
  1947  	return nil
  1948  }
  1949  
  1950  // compileV128Max implements compiler.compileV128Max for amd64.
  1951  func (c *amd64Compiler) compileV128Max(o *wazeroir.UnionOperation) error {
  1952  	x2 := c.locationStack.popV128()
  1953  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1954  		return err
  1955  	}
  1956  
  1957  	x1 := c.locationStack.popV128()
  1958  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1959  		return err
  1960  	}
  1961  
  1962  	shape := o.B1
  1963  	if shape >= wazeroir.ShapeF32x4 {
  1964  		return c.compileV128FloatMaxImpl(shape == wazeroir.ShapeF32x4, x1.register, x2.register)
  1965  	}
  1966  
  1967  	signed := o.B3
  1968  	var inst asm.Instruction
  1969  	switch shape {
  1970  	case wazeroir.ShapeI8x16:
  1971  		if signed {
  1972  			inst = amd64.PMAXSB
  1973  		} else {
  1974  			inst = amd64.PMAXUB
  1975  		}
  1976  	case wazeroir.ShapeI16x8:
  1977  		if signed {
  1978  			inst = amd64.PMAXSW
  1979  		} else {
  1980  			inst = amd64.PMAXUW
  1981  		}
  1982  	case wazeroir.ShapeI32x4:
  1983  		if signed {
  1984  			inst = amd64.PMAXSD
  1985  		} else {
  1986  			inst = amd64.PMAXUD
  1987  		}
  1988  	}
  1989  
  1990  	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
  1991  
  1992  	c.locationStack.markRegisterUnused(x2.register)
  1993  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
  1994  	return nil
  1995  }
  1996  
  1997  // compileV128FloatMaxImpl implements compiler.compileV128Max for float lanes.
  1998  func (c *amd64Compiler) compileV128FloatMaxImpl(is32bit bool, x1r, x2r asm.Register) error {
  1999  	tmp, err := c.allocateRegister(registerTypeVector)
  2000  	if err != nil {
  2001  		return err
  2002  	}
  2003  
  2004  	var max, cmp, andn, or, xor, sub, srl /* shit right logical */ asm.Instruction
  2005  	var shiftNumToInverseNaN asm.ConstantValue
  2006  	if is32bit {
  2007  		max, cmp, andn, or, xor, sub, srl, shiftNumToInverseNaN = amd64.MAXPS, amd64.CMPPS, amd64.ANDNPS, amd64.ORPS, amd64.XORPS, amd64.SUBPS, amd64.PSRLD, 0xa
  2008  	} else {
  2009  		max, cmp, andn, or, xor, sub, srl, shiftNumToInverseNaN = amd64.MAXPD, amd64.CMPPD, amd64.ANDNPD, amd64.ORPD, amd64.XORPD, amd64.SUBPD, amd64.PSRLQ, 0xd
  2010  	}
  2011  
  2012  	// Let v1 and v2 be the operand values on x1r and x2r at this point.
  2013  
  2014  	// Copy the value into tmp: tmp=v2
  2015  	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x2r, tmp)
  2016  	// tmp=max(v2, v1)
  2017  	c.assembler.CompileRegisterToRegister(max, x1r, tmp)
  2018  	// x1r=max(v1, v2)
  2019  	c.assembler.CompileRegisterToRegister(max, x2r, x1r)
  2020  	// x2r=max(v1, v2)
  2021  	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1r, x2r)
  2022  
  2023  	// x2r = -0      if (v1 == -0 && v2 == 0) || (v1 == 0 && v2 == -0)
  2024  	//       0       if (v1 == 0 && v2 ==  0)
  2025  	//       -0       if (v1 == -0 && v2 == -0)
  2026  	//       v1^v2   if v1 == NaN || v2 == NaN
  2027  	//       0       otherwise
  2028  	c.assembler.CompileRegisterToRegister(xor, tmp, x2r)
  2029  	// x1r = -0           if (v1 == -0 && v2 == 0) || (v1 == 0 && v2 == -0)
  2030  	//       0            if (v1 == 0 && v2 ==  0)
  2031  	//       -0           if (v1 == -0 && v2 == -0)
  2032  	//       NaN          if v1 == NaN || v2 == NaN
  2033  	//       max(v1, v2)  otherwise
  2034  	c.assembler.CompileRegisterToRegister(or, x2r, x1r)
  2035  	// Copy x1r into tmp.
  2036  	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1r, tmp)
  2037  	// tmp = 0            if (v1 == -0 && v2 == 0) || (v1 == 0 && v2 == -0) || (v1 == 0 && v2 ==  0)
  2038  	//       -0           if (v1 == -0 && v2 == -0)
  2039  	//       NaN          if v1 == NaN || v2 == NaN
  2040  	//       max(v1, v2)  otherwise
  2041  	//
  2042  	// Note: -0 - (-0) = 0 (!= -0) in floating point operation.
  2043  	c.assembler.CompileRegisterToRegister(sub, x2r, tmp)
  2044  	// x1r = 0^ if v1 == NaN || v2 == NaN
  2045  	c.assembler.CompileRegisterToRegisterWithArg(cmp, x1r, x1r, 3)
  2046  	// x1r = set all bits on the mantissa bits
  2047  	//       0 otherwise
  2048  	c.assembler.CompileConstToRegister(srl, shiftNumToInverseNaN, x1r)
  2049  	c.assembler.CompileRegisterToRegister(andn, tmp, x1r)
  2050  
  2051  	c.locationStack.markRegisterUnused(x2r)
  2052  	c.pushVectorRuntimeValueLocationOnRegister(x1r)
  2053  	return nil
  2054  }
  2055  
  2056  // compileV128AvgrU implements compiler.compileV128AvgrU for amd64.
  2057  func (c *amd64Compiler) compileV128AvgrU(o *wazeroir.UnionOperation) error {
  2058  	x2 := c.locationStack.popV128()
  2059  	if err := c.compileEnsureOnRegister(x2); err != nil {
  2060  		return err
  2061  	}
  2062  
  2063  	x1 := c.locationStack.popV128()
  2064  	if err := c.compileEnsureOnRegister(x1); err != nil {
  2065  		return err
  2066  	}
  2067  
  2068  	var inst asm.Instruction
  2069  	shape := o.B1
  2070  	switch shape {
  2071  	case wazeroir.ShapeI8x16:
  2072  		inst = amd64.PAVGB
  2073  	case wazeroir.ShapeI16x8:
  2074  		inst = amd64.PAVGW
  2075  	}
  2076  
  2077  	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
  2078  
  2079  	c.locationStack.markRegisterUnused(x2.register)
  2080  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
  2081  	return nil
  2082  }
  2083  
  2084  // compileV128Pmin implements compiler.compileV128Pmin for amd64.
  2085  func (c *amd64Compiler) compileV128Pmin(o *wazeroir.UnionOperation) error {
  2086  	x2 := c.locationStack.popV128()
  2087  	if err := c.compileEnsureOnRegister(x2); err != nil {
  2088  		return err
  2089  	}
  2090  
  2091  	x1 := c.locationStack.popV128()
  2092  	if err := c.compileEnsureOnRegister(x1); err != nil {
  2093  		return err
  2094  	}
  2095  
  2096  	var min asm.Instruction
  2097  	if o.B1 == wazeroir.ShapeF32x4 {
  2098  		min = amd64.MINPS
  2099  	} else {
  2100  		min = amd64.MINPD
  2101  	}
  2102  
  2103  	x1r, v2r := x1.register, x2.register
  2104  
  2105  	c.assembler.CompileRegisterToRegister(min, x1r, v2r)
  2106  
  2107  	c.locationStack.markRegisterUnused(x1r)
  2108  	c.pushVectorRuntimeValueLocationOnRegister(v2r)
  2109  	return nil
  2110  }
  2111  
  2112  // compileV128Pmax implements compiler.compileV128Pmax for amd64.
  2113  func (c *amd64Compiler) compileV128Pmax(o *wazeroir.UnionOperation) error {
  2114  	x2 := c.locationStack.popV128()
  2115  	if err := c.compileEnsureOnRegister(x2); err != nil {
  2116  		return err
  2117  	}
  2118  
  2119  	x1 := c.locationStack.popV128()
  2120  	if err := c.compileEnsureOnRegister(x1); err != nil {
  2121  		return err
  2122  	}
  2123  
  2124  	var min asm.Instruction
  2125  	if o.B1 == wazeroir.ShapeF32x4 {
  2126  		min = amd64.MAXPS
  2127  	} else {
  2128  		min = amd64.MAXPD
  2129  	}
  2130  
  2131  	x1r, v2r := x1.register, x2.register
  2132  
  2133  	c.assembler.CompileRegisterToRegister(min, x1r, v2r)
  2134  
  2135  	c.locationStack.markRegisterUnused(x1r)
  2136  	c.pushVectorRuntimeValueLocationOnRegister(v2r)
  2137  	return nil
  2138  }
  2139  
  2140  // compileV128Ceil implements compiler.compileV128Ceil for amd64.
  2141  func (c *amd64Compiler) compileV128Ceil(o *wazeroir.UnionOperation) error {
  2142  	// See https://www.felixcloutier.com/x86/roundpd
  2143  	const roundModeCeil = 0x2
  2144  	return c.compileV128RoundImpl(o.B1 == wazeroir.ShapeF32x4, roundModeCeil)
  2145  }
  2146  
  2147  // compileV128Floor implements compiler.compileV128Floor for amd64.
  2148  func (c *amd64Compiler) compileV128Floor(o *wazeroir.UnionOperation) error {
  2149  	// See https://www.felixcloutier.com/x86/roundpd
  2150  	const roundModeFloor = 0x1
  2151  	return c.compileV128RoundImpl(o.B1 == wazeroir.ShapeF32x4, roundModeFloor)
  2152  }
  2153  
  2154  // compileV128Trunc implements compiler.compileV128Trunc for amd64.
  2155  func (c *amd64Compiler) compileV128Trunc(o *wazeroir.UnionOperation) error {
  2156  	// See https://www.felixcloutier.com/x86/roundpd
  2157  	const roundModeTrunc = 0x3
  2158  	return c.compileV128RoundImpl(o.B1 == wazeroir.ShapeF32x4, roundModeTrunc)
  2159  }
  2160  
  2161  // compileV128Nearest implements compiler.compileV128Nearest for amd64.
  2162  func (c *amd64Compiler) compileV128Nearest(o *wazeroir.UnionOperation) error {
  2163  	// See https://www.felixcloutier.com/x86/roundpd
  2164  	const roundModeNearest = 0x0
  2165  	return c.compileV128RoundImpl(o.B1 == wazeroir.ShapeF32x4, roundModeNearest)
  2166  }
  2167  
  2168  // compileV128RoundImpl implements compileV128Nearest compileV128Trunc compileV128Floor and compileV128Ceil
  2169  // with ROUNDPS (32-bit lane) and ROUNDPD (64-bit lane).
  2170  func (c *amd64Compiler) compileV128RoundImpl(is32bit bool, mode byte) error {
  2171  	v := c.locationStack.popV128()
  2172  	if err := c.compileEnsureOnRegister(v); err != nil {
  2173  		return err
  2174  	}
  2175  	vr := v.register
  2176  
  2177  	var round asm.Instruction
  2178  	if is32bit {
  2179  		round = amd64.ROUNDPS
  2180  	} else {
  2181  		round = amd64.ROUNDPD
  2182  	}
  2183  
  2184  	c.assembler.CompileRegisterToRegisterWithArg(round, vr, vr, mode)
  2185  	c.pushVectorRuntimeValueLocationOnRegister(vr)
  2186  	return nil
  2187  }
  2188  
  2189  // compileV128Extend implements compiler.compileV128Extend for amd64.
  2190  func (c *amd64Compiler) compileV128Extend(o *wazeroir.UnionOperation) error {
  2191  	v := c.locationStack.popV128()
  2192  	if err := c.compileEnsureOnRegister(v); err != nil {
  2193  		return err
  2194  	}
  2195  	vr := v.register
  2196  
  2197  	originShape := o.B1
  2198  	signed := o.B2 == 1
  2199  	useLow := o.B3
  2200  	if !useLow {
  2201  		// We have to shift the higher 64-bits into the lower ones before the actual extending instruction.
  2202  		// Shifting right by 0x8 * 8 = 64bits and concatenate itself.
  2203  		// See https://www.felixcloutier.com/x86/palignr
  2204  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PALIGNR, v.register, v.register, 0x8)
  2205  	}
  2206  
  2207  	var extend asm.Instruction
  2208  	switch originShape {
  2209  	case wazeroir.ShapeI8x16:
  2210  		if signed {
  2211  			extend = amd64.PMOVSXBW
  2212  		} else {
  2213  			extend = amd64.PMOVZXBW
  2214  		}
  2215  	case wazeroir.ShapeI16x8:
  2216  		if signed {
  2217  			extend = amd64.PMOVSXWD
  2218  		} else {
  2219  			extend = amd64.PMOVZXWD
  2220  		}
  2221  	case wazeroir.ShapeI32x4:
  2222  		if signed {
  2223  			extend = amd64.PMOVSXDQ
  2224  		} else {
  2225  			extend = amd64.PMOVZXDQ
  2226  		}
  2227  	}
  2228  
  2229  	c.assembler.CompileRegisterToRegister(extend, vr, vr)
  2230  	c.pushVectorRuntimeValueLocationOnRegister(vr)
  2231  	return nil
  2232  }
  2233  
  2234  // compileV128ExtMul implements compiler.compileV128ExtMul for amd64.
  2235  func (c *amd64Compiler) compileV128ExtMul(o *wazeroir.UnionOperation) error {
  2236  	x2 := c.locationStack.popV128()
  2237  	if err := c.compileEnsureOnRegister(x2); err != nil {
  2238  		return err
  2239  	}
  2240  
  2241  	x1 := c.locationStack.popV128()
  2242  	if err := c.compileEnsureOnRegister(x1); err != nil {
  2243  		return err
  2244  	}
  2245  
  2246  	x1r, x2r := x1.register, x2.register
  2247  
  2248  	originShape := o.B1
  2249  	signed := o.B2 == 1
  2250  	useLow := o.B3
  2251  	switch originShape {
  2252  	case wazeroir.ShapeI8x16:
  2253  		if !useLow {
  2254  			// We have to shift the higher 64-bits into the lower ones before the actual extending instruction.
  2255  			// Shifting right by 0x8 * 8 = 64bits and concatenate itself.
  2256  			// See https://www.felixcloutier.com/x86/palignr
  2257  			c.assembler.CompileRegisterToRegisterWithArg(amd64.PALIGNR, x1r, x1r, 0x8)
  2258  			c.assembler.CompileRegisterToRegisterWithArg(amd64.PALIGNR, x2r, x2r, 0x8)
  2259  		}
  2260  
  2261  		var ext asm.Instruction
  2262  		if signed {
  2263  			ext = amd64.PMOVSXBW
  2264  		} else {
  2265  			ext = amd64.PMOVZXBW
  2266  		}
  2267  
  2268  		// Signed or Zero extend lower half packed bytes to packed words.
  2269  		c.assembler.CompileRegisterToRegister(ext, x1r, x1r)
  2270  		c.assembler.CompileRegisterToRegister(ext, x2r, x2r)
  2271  
  2272  		c.assembler.CompileRegisterToRegister(amd64.PMULLW, x2r, x1r)
  2273  	case wazeroir.ShapeI16x8:
  2274  		tmp, err := c.allocateRegister(registerTypeVector)
  2275  		if err != nil {
  2276  			return err
  2277  		}
  2278  
  2279  		// Copy the value on x1r to tmp.
  2280  		c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1r, tmp)
  2281  
  2282  		// Multiply the values and store the lower 16-bits into x1r.
  2283  		c.assembler.CompileRegisterToRegister(amd64.PMULLW, x2r, x1r)
  2284  		if signed {
  2285  			// Signed multiply the values and store the higher 16-bits into tmp.
  2286  			c.assembler.CompileRegisterToRegister(amd64.PMULHW, x2r, tmp)
  2287  		} else {
  2288  			// Unsigned multiply the values and store the higher 16-bits into tmp.
  2289  			c.assembler.CompileRegisterToRegister(amd64.PMULHUW, x2r, tmp)
  2290  		}
  2291  
  2292  		// Unpack lower or higher half of vectors (tmp and x1r) and concatenate them.
  2293  		if useLow {
  2294  			c.assembler.CompileRegisterToRegister(amd64.PUNPCKLWD, tmp, x1r)
  2295  		} else {
  2296  			c.assembler.CompileRegisterToRegister(amd64.PUNPCKHWD, tmp, x1r)
  2297  		}
  2298  	case wazeroir.ShapeI32x4:
  2299  		var shuffleOrder byte
  2300  		// Given that the original state of the register is as [v1, v2, v3, v4] where vN = a word,
  2301  		if useLow {
  2302  			// This makes the register as [v1, v1, v2, v2]
  2303  			shuffleOrder = 0b01010000
  2304  		} else {
  2305  			// This makes the register as [v3, v3, v4, v4]
  2306  			shuffleOrder = 0b11111010
  2307  		}
  2308  		// See https://www.felixcloutier.com/x86/pshufd
  2309  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, x1r, x1r, shuffleOrder)
  2310  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, x2r, x2r, shuffleOrder)
  2311  
  2312  		var mul asm.Instruction
  2313  		if signed {
  2314  			mul = amd64.PMULDQ
  2315  		} else {
  2316  			mul = amd64.PMULUDQ
  2317  		}
  2318  		c.assembler.CompileRegisterToRegister(mul, x2r, x1r)
  2319  	}
  2320  
  2321  	c.locationStack.markRegisterUnused(x2r)
  2322  	c.pushVectorRuntimeValueLocationOnRegister(x1r)
  2323  	return nil
  2324  }
  2325  
  2326  var q15mulrSatSMask = [16]byte{
  2327  	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
  2328  	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
  2329  }
  2330  
  2331  // compileV128Q15mulrSatS implements compiler.compileV128Q15mulrSatS for amd64.
  2332  func (c *amd64Compiler) compileV128Q15mulrSatS(*wazeroir.UnionOperation) error {
  2333  	x2 := c.locationStack.popV128()
  2334  	if err := c.compileEnsureOnRegister(x2); err != nil {
  2335  		return err
  2336  	}
  2337  
  2338  	x1 := c.locationStack.popV128()
  2339  	if err := c.compileEnsureOnRegister(x1); err != nil {
  2340  		return err
  2341  	}
  2342  
  2343  	tmp, err := c.allocateRegister(registerTypeVector)
  2344  	if err != nil {
  2345  		return err
  2346  	}
  2347  
  2348  	x1r, x2r := x1.register, x2.register
  2349  
  2350  	// See https://github.com/WebAssembly/simd/pull/365 for the following logic.
  2351  	if err := c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, asm.NewStaticConst(q15mulrSatSMask[:]), tmp); err != nil {
  2352  		return err
  2353  	}
  2354  
  2355  	c.assembler.CompileRegisterToRegister(amd64.PMULHRSW, x2r, x1r)
  2356  	c.assembler.CompileRegisterToRegister(amd64.PCMPEQW, x1r, tmp)
  2357  	c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, x1r)
  2358  
  2359  	c.locationStack.markRegisterUnused(x2r)
  2360  	c.pushVectorRuntimeValueLocationOnRegister(x1r)
  2361  	return nil
  2362  }
  2363  
  2364  var (
  2365  	allOnesI8x16 = [16]byte{0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1}
  2366  	allOnesI16x8 = [16]byte{0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0}
  2367  
  2368  	extAddPairwiseI16x8uMask = [16 * 2]byte{
  2369  		0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
  2370  		0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00,
  2371  	}
  2372  )
  2373  
  2374  // compileV128ExtAddPairwise implements compiler.compileV128ExtAddPairwise for amd64.
  2375  func (c *amd64Compiler) compileV128ExtAddPairwise(o *wazeroir.UnionOperation) error {
  2376  	v := c.locationStack.popV128()
  2377  	if err := c.compileEnsureOnRegister(v); err != nil {
  2378  		return err
  2379  	}
  2380  	vr := v.register
  2381  
  2382  	originShape := o.B1
  2383  	signed := o.B3
  2384  	switch originShape {
  2385  	case wazeroir.ShapeI8x16:
  2386  		allOnesReg, err := c.allocateRegister(registerTypeVector)
  2387  		if err != nil {
  2388  			return err
  2389  		}
  2390  
  2391  		if err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU,
  2392  			asm.NewStaticConst(allOnesI8x16[:]), allOnesReg); err != nil {
  2393  			return err
  2394  		}
  2395  
  2396  		var result asm.Register
  2397  		// See https://www.felixcloutier.com/x86/pmaddubsw for detail.
  2398  		if signed {
  2399  			// Interpret vr's value as signed byte and multiply with one and add pairwise, which results in pairwise
  2400  			// signed extadd.
  2401  			c.assembler.CompileRegisterToRegister(amd64.PMADDUBSW, vr, allOnesReg)
  2402  			result = allOnesReg
  2403  		} else {
  2404  			// Interpreter tmp (all ones) as signed byte meaning that all the multiply-add is unsigned.
  2405  			c.assembler.CompileRegisterToRegister(amd64.PMADDUBSW, allOnesReg, vr)
  2406  			result = vr
  2407  		}
  2408  
  2409  		if result != vr {
  2410  			c.locationStack.markRegisterUnused(vr)
  2411  		}
  2412  		c.pushVectorRuntimeValueLocationOnRegister(result)
  2413  	case wazeroir.ShapeI16x8:
  2414  		tmp, err := c.allocateRegister(registerTypeVector)
  2415  		if err != nil {
  2416  			return err
  2417  		}
  2418  
  2419  		if signed {
  2420  			// See https://www.felixcloutier.com/x86/pmaddwd
  2421  			if err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU,
  2422  				asm.NewStaticConst(allOnesI16x8[:]), tmp); err != nil {
  2423  				return err
  2424  			}
  2425  
  2426  			c.assembler.CompileRegisterToRegister(amd64.PMADDWD, tmp, vr)
  2427  			c.pushVectorRuntimeValueLocationOnRegister(vr)
  2428  		} else {
  2429  
  2430  			if err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU,
  2431  				asm.NewStaticConst(extAddPairwiseI16x8uMask[:16]), tmp); err != nil {
  2432  				return err
  2433  			}
  2434  
  2435  			// Flip the sign bits on vr.
  2436  			//
  2437  			// Assuming that vr = [w1, ..., w8], now we have,
  2438  			// 	vr[i] = int8(-w1) for i = 0...8
  2439  			c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, vr)
  2440  
  2441  			if err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU,
  2442  				asm.NewStaticConst(allOnesI16x8[:]), tmp); err != nil {
  2443  				return err
  2444  			}
  2445  
  2446  			// For i = 0,..4 (as this results in i32x4 lanes), now we have
  2447  			// vr[i] = int32(-wn + -w(n+1)) = int32(-(wn + w(n+1)))
  2448  			c.assembler.CompileRegisterToRegister(amd64.PMADDWD, tmp, vr)
  2449  
  2450  			// tmp[i] = [0, 0, 1, 0] = int32(math.MaxInt16+1)
  2451  			if err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU,
  2452  				asm.NewStaticConst(extAddPairwiseI16x8uMask[16:]), tmp); err != nil {
  2453  				return err
  2454  			}
  2455  
  2456  			// vr[i] = int32(-(wn + w(n+1))) + int32(math.MaxInt16+1) = int32((wn + w(n+1))) = uint32(wn + w(n+1)).
  2457  			c.assembler.CompileRegisterToRegister(amd64.PADDD, tmp, vr)
  2458  			c.pushVectorRuntimeValueLocationOnRegister(vr)
  2459  		}
  2460  	}
  2461  	return nil
  2462  }
  2463  
  2464  // compileV128FloatPromote implements compiler.compileV128FloatPromote for amd64.
  2465  func (c *amd64Compiler) compileV128FloatPromote(*wazeroir.UnionOperation) error {
  2466  	v := c.locationStack.popV128()
  2467  	if err := c.compileEnsureOnRegister(v); err != nil {
  2468  		return err
  2469  	}
  2470  	vr := v.register
  2471  
  2472  	c.assembler.CompileRegisterToRegister(amd64.CVTPS2PD, vr, vr)
  2473  	c.pushVectorRuntimeValueLocationOnRegister(vr)
  2474  	return nil
  2475  }
  2476  
  2477  // compileV128FloatDemote implements compiler.compileV128FloatDemote for amd64.
  2478  func (c *amd64Compiler) compileV128FloatDemote(*wazeroir.UnionOperation) error {
  2479  	v := c.locationStack.popV128()
  2480  	if err := c.compileEnsureOnRegister(v); err != nil {
  2481  		return err
  2482  	}
  2483  	vr := v.register
  2484  
  2485  	c.assembler.CompileRegisterToRegister(amd64.CVTPD2PS, vr, vr)
  2486  	c.pushVectorRuntimeValueLocationOnRegister(vr)
  2487  	return nil
  2488  }
  2489  
  2490  // compileV128Dot implements compiler.compileV128Dot for amd64.
  2491  func (c *amd64Compiler) compileV128Dot(*wazeroir.UnionOperation) error {
  2492  	x2 := c.locationStack.popV128()
  2493  	if err := c.compileEnsureOnRegister(x2); err != nil {
  2494  		return err
  2495  	}
  2496  
  2497  	x1 := c.locationStack.popV128()
  2498  	if err := c.compileEnsureOnRegister(x1); err != nil {
  2499  		return err
  2500  	}
  2501  
  2502  	c.assembler.CompileRegisterToRegister(amd64.PMADDWD, x2.register, x1.register)
  2503  
  2504  	c.locationStack.markRegisterUnused(x2.register)
  2505  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
  2506  	return nil
  2507  }
  2508  
  2509  var fConvertFromIMask = [16]byte{
  2510  	0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  2511  }
  2512  
  2513  // compileV128FConvertFromI implements compiler.compileV128FConvertFromI for amd64.
  2514  func (c *amd64Compiler) compileV128FConvertFromI(o *wazeroir.UnionOperation) error {
  2515  	v := c.locationStack.popV128()
  2516  	if err := c.compileEnsureOnRegister(v); err != nil {
  2517  		return err
  2518  	}
  2519  	vr := v.register
  2520  
  2521  	destinationShape := o.B1
  2522  	signed := o.B3
  2523  
  2524  	switch destinationShape {
  2525  	case wazeroir.ShapeF32x4:
  2526  		if signed {
  2527  			c.assembler.CompileRegisterToRegister(amd64.CVTDQ2PS, vr, vr)
  2528  		} else {
  2529  			tmp, err := c.allocateRegister(registerTypeVector)
  2530  			if err != nil {
  2531  				return err
  2532  			}
  2533  
  2534  			// Copy the value into tmp.
  2535  			c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vr, tmp)
  2536  
  2537  			// Clear the higher 16-bits of tmp.
  2538  			c.assembler.CompileConstToRegister(amd64.PSLLD, 0xa, tmp)
  2539  			c.assembler.CompileConstToRegister(amd64.PSRLD, 0xa, tmp)
  2540  
  2541  			// Subtract the higher 16-bits from vr == clear the lower 16-bits of vr.
  2542  			c.assembler.CompileRegisterToRegister(amd64.PSUBD, tmp, vr)
  2543  
  2544  			// Convert the lower 16-bits in tmp.
  2545  			c.assembler.CompileRegisterToRegister(amd64.CVTDQ2PS, tmp, tmp)
  2546  
  2547  			// Left shift by one and convert vr, meaning that halved conversion result of higher 16-bits in vr.
  2548  			c.assembler.CompileConstToRegister(amd64.PSRLD, 1, vr)
  2549  			c.assembler.CompileRegisterToRegister(amd64.CVTDQ2PS, vr, vr)
  2550  
  2551  			// Double the converted halved higher 16bits.
  2552  			c.assembler.CompileRegisterToRegister(amd64.ADDPS, vr, vr)
  2553  
  2554  			// Get the conversion result by add tmp (holding lower 16-bit conversion) into vr.
  2555  			c.assembler.CompileRegisterToRegister(amd64.ADDPS, tmp, vr)
  2556  		}
  2557  	case wazeroir.ShapeF64x2:
  2558  		if signed {
  2559  			c.assembler.CompileRegisterToRegister(amd64.CVTDQ2PD, vr, vr)
  2560  		} else {
  2561  			tmp, err := c.allocateRegister(registerTypeVector)
  2562  			if err != nil {
  2563  				return err
  2564  			}
  2565  
  2566  			// tmp = [0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]
  2567  			if err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, asm.NewStaticConst(fConvertFromIMask[:16]), tmp); err != nil {
  2568  				return err
  2569  			}
  2570  
  2571  			// Given that we have vr = [d1, d2, d3, d4], this results in
  2572  			//	vr = [d1, [0x00, 0x00, 0x30, 0x43], d2, [0x00, 0x00, 0x30, 0x43]]
  2573  			//     = [float64(uint32(d1)) + 0x1.0p52, float64(uint32(d2)) + 0x1.0p52]
  2574  			//     ^See https://stackoverflow.com/questions/13269523/can-all-32-bit-ints-be-exactly-represented-as-a-double
  2575  			c.assembler.CompileRegisterToRegister(amd64.UNPCKLPS, tmp, vr)
  2576  
  2577  			// tmp = [float64(0x1.0p52), float64(0x1.0p52)]
  2578  			if err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU,
  2579  				asm.NewStaticConst(twop52[:]), tmp); err != nil {
  2580  				return err
  2581  			}
  2582  
  2583  			// Now, we get the result as
  2584  			// 	vr = [float64(uint32(d1)), float64(uint32(d2))]
  2585  			// because the following equality always satisfies:
  2586  			//  float64(0x1.0p52 + float64(uint32(x))) - float64(0x1.0p52 + float64(uint32(y))) = float64(uint32(x)) - float64(uint32(y))
  2587  			c.assembler.CompileRegisterToRegister(amd64.SUBPD, tmp, vr)
  2588  		}
  2589  	}
  2590  
  2591  	c.pushVectorRuntimeValueLocationOnRegister(vr)
  2592  	return nil
  2593  }
  2594  
  2595  // compileV128Narrow implements compiler.compileV128Narrow for amd64.
  2596  func (c *amd64Compiler) compileV128Narrow(o *wazeroir.UnionOperation) error {
  2597  	x2 := c.locationStack.popV128()
  2598  	if err := c.compileEnsureOnRegister(x2); err != nil {
  2599  		return err
  2600  	}
  2601  
  2602  	x1 := c.locationStack.popV128()
  2603  	if err := c.compileEnsureOnRegister(x1); err != nil {
  2604  		return err
  2605  	}
  2606  
  2607  	var narrow asm.Instruction
  2608  	originShape := o.B1
  2609  	signed := o.B3
  2610  	switch originShape {
  2611  	case wazeroir.ShapeI16x8:
  2612  		if signed {
  2613  			narrow = amd64.PACKSSWB
  2614  		} else {
  2615  			narrow = amd64.PACKUSWB
  2616  		}
  2617  	case wazeroir.ShapeI32x4:
  2618  		if signed {
  2619  			narrow = amd64.PACKSSDW
  2620  		} else {
  2621  			narrow = amd64.PACKUSDW
  2622  		}
  2623  	}
  2624  	c.assembler.CompileRegisterToRegister(narrow, x2.register, x1.register)
  2625  
  2626  	c.locationStack.markRegisterUnused(x2.register)
  2627  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
  2628  	return nil
  2629  }
  2630  
  2631  var (
  2632  	// i32sMaxOnF64x2 holds math.MaxInt32(=2147483647.0) on two f64 lanes.
  2633  	i32sMaxOnF64x2 = [16]byte{
  2634  		0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0xdf, 0x41, // float64(2147483647.0)
  2635  		0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0xdf, 0x41, // float64(2147483647.0)
  2636  	}
  2637  
  2638  	// i32sMaxOnF64x2 holds math.MaxUint32(=4294967295.0) on two f64 lanes.
  2639  	i32uMaxOnF64x2 = [16]byte{
  2640  		0x00, 0x00, 0xe0, 0xff, 0xff, 0xff, 0xef, 0x41, // float64(4294967295.0)
  2641  		0x00, 0x00, 0xe0, 0xff, 0xff, 0xff, 0xef, 0x41, // float64(4294967295.0)
  2642  	}
  2643  
  2644  	// twop52 holds two float64(0x1.0p52) on two f64 lanes. 0x1.0p52 is special in the sense that
  2645  	// with this exponent, the mantissa represents a corresponding uint32 number, and arithmetics,
  2646  	// like addition or subtraction, the resulted floating point holds exactly the same
  2647  	// bit representations in 32-bit integer on its mantissa.
  2648  	//
  2649  	// Note: the name twop52 is common across various compiler ecosystem.
  2650  	// 	E.g. https://github.com/llvm/llvm-project/blob/92ab024f81e5b64e258b7c3baaf213c7c26fcf40/compiler-rt/lib/builtins/floatdidf.c#L28
  2651  	// 	E.g. https://opensource.apple.com/source/clang/clang-425.0.24/src/projects/compiler-rt/lib/floatdidf.c.auto.html
  2652  	twop52 = [16]byte{
  2653  		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, // float64(0x1.0p52)
  2654  		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, // float64(0x1.0p52)
  2655  	}
  2656  )
  2657  
  2658  // compileV128ITruncSatFromF implements compiler.compileV128ITruncSatFromF for amd64.
  2659  func (c *amd64Compiler) compileV128ITruncSatFromF(o *wazeroir.UnionOperation) error {
  2660  	v := c.locationStack.popV128()
  2661  	if err := c.compileEnsureOnRegister(v); err != nil {
  2662  		return err
  2663  	}
  2664  	vr := v.register
  2665  
  2666  	tmp, err := c.allocateRegister(registerTypeVector)
  2667  	if err != nil {
  2668  		return err
  2669  	}
  2670  
  2671  	c.locationStack.markRegisterUsed(tmp)
  2672  
  2673  	originShape := o.B1
  2674  	signed := o.B3
  2675  	switch originShape {
  2676  	case wazeroir.ShapeF32x4:
  2677  		if signed {
  2678  			// Copy the value into tmp.
  2679  			c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vr, tmp)
  2680  
  2681  			// Assuming we have vr = [v1, v2, v3, v4].
  2682  			//
  2683  			// Set all bits if lane is not NaN on tmp.
  2684  			// tmp[i] = 0xffffffff  if vi != NaN
  2685  			//        = 0           if vi == NaN
  2686  			c.assembler.CompileRegisterToRegister(amd64.CMPEQPS, tmp, tmp)
  2687  
  2688  			// Clear NaN lanes on vr, meaning that
  2689  			// 	vr[i] = vi  if vi != NaN
  2690  			//	        0   if vi == NaN
  2691  			c.assembler.CompileRegisterToRegister(amd64.ANDPS, tmp, vr)
  2692  
  2693  			// tmp[i] = ^vi         if vi != NaN
  2694  			//        = 0xffffffff  if vi == NaN
  2695  			// which means that tmp[i] & 0x80000000 != 0 if and only if vi is negative.
  2696  			c.assembler.CompileRegisterToRegister(amd64.PXOR, vr, tmp)
  2697  
  2698  			// vr[i] = int32(vi)   if vi != NaN and vr is not overflowing.
  2699  			//       = 0x80000000  if vi != NaN and vr is overflowing (See https://www.felixcloutier.com/x86/cvttps2dq)
  2700  			//       = 0           if vi == NaN
  2701  			c.assembler.CompileRegisterToRegister(amd64.CVTTPS2DQ, vr, vr)
  2702  
  2703  			// Below, we have to convert 0x80000000 into 0x7FFFFFFF for positive overflowing lane.
  2704  			//
  2705  			// tmp[i] = 0x80000000                         if vi is positive
  2706  			//        = any satisfying any&0x80000000 = 0  if vi is negative or zero.
  2707  			c.assembler.CompileRegisterToRegister(amd64.PAND, vr, tmp)
  2708  
  2709  			// Arithmetic right shifting tmp by 31, meaning that we have
  2710  			// tmp[i] = 0xffffffff if vi is positive, 0 otherwise.
  2711  			c.assembler.CompileConstToRegister(amd64.PSRAD, 0x1f, tmp)
  2712  
  2713  			// Flipping 0x80000000 if vi is positive, otherwise keep intact.
  2714  			c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, vr)
  2715  		} else {
  2716  			tmp2, err := c.allocateRegister(registerTypeVector)
  2717  			if err != nil {
  2718  				return err
  2719  			}
  2720  
  2721  			// See https://github.com/bytecodealliance/wasmtime/pull/2440
  2722  			// Note: even v8 doesn't seem to have support for this i32x4.tranc_sat_f32x4_u.
  2723  			c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, tmp)
  2724  			c.assembler.CompileRegisterToRegister(amd64.MAXPS, tmp, vr)
  2725  			c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, tmp, tmp)
  2726  			c.assembler.CompileConstToRegister(amd64.PSRLD, 0x1, tmp)
  2727  			c.assembler.CompileRegisterToRegister(amd64.CVTDQ2PS, tmp, tmp)
  2728  			c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vr, tmp2)
  2729  			c.assembler.CompileRegisterToRegister(amd64.CVTTPS2DQ, vr, vr)
  2730  			c.assembler.CompileRegisterToRegister(amd64.SUBPS, tmp, tmp2)
  2731  			c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPS, tmp2, tmp, 0x2) // == CMPLEPS
  2732  			c.assembler.CompileRegisterToRegister(amd64.CVTTPS2DQ, tmp2, tmp2)
  2733  			c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, tmp2)
  2734  			c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, tmp)
  2735  			c.assembler.CompileRegisterToRegister(amd64.PMAXSD, tmp, tmp2)
  2736  			c.assembler.CompileRegisterToRegister(amd64.PADDD, tmp2, vr)
  2737  		}
  2738  	case wazeroir.ShapeF64x2:
  2739  		tmp2, err := c.allocateRegister(registerTypeVector)
  2740  		if err != nil {
  2741  			return err
  2742  		}
  2743  
  2744  		if signed {
  2745  			// Copy the value into tmp.
  2746  			c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vr, tmp)
  2747  
  2748  			// Set all bits for non-NaN lanes, zeros otherwise.
  2749  			// I.e. tmp[i] = 0xffffffff_ffffffff if vi != NaN, 0 otherwise.
  2750  			c.assembler.CompileRegisterToRegister(amd64.CMPEQPD, tmp, tmp)
  2751  
  2752  			// Load the 2147483647 into tmp2's each lane.
  2753  			if err = c.assembler.CompileStaticConstToRegister(amd64.MOVUPD, asm.NewStaticConst(i32sMaxOnF64x2[:]), tmp2); err != nil {
  2754  				return err
  2755  			}
  2756  
  2757  			// tmp[i] = 2147483647 if vi != NaN, 0 otherwise.
  2758  			c.assembler.CompileRegisterToRegister(amd64.ANDPS, tmp2, tmp)
  2759  
  2760  			// MINPD returns the source register's value as-is, so we have
  2761  			//  vr[i] = vi   if vi != NaN
  2762  			//        = 0    if vi == NaN
  2763  			c.assembler.CompileRegisterToRegister(amd64.MINPD, tmp, vr)
  2764  
  2765  			c.assembler.CompileRegisterToRegister(amd64.CVTTPD2DQ, vr, vr)
  2766  		} else {
  2767  			// Clears all bits on tmp.
  2768  			c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, tmp)
  2769  
  2770  			//  vr[i] = vi   if vi != NaN && vi > 0
  2771  			//        = 0    if vi == NaN || vi <= 0
  2772  			c.assembler.CompileRegisterToRegister(amd64.MAXPD, tmp, vr)
  2773  
  2774  			// tmp2[i] = float64(math.MaxUint32) = math.MaxUint32
  2775  			if err = c.assembler.CompileStaticConstToRegister(amd64.MOVUPD, asm.NewStaticConst(i32uMaxOnF64x2[:]), tmp2); err != nil {
  2776  				return err
  2777  			}
  2778  
  2779  			// vr[i] = vi   if vi != NaN && vi > 0 && vi <= math.MaxUint32
  2780  			//       = 0    otherwise
  2781  			c.assembler.CompileRegisterToRegister(amd64.MINPD, tmp2, vr)
  2782  
  2783  			// Round the floating points into integer.
  2784  			c.assembler.CompileRegisterToRegisterWithArg(amd64.ROUNDPD, vr, vr, 0x3)
  2785  
  2786  			// tmp2[i] = float64(0x1.0p52)
  2787  			if err = c.assembler.CompileStaticConstToRegister(amd64.MOVUPD, asm.NewStaticConst(twop52[:]), tmp2); err != nil {
  2788  				return err
  2789  			}
  2790  
  2791  			// vr[i] = float64(0x1.0p52) + float64(uint32(vi)) if vi != NaN && vi > 0 && vi <= math.MaxUint32
  2792  			//       = 0                                       otherwise
  2793  			//
  2794  			// This means that vr[i] holds exactly the same bit of uint32(vi) in its lower 32-bits.
  2795  			c.assembler.CompileRegisterToRegister(amd64.ADDPD, tmp2, vr)
  2796  
  2797  			// At this point, we have
  2798  			// 	vr  = [uint32(v0), float64(0x1.0p52), uint32(v1), float64(0x1.0p52)]
  2799  			//  tmp = [0, 0, 0, 0]
  2800  			// as 32x4 lanes. Therefore, SHUFPS with 0b00_00_10_00 results in
  2801  			//	vr = [vr[00], vr[10], tmp[00], tmp[00]] = [vr[00], vr[10], 0, 0]
  2802  			// meaning that for i = 0 and 1, we have
  2803  			//  vr[i] = uint32(vi) if vi != NaN && vi > 0 && vi <= math.MaxUint32
  2804  			//        = 0          otherwise.
  2805  			c.assembler.CompileRegisterToRegisterWithArg(amd64.SHUFPS, tmp, vr, 0b00_00_10_00)
  2806  		}
  2807  	}
  2808  
  2809  	c.locationStack.markRegisterUnused(tmp)
  2810  	c.pushVectorRuntimeValueLocationOnRegister(vr)
  2811  	return nil
  2812  }