wa-lang.org/wazero@v1.0.2/internal/engine/compiler/impl_vec_amd64.go

wa-lang.org/wazero@v1.0.2/internal/engine/compiler/impl_vec_amd64.go (about)

     1  package compiler
     2  
     3  import (
     4  	"errors"
     5  
     6  	"wa-lang.org/wazero/internal/asm"
     7  	"wa-lang.org/wazero/internal/asm/amd64"
     8  	"wa-lang.org/wazero/internal/wazeroir"
     9  )
    10  
    11  // compileV128Const implements compiler.compileV128Const for amd64 architecture.
    12  func (c *amd64Compiler) compileV128Const(o *wazeroir.OperationV128Const) error {
    13  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
    14  		return err
    15  	}
    16  
    17  	result, err := c.allocateRegister(registerTypeVector)
    18  	if err != nil {
    19  		return err
    20  	}
    21  
    22  	// We cannot directly load the value from memory to float regs,
    23  	// so we move it to int reg temporarily.
    24  	tmpReg, err := c.allocateRegister(registerTypeGeneralPurpose)
    25  	if err != nil {
    26  		return err
    27  	}
    28  
    29  	// Move the lower 64-bits.
    30  	if o.Lo == 0 {
    31  		c.assembler.CompileRegisterToRegister(amd64.XORQ, tmpReg, tmpReg)
    32  	} else {
    33  		c.assembler.CompileConstToRegister(amd64.MOVQ, int64(o.Lo), tmpReg)
    34  	}
    35  	c.assembler.CompileRegisterToRegister(amd64.MOVQ, tmpReg, result)
    36  
    37  	if o.Lo != 0 && o.Hi == 0 {
    38  		c.assembler.CompileRegisterToRegister(amd64.XORQ, tmpReg, tmpReg)
    39  	} else if o.Hi != 0 {
    40  		c.assembler.CompileConstToRegister(amd64.MOVQ, int64(o.Hi), tmpReg)
    41  	}
    42  	// Move the higher 64-bits with PINSRQ at the second element of 64x2 vector.
    43  	c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, tmpReg, result, 1)
    44  
    45  	c.pushVectorRuntimeValueLocationOnRegister(result)
    46  	return nil
    47  }
    48  
    49  // compileV128Add implements compiler.compileV128Add for amd64 architecture.
    50  func (c *amd64Compiler) compileV128Add(o *wazeroir.OperationV128Add) error {
    51  	x2 := c.locationStack.popV128()
    52  	if err := c.compileEnsureOnRegister(x2); err != nil {
    53  		return err
    54  	}
    55  
    56  	x1 := c.locationStack.popV128()
    57  	if err := c.compileEnsureOnRegister(x1); err != nil {
    58  		return err
    59  	}
    60  	var inst asm.Instruction
    61  	switch o.Shape {
    62  	case wazeroir.ShapeI8x16:
    63  		inst = amd64.PADDB
    64  	case wazeroir.ShapeI16x8:
    65  		inst = amd64.PADDW
    66  	case wazeroir.ShapeI32x4:
    67  		inst = amd64.PADDD
    68  	case wazeroir.ShapeI64x2:
    69  		inst = amd64.PADDQ
    70  	case wazeroir.ShapeF32x4:
    71  		inst = amd64.ADDPS
    72  	case wazeroir.ShapeF64x2:
    73  		inst = amd64.ADDPD
    74  	}
    75  	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
    76  
    77  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
    78  	c.locationStack.markRegisterUnused(x2.register)
    79  	return nil
    80  }
    81  
    82  // compileV128Sub implements compiler.compileV128Sub for amd64 architecture.
    83  func (c *amd64Compiler) compileV128Sub(o *wazeroir.OperationV128Sub) error {
    84  	x2 := c.locationStack.popV128()
    85  	if err := c.compileEnsureOnRegister(x2); err != nil {
    86  		return err
    87  	}
    88  
    89  	x1 := c.locationStack.popV128()
    90  	if err := c.compileEnsureOnRegister(x1); err != nil {
    91  		return err
    92  	}
    93  	var inst asm.Instruction
    94  	switch o.Shape {
    95  	case wazeroir.ShapeI8x16:
    96  		inst = amd64.PSUBB
    97  	case wazeroir.ShapeI16x8:
    98  		inst = amd64.PSUBW
    99  	case wazeroir.ShapeI32x4:
   100  		inst = amd64.PSUBD
   101  	case wazeroir.ShapeI64x2:
   102  		inst = amd64.PSUBQ
   103  	case wazeroir.ShapeF32x4:
   104  		inst = amd64.SUBPS
   105  	case wazeroir.ShapeF64x2:
   106  		inst = amd64.SUBPD
   107  	}
   108  	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
   109  
   110  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
   111  	c.locationStack.markRegisterUnused(x2.register)
   112  	return nil
   113  }
   114  
   115  // compileV128Load implements compiler.compileV128Load for amd64 architecture.
   116  func (c *amd64Compiler) compileV128Load(o *wazeroir.OperationV128Load) error {
   117  	result, err := c.allocateRegister(registerTypeVector)
   118  	if err != nil {
   119  		return err
   120  	}
   121  
   122  	switch o.Type {
   123  	case wazeroir.V128LoadType128:
   124  		err = c.compileV128LoadImpl(amd64.MOVDQU, o.Arg.Offset, 16, result)
   125  	case wazeroir.V128LoadType8x8s:
   126  		err = c.compileV128LoadImpl(amd64.PMOVSXBW, o.Arg.Offset, 8, result)
   127  	case wazeroir.V128LoadType8x8u:
   128  		err = c.compileV128LoadImpl(amd64.PMOVZXBW, o.Arg.Offset, 8, result)
   129  	case wazeroir.V128LoadType16x4s:
   130  		err = c.compileV128LoadImpl(amd64.PMOVSXWD, o.Arg.Offset, 8, result)
   131  	case wazeroir.V128LoadType16x4u:
   132  		err = c.compileV128LoadImpl(amd64.PMOVZXWD, o.Arg.Offset, 8, result)
   133  	case wazeroir.V128LoadType32x2s:
   134  		err = c.compileV128LoadImpl(amd64.PMOVSXDQ, o.Arg.Offset, 8, result)
   135  	case wazeroir.V128LoadType32x2u:
   136  		err = c.compileV128LoadImpl(amd64.PMOVZXDQ, o.Arg.Offset, 8, result)
   137  	case wazeroir.V128LoadType8Splat:
   138  		reg, err := c.compileMemoryAccessCeilSetup(o.Arg.Offset, 1)
   139  		if err != nil {
   140  			return err
   141  		}
   142  		c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVBQZX, amd64ReservedRegisterForMemory, -1,
   143  			reg, 1, reg)
   144  		// pinsrb   $0, reg, result
   145  		// pxor	    tmpVReg, tmpVReg
   146  		// pshufb   tmpVReg, result
   147  		c.locationStack.markRegisterUsed(result)
   148  		tmpVReg, err := c.allocateRegister(registerTypeVector)
   149  		if err != nil {
   150  			return err
   151  		}
   152  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRB, reg, result, 0)
   153  		c.assembler.CompileRegisterToRegister(amd64.PXOR, tmpVReg, tmpVReg)
   154  		c.assembler.CompileRegisterToRegister(amd64.PSHUFB, tmpVReg, result)
   155  	case wazeroir.V128LoadType16Splat:
   156  		reg, err := c.compileMemoryAccessCeilSetup(o.Arg.Offset, 2)
   157  		if err != nil {
   158  			return err
   159  		}
   160  		c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVWQZX, amd64ReservedRegisterForMemory, -2,
   161  			reg, 1, reg)
   162  		// pinsrw $0, reg, result
   163  		// pinsrw $1, reg, result
   164  		// pshufd $0, result, result (result = result[0,0,0,0])
   165  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRW, reg, result, 0)
   166  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRW, reg, result, 1)
   167  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, result, result, 0)
   168  	case wazeroir.V128LoadType32Splat:
   169  		reg, err := c.compileMemoryAccessCeilSetup(o.Arg.Offset, 4)
   170  		if err != nil {
   171  			return err
   172  		}
   173  		c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVLQZX, amd64ReservedRegisterForMemory, -4,
   174  			reg, 1, reg)
   175  		// pinsrd $0, reg, result
   176  		// pshufd $0, result, result (result = result[0,0,0,0])
   177  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRD, reg, result, 0)
   178  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, result, result, 0)
   179  	case wazeroir.V128LoadType64Splat:
   180  		reg, err := c.compileMemoryAccessCeilSetup(o.Arg.Offset, 8)
   181  		if err != nil {
   182  			return err
   183  		}
   184  		c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVQ, amd64ReservedRegisterForMemory, -8,
   185  			reg, 1, reg)
   186  		// pinsrq $0, reg, result
   187  		// pinsrq $1, reg, result
   188  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, reg, result, 0)
   189  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, reg, result, 1)
   190  	case wazeroir.V128LoadType32zero:
   191  		err = c.compileV128LoadImpl(amd64.MOVL, o.Arg.Offset, 4, result)
   192  	case wazeroir.V128LoadType64zero:
   193  		err = c.compileV128LoadImpl(amd64.MOVQ, o.Arg.Offset, 8, result)
   194  	}
   195  
   196  	if err != nil {
   197  		return err
   198  	}
   199  
   200  	c.pushVectorRuntimeValueLocationOnRegister(result)
   201  	return nil
   202  }
   203  
   204  func (c *amd64Compiler) compileV128LoadImpl(inst asm.Instruction, offset uint32, targetSizeInBytes int64, dst asm.Register) error {
   205  	offsetReg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes)
   206  	if err != nil {
   207  		return err
   208  	}
   209  	c.assembler.CompileMemoryWithIndexToRegister(inst, amd64ReservedRegisterForMemory, -targetSizeInBytes,
   210  		offsetReg, 1, dst)
   211  	return nil
   212  }
   213  
   214  // compileV128LoadLane implements compiler.compileV128LoadLane for amd64.
   215  func (c *amd64Compiler) compileV128LoadLane(o *wazeroir.OperationV128LoadLane) error {
   216  	targetVector := c.locationStack.popV128()
   217  	if err := c.compileEnsureOnRegister(targetVector); err != nil {
   218  		return err
   219  	}
   220  
   221  	var insertInst asm.Instruction
   222  	switch o.LaneSize {
   223  	case 8:
   224  		insertInst = amd64.PINSRB
   225  	case 16:
   226  		insertInst = amd64.PINSRW
   227  	case 32:
   228  		insertInst = amd64.PINSRD
   229  	case 64:
   230  		insertInst = amd64.PINSRQ
   231  	}
   232  
   233  	targetSizeInBytes := int64(o.LaneSize / 8)
   234  	offsetReg, err := c.compileMemoryAccessCeilSetup(o.Arg.Offset, targetSizeInBytes)
   235  	if err != nil {
   236  		return err
   237  	}
   238  	c.assembler.CompileMemoryWithIndexAndArgToRegister(insertInst, amd64ReservedRegisterForMemory, -targetSizeInBytes,
   239  		offsetReg, 1, targetVector.register, o.LaneIndex)
   240  
   241  	c.pushVectorRuntimeValueLocationOnRegister(targetVector.register)
   242  	return nil
   243  }
   244  
   245  // compileV128Store implements compiler.compileV128Store for amd64.
   246  func (c *amd64Compiler) compileV128Store(o *wazeroir.OperationV128Store) error {
   247  	val := c.locationStack.popV128()
   248  	if err := c.compileEnsureOnRegister(val); err != nil {
   249  		return err
   250  	}
   251  
   252  	const targetSizeInBytes = 16
   253  	offsetReg, err := c.compileMemoryAccessCeilSetup(o.Arg.Offset, targetSizeInBytes)
   254  	if err != nil {
   255  		return err
   256  	}
   257  
   258  	c.assembler.CompileRegisterToMemoryWithIndex(amd64.MOVDQU, val.register,
   259  		amd64ReservedRegisterForMemory, -targetSizeInBytes, offsetReg, 1)
   260  
   261  	c.locationStack.markRegisterUnused(val.register, offsetReg)
   262  	return nil
   263  }
   264  
   265  // compileV128StoreLane implements compiler.compileV128StoreLane for amd64.
   266  func (c *amd64Compiler) compileV128StoreLane(o *wazeroir.OperationV128StoreLane) error {
   267  	var storeInst asm.Instruction
   268  	switch o.LaneSize {
   269  	case 8:
   270  		storeInst = amd64.PEXTRB
   271  	case 16:
   272  		storeInst = amd64.PEXTRW
   273  	case 32:
   274  		storeInst = amd64.PEXTRD
   275  	case 64:
   276  		storeInst = amd64.PEXTRQ
   277  	}
   278  
   279  	val := c.locationStack.popV128()
   280  	if err := c.compileEnsureOnRegister(val); err != nil {
   281  		return err
   282  	}
   283  
   284  	targetSizeInBytes := int64(o.LaneSize / 8)
   285  	offsetReg, err := c.compileMemoryAccessCeilSetup(o.Arg.Offset, targetSizeInBytes)
   286  	if err != nil {
   287  		return err
   288  	}
   289  
   290  	c.assembler.CompileRegisterToMemoryWithIndexAndArg(storeInst, val.register,
   291  		amd64ReservedRegisterForMemory, -targetSizeInBytes, offsetReg, 1, o.LaneIndex)
   292  
   293  	c.locationStack.markRegisterUnused(val.register, offsetReg)
   294  	return nil
   295  }
   296  
   297  // compileV128ExtractLane implements compiler.compileV128ExtractLane for amd64.
   298  func (c *amd64Compiler) compileV128ExtractLane(o *wazeroir.OperationV128ExtractLane) error {
   299  	val := c.locationStack.popV128()
   300  	if err := c.compileEnsureOnRegister(val); err != nil {
   301  		return err
   302  	}
   303  	switch o.Shape {
   304  	case wazeroir.ShapeI8x16:
   305  		result, err := c.allocateRegister(registerTypeGeneralPurpose)
   306  		if err != nil {
   307  			return err
   308  		}
   309  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PEXTRB, val.register, result, o.LaneIndex)
   310  		if o.Signed {
   311  			c.assembler.CompileRegisterToRegister(amd64.MOVBLSX, result, result)
   312  		} else {
   313  			c.assembler.CompileRegisterToRegister(amd64.MOVBLZX, result, result)
   314  		}
   315  		c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32)
   316  		c.locationStack.markRegisterUnused(val.register)
   317  	case wazeroir.ShapeI16x8:
   318  		result, err := c.allocateRegister(registerTypeGeneralPurpose)
   319  		if err != nil {
   320  			return err
   321  		}
   322  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PEXTRW, val.register, result, o.LaneIndex)
   323  		if o.Signed {
   324  			c.assembler.CompileRegisterToRegister(amd64.MOVWLSX, result, result)
   325  		} else {
   326  			c.assembler.CompileRegisterToRegister(amd64.MOVWLZX, result, result)
   327  		}
   328  		c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32)
   329  		c.locationStack.markRegisterUnused(val.register)
   330  	case wazeroir.ShapeI32x4:
   331  		result, err := c.allocateRegister(registerTypeGeneralPurpose)
   332  		if err != nil {
   333  			return err
   334  		}
   335  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PEXTRD, val.register, result, o.LaneIndex)
   336  		c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32)
   337  		c.locationStack.markRegisterUnused(val.register)
   338  	case wazeroir.ShapeI64x2:
   339  		result, err := c.allocateRegister(registerTypeGeneralPurpose)
   340  		if err != nil {
   341  			return err
   342  		}
   343  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PEXTRQ, val.register, result, o.LaneIndex)
   344  		c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI64)
   345  		c.locationStack.markRegisterUnused(val.register)
   346  	case wazeroir.ShapeF32x4:
   347  		if o.LaneIndex != 0 {
   348  			c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, val.register, val.register, o.LaneIndex)
   349  		}
   350  		c.pushRuntimeValueLocationOnRegister(val.register, runtimeValueTypeF32)
   351  	case wazeroir.ShapeF64x2:
   352  		if o.LaneIndex != 0 {
   353  			// This case we can assume LaneIndex == 1.
   354  			// We have to modify the val.register as, for example:
   355  			//    0b11 0b10 0b01 0b00
   356  			//     |    |    |    |
   357  			//   [x3,  x2,  x1,  x0] -> [x0,  x0,  x3,  x2]
   358  			// where val.register = [x3, x2, x1, x0] and each xN = 32bits.
   359  			// Then, we interpret the register as float64, therefore, the float64 value is obtained as [x3, x2].
   360  			arg := byte(0b00_00_11_10)
   361  			c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, val.register, val.register, arg)
   362  		}
   363  		c.pushRuntimeValueLocationOnRegister(val.register, runtimeValueTypeF64)
   364  	}
   365  
   366  	return nil
   367  }
   368  
   369  // compileV128ReplaceLane implements compiler.compileV128ReplaceLane for amd64.
   370  func (c *amd64Compiler) compileV128ReplaceLane(o *wazeroir.OperationV128ReplaceLane) error {
   371  	origin := c.locationStack.pop()
   372  	if err := c.compileEnsureOnRegister(origin); err != nil {
   373  		return err
   374  	}
   375  
   376  	vector := c.locationStack.popV128()
   377  	if err := c.compileEnsureOnRegister(vector); err != nil {
   378  		return err
   379  	}
   380  
   381  	switch o.Shape {
   382  	case wazeroir.ShapeI8x16:
   383  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRB, origin.register, vector.register, o.LaneIndex)
   384  	case wazeroir.ShapeI16x8:
   385  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRW, origin.register, vector.register, o.LaneIndex)
   386  	case wazeroir.ShapeI32x4:
   387  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRD, origin.register, vector.register, o.LaneIndex)
   388  	case wazeroir.ShapeI64x2:
   389  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, origin.register, vector.register, o.LaneIndex)
   390  	case wazeroir.ShapeF32x4:
   391  		c.assembler.CompileRegisterToRegisterWithArg(amd64.INSERTPS, origin.register, vector.register,
   392  			// In INSERTPS instruction, the destination index is encoded at 4 and 5 bits of the argument.
   393  			// See https://www.felixcloutier.com/x86/insertps
   394  			o.LaneIndex<<4,
   395  		)
   396  	case wazeroir.ShapeF64x2:
   397  		if o.LaneIndex == 0 {
   398  			c.assembler.CompileRegisterToRegister(amd64.MOVSD, origin.register, vector.register)
   399  		} else {
   400  			c.assembler.CompileRegisterToRegister(amd64.MOVLHPS, origin.register, vector.register)
   401  		}
   402  	}
   403  
   404  	c.pushVectorRuntimeValueLocationOnRegister(vector.register)
   405  	c.locationStack.markRegisterUnused(origin.register)
   406  	return nil
   407  }
   408  
   409  // compileV128Splat implements compiler.compileV128Splat for amd64.
   410  func (c *amd64Compiler) compileV128Splat(o *wazeroir.OperationV128Splat) (err error) {
   411  	origin := c.locationStack.pop()
   412  	if err = c.compileEnsureOnRegister(origin); err != nil {
   413  		return
   414  	}
   415  
   416  	var result asm.Register
   417  	switch o.Shape {
   418  	case wazeroir.ShapeI8x16:
   419  		result, err = c.allocateRegister(registerTypeVector)
   420  		if err != nil {
   421  			return err
   422  		}
   423  		c.locationStack.markRegisterUsed(result)
   424  
   425  		tmp, err := c.allocateRegister(registerTypeVector)
   426  		if err != nil {
   427  			return err
   428  		}
   429  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRB, origin.register, result, 0)
   430  		c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, tmp)
   431  		c.assembler.CompileRegisterToRegister(amd64.PSHUFB, tmp, result)
   432  	case wazeroir.ShapeI16x8:
   433  		result, err = c.allocateRegister(registerTypeVector)
   434  		if err != nil {
   435  			return err
   436  		}
   437  		c.locationStack.markRegisterUsed(result)
   438  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRW, origin.register, result, 0)
   439  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRW, origin.register, result, 1)
   440  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, result, result, 0)
   441  	case wazeroir.ShapeI32x4:
   442  		result, err = c.allocateRegister(registerTypeVector)
   443  		if err != nil {
   444  			return err
   445  		}
   446  		c.locationStack.markRegisterUsed(result)
   447  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRD, origin.register, result, 0)
   448  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, result, result, 0)
   449  	case wazeroir.ShapeI64x2:
   450  		result, err = c.allocateRegister(registerTypeVector)
   451  		if err != nil {
   452  			return err
   453  		}
   454  		c.locationStack.markRegisterUsed(result)
   455  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, origin.register, result, 0)
   456  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, origin.register, result, 1)
   457  	case wazeroir.ShapeF32x4:
   458  		result = origin.register
   459  		c.assembler.CompileRegisterToRegisterWithArg(amd64.INSERTPS, origin.register, result, 0)
   460  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, result, result, 0)
   461  	case wazeroir.ShapeF64x2:
   462  		result = origin.register
   463  		c.assembler.CompileRegisterToRegister(amd64.MOVQ, origin.register, result)
   464  		c.assembler.CompileRegisterToRegister(amd64.MOVLHPS, origin.register, result)
   465  	}
   466  
   467  	c.locationStack.markRegisterUnused(origin.register)
   468  	c.pushVectorRuntimeValueLocationOnRegister(result)
   469  	return nil
   470  }
   471  
   472  // compileV128Shuffle implements compiler.compileV128Shuffle for amd64.
   473  func (c *amd64Compiler) compileV128Shuffle(o *wazeroir.OperationV128Shuffle) error {
   474  	w := c.locationStack.popV128()
   475  	if err := c.compileEnsureOnRegister(w); err != nil {
   476  		return err
   477  	}
   478  
   479  	v := c.locationStack.popV128()
   480  	if err := c.compileEnsureOnRegister(v); err != nil {
   481  		return err
   482  	}
   483  
   484  	tmp, err := c.allocateRegister(registerTypeVector)
   485  	if err != nil {
   486  		return err
   487  	}
   488  
   489  	consts := [32]byte{}
   490  	for i, lane := range o.Lanes {
   491  		if lane < 16 {
   492  			consts[i+16] = 0x80
   493  			consts[i] = lane
   494  		} else {
   495  			consts[i+16] = lane - 16
   496  			consts[i] = 0x80
   497  		}
   498  	}
   499  
   500  	err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, asm.NewStaticConst(consts[:16]), tmp)
   501  	if err != nil {
   502  		return err
   503  	}
   504  	c.assembler.CompileRegisterToRegister(amd64.PSHUFB, tmp, v.register)
   505  	err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, asm.NewStaticConst(consts[16:]), tmp)
   506  	if err != nil {
   507  		return err
   508  	}
   509  	c.assembler.CompileRegisterToRegister(amd64.PSHUFB, tmp, w.register)
   510  	c.assembler.CompileRegisterToRegister(amd64.ORPS, v.register, w.register)
   511  
   512  	c.pushVectorRuntimeValueLocationOnRegister(w.register)
   513  	c.locationStack.markRegisterUnused(v.register)
   514  	return nil
   515  }
   516  
   517  var swizzleConst = [16]byte{
   518  	0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
   519  	0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
   520  }
   521  
   522  // compileV128Swizzle implements compiler.compileV128Swizzle for amd64.
   523  func (c *amd64Compiler) compileV128Swizzle(*wazeroir.OperationV128Swizzle) error {
   524  	indexVec := c.locationStack.popV128()
   525  	if err := c.compileEnsureOnRegister(indexVec); err != nil {
   526  		return err
   527  	}
   528  
   529  	baseVec := c.locationStack.popV128()
   530  	if err := c.compileEnsureOnRegister(baseVec); err != nil {
   531  		return err
   532  	}
   533  
   534  	tmp, err := c.allocateRegister(registerTypeVector)
   535  	if err != nil {
   536  		return err
   537  	}
   538  
   539  	err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, asm.NewStaticConst(swizzleConst[:]), tmp)
   540  	if err != nil {
   541  		return err
   542  	}
   543  
   544  	c.assembler.CompileRegisterToRegister(amd64.PADDUSB, tmp, indexVec.register)
   545  	c.assembler.CompileRegisterToRegister(amd64.PSHUFB, indexVec.register, baseVec.register)
   546  
   547  	c.pushVectorRuntimeValueLocationOnRegister(baseVec.register)
   548  	c.locationStack.markRegisterUnused(indexVec.register)
   549  	return nil
   550  }
   551  
   552  // compileV128AnyTrue implements compiler.compileV128AnyTrue for amd64.
   553  func (c *amd64Compiler) compileV128AnyTrue(*wazeroir.OperationV128AnyTrue) error {
   554  	v := c.locationStack.popV128()
   555  	if err := c.compileEnsureOnRegister(v); err != nil {
   556  		return err
   557  	}
   558  
   559  	c.assembler.CompileRegisterToRegister(amd64.PTEST, v.register, v.register)
   560  
   561  	c.locationStack.pushRuntimeValueLocationOnConditionalRegister(amd64.ConditionalRegisterStateNE)
   562  	c.locationStack.markRegisterUnused(v.register)
   563  	return nil
   564  }
   565  
   566  // compileV128AllTrue implements compiler.compileV128AllTrue for amd64.
   567  func (c *amd64Compiler) compileV128AllTrue(o *wazeroir.OperationV128AllTrue) error {
   568  	v := c.locationStack.popV128()
   569  	if err := c.compileEnsureOnRegister(v); err != nil {
   570  		return err
   571  	}
   572  
   573  	tmp, err := c.allocateRegister(registerTypeVector)
   574  	if err != nil {
   575  		return err
   576  	}
   577  
   578  	var cmpInst asm.Instruction
   579  	switch o.Shape {
   580  	case wazeroir.ShapeI8x16:
   581  		cmpInst = amd64.PCMPEQB
   582  	case wazeroir.ShapeI16x8:
   583  		cmpInst = amd64.PCMPEQW
   584  	case wazeroir.ShapeI32x4:
   585  		cmpInst = amd64.PCMPEQD
   586  	case wazeroir.ShapeI64x2:
   587  		cmpInst = amd64.PCMPEQQ
   588  	}
   589  
   590  	c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, tmp)
   591  	c.assembler.CompileRegisterToRegister(cmpInst, v.register, tmp)
   592  	c.assembler.CompileRegisterToRegister(amd64.PTEST, tmp, tmp)
   593  	c.locationStack.markRegisterUnused(v.register, tmp)
   594  	c.locationStack.pushRuntimeValueLocationOnConditionalRegister(amd64.ConditionalRegisterStateE)
   595  	return nil
   596  }
   597  
   598  // compileV128BitMask implements compiler.compileV128BitMask for amd64.
   599  func (c *amd64Compiler) compileV128BitMask(o *wazeroir.OperationV128BitMask) error {
   600  	v := c.locationStack.popV128()
   601  	if err := c.compileEnsureOnRegister(v); err != nil {
   602  		return err
   603  	}
   604  
   605  	result, err := c.allocateRegister(registerTypeGeneralPurpose)
   606  	if err != nil {
   607  		return err
   608  	}
   609  
   610  	switch o.Shape {
   611  	case wazeroir.ShapeI8x16:
   612  		c.assembler.CompileRegisterToRegister(amd64.PMOVMSKB, v.register, result)
   613  	case wazeroir.ShapeI16x8:
   614  		// When we have:
   615  		// 	R1 = [R1(w1), R1(w2), R1(w3), R1(w4), R1(w5), R1(w6), R1(w7), R1(v8)]
   616  		// 	R2 = [R2(w1), R2(w2), R2(w3), R2(v4), R2(w5), R2(w6), R2(w7), R2(v8)]
   617  		//	where RX(wn) is n-th signed word (16-bit) of RX register,
   618  		//
   619  		// "PACKSSWB R1, R2" produces
   620  		//  R1 = [
   621  		// 		byte_sat(R1(w1)), byte_sat(R1(w2)), byte_sat(R1(w3)), byte_sat(R1(w4)),
   622  		// 		byte_sat(R1(w5)), byte_sat(R1(w6)), byte_sat(R1(w7)), byte_sat(R1(w8)),
   623  		// 		byte_sat(R2(w1)), byte_sat(R2(w2)), byte_sat(R2(w3)), byte_sat(R2(w4)),
   624  		// 		byte_sat(R2(w5)), byte_sat(R2(w6)), byte_sat(R2(w7)), byte_sat(R2(w8)),
   625  		//  ]
   626  		//  where R1 is the destination register, and
   627  		// 	byte_sat(w) = int8(w) if w fits as signed 8-bit,
   628  		//                0x80 if w is less than 0x80
   629  		//                0x7F if w is greater than 0x7f
   630  		//
   631  		// See https://www.felixcloutier.com/x86/packsswb:packssdw for detail.
   632  		//
   633  		// Therefore, v.register ends up having i-th and (i+8)-th bit set if i-th lane is negative (for i in 0..8).
   634  		c.assembler.CompileRegisterToRegister(amd64.PACKSSWB, v.register, v.register)
   635  		c.assembler.CompileRegisterToRegister(amd64.PMOVMSKB, v.register, result)
   636  		// Clear the higher bits than 8.
   637  		c.assembler.CompileConstToRegister(amd64.SHRQ, 8, result)
   638  	case wazeroir.ShapeI32x4:
   639  		c.assembler.CompileRegisterToRegister(amd64.MOVMSKPS, v.register, result)
   640  	case wazeroir.ShapeI64x2:
   641  		c.assembler.CompileRegisterToRegister(amd64.MOVMSKPD, v.register, result)
   642  	}
   643  
   644  	c.locationStack.markRegisterUnused(v.register)
   645  	c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32)
   646  	return nil
   647  }
   648  
   649  // compileV128And implements compiler.compileV128And for amd64.
   650  func (c *amd64Compiler) compileV128And(*wazeroir.OperationV128And) error {
   651  	x2 := c.locationStack.popV128()
   652  	if err := c.compileEnsureOnRegister(x2); err != nil {
   653  		return err
   654  	}
   655  
   656  	x1 := c.locationStack.popV128()
   657  	if err := c.compileEnsureOnRegister(x1); err != nil {
   658  		return err
   659  	}
   660  
   661  	c.assembler.CompileRegisterToRegister(amd64.PAND, x2.register, x1.register)
   662  
   663  	c.locationStack.markRegisterUnused(x2.register)
   664  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
   665  	return nil
   666  }
   667  
   668  // compileV128Not implements compiler.compileV128Not for amd64.
   669  func (c *amd64Compiler) compileV128Not(*wazeroir.OperationV128Not) error {
   670  	v := c.locationStack.popV128()
   671  	if err := c.compileEnsureOnRegister(v); err != nil {
   672  		return err
   673  	}
   674  
   675  	tmp, err := c.allocateRegister(registerTypeVector)
   676  	if err != nil {
   677  		return err
   678  	}
   679  
   680  	// Set all bits on tmp register.
   681  	c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, tmp, tmp)
   682  	// Then XOR with tmp to reverse all bits on v.register.
   683  	c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, v.register)
   684  	c.pushVectorRuntimeValueLocationOnRegister(v.register)
   685  	return nil
   686  }
   687  
   688  // compileV128Or implements compiler.compileV128Or for amd64.
   689  func (c *amd64Compiler) compileV128Or(*wazeroir.OperationV128Or) error {
   690  	x2 := c.locationStack.popV128()
   691  	if err := c.compileEnsureOnRegister(x2); err != nil {
   692  		return err
   693  	}
   694  
   695  	x1 := c.locationStack.popV128()
   696  	if err := c.compileEnsureOnRegister(x1); err != nil {
   697  		return err
   698  	}
   699  
   700  	c.assembler.CompileRegisterToRegister(amd64.POR, x2.register, x1.register)
   701  
   702  	c.locationStack.markRegisterUnused(x2.register)
   703  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
   704  	return nil
   705  }
   706  
   707  // compileV128Xor implements compiler.compileV128Xor for amd64.
   708  func (c *amd64Compiler) compileV128Xor(*wazeroir.OperationV128Xor) error {
   709  	x2 := c.locationStack.popV128()
   710  	if err := c.compileEnsureOnRegister(x2); err != nil {
   711  		return err
   712  	}
   713  
   714  	x1 := c.locationStack.popV128()
   715  	if err := c.compileEnsureOnRegister(x1); err != nil {
   716  		return err
   717  	}
   718  
   719  	c.assembler.CompileRegisterToRegister(amd64.PXOR, x2.register, x1.register)
   720  
   721  	c.locationStack.markRegisterUnused(x2.register)
   722  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
   723  	return nil
   724  }
   725  
   726  // compileV128Bitselect implements compiler.compileV128Bitselect for amd64.
   727  func (c *amd64Compiler) compileV128Bitselect(*wazeroir.OperationV128Bitselect) error {
   728  	selector := c.locationStack.popV128()
   729  	if err := c.compileEnsureOnRegister(selector); err != nil {
   730  		return err
   731  	}
   732  
   733  	x2 := c.locationStack.popV128()
   734  	if err := c.compileEnsureOnRegister(x2); err != nil {
   735  		return err
   736  	}
   737  
   738  	x1 := c.locationStack.popV128()
   739  	if err := c.compileEnsureOnRegister(x1); err != nil {
   740  		return err
   741  	}
   742  
   743  	// The following logic is equivalent to v128.or(v128.and(v1, selector), v128.and(v2, v128.not(selector)))
   744  	// See https://github.com/WebAssembly/spec/blob/wg-2.0.draft1/proposals/simd/SIMD.md#bitwise-select
   745  	c.assembler.CompileRegisterToRegister(amd64.PAND, selector.register, x1.register)
   746  	c.assembler.CompileRegisterToRegister(amd64.PANDN, x2.register, selector.register)
   747  	c.assembler.CompileRegisterToRegister(amd64.POR, selector.register, x1.register)
   748  
   749  	c.locationStack.markRegisterUnused(x2.register, selector.register)
   750  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
   751  	return nil
   752  }
   753  
   754  // compileV128AndNot implements compiler.compileV128AndNot for amd64.
   755  func (c *amd64Compiler) compileV128AndNot(*wazeroir.OperationV128AndNot) error {
   756  	x2 := c.locationStack.popV128()
   757  	if err := c.compileEnsureOnRegister(x2); err != nil {
   758  		return err
   759  	}
   760  
   761  	x1 := c.locationStack.popV128()
   762  	if err := c.compileEnsureOnRegister(x1); err != nil {
   763  		return err
   764  	}
   765  
   766  	c.assembler.CompileRegisterToRegister(amd64.PANDN, x1.register, x2.register)
   767  
   768  	c.locationStack.markRegisterUnused(x1.register)
   769  	c.pushVectorRuntimeValueLocationOnRegister(x2.register)
   770  	return nil
   771  }
   772  
   773  // compileV128Shr implements compiler.compileV128Shr for amd64.
   774  func (c *amd64Compiler) compileV128Shr(o *wazeroir.OperationV128Shr) error {
   775  	// https://stackoverflow.com/questions/35002937/sse-simd-shift-with-one-byte-element-size-granularity
   776  	if o.Shape == wazeroir.ShapeI8x16 {
   777  		return c.compileV128ShrI8x16Impl(o.Signed)
   778  	} else if o.Shape == wazeroir.ShapeI64x2 && o.Signed {
   779  		return c.compileV128ShrI64x2SignedImpl()
   780  	} else {
   781  		return c.compileV128ShrImpl(o)
   782  	}
   783  }
   784  
   785  // compileV128ShrImpl implements shift right instructions except for i8x16 (logical/arithmetic) and i64x2 (arithmetic).
   786  func (c *amd64Compiler) compileV128ShrImpl(o *wazeroir.OperationV128Shr) error {
   787  	s := c.locationStack.pop()
   788  	if err := c.compileEnsureOnRegister(s); err != nil {
   789  		return err
   790  	}
   791  
   792  	x1 := c.locationStack.popV128()
   793  	if err := c.compileEnsureOnRegister(x1); err != nil {
   794  		return err
   795  	}
   796  
   797  	vecTmp, err := c.allocateRegister(registerTypeVector)
   798  	if err != nil {
   799  		return err
   800  	}
   801  
   802  	var moduleConst int64
   803  	var shift asm.Instruction
   804  	switch o.Shape {
   805  	case wazeroir.ShapeI16x8:
   806  		moduleConst = 0xf // modulo 16.
   807  		if o.Signed {
   808  			shift = amd64.PSRAW
   809  		} else {
   810  			shift = amd64.PSRLW
   811  		}
   812  	case wazeroir.ShapeI32x4:
   813  		moduleConst = 0x1f // modulo 32.
   814  		if o.Signed {
   815  			shift = amd64.PSRAD
   816  		} else {
   817  			shift = amd64.PSRLD
   818  		}
   819  	case wazeroir.ShapeI64x2:
   820  		moduleConst = 0x3f // modulo 64.
   821  		shift = amd64.PSRLQ
   822  	}
   823  
   824  	gpShiftAmount := s.register
   825  	c.assembler.CompileConstToRegister(amd64.ANDQ, moduleConst, gpShiftAmount)
   826  	c.assembler.CompileRegisterToRegister(amd64.MOVL, gpShiftAmount, vecTmp)
   827  	c.assembler.CompileRegisterToRegister(shift, vecTmp, x1.register)
   828  
   829  	c.locationStack.markRegisterUnused(gpShiftAmount)
   830  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
   831  	return nil
   832  }
   833  
   834  // compileV128ShrI64x2SignedImpl implements compiler.compileV128Shr for i64x2 signed (arithmetic) shift.
   835  // PSRAQ instruction requires AVX, so we emulate it without AVX instructions. https://www.felixcloutier.com/x86/psraw:psrad:psraq
   836  func (c *amd64Compiler) compileV128ShrI64x2SignedImpl() error {
   837  	const shiftCountRegister = amd64.RegCX
   838  
   839  	s := c.locationStack.pop()
   840  	if s.register != shiftCountRegister {
   841  		// If another value lives on the CX register, we release it to the stack.
   842  		c.onValueReleaseRegisterToStack(shiftCountRegister)
   843  		if s.onStack() {
   844  			s.setRegister(shiftCountRegister)
   845  			c.compileLoadValueOnStackToRegister(s)
   846  		} else if s.onConditionalRegister() {
   847  			c.compileMoveConditionalToGeneralPurposeRegister(s, shiftCountRegister)
   848  		} else { // already on register.
   849  			old := s.register
   850  			c.assembler.CompileRegisterToRegister(amd64.MOVL, old, shiftCountRegister)
   851  			s.setRegister(shiftCountRegister)
   852  			c.locationStack.markRegisterUnused(old)
   853  		}
   854  	}
   855  
   856  	c.locationStack.markRegisterUsed(shiftCountRegister)
   857  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
   858  	if err != nil {
   859  		return err
   860  	}
   861  
   862  	x1 := c.locationStack.popV128()
   863  	if err := c.compileEnsureOnRegister(x1); err != nil {
   864  		return err
   865  	}
   866  
   867  	// Extract each lane into tmp, execute SHR on tmp, and write it back to the lane.
   868  	c.assembler.CompileRegisterToRegisterWithArg(amd64.PEXTRQ, x1.register, tmp, 0)
   869  	c.assembler.CompileRegisterToRegister(amd64.SARQ, shiftCountRegister, tmp)
   870  	c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, tmp, x1.register, 0)
   871  	c.assembler.CompileRegisterToRegisterWithArg(amd64.PEXTRQ, x1.register, tmp, 1)
   872  	c.assembler.CompileRegisterToRegister(amd64.SARQ, shiftCountRegister, tmp)
   873  	c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, tmp, x1.register, 1)
   874  
   875  	c.locationStack.markRegisterUnused(shiftCountRegister)
   876  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
   877  	return nil
   878  }
   879  
   880  // i8x16LogicalSHRMaskTable is necessary for emulating non-existent packed bytes logical right shifts on amd64.
   881  // The mask is applied after performing packed word shifts on the value to clear out the unnecessary bits.
   882  var i8x16LogicalSHRMaskTable = [8 * 16]byte{ // (the number of possible shift amount 0, 1, ..., 7.) * 16 bytes.
   883  	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // for 0 shift
   884  	0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // for 1 shift
   885  	0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, // for 2 shift
   886  	0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, // for 3 shift
   887  	0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, // for 4 shift
   888  	0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, // for 5 shift
   889  	0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, // for 6 shift
   890  	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // for 7 shift
   891  }
   892  
   893  // compileV128ShrI64x2SignedImpl implements compiler.compileV128Shr for i8x16 signed logical/arithmetic shifts.
   894  // amd64 doesn't have packed byte shifts, so we need this special casing.
   895  // See https://stackoverflow.com/questions/35002937/sse-simd-shift-with-one-byte-element-size-granularity
   896  func (c *amd64Compiler) compileV128ShrI8x16Impl(signed bool) error {
   897  	s := c.locationStack.pop()
   898  	if err := c.compileEnsureOnRegister(s); err != nil {
   899  		return err
   900  	}
   901  
   902  	v := c.locationStack.popV128()
   903  	if err := c.compileEnsureOnRegister(v); err != nil {
   904  		return err
   905  	}
   906  
   907  	vecTmp, err := c.allocateRegister(registerTypeVector)
   908  	if err != nil {
   909  		return err
   910  	}
   911  
   912  	gpShiftAmount := s.register
   913  	c.assembler.CompileConstToRegister(amd64.ANDQ, 0x7, gpShiftAmount) // mod 8.
   914  
   915  	if signed {
   916  		c.locationStack.markRegisterUsed(vecTmp)
   917  		vecTmp2, err := c.allocateRegister(registerTypeVector)
   918  		if err != nil {
   919  			return err
   920  		}
   921  
   922  		vreg := v.register
   923  
   924  		// Copy the value from v.register to vecTmp.
   925  		c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vreg, vecTmp)
   926  
   927  		// Assuming that we have
   928  		//  vreg   = [b1, ..., b16]
   929  		//  vecTmp = [b1, ..., b16]
   930  		// at this point, then we use PUNPCKLBW and PUNPCKHBW to produce:
   931  		//  vreg   = [b1, b1, b2, b2, ..., b8, b8]
   932  		//  vecTmp = [b9, b9, b10, b10, ..., b16, b16]
   933  		c.assembler.CompileRegisterToRegister(amd64.PUNPCKLBW, vreg, vreg)
   934  		c.assembler.CompileRegisterToRegister(amd64.PUNPCKHBW, vecTmp, vecTmp)
   935  
   936  		// Adding 8 to the shift amount, and then move the amount to vecTmp2.
   937  		c.assembler.CompileConstToRegister(amd64.ADDQ, 0x8, gpShiftAmount)
   938  		c.assembler.CompileRegisterToRegister(amd64.MOVL, gpShiftAmount, vecTmp2)
   939  
   940  		// Perform the word packed arithmetic right shifts on vreg and vecTmp.
   941  		// This changes these two registers as:
   942  		//  vreg   = [xxx, b1 >> s, xxx, b2 >> s, ..., xxx, b8 >> s]
   943  		//  vecTmp = [xxx, b9 >> s, xxx, b10 >> s, ..., xxx, b16 >> s]
   944  		// where xxx is 1 or 0 depending on each byte's sign, and ">>" is the arithmetic shift on a byte.
   945  		c.assembler.CompileRegisterToRegister(amd64.PSRAW, vecTmp2, vreg)
   946  		c.assembler.CompileRegisterToRegister(amd64.PSRAW, vecTmp2, vecTmp)
   947  
   948  		// Finally, we can get the result by packing these two word vectors.
   949  		c.assembler.CompileRegisterToRegister(amd64.PACKSSWB, vecTmp, vreg)
   950  
   951  		c.locationStack.markRegisterUnused(gpShiftAmount, vecTmp)
   952  		c.pushVectorRuntimeValueLocationOnRegister(vreg)
   953  	} else {
   954  		c.assembler.CompileRegisterToRegister(amd64.MOVL, gpShiftAmount, vecTmp)
   955  		// amd64 doesn't have packed byte shifts, so we packed word shift here, and then mark-out
   956  		// the unnecessary bits below.
   957  		c.assembler.CompileRegisterToRegister(amd64.PSRLW, vecTmp, v.register)
   958  
   959  		gpTmp, err := c.allocateRegister(registerTypeGeneralPurpose)
   960  		if err != nil {
   961  			return err
   962  		}
   963  
   964  		// Read the initial address of the mask table into gpTmp register.
   965  		err = c.assembler.CompileStaticConstToRegister(amd64.LEAQ, asm.NewStaticConst(i8x16LogicalSHRMaskTable[:]), gpTmp)
   966  		if err != nil {
   967  			return err
   968  		}
   969  
   970  		// We have to get the mask according to the shift amount, so we first have to do
   971  		// gpShiftAmount << 4 = gpShiftAmount*16 to get the initial offset of the mask (16 is the size of each mask in bytes).
   972  		c.assembler.CompileConstToRegister(amd64.SHLQ, 4, gpShiftAmount)
   973  
   974  		// Now ready to read the content of the mask into the vecTmp.
   975  		c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVDQU,
   976  			gpTmp, 0, gpShiftAmount, 1,
   977  			vecTmp,
   978  		)
   979  
   980  		// Finally, clear out the unnecessary
   981  		c.assembler.CompileRegisterToRegister(amd64.PAND, vecTmp, v.register)
   982  
   983  		c.locationStack.markRegisterUnused(gpShiftAmount)
   984  		c.pushVectorRuntimeValueLocationOnRegister(v.register)
   985  	}
   986  	return nil
   987  }
   988  
   989  // i8x16SHLMaskTable is necessary for emulating non-existent packed bytes left shifts on amd64.
   990  // The mask is applied after performing packed word shifts on the value to clear out the unnecessary bits.
   991  var i8x16SHLMaskTable = [8 * 16]byte{ // (the number of possible shift amount 0, 1, ..., 7.) * 16 bytes.
   992  	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // for 0 shift
   993  	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, // for 1 shift
   994  	0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, // for 2 shift
   995  	0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, // for 3 shift
   996  	0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // for 4 shift
   997  	0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, // for 5 shift
   998  	0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, // for 6 shift
   999  	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, // for 7 shift
  1000  }
  1001  
  1002  // compileV128Shl implements compiler.compileV128Shl for amd64.
  1003  func (c *amd64Compiler) compileV128Shl(o *wazeroir.OperationV128Shl) error {
  1004  	s := c.locationStack.pop()
  1005  	if err := c.compileEnsureOnRegister(s); err != nil {
  1006  		return err
  1007  	}
  1008  
  1009  	x1 := c.locationStack.popV128()
  1010  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1011  		return err
  1012  	}
  1013  
  1014  	vecTmp, err := c.allocateRegister(registerTypeVector)
  1015  	if err != nil {
  1016  		return err
  1017  	}
  1018  
  1019  	var modulo int64
  1020  	var shift asm.Instruction
  1021  	switch o.Shape {
  1022  	case wazeroir.ShapeI8x16:
  1023  		modulo = 0x7 // modulo 8.
  1024  		// x86 doesn't have packed bytes shift, so we use PSLLW and mask-out the redundant bits.
  1025  		// See https://stackoverflow.com/questions/35002937/sse-simd-shift-with-one-byte-element-size-granularity
  1026  		shift = amd64.PSLLW
  1027  	case wazeroir.ShapeI16x8:
  1028  		modulo = 0xf // modulo 16.
  1029  		shift = amd64.PSLLW
  1030  	case wazeroir.ShapeI32x4:
  1031  		modulo = 0x1f // modulo 32.
  1032  		shift = amd64.PSLLD
  1033  	case wazeroir.ShapeI64x2:
  1034  		modulo = 0x3f // modulo 64.
  1035  		shift = amd64.PSLLQ
  1036  	}
  1037  
  1038  	gpShiftAmount := s.register
  1039  	c.assembler.CompileConstToRegister(amd64.ANDQ, modulo, gpShiftAmount)
  1040  	c.assembler.CompileRegisterToRegister(amd64.MOVL, gpShiftAmount, vecTmp)
  1041  	c.assembler.CompileRegisterToRegister(shift, vecTmp, x1.register)
  1042  
  1043  	if o.Shape == wazeroir.ShapeI8x16 {
  1044  		gpTmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  1045  		if err != nil {
  1046  			return err
  1047  		}
  1048  
  1049  		// Read the initial address of the mask table into gpTmp register.
  1050  		err = c.assembler.CompileStaticConstToRegister(amd64.LEAQ, asm.NewStaticConst(i8x16SHLMaskTable[:]), gpTmp)
  1051  		if err != nil {
  1052  			return err
  1053  		}
  1054  
  1055  		// We have to get the mask according to the shift amount, so we first have to do
  1056  		// gpShiftAmount << 4 = gpShiftAmount*16 to get the initial offset of the mask (16 is the size of each mask in bytes).
  1057  		c.assembler.CompileConstToRegister(amd64.SHLQ, 4, gpShiftAmount)
  1058  
  1059  		// Now ready to read the content of the mask into the vecTmp.
  1060  		c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVDQU,
  1061  			gpTmp, 0, gpShiftAmount, 1,
  1062  			vecTmp,
  1063  		)
  1064  
  1065  		// Finally, clear out the unnecessary
  1066  		c.assembler.CompileRegisterToRegister(amd64.PAND, vecTmp, x1.register)
  1067  	}
  1068  
  1069  	c.locationStack.markRegisterUnused(gpShiftAmount)
  1070  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
  1071  	return nil
  1072  }
  1073  
  1074  // compileV128Cmp implements compiler.compileV128Cmp for amd64.
  1075  func (c *amd64Compiler) compileV128Cmp(o *wazeroir.OperationV128Cmp) error {
  1076  	x2 := c.locationStack.popV128()
  1077  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1078  		return err
  1079  	}
  1080  
  1081  	x1 := c.locationStack.popV128()
  1082  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1083  		return err
  1084  	}
  1085  
  1086  	const (
  1087  		// See https://www.felixcloutier.com/x86/cmppd and https://www.felixcloutier.com/x86/cmpps
  1088  		floatEqualArg           = 0
  1089  		floatLessThanArg        = 1
  1090  		floatLessThanOrEqualArg = 2
  1091  		floatNotEqualARg        = 4
  1092  	)
  1093  
  1094  	x1Reg, x2Reg, result := x1.register, x2.register, asm.NilRegister
  1095  	switch o.Type {
  1096  	case wazeroir.V128CmpTypeF32x4Eq:
  1097  		c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPS, x2Reg, x1Reg, floatEqualArg)
  1098  		result = x1Reg
  1099  	case wazeroir.V128CmpTypeF32x4Ne:
  1100  		c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPS, x2Reg, x1Reg, floatNotEqualARg)
  1101  		result = x1Reg
  1102  	case wazeroir.V128CmpTypeF32x4Lt:
  1103  		c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPS, x2Reg, x1Reg, floatLessThanArg)
  1104  		result = x1Reg
  1105  	case wazeroir.V128CmpTypeF32x4Gt:
  1106  		// Without AVX, there's no float Gt instruction, so we swap the register and use Lt instead.
  1107  		c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPS, x1Reg, x2Reg, floatLessThanArg)
  1108  		result = x2Reg
  1109  	case wazeroir.V128CmpTypeF32x4Le:
  1110  		c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPS, x2Reg, x1Reg, floatLessThanOrEqualArg)
  1111  		result = x1Reg
  1112  	case wazeroir.V128CmpTypeF32x4Ge:
  1113  		// Without AVX, there's no float Ge instruction, so we swap the register and use Le instead.
  1114  		c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPS, x1Reg, x2Reg, floatLessThanOrEqualArg)
  1115  		result = x2Reg
  1116  	case wazeroir.V128CmpTypeF64x2Eq:
  1117  		c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPD, x2Reg, x1Reg, floatEqualArg)
  1118  		result = x1Reg
  1119  	case wazeroir.V128CmpTypeF64x2Ne:
  1120  		c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPD, x2Reg, x1Reg, floatNotEqualARg)
  1121  		result = x1Reg
  1122  	case wazeroir.V128CmpTypeF64x2Lt:
  1123  		c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPD, x2Reg, x1Reg, floatLessThanArg)
  1124  		result = x1Reg
  1125  	case wazeroir.V128CmpTypeF64x2Gt:
  1126  		// Without AVX, there's no float Gt instruction, so we swap the register and use Lt instead.
  1127  		c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPD, x1Reg, x2Reg, floatLessThanArg)
  1128  		result = x2Reg
  1129  	case wazeroir.V128CmpTypeF64x2Le:
  1130  		c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPD, x2Reg, x1Reg, floatLessThanOrEqualArg)
  1131  		result = x1Reg
  1132  	case wazeroir.V128CmpTypeF64x2Ge:
  1133  		// Without AVX, there's no float Ge instruction, so we swap the register and use Le instead.
  1134  		c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPD, x1Reg, x2Reg, floatLessThanOrEqualArg)
  1135  		result = x2Reg
  1136  	case wazeroir.V128CmpTypeI8x16Eq:
  1137  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQB, x2Reg, x1Reg)
  1138  		result = x1Reg
  1139  	case wazeroir.V128CmpTypeI8x16Ne:
  1140  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQB, x2Reg, x1Reg)
  1141  		// Set all bits on x2Reg register.
  1142  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg)
  1143  		// Swap the bits on x1Reg register.
  1144  		c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg)
  1145  		result = x1Reg
  1146  	case wazeroir.V128CmpTypeI8x16LtS:
  1147  		c.assembler.CompileRegisterToRegister(amd64.PCMPGTB, x1Reg, x2Reg)
  1148  		result = x2Reg
  1149  	case wazeroir.V128CmpTypeI8x16LtU, wazeroir.V128CmpTypeI8x16GtU:
  1150  		// Take the unsigned min/max values on each byte on x1 and x2 onto x1Reg.
  1151  		if o.Type == wazeroir.V128CmpTypeI8x16LtU {
  1152  			c.assembler.CompileRegisterToRegister(amd64.PMINUB, x2Reg, x1Reg)
  1153  		} else {
  1154  			c.assembler.CompileRegisterToRegister(amd64.PMAXUB, x2Reg, x1Reg)
  1155  		}
  1156  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQB, x2Reg, x1Reg)
  1157  		// Set all bits on x2Reg register.
  1158  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg)
  1159  		// Swap the bits on x2Reg register.
  1160  		c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg)
  1161  		result = x1Reg
  1162  	case wazeroir.V128CmpTypeI8x16GtS:
  1163  		c.assembler.CompileRegisterToRegister(amd64.PCMPGTB, x2Reg, x1Reg)
  1164  		result = x1Reg
  1165  	case wazeroir.V128CmpTypeI8x16LeS, wazeroir.V128CmpTypeI8x16LeU:
  1166  		tmp, err := c.allocateRegister(registerTypeVector)
  1167  		if err != nil {
  1168  			return err
  1169  		}
  1170  		// Copy the value on the src to tmp.
  1171  		c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1Reg, tmp)
  1172  		if o.Type == wazeroir.V128CmpTypeI8x16LeS {
  1173  			c.assembler.CompileRegisterToRegister(amd64.PMINSB, x2Reg, tmp)
  1174  		} else {
  1175  			c.assembler.CompileRegisterToRegister(amd64.PMINUB, x2Reg, tmp)
  1176  		}
  1177  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQB, tmp, x1Reg)
  1178  		result = x1Reg
  1179  	case wazeroir.V128CmpTypeI8x16GeS, wazeroir.V128CmpTypeI8x16GeU:
  1180  		tmp, err := c.allocateRegister(registerTypeVector)
  1181  		if err != nil {
  1182  			return err
  1183  		}
  1184  		c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1Reg, tmp)
  1185  		if o.Type == wazeroir.V128CmpTypeI8x16GeS {
  1186  			c.assembler.CompileRegisterToRegister(amd64.PMAXSB, x2Reg, tmp)
  1187  		} else {
  1188  			c.assembler.CompileRegisterToRegister(amd64.PMAXUB, x2Reg, tmp)
  1189  		}
  1190  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQB, tmp, x1Reg)
  1191  		result = x1Reg
  1192  	case wazeroir.V128CmpTypeI16x8Eq:
  1193  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQW, x2Reg, x1Reg)
  1194  		result = x1Reg
  1195  	case wazeroir.V128CmpTypeI16x8Ne:
  1196  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQW, x2Reg, x1Reg)
  1197  		// Set all bits on x2Reg register.
  1198  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg)
  1199  		// Swap the bits on x1Reg register.
  1200  		c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg)
  1201  		result = x1Reg
  1202  	case wazeroir.V128CmpTypeI16x8LtS:
  1203  		c.assembler.CompileRegisterToRegister(amd64.PCMPGTW, x1Reg, x2Reg)
  1204  		result = x2Reg
  1205  	case wazeroir.V128CmpTypeI16x8LtU, wazeroir.V128CmpTypeI16x8GtU:
  1206  		// Take the unsigned min/max values on each byte on x1 and x2 onto x1Reg.
  1207  		if o.Type == wazeroir.V128CmpTypeI16x8LtU {
  1208  			c.assembler.CompileRegisterToRegister(amd64.PMINUW, x2Reg, x1Reg)
  1209  		} else {
  1210  			c.assembler.CompileRegisterToRegister(amd64.PMAXUW, x2Reg, x1Reg)
  1211  		}
  1212  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQW, x2Reg, x1Reg)
  1213  		// Set all bits on x2Reg register.
  1214  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg)
  1215  		// Swap the bits on x2Reg register.
  1216  		c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg)
  1217  		result = x1Reg
  1218  	case wazeroir.V128CmpTypeI16x8GtS:
  1219  		c.assembler.CompileRegisterToRegister(amd64.PCMPGTW, x2Reg, x1Reg)
  1220  		result = x1Reg
  1221  	case wazeroir.V128CmpTypeI16x8LeS, wazeroir.V128CmpTypeI16x8LeU:
  1222  		tmp, err := c.allocateRegister(registerTypeVector)
  1223  		if err != nil {
  1224  			return err
  1225  		}
  1226  		// Copy the value on the src to tmp.
  1227  		c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1Reg, tmp)
  1228  		if o.Type == wazeroir.V128CmpTypeI16x8LeS {
  1229  			c.assembler.CompileRegisterToRegister(amd64.PMINSW, x2Reg, tmp)
  1230  		} else {
  1231  			c.assembler.CompileRegisterToRegister(amd64.PMINUW, x2Reg, tmp)
  1232  		}
  1233  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQW, tmp, x1Reg)
  1234  		result = x1Reg
  1235  	case wazeroir.V128CmpTypeI16x8GeS, wazeroir.V128CmpTypeI16x8GeU:
  1236  		tmp, err := c.allocateRegister(registerTypeVector)
  1237  		if err != nil {
  1238  			return err
  1239  		}
  1240  		c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1Reg, tmp)
  1241  		if o.Type == wazeroir.V128CmpTypeI16x8GeS {
  1242  			c.assembler.CompileRegisterToRegister(amd64.PMAXSW, x2Reg, tmp)
  1243  		} else {
  1244  			c.assembler.CompileRegisterToRegister(amd64.PMAXUW, x2Reg, tmp)
  1245  		}
  1246  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQW, tmp, x1Reg)
  1247  		result = x1Reg
  1248  	case wazeroir.V128CmpTypeI32x4Eq:
  1249  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x1Reg)
  1250  		result = x1Reg
  1251  	case wazeroir.V128CmpTypeI32x4Ne:
  1252  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x1Reg)
  1253  		// Set all bits on x2Reg register.
  1254  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg)
  1255  		// Swap the bits on x1Reg register.
  1256  		c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg)
  1257  		result = x1Reg
  1258  	case wazeroir.V128CmpTypeI32x4LtS:
  1259  		c.assembler.CompileRegisterToRegister(amd64.PCMPGTD, x1Reg, x2Reg)
  1260  		result = x2Reg
  1261  	case wazeroir.V128CmpTypeI32x4LtU, wazeroir.V128CmpTypeI32x4GtU:
  1262  		// Take the unsigned min/max values on each byte on x1 and x2 onto x1Reg.
  1263  		if o.Type == wazeroir.V128CmpTypeI32x4LtU {
  1264  			c.assembler.CompileRegisterToRegister(amd64.PMINUD, x2Reg, x1Reg)
  1265  		} else {
  1266  			c.assembler.CompileRegisterToRegister(amd64.PMAXUD, x2Reg, x1Reg)
  1267  		}
  1268  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x1Reg)
  1269  		// Set all bits on x2Reg register.
  1270  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg)
  1271  		// Swap the bits on x2Reg register.
  1272  		c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg)
  1273  		result = x1Reg
  1274  	case wazeroir.V128CmpTypeI32x4GtS:
  1275  		c.assembler.CompileRegisterToRegister(amd64.PCMPGTD, x2Reg, x1Reg)
  1276  		result = x1Reg
  1277  	case wazeroir.V128CmpTypeI32x4LeS, wazeroir.V128CmpTypeI32x4LeU:
  1278  		tmp, err := c.allocateRegister(registerTypeVector)
  1279  		if err != nil {
  1280  			return err
  1281  		}
  1282  		// Copy the value on the src to tmp.
  1283  		c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1Reg, tmp)
  1284  		if o.Type == wazeroir.V128CmpTypeI32x4LeS {
  1285  			c.assembler.CompileRegisterToRegister(amd64.PMINSD, x2Reg, tmp)
  1286  		} else {
  1287  			c.assembler.CompileRegisterToRegister(amd64.PMINUD, x2Reg, tmp)
  1288  		}
  1289  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, tmp, x1Reg)
  1290  		result = x1Reg
  1291  	case wazeroir.V128CmpTypeI32x4GeS, wazeroir.V128CmpTypeI32x4GeU:
  1292  		tmp, err := c.allocateRegister(registerTypeVector)
  1293  		if err != nil {
  1294  			return err
  1295  		}
  1296  		c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1Reg, tmp)
  1297  		if o.Type == wazeroir.V128CmpTypeI32x4GeS {
  1298  			c.assembler.CompileRegisterToRegister(amd64.PMAXSD, x2Reg, tmp)
  1299  		} else {
  1300  			c.assembler.CompileRegisterToRegister(amd64.PMAXUD, x2Reg, tmp)
  1301  		}
  1302  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, tmp, x1Reg)
  1303  		result = x1Reg
  1304  	case wazeroir.V128CmpTypeI64x2Eq:
  1305  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQQ, x2Reg, x1Reg)
  1306  		result = x1Reg
  1307  	case wazeroir.V128CmpTypeI64x2Ne:
  1308  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQQ, x2Reg, x1Reg)
  1309  		// Set all bits on x2Reg register.
  1310  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg)
  1311  		// Swap the bits on x1Reg register.
  1312  		c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg)
  1313  		result = x1Reg
  1314  	case wazeroir.V128CmpTypeI64x2LtS:
  1315  		c.assembler.CompileRegisterToRegister(amd64.PCMPGTQ, x1Reg, x2Reg)
  1316  		result = x2Reg
  1317  	case wazeroir.V128CmpTypeI64x2GtS:
  1318  		c.assembler.CompileRegisterToRegister(amd64.PCMPGTQ, x2Reg, x1Reg)
  1319  		result = x1Reg
  1320  	case wazeroir.V128CmpTypeI64x2LeS:
  1321  		c.assembler.CompileRegisterToRegister(amd64.PCMPGTQ, x2Reg, x1Reg)
  1322  		// Set all bits on x2Reg register.
  1323  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg)
  1324  		// Swap the bits on x1Reg register.
  1325  		c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg)
  1326  		result = x1Reg
  1327  	case wazeroir.V128CmpTypeI64x2GeS:
  1328  		c.assembler.CompileRegisterToRegister(amd64.PCMPGTQ, x1Reg, x2Reg)
  1329  		// Set all bits on x1Reg register.
  1330  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x1Reg, x1Reg)
  1331  		// Swap the bits on x2Reg register.
  1332  		c.assembler.CompileRegisterToRegister(amd64.PXOR, x1Reg, x2Reg)
  1333  		result = x2Reg
  1334  	}
  1335  
  1336  	c.locationStack.markRegisterUnused(x1Reg, x2Reg)
  1337  	c.pushVectorRuntimeValueLocationOnRegister(result)
  1338  	return nil
  1339  }
  1340  
  1341  // compileV128AddSat implements compiler.compileV128AddSat for amd64.
  1342  func (c *amd64Compiler) compileV128AddSat(o *wazeroir.OperationV128AddSat) error {
  1343  	var inst asm.Instruction
  1344  	switch o.Shape {
  1345  	case wazeroir.ShapeI8x16:
  1346  		if o.Signed {
  1347  			inst = amd64.PADDSB
  1348  		} else {
  1349  			inst = amd64.PADDUSB
  1350  		}
  1351  	case wazeroir.ShapeI16x8:
  1352  		if o.Signed {
  1353  			inst = amd64.PADDSW
  1354  		} else {
  1355  			inst = amd64.PADDUSW
  1356  		}
  1357  	}
  1358  
  1359  	x2 := c.locationStack.popV128()
  1360  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1361  		return err
  1362  	}
  1363  
  1364  	x1 := c.locationStack.popV128()
  1365  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1366  		return err
  1367  	}
  1368  
  1369  	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
  1370  
  1371  	c.locationStack.markRegisterUnused(x2.register)
  1372  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
  1373  	return nil
  1374  }
  1375  
  1376  // compileV128SubSat implements compiler.compileV128SubSat for amd64.
  1377  func (c *amd64Compiler) compileV128SubSat(o *wazeroir.OperationV128SubSat) error {
  1378  	var inst asm.Instruction
  1379  	switch o.Shape {
  1380  	case wazeroir.ShapeI8x16:
  1381  		if o.Signed {
  1382  			inst = amd64.PSUBSB
  1383  		} else {
  1384  			inst = amd64.PSUBUSB
  1385  		}
  1386  	case wazeroir.ShapeI16x8:
  1387  		if o.Signed {
  1388  			inst = amd64.PSUBSW
  1389  		} else {
  1390  			inst = amd64.PSUBUSW
  1391  		}
  1392  	}
  1393  
  1394  	x2 := c.locationStack.popV128()
  1395  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1396  		return err
  1397  	}
  1398  
  1399  	x1 := c.locationStack.popV128()
  1400  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1401  		return err
  1402  	}
  1403  
  1404  	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
  1405  
  1406  	c.locationStack.markRegisterUnused(x2.register)
  1407  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
  1408  	return nil
  1409  }
  1410  
  1411  // compileV128Mul implements compiler.compileV128Mul for amd64.
  1412  func (c *amd64Compiler) compileV128Mul(o *wazeroir.OperationV128Mul) error {
  1413  	var inst asm.Instruction
  1414  	switch o.Shape {
  1415  	case wazeroir.ShapeI16x8:
  1416  		inst = amd64.PMULLW
  1417  	case wazeroir.ShapeI32x4:
  1418  		inst = amd64.PMULLD
  1419  	case wazeroir.ShapeI64x2:
  1420  		return c.compileV128MulI64x2()
  1421  	case wazeroir.ShapeF32x4:
  1422  		inst = amd64.MULPS
  1423  	case wazeroir.ShapeF64x2:
  1424  		inst = amd64.MULPD
  1425  	}
  1426  
  1427  	x2 := c.locationStack.popV128()
  1428  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1429  		return err
  1430  	}
  1431  
  1432  	x1 := c.locationStack.popV128()
  1433  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1434  		return err
  1435  	}
  1436  
  1437  	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
  1438  
  1439  	c.locationStack.markRegisterUnused(x2.register)
  1440  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
  1441  	return nil
  1442  }
  1443  
  1444  // compileV128MulI64x2 implements V128Mul for i64x2.
  1445  func (c *amd64Compiler) compileV128MulI64x2() error {
  1446  	x2 := c.locationStack.popV128()
  1447  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1448  		return err
  1449  	}
  1450  
  1451  	x1 := c.locationStack.popV128()
  1452  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1453  		return err
  1454  	}
  1455  
  1456  	x1r, x2r := x1.register, x2.register
  1457  
  1458  	tmp1, err := c.allocateRegister(registerTypeVector)
  1459  	if err != nil {
  1460  		return err
  1461  	}
  1462  
  1463  	c.locationStack.markRegisterUsed(tmp1)
  1464  
  1465  	tmp2, err := c.allocateRegister(registerTypeVector)
  1466  	if err != nil {
  1467  		return err
  1468  	}
  1469  
  1470  	// Assuming that we have
  1471  	//	x1r = [p1, p2] = [p1_lo, p1_hi, p2_lo, p2_high]
  1472  	//  x2r = [q1, q2] = [q1_lo, q1_hi, q2_lo, q2_high]
  1473  	// where pN and qN are 64-bit (quad word) lane, and pN_lo, pN_hi, qN_lo and qN_hi are 32-bit (double word) lane.
  1474  
  1475  	// Copy x1's value into tmp1.
  1476  	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1r, tmp1)
  1477  	// And do the logical right shift by 32-bit on tmp1, which makes tmp1 = [0, p1_high, 0, p2_high]
  1478  	c.assembler.CompileConstToRegister(amd64.PSRLQ, 32, tmp1)
  1479  
  1480  	// Execute "pmuludq x2r,tmp1", which makes tmp1 = [p1_high*q1_lo, p2_high*q2_lo] where each lane is 64-bit.
  1481  	c.assembler.CompileRegisterToRegister(amd64.PMULUDQ, x2r, tmp1)
  1482  
  1483  	// Copy x2's value into tmp2.
  1484  	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x2r, tmp2)
  1485  	// And do the logical right shift by 32-bit on tmp2, which makes tmp2 = [0, q1_high, 0, q2_high]
  1486  	c.assembler.CompileConstToRegister(amd64.PSRLQ, 32, tmp2)
  1487  
  1488  	// Execute "pmuludq x1r,tmp2", which makes tmp2 = [p1_lo*q1_high, p2_lo*q2_high] where each lane is 64-bit.
  1489  	c.assembler.CompileRegisterToRegister(amd64.PMULUDQ, x1r, tmp2)
  1490  
  1491  	// Adds tmp1 and tmp2 and do the logical left shift by 32-bit,
  1492  	// which makes tmp1 = [(p1_lo*q1_high+p1_high*q1_lo)<<32, (p2_lo*q2_high+p2_high*q2_lo)<<32]
  1493  	c.assembler.CompileRegisterToRegister(amd64.PADDQ, tmp2, tmp1)
  1494  	c.assembler.CompileConstToRegister(amd64.PSLLQ, 32, tmp1)
  1495  
  1496  	// Execute "pmuludq x2r,x1r", which makes x1r = [p1_lo*q1_lo, p2_lo*q2_lo] where each lane is 64-bit.
  1497  	c.assembler.CompileRegisterToRegister(amd64.PMULUDQ, x2r, x1r)
  1498  
  1499  	// Finally, we get the result by adding x1r and tmp1,
  1500  	// which makes x1r = [(p1_lo*q1_high+p1_high*q1_lo)<<32+p1_lo*q1_lo, (p2_lo*q2_high+p2_high*q2_lo)<<32+p2_lo*q2_lo]
  1501  	c.assembler.CompileRegisterToRegister(amd64.PADDQ, tmp1, x1r)
  1502  
  1503  	c.locationStack.markRegisterUnused(x2r, tmp1)
  1504  	c.pushVectorRuntimeValueLocationOnRegister(x1r)
  1505  	return nil
  1506  }
  1507  
  1508  // compileV128Div implements compiler.compileV128Div for amd64.
  1509  func (c *amd64Compiler) compileV128Div(o *wazeroir.OperationV128Div) error {
  1510  	x2 := c.locationStack.popV128()
  1511  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1512  		return err
  1513  	}
  1514  
  1515  	x1 := c.locationStack.popV128()
  1516  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1517  		return err
  1518  	}
  1519  
  1520  	var inst asm.Instruction
  1521  	switch o.Shape {
  1522  	case wazeroir.ShapeF32x4:
  1523  		inst = amd64.DIVPS
  1524  	case wazeroir.ShapeF64x2:
  1525  		inst = amd64.DIVPD
  1526  	}
  1527  
  1528  	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
  1529  
  1530  	c.locationStack.markRegisterUnused(x2.register)
  1531  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
  1532  	return nil
  1533  }
  1534  
  1535  // compileV128Neg implements compiler.compileV128Neg for amd64.
  1536  func (c *amd64Compiler) compileV128Neg(o *wazeroir.OperationV128Neg) error {
  1537  	if o.Shape <= wazeroir.ShapeI64x2 {
  1538  		return c.compileV128NegInt(o.Shape)
  1539  	} else {
  1540  		return c.compileV128NegFloat(o.Shape)
  1541  	}
  1542  }
  1543  
  1544  // compileV128NegInt implements compiler.compileV128Neg for integer lanes.
  1545  func (c *amd64Compiler) compileV128NegInt(s wazeroir.Shape) error {
  1546  	v := c.locationStack.popV128()
  1547  	if err := c.compileEnsureOnRegister(v); err != nil {
  1548  		return err
  1549  	}
  1550  
  1551  	result, err := c.allocateRegister(registerTypeVector)
  1552  	if err != nil {
  1553  		return err
  1554  	}
  1555  
  1556  	var subInst asm.Instruction
  1557  	switch s {
  1558  	case wazeroir.ShapeI8x16:
  1559  		subInst = amd64.PSUBB
  1560  	case wazeroir.ShapeI16x8:
  1561  		subInst = amd64.PSUBW
  1562  	case wazeroir.ShapeI32x4:
  1563  		subInst = amd64.PSUBD
  1564  	case wazeroir.ShapeI64x2:
  1565  		subInst = amd64.PSUBQ
  1566  	}
  1567  
  1568  	c.assembler.CompileRegisterToRegister(amd64.PXOR, result, result)
  1569  	c.assembler.CompileRegisterToRegister(subInst, v.register, result)
  1570  
  1571  	c.locationStack.markRegisterUnused(v.register)
  1572  	c.pushVectorRuntimeValueLocationOnRegister(result)
  1573  	return nil
  1574  }
  1575  
  1576  // compileV128NegInt implements compiler.compileV128Neg for float lanes.
  1577  func (c *amd64Compiler) compileV128NegFloat(s wazeroir.Shape) error {
  1578  	v := c.locationStack.popV128()
  1579  	if err := c.compileEnsureOnRegister(v); err != nil {
  1580  		return err
  1581  	}
  1582  
  1583  	tmp, err := c.allocateRegister(registerTypeVector)
  1584  	if err != nil {
  1585  		return err
  1586  	}
  1587  
  1588  	var leftShiftInst, xorInst asm.Instruction
  1589  	var leftShiftAmount asm.ConstantValue
  1590  	if s == wazeroir.ShapeF32x4 {
  1591  		leftShiftInst, leftShiftAmount, xorInst = amd64.PSLLD, 31, amd64.XORPS
  1592  	} else {
  1593  		leftShiftInst, leftShiftAmount, xorInst = amd64.PSLLQ, 63, amd64.XORPD
  1594  	}
  1595  
  1596  	// Clear all bits on tmp.
  1597  	c.assembler.CompileRegisterToRegister(amd64.XORPS, tmp, tmp)
  1598  	// Set all bits on tmp by CMPPD with arg=0 (== pseudo CMPEQPD instruction).
  1599  	// See https://www.felixcloutier.com/x86/cmpps
  1600  	//
  1601  	// Note: if we do not clear all the bits ^ with XORPS, this might end up not setting ones on some lane
  1602  	// if the lane is NaN.
  1603  	c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPD, tmp, tmp, 0x8)
  1604  	// Do the left shift on each lane to set only the most significant bit in each.
  1605  	c.assembler.CompileConstToRegister(leftShiftInst, leftShiftAmount, tmp)
  1606  	// Get the negated result by XOR on each lane with tmp.
  1607  	c.assembler.CompileRegisterToRegister(xorInst, tmp, v.register)
  1608  
  1609  	c.pushVectorRuntimeValueLocationOnRegister(v.register)
  1610  	return nil
  1611  }
  1612  
  1613  // compileV128Sqrt implements compiler.compileV128Sqrt for amd64.
  1614  func (c *amd64Compiler) compileV128Sqrt(o *wazeroir.OperationV128Sqrt) error {
  1615  	v := c.locationStack.popV128()
  1616  	if err := c.compileEnsureOnRegister(v); err != nil {
  1617  		return err
  1618  	}
  1619  
  1620  	var inst asm.Instruction
  1621  	switch o.Shape {
  1622  	case wazeroir.ShapeF64x2:
  1623  		inst = amd64.SQRTPD
  1624  	case wazeroir.ShapeF32x4:
  1625  		inst = amd64.SQRTPS
  1626  	}
  1627  
  1628  	c.assembler.CompileRegisterToRegister(inst, v.register, v.register)
  1629  	c.pushVectorRuntimeValueLocationOnRegister(v.register)
  1630  	return nil
  1631  }
  1632  
  1633  // compileV128Abs implements compiler.compileV128Abs for amd64.
  1634  func (c *amd64Compiler) compileV128Abs(o *wazeroir.OperationV128Abs) error {
  1635  	if o.Shape == wazeroir.ShapeI64x2 {
  1636  		return c.compileV128AbsI64x2()
  1637  	}
  1638  
  1639  	v := c.locationStack.popV128()
  1640  	if err := c.compileEnsureOnRegister(v); err != nil {
  1641  		return err
  1642  	}
  1643  
  1644  	result := v.register
  1645  	switch o.Shape {
  1646  	case wazeroir.ShapeI8x16:
  1647  		c.assembler.CompileRegisterToRegister(amd64.PABSB, result, result)
  1648  	case wazeroir.ShapeI16x8:
  1649  		c.assembler.CompileRegisterToRegister(amd64.PABSW, result, result)
  1650  	case wazeroir.ShapeI32x4:
  1651  		c.assembler.CompileRegisterToRegister(amd64.PABSD, result, result)
  1652  	case wazeroir.ShapeF32x4:
  1653  		tmp, err := c.allocateRegister(registerTypeVector)
  1654  		if err != nil {
  1655  			return err
  1656  		}
  1657  		// Set all bits on tmp.
  1658  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, tmp, tmp)
  1659  		// Shift right packed single floats by 1 to clear the sign bits.
  1660  		c.assembler.CompileConstToRegister(amd64.PSRLD, 1, tmp)
  1661  		// Clear the sign bit of vr.
  1662  		c.assembler.CompileRegisterToRegister(amd64.ANDPS, tmp, result)
  1663  	case wazeroir.ShapeF64x2:
  1664  		tmp, err := c.allocateRegister(registerTypeVector)
  1665  		if err != nil {
  1666  			return err
  1667  		}
  1668  		// Set all bits on tmp.
  1669  		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, tmp, tmp)
  1670  		// Shift right packed single floats by 1 to clear the sign bits.
  1671  		c.assembler.CompileConstToRegister(amd64.PSRLQ, 1, tmp)
  1672  		// Clear the sign bit of vr.
  1673  		c.assembler.CompileRegisterToRegister(amd64.ANDPD, tmp, result)
  1674  	}
  1675  
  1676  	c.pushVectorRuntimeValueLocationOnRegister(result)
  1677  	return nil
  1678  }
  1679  
  1680  // compileV128AbsI64x2 implements compileV128Abs for i64x2 lanes.
  1681  func (c *amd64Compiler) compileV128AbsI64x2() error {
  1682  	// See https://www.felixcloutier.com/x86/blendvpd
  1683  	const blendMaskReg = amd64.RegX0
  1684  	c.onValueReleaseRegisterToStack(blendMaskReg)
  1685  	c.locationStack.markRegisterUsed(blendMaskReg)
  1686  
  1687  	v := c.locationStack.popV128()
  1688  	if err := c.compileEnsureOnRegister(v); err != nil {
  1689  		return err
  1690  	}
  1691  	vr := v.register
  1692  
  1693  	if vr == blendMaskReg {
  1694  		return errors.New("BUG: X0 must not be used")
  1695  	}
  1696  
  1697  	tmp, err := c.allocateRegister(registerTypeVector)
  1698  	if err != nil {
  1699  		return err
  1700  	}
  1701  	c.locationStack.markRegisterUsed(tmp)
  1702  
  1703  	// Copy the value to tmp.
  1704  	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vr, tmp)
  1705  
  1706  	// Clear all bits on blendMaskReg.
  1707  	c.assembler.CompileRegisterToRegister(amd64.PXOR, blendMaskReg, blendMaskReg)
  1708  	// Subtract vr from blendMaskReg.
  1709  	c.assembler.CompileRegisterToRegister(amd64.PSUBQ, vr, blendMaskReg)
  1710  	// Copy the subtracted value ^^ back into vr.
  1711  	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, blendMaskReg, vr)
  1712  
  1713  	c.assembler.CompileRegisterToRegister(amd64.BLENDVPD, tmp, vr)
  1714  
  1715  	c.locationStack.markRegisterUnused(blendMaskReg, tmp)
  1716  	c.pushVectorRuntimeValueLocationOnRegister(vr)
  1717  	return nil
  1718  }
  1719  
  1720  var (
  1721  	popcntMask = [16]byte{
  1722  		0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
  1723  		0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
  1724  	}
  1725  	// popcntTable holds each index's Popcnt, for example popcntTable[5] holds popcnt(0x05).
  1726  	popcntTable = [16]byte{
  1727  		0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03,
  1728  		0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04,
  1729  	}
  1730  )
  1731  
  1732  // compileV128Popcnt implements compiler.compileV128Popcnt for amd64.
  1733  func (c *amd64Compiler) compileV128Popcnt(*wazeroir.OperationV128Popcnt) error {
  1734  	v := c.locationStack.popV128()
  1735  	if err := c.compileEnsureOnRegister(v); err != nil {
  1736  		return err
  1737  	}
  1738  	vr := v.register
  1739  
  1740  	tmp1, err := c.allocateRegister(registerTypeVector)
  1741  	if err != nil {
  1742  		return err
  1743  	}
  1744  
  1745  	c.locationStack.markRegisterUsed(tmp1)
  1746  
  1747  	tmp2, err := c.allocateRegister(registerTypeVector)
  1748  	if err != nil {
  1749  		return err
  1750  	}
  1751  
  1752  	c.locationStack.markRegisterUsed(tmp2)
  1753  
  1754  	tmp3, err := c.allocateRegister(registerTypeVector)
  1755  	if err != nil {
  1756  		return err
  1757  	}
  1758  
  1759  	// Read the popcntMask into tmp1, and we have
  1760  	//  tmp1 = [0xf, ..., 0xf]
  1761  	if err := c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, asm.NewStaticConst(popcntMask[:]), tmp1); err != nil {
  1762  		return err
  1763  	}
  1764  
  1765  	// Copy the original value into tmp2.
  1766  	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vr, tmp2)
  1767  
  1768  	// Given that we have:
  1769  	//  v = [b1, ..., b16] where bn = hn:ln and hn and ln are higher and lower 4-bits of bn.
  1770  	//
  1771  	// Take PAND on tmp1 and tmp2, and we have
  1772  	//  tmp2 = [l1, ..., l16].
  1773  	c.assembler.CompileRegisterToRegister(amd64.PAND, tmp1, tmp2)
  1774  
  1775  	// Do logical (packed word) right shift by 4 on vr and PAND with vr and tmp1, meaning that we have
  1776  	//  vr = [h1, ...., h16].
  1777  	c.assembler.CompileConstToRegister(amd64.PSRLW, 4, vr)
  1778  	c.assembler.CompileRegisterToRegister(amd64.PAND, tmp1, vr)
  1779  
  1780  	// Read the popcntTable into tmp1, and we have
  1781  	//  tmp1 = [0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04]
  1782  	if err := c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, asm.NewStaticConst(popcntTable[:]), tmp1); err != nil {
  1783  		return err
  1784  	}
  1785  
  1786  	// Copy the tmp1 into tmp3, and we have
  1787  	//  tmp3 = [0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04]
  1788  	c.assembler.CompileRegisterToRegister(amd64.MOVDQU, tmp1, tmp3)
  1789  
  1790  	//  tmp3 = [popcnt(l1), ..., popcnt(l16)].
  1791  	c.assembler.CompileRegisterToRegister(amd64.PSHUFB, tmp2, tmp3)
  1792  
  1793  	//  tmp1 = [popcnt(h1), ..., popcnt(h16)].
  1794  	c.assembler.CompileRegisterToRegister(amd64.PSHUFB, vr, tmp1)
  1795  
  1796  	// vr = tmp1 = [popcnt(h1), ..., popcnt(h16)].
  1797  	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, tmp1, vr)
  1798  
  1799  	// vr += tmp3 = [popcnt(h1)+popcnt(l1), ..., popcnt(h16)+popcnt(l16)] = [popcnt(b1), ..., popcnt(b16)].
  1800  	c.assembler.CompileRegisterToRegister(amd64.PADDB, tmp3, vr)
  1801  
  1802  	c.locationStack.markRegisterUnused(tmp1, tmp2)
  1803  	c.pushVectorRuntimeValueLocationOnRegister(vr)
  1804  	return nil
  1805  }
  1806  
  1807  // compileV128Min implements compiler.compileV128Min for amd64.
  1808  func (c *amd64Compiler) compileV128Min(o *wazeroir.OperationV128Min) error {
  1809  	x2 := c.locationStack.popV128()
  1810  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1811  		return err
  1812  	}
  1813  
  1814  	x1 := c.locationStack.popV128()
  1815  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1816  		return err
  1817  	}
  1818  
  1819  	if o.Shape >= wazeroir.ShapeF32x4 {
  1820  		return c.compileV128FloatMinImpl(o.Shape == wazeroir.ShapeF32x4, x1.register, x2.register)
  1821  	}
  1822  
  1823  	var inst asm.Instruction
  1824  	switch o.Shape {
  1825  	case wazeroir.ShapeI8x16:
  1826  		if o.Signed {
  1827  			inst = amd64.PMINSB
  1828  		} else {
  1829  			inst = amd64.PMINUB
  1830  		}
  1831  	case wazeroir.ShapeI16x8:
  1832  		if o.Signed {
  1833  			inst = amd64.PMINSW
  1834  		} else {
  1835  			inst = amd64.PMINUW
  1836  		}
  1837  	case wazeroir.ShapeI32x4:
  1838  		if o.Signed {
  1839  			inst = amd64.PMINSD
  1840  		} else {
  1841  			inst = amd64.PMINUD
  1842  		}
  1843  	}
  1844  
  1845  	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
  1846  
  1847  	c.locationStack.markRegisterUnused(x2.register)
  1848  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
  1849  	return nil
  1850  }
  1851  
  1852  // compileV128FloatMinImpl implements compiler.compileV128Min for float lanes.
  1853  func (c *amd64Compiler) compileV128FloatMinImpl(is32bit bool, x1r, x2r asm.Register) error {
  1854  	tmp, err := c.allocateRegister(registerTypeVector)
  1855  	if err != nil {
  1856  		return err
  1857  	}
  1858  
  1859  	var min, cmp, andn, or, srl /* shit right logical */ asm.Instruction
  1860  	var shiftNumToInverseNaN asm.ConstantValue
  1861  	if is32bit {
  1862  		min, cmp, andn, or, srl, shiftNumToInverseNaN = amd64.MINPS, amd64.CMPPS, amd64.ANDNPS, amd64.ORPS, amd64.PSRLD, 0xa
  1863  	} else {
  1864  		min, cmp, andn, or, srl, shiftNumToInverseNaN = amd64.MINPD, amd64.CMPPD, amd64.ANDNPD, amd64.ORPD, amd64.PSRLQ, 0xd
  1865  	}
  1866  
  1867  	// Let v1 and v2 be the operand values on x1r and x2r at this point.
  1868  
  1869  	// Copy the value into tmp: tmp=v1
  1870  	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1r, tmp)
  1871  	// tmp=min(v1, v2)
  1872  	c.assembler.CompileRegisterToRegister(min, x2r, tmp)
  1873  	// x2r=min(v2, v1)
  1874  	c.assembler.CompileRegisterToRegister(min, x1r, x2r)
  1875  	// x1r=min(v2, v1)
  1876  	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x2r, x1r)
  1877  
  1878  	// x2r = -0          if (v1 == -0 || x2 == -0) && v1 != NaN && v2 !=NaN
  1879  	//       NaN         if v1 == NaN || v2 == NaN
  1880  	//       min(v1, v2) otherwise
  1881  	c.assembler.CompileRegisterToRegister(or, tmp, x2r)
  1882  	// x1r = 0^ (set all bits) if v1 == NaN || v2 == NaN
  1883  	//       0 otherwise
  1884  	c.assembler.CompileRegisterToRegisterWithArg(cmp, tmp, x1r, 3)
  1885  	// x2r = -0          if (v1 == -0 || x2 == -0) && v1 != NaN && v2 !=NaN
  1886  	//       ^0          if v1 == NaN || v2 == NaN
  1887  	//       min(v1, v2) otherwise
  1888  	c.assembler.CompileRegisterToRegister(or, x1r, x2r)
  1889  	// x1r = set all bits on the mantissa bits
  1890  	//       0 otherwise
  1891  	c.assembler.CompileConstToRegister(srl, shiftNumToInverseNaN, x1r)
  1892  	// x1r = x2r and !x1r
  1893  	//     = -0                                                   if (v1 == -0 || x2 == -0) && v1 != NaN && v2 !=NaN
  1894  	//       set all bits on exponential and sign bit (== NaN)    if v1 == NaN || v2 == NaN
  1895  	//       min(v1, v2)                                          otherwise
  1896  	c.assembler.CompileRegisterToRegister(andn, x2r, x1r)
  1897  
  1898  	c.locationStack.markRegisterUnused(x2r)
  1899  	c.pushVectorRuntimeValueLocationOnRegister(x1r)
  1900  	return nil
  1901  }
  1902  
  1903  // compileV128Max implements compiler.compileV128Max for amd64.
  1904  func (c *amd64Compiler) compileV128Max(o *wazeroir.OperationV128Max) error {
  1905  	x2 := c.locationStack.popV128()
  1906  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1907  		return err
  1908  	}
  1909  
  1910  	x1 := c.locationStack.popV128()
  1911  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1912  		return err
  1913  	}
  1914  
  1915  	if o.Shape >= wazeroir.ShapeF32x4 {
  1916  		return c.compileV128FloatMaxImpl(o.Shape == wazeroir.ShapeF32x4, x1.register, x2.register)
  1917  	}
  1918  
  1919  	var inst asm.Instruction
  1920  	switch o.Shape {
  1921  	case wazeroir.ShapeI8x16:
  1922  		if o.Signed {
  1923  			inst = amd64.PMAXSB
  1924  		} else {
  1925  			inst = amd64.PMAXUB
  1926  		}
  1927  	case wazeroir.ShapeI16x8:
  1928  		if o.Signed {
  1929  			inst = amd64.PMAXSW
  1930  		} else {
  1931  			inst = amd64.PMAXUW
  1932  		}
  1933  	case wazeroir.ShapeI32x4:
  1934  		if o.Signed {
  1935  			inst = amd64.PMAXSD
  1936  		} else {
  1937  			inst = amd64.PMAXUD
  1938  		}
  1939  	}
  1940  
  1941  	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
  1942  
  1943  	c.locationStack.markRegisterUnused(x2.register)
  1944  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
  1945  	return nil
  1946  }
  1947  
  1948  // compileV128FloatMaxImpl implements compiler.compileV128Max for float lanes.
  1949  func (c *amd64Compiler) compileV128FloatMaxImpl(is32bit bool, x1r, x2r asm.Register) error {
  1950  	tmp, err := c.allocateRegister(registerTypeVector)
  1951  	if err != nil {
  1952  		return err
  1953  	}
  1954  
  1955  	var max, cmp, andn, or, xor, sub, srl /* shit right logical */ asm.Instruction
  1956  	var shiftNumToInverseNaN asm.ConstantValue
  1957  	if is32bit {
  1958  		max, cmp, andn, or, xor, sub, srl, shiftNumToInverseNaN = amd64.MAXPS, amd64.CMPPS, amd64.ANDNPS, amd64.ORPS, amd64.XORPS, amd64.SUBPS, amd64.PSRLD, 0xa
  1959  	} else {
  1960  		max, cmp, andn, or, xor, sub, srl, shiftNumToInverseNaN = amd64.MAXPD, amd64.CMPPD, amd64.ANDNPD, amd64.ORPD, amd64.XORPD, amd64.SUBPD, amd64.PSRLQ, 0xd
  1961  	}
  1962  
  1963  	// Let v1 and v2 be the operand values on x1r and x2r at this point.
  1964  
  1965  	// Copy the value into tmp: tmp=v2
  1966  	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x2r, tmp)
  1967  	// tmp=max(v2, v1)
  1968  	c.assembler.CompileRegisterToRegister(max, x1r, tmp)
  1969  	// x1r=max(v1, v2)
  1970  	c.assembler.CompileRegisterToRegister(max, x2r, x1r)
  1971  	// x2r=max(v1, v2)
  1972  	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1r, x2r)
  1973  
  1974  	// x2r = -0      if (v1 == -0 && v2 == 0) || (v1 == 0 && v2 == -0)
  1975  	//       0       if (v1 == 0 && v2 ==  0)
  1976  	//       -0       if (v1 == -0 && v2 == -0)
  1977  	//       v1^v2   if v1 == NaN || v2 == NaN
  1978  	//       0       otherwise
  1979  	c.assembler.CompileRegisterToRegister(xor, tmp, x2r)
  1980  	// x1r = -0           if (v1 == -0 && v2 == 0) || (v1 == 0 && v2 == -0)
  1981  	//       0            if (v1 == 0 && v2 ==  0)
  1982  	//       -0           if (v1 == -0 && v2 == -0)
  1983  	//       NaN          if v1 == NaN || v2 == NaN
  1984  	//       max(v1, v2)  otherwise
  1985  	c.assembler.CompileRegisterToRegister(or, x2r, x1r)
  1986  	// Copy x1r into tmp.
  1987  	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1r, tmp)
  1988  	// tmp = 0            if (v1 == -0 && v2 == 0) || (v1 == 0 && v2 == -0) || (v1 == 0 && v2 ==  0)
  1989  	//       -0           if (v1 == -0 && v2 == -0)
  1990  	//       NaN          if v1 == NaN || v2 == NaN
  1991  	//       max(v1, v2)  otherwise
  1992  	//
  1993  	// Note: -0 - (-0) = 0 (!= -0) in floating point operation.
  1994  	c.assembler.CompileRegisterToRegister(sub, x2r, tmp)
  1995  	// x1r = 0^ if v1 == NaN || v2 == NaN
  1996  	c.assembler.CompileRegisterToRegisterWithArg(cmp, x1r, x1r, 3)
  1997  	// x1r = set all bits on the mantissa bits
  1998  	//       0 otherwise
  1999  	c.assembler.CompileConstToRegister(srl, shiftNumToInverseNaN, x1r)
  2000  	c.assembler.CompileRegisterToRegister(andn, tmp, x1r)
  2001  
  2002  	c.locationStack.markRegisterUnused(x2r)
  2003  	c.pushVectorRuntimeValueLocationOnRegister(x1r)
  2004  	return nil
  2005  }
  2006  
  2007  // compileV128AvgrU implements compiler.compileV128AvgrU for amd64.
  2008  func (c *amd64Compiler) compileV128AvgrU(o *wazeroir.OperationV128AvgrU) error {
  2009  	x2 := c.locationStack.popV128()
  2010  	if err := c.compileEnsureOnRegister(x2); err != nil {
  2011  		return err
  2012  	}
  2013  
  2014  	x1 := c.locationStack.popV128()
  2015  	if err := c.compileEnsureOnRegister(x1); err != nil {
  2016  		return err
  2017  	}
  2018  
  2019  	var inst asm.Instruction
  2020  	switch o.Shape {
  2021  	case wazeroir.ShapeI8x16:
  2022  		inst = amd64.PAVGB
  2023  	case wazeroir.ShapeI16x8:
  2024  		inst = amd64.PAVGW
  2025  	}
  2026  
  2027  	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
  2028  
  2029  	c.locationStack.markRegisterUnused(x2.register)
  2030  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
  2031  	return nil
  2032  }
  2033  
  2034  // compileV128Pmin implements compiler.compileV128Pmin for amd64.
  2035  func (c *amd64Compiler) compileV128Pmin(o *wazeroir.OperationV128Pmin) error {
  2036  	x2 := c.locationStack.popV128()
  2037  	if err := c.compileEnsureOnRegister(x2); err != nil {
  2038  		return err
  2039  	}
  2040  
  2041  	x1 := c.locationStack.popV128()
  2042  	if err := c.compileEnsureOnRegister(x1); err != nil {
  2043  		return err
  2044  	}
  2045  
  2046  	var min asm.Instruction
  2047  	if o.Shape == wazeroir.ShapeF32x4 {
  2048  		min = amd64.MINPS
  2049  	} else {
  2050  		min = amd64.MINPD
  2051  	}
  2052  
  2053  	x1r, v2r := x1.register, x2.register
  2054  
  2055  	c.assembler.CompileRegisterToRegister(min, x1r, v2r)
  2056  
  2057  	c.locationStack.markRegisterUnused(x1r)
  2058  	c.pushVectorRuntimeValueLocationOnRegister(v2r)
  2059  	return nil
  2060  }
  2061  
  2062  // compileV128Pmax implements compiler.compileV128Pmax for amd64.
  2063  func (c *amd64Compiler) compileV128Pmax(o *wazeroir.OperationV128Pmax) error {
  2064  	x2 := c.locationStack.popV128()
  2065  	if err := c.compileEnsureOnRegister(x2); err != nil {
  2066  		return err
  2067  	}
  2068  
  2069  	x1 := c.locationStack.popV128()
  2070  	if err := c.compileEnsureOnRegister(x1); err != nil {
  2071  		return err
  2072  	}
  2073  
  2074  	var min asm.Instruction
  2075  	if o.Shape == wazeroir.ShapeF32x4 {
  2076  		min = amd64.MAXPS
  2077  	} else {
  2078  		min = amd64.MAXPD
  2079  	}
  2080  
  2081  	x1r, v2r := x1.register, x2.register
  2082  
  2083  	c.assembler.CompileRegisterToRegister(min, x1r, v2r)
  2084  
  2085  	c.locationStack.markRegisterUnused(x1r)
  2086  	c.pushVectorRuntimeValueLocationOnRegister(v2r)
  2087  	return nil
  2088  }
  2089  
  2090  // compileV128Ceil implements compiler.compileV128Ceil for amd64.
  2091  func (c *amd64Compiler) compileV128Ceil(o *wazeroir.OperationV128Ceil) error {
  2092  	// See https://www.felixcloutier.com/x86/roundpd
  2093  	const roundModeCeil = 0x2
  2094  	return c.compileV128RoundImpl(o.Shape == wazeroir.ShapeF32x4, roundModeCeil)
  2095  }
  2096  
  2097  // compileV128Floor implements compiler.compileV128Floor for amd64.
  2098  func (c *amd64Compiler) compileV128Floor(o *wazeroir.OperationV128Floor) error {
  2099  	// See https://www.felixcloutier.com/x86/roundpd
  2100  	const roundModeFloor = 0x1
  2101  	return c.compileV128RoundImpl(o.Shape == wazeroir.ShapeF32x4, roundModeFloor)
  2102  }
  2103  
  2104  // compileV128Trunc implements compiler.compileV128Trunc for amd64.
  2105  func (c *amd64Compiler) compileV128Trunc(o *wazeroir.OperationV128Trunc) error {
  2106  	// See https://www.felixcloutier.com/x86/roundpd
  2107  	const roundModeTrunc = 0x3
  2108  	return c.compileV128RoundImpl(o.Shape == wazeroir.ShapeF32x4, roundModeTrunc)
  2109  }
  2110  
  2111  // compileV128Nearest implements compiler.compileV128Nearest for amd64.
  2112  func (c *amd64Compiler) compileV128Nearest(o *wazeroir.OperationV128Nearest) error {
  2113  	// See https://www.felixcloutier.com/x86/roundpd
  2114  	const roundModeNearest = 0x0
  2115  	return c.compileV128RoundImpl(o.Shape == wazeroir.ShapeF32x4, roundModeNearest)
  2116  }
  2117  
  2118  // compileV128RoundImpl implements compileV128Nearest compileV128Trunc compileV128Floor and compileV128Ceil
  2119  // with ROUNDPS (32-bit lane) and ROUNDPD (64-bit lane).
  2120  func (c *amd64Compiler) compileV128RoundImpl(is32bit bool, mode byte) error {
  2121  	v := c.locationStack.popV128()
  2122  	if err := c.compileEnsureOnRegister(v); err != nil {
  2123  		return err
  2124  	}
  2125  	vr := v.register
  2126  
  2127  	var round asm.Instruction
  2128  	if is32bit {
  2129  		round = amd64.ROUNDPS
  2130  	} else {
  2131  		round = amd64.ROUNDPD
  2132  	}
  2133  
  2134  	c.assembler.CompileRegisterToRegisterWithArg(round, vr, vr, mode)
  2135  	c.pushVectorRuntimeValueLocationOnRegister(vr)
  2136  	return nil
  2137  }
  2138  
  2139  // compileV128Extend implements compiler.compileV128Extend for amd64.
  2140  func (c *amd64Compiler) compileV128Extend(o *wazeroir.OperationV128Extend) error {
  2141  	v := c.locationStack.popV128()
  2142  	if err := c.compileEnsureOnRegister(v); err != nil {
  2143  		return err
  2144  	}
  2145  	vr := v.register
  2146  
  2147  	if !o.UseLow {
  2148  		// We have to shift the higher 64-bits into the lower ones before the actual extending instruction.
  2149  		// Shifting right by 0x8 * 8 = 64bits and concatenate itself.
  2150  		// See https://www.felixcloutier.com/x86/palignr
  2151  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PALIGNR, v.register, v.register, 0x8)
  2152  	}
  2153  
  2154  	var extend asm.Instruction
  2155  	switch o.OriginShape {
  2156  	case wazeroir.ShapeI8x16:
  2157  		if o.Signed {
  2158  			extend = amd64.PMOVSXBW
  2159  		} else {
  2160  			extend = amd64.PMOVZXBW
  2161  		}
  2162  	case wazeroir.ShapeI16x8:
  2163  		if o.Signed {
  2164  			extend = amd64.PMOVSXWD
  2165  		} else {
  2166  			extend = amd64.PMOVZXWD
  2167  		}
  2168  	case wazeroir.ShapeI32x4:
  2169  		if o.Signed {
  2170  			extend = amd64.PMOVSXDQ
  2171  		} else {
  2172  			extend = amd64.PMOVZXDQ
  2173  		}
  2174  	}
  2175  
  2176  	c.assembler.CompileRegisterToRegister(extend, vr, vr)
  2177  	c.pushVectorRuntimeValueLocationOnRegister(vr)
  2178  	return nil
  2179  }
  2180  
  2181  // compileV128ExtMul implements compiler.compileV128ExtMul for amd64.
  2182  func (c *amd64Compiler) compileV128ExtMul(o *wazeroir.OperationV128ExtMul) error {
  2183  	x2 := c.locationStack.popV128()
  2184  	if err := c.compileEnsureOnRegister(x2); err != nil {
  2185  		return err
  2186  	}
  2187  
  2188  	x1 := c.locationStack.popV128()
  2189  	if err := c.compileEnsureOnRegister(x1); err != nil {
  2190  		return err
  2191  	}
  2192  
  2193  	x1r, x2r := x1.register, x2.register
  2194  
  2195  	switch o.OriginShape {
  2196  	case wazeroir.ShapeI8x16:
  2197  		if !o.UseLow {
  2198  			// We have to shift the higher 64-bits into the lower ones before the actual extending instruction.
  2199  			// Shifting right by 0x8 * 8 = 64bits and concatenate itself.
  2200  			// See https://www.felixcloutier.com/x86/palignr
  2201  			c.assembler.CompileRegisterToRegisterWithArg(amd64.PALIGNR, x1r, x1r, 0x8)
  2202  			c.assembler.CompileRegisterToRegisterWithArg(amd64.PALIGNR, x2r, x2r, 0x8)
  2203  		}
  2204  
  2205  		var ext asm.Instruction
  2206  		if o.Signed {
  2207  			ext = amd64.PMOVSXBW
  2208  		} else {
  2209  			ext = amd64.PMOVZXBW
  2210  		}
  2211  
  2212  		// Signed or Zero extend lower half packed bytes to packed words.
  2213  		c.assembler.CompileRegisterToRegister(ext, x1r, x1r)
  2214  		c.assembler.CompileRegisterToRegister(ext, x2r, x2r)
  2215  
  2216  		c.assembler.CompileRegisterToRegister(amd64.PMULLW, x2r, x1r)
  2217  	case wazeroir.ShapeI16x8:
  2218  		tmp, err := c.allocateRegister(registerTypeVector)
  2219  		if err != nil {
  2220  			return err
  2221  		}
  2222  
  2223  		// Copy the value on x1r to tmp.
  2224  		c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1r, tmp)
  2225  
  2226  		// Multiply the values and store the lower 16-bits into x1r.
  2227  		c.assembler.CompileRegisterToRegister(amd64.PMULLW, x2r, x1r)
  2228  		if o.Signed {
  2229  			// Signed multiply the values and store the higher 16-bits into tmp.
  2230  			c.assembler.CompileRegisterToRegister(amd64.PMULHW, x2r, tmp)
  2231  		} else {
  2232  			// Unsigned multiply the values and store the higher 16-bits into tmp.
  2233  			c.assembler.CompileRegisterToRegister(amd64.PMULHUW, x2r, tmp)
  2234  		}
  2235  
  2236  		// Unpack lower or higher half of vectors (tmp and x1r) and concatenate them.
  2237  		if o.UseLow {
  2238  			c.assembler.CompileRegisterToRegister(amd64.PUNPCKLWD, tmp, x1r)
  2239  		} else {
  2240  			c.assembler.CompileRegisterToRegister(amd64.PUNPCKHWD, tmp, x1r)
  2241  		}
  2242  	case wazeroir.ShapeI32x4:
  2243  		var shuffleOrder byte
  2244  		// Given that the original state of the register is as [v1, v2, v3, v4] where vN = a word,
  2245  		if o.UseLow {
  2246  			// This makes the register as [v1, v1, v2, v2]
  2247  			shuffleOrder = 0b01010000
  2248  		} else {
  2249  			// This makes the register as [v3, v3, v4, v4]
  2250  			shuffleOrder = 0b11111010
  2251  		}
  2252  		// See https://www.felixcloutier.com/x86/pshufd
  2253  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, x1r, x1r, shuffleOrder)
  2254  		c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, x2r, x2r, shuffleOrder)
  2255  
  2256  		var mul asm.Instruction
  2257  		if o.Signed {
  2258  			mul = amd64.PMULDQ
  2259  		} else {
  2260  			mul = amd64.PMULUDQ
  2261  		}
  2262  		c.assembler.CompileRegisterToRegister(mul, x2r, x1r)
  2263  	}
  2264  
  2265  	c.locationStack.markRegisterUnused(x2r)
  2266  	c.pushVectorRuntimeValueLocationOnRegister(x1r)
  2267  	return nil
  2268  }
  2269  
  2270  var q15mulrSatSMask = [16]byte{
  2271  	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
  2272  	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
  2273  }
  2274  
  2275  // compileV128Q15mulrSatS implements compiler.compileV128Q15mulrSatS for amd64.
  2276  func (c *amd64Compiler) compileV128Q15mulrSatS(*wazeroir.OperationV128Q15mulrSatS) error {
  2277  	x2 := c.locationStack.popV128()
  2278  	if err := c.compileEnsureOnRegister(x2); err != nil {
  2279  		return err
  2280  	}
  2281  
  2282  	x1 := c.locationStack.popV128()
  2283  	if err := c.compileEnsureOnRegister(x1); err != nil {
  2284  		return err
  2285  	}
  2286  
  2287  	tmp, err := c.allocateRegister(registerTypeVector)
  2288  	if err != nil {
  2289  		return err
  2290  	}
  2291  
  2292  	x1r, x2r := x1.register, x2.register
  2293  
  2294  	// See https://github.com/WebAssembly/simd/pull/365 for the following logic.
  2295  	if err := c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, asm.NewStaticConst(q15mulrSatSMask[:]), tmp); err != nil {
  2296  		return err
  2297  	}
  2298  
  2299  	c.assembler.CompileRegisterToRegister(amd64.PMULHRSW, x2r, x1r)
  2300  	c.assembler.CompileRegisterToRegister(amd64.PCMPEQW, x1r, tmp)
  2301  	c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, x1r)
  2302  
  2303  	c.locationStack.markRegisterUnused(x2r)
  2304  	c.pushVectorRuntimeValueLocationOnRegister(x1r)
  2305  	return nil
  2306  }
  2307  
  2308  var (
  2309  	allOnesI8x16 = [16]byte{0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1}
  2310  	allOnesI16x8 = [16]byte{0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0}
  2311  
  2312  	extAddPairwiseI16x8uMask = [16 * 2]byte{
  2313  		0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
  2314  		0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00,
  2315  	}
  2316  )
  2317  
  2318  // compileV128ExtAddPairwise implements compiler.compileV128ExtAddPairwise for amd64.
  2319  func (c *amd64Compiler) compileV128ExtAddPairwise(o *wazeroir.OperationV128ExtAddPairwise) error {
  2320  	v := c.locationStack.popV128()
  2321  	if err := c.compileEnsureOnRegister(v); err != nil {
  2322  		return err
  2323  	}
  2324  	vr := v.register
  2325  
  2326  	switch o.OriginShape {
  2327  	case wazeroir.ShapeI8x16:
  2328  		allOnesReg, err := c.allocateRegister(registerTypeVector)
  2329  		if err != nil {
  2330  			return err
  2331  		}
  2332  
  2333  		if err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU,
  2334  			asm.NewStaticConst(allOnesI8x16[:]), allOnesReg); err != nil {
  2335  			return err
  2336  		}
  2337  
  2338  		var result asm.Register
  2339  		// See https://www.felixcloutier.com/x86/pmaddubsw for detail.
  2340  		if o.Signed {
  2341  			// Interpret vr's value as signed byte and multiply with one and add pairwise, which results in pairwise
  2342  			// signed extadd.
  2343  			c.assembler.CompileRegisterToRegister(amd64.PMADDUBSW, vr, allOnesReg)
  2344  			result = allOnesReg
  2345  		} else {
  2346  			// Interpreter tmp (all ones) as signed byte meaning that all the multiply-add is unsigned.
  2347  			c.assembler.CompileRegisterToRegister(amd64.PMADDUBSW, allOnesReg, vr)
  2348  			result = vr
  2349  		}
  2350  
  2351  		if result != vr {
  2352  			c.locationStack.markRegisterUnused(vr)
  2353  		}
  2354  		c.pushVectorRuntimeValueLocationOnRegister(result)
  2355  	case wazeroir.ShapeI16x8:
  2356  		tmp, err := c.allocateRegister(registerTypeVector)
  2357  		if err != nil {
  2358  			return err
  2359  		}
  2360  
  2361  		if o.Signed {
  2362  			// See https://www.felixcloutier.com/x86/pmaddwd
  2363  			if err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU,
  2364  				asm.NewStaticConst(allOnesI16x8[:]), tmp); err != nil {
  2365  				return err
  2366  			}
  2367  
  2368  			c.assembler.CompileRegisterToRegister(amd64.PMADDWD, tmp, vr)
  2369  			c.pushVectorRuntimeValueLocationOnRegister(vr)
  2370  		} else {
  2371  
  2372  			if err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU,
  2373  				asm.NewStaticConst(extAddPairwiseI16x8uMask[:16]), tmp); err != nil {
  2374  				return err
  2375  			}
  2376  
  2377  			// Flip the sign bits on vr.
  2378  			//
  2379  			// Assuming that vr = [w1, ..., w8], now we have,
  2380  			// 	vr[i] = int8(-w1) for i = 0...8
  2381  			c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, vr)
  2382  
  2383  			if err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU,
  2384  				asm.NewStaticConst(allOnesI16x8[:]), tmp); err != nil {
  2385  				return err
  2386  			}
  2387  
  2388  			// For i = 0,..4 (as this results in i32x4 lanes), now we have
  2389  			// vr[i] = int32(-wn + -w(n+1)) = int32(-(wn + w(n+1)))
  2390  			c.assembler.CompileRegisterToRegister(amd64.PMADDWD, tmp, vr)
  2391  
  2392  			// tmp[i] = [0, 0, 1, 0] = int32(math.MaxInt16+1)
  2393  			if err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU,
  2394  				asm.NewStaticConst(extAddPairwiseI16x8uMask[16:]), tmp); err != nil {
  2395  				return err
  2396  			}
  2397  
  2398  			// vr[i] = int32(-(wn + w(n+1))) + int32(math.MaxInt16+1) = int32((wn + w(n+1))) = uint32(wn + w(n+1)).
  2399  			c.assembler.CompileRegisterToRegister(amd64.PADDD, tmp, vr)
  2400  			c.pushVectorRuntimeValueLocationOnRegister(vr)
  2401  		}
  2402  	}
  2403  	return nil
  2404  }
  2405  
  2406  // compileV128FloatPromote implements compiler.compileV128FloatPromote for amd64.
  2407  func (c *amd64Compiler) compileV128FloatPromote(*wazeroir.OperationV128FloatPromote) error {
  2408  	v := c.locationStack.popV128()
  2409  	if err := c.compileEnsureOnRegister(v); err != nil {
  2410  		return err
  2411  	}
  2412  	vr := v.register
  2413  
  2414  	c.assembler.CompileRegisterToRegister(amd64.CVTPS2PD, vr, vr)
  2415  	c.pushVectorRuntimeValueLocationOnRegister(vr)
  2416  	return nil
  2417  }
  2418  
  2419  // compileV128FloatDemote implements compiler.compileV128FloatDemote for amd64.
  2420  func (c *amd64Compiler) compileV128FloatDemote(*wazeroir.OperationV128FloatDemote) error {
  2421  	v := c.locationStack.popV128()
  2422  	if err := c.compileEnsureOnRegister(v); err != nil {
  2423  		return err
  2424  	}
  2425  	vr := v.register
  2426  
  2427  	c.assembler.CompileRegisterToRegister(amd64.CVTPD2PS, vr, vr)
  2428  	c.pushVectorRuntimeValueLocationOnRegister(vr)
  2429  	return nil
  2430  }
  2431  
  2432  // compileV128Dot implements compiler.compileV128Dot for amd64.
  2433  func (c *amd64Compiler) compileV128Dot(*wazeroir.OperationV128Dot) error {
  2434  	x2 := c.locationStack.popV128()
  2435  	if err := c.compileEnsureOnRegister(x2); err != nil {
  2436  		return err
  2437  	}
  2438  
  2439  	x1 := c.locationStack.popV128()
  2440  	if err := c.compileEnsureOnRegister(x1); err != nil {
  2441  		return err
  2442  	}
  2443  
  2444  	c.assembler.CompileRegisterToRegister(amd64.PMADDWD, x2.register, x1.register)
  2445  
  2446  	c.locationStack.markRegisterUnused(x2.register)
  2447  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
  2448  	return nil
  2449  }
  2450  
  2451  var fConvertFromIMask = [16]byte{
  2452  	0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  2453  }
  2454  
  2455  // compileV128FConvertFromI implements compiler.compileV128FConvertFromI for amd64.
  2456  func (c *amd64Compiler) compileV128FConvertFromI(o *wazeroir.OperationV128FConvertFromI) error {
  2457  	v := c.locationStack.popV128()
  2458  	if err := c.compileEnsureOnRegister(v); err != nil {
  2459  		return err
  2460  	}
  2461  	vr := v.register
  2462  
  2463  	switch o.DestinationShape {
  2464  	case wazeroir.ShapeF32x4:
  2465  		if o.Signed {
  2466  			c.assembler.CompileRegisterToRegister(amd64.CVTDQ2PS, vr, vr)
  2467  		} else {
  2468  			tmp, err := c.allocateRegister(registerTypeVector)
  2469  			if err != nil {
  2470  				return err
  2471  			}
  2472  
  2473  			// Copy the value into tmp.
  2474  			c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vr, tmp)
  2475  
  2476  			// Clear the higher 16-bits of tmp.
  2477  			c.assembler.CompileConstToRegister(amd64.PSLLD, 0xa, tmp)
  2478  			c.assembler.CompileConstToRegister(amd64.PSRLD, 0xa, tmp)
  2479  
  2480  			// Subtract the higher 16-bits from vr == clear the lower 16-bits of vr.
  2481  			c.assembler.CompileRegisterToRegister(amd64.PSUBD, tmp, vr)
  2482  
  2483  			// Convert the lower 16-bits in tmp.
  2484  			c.assembler.CompileRegisterToRegister(amd64.CVTDQ2PS, tmp, tmp)
  2485  
  2486  			// Left shift by one and convert vr, meaning that halved conversion result of higher 16-bits in vr.
  2487  			c.assembler.CompileConstToRegister(amd64.PSRLD, 1, vr)
  2488  			c.assembler.CompileRegisterToRegister(amd64.CVTDQ2PS, vr, vr)
  2489  
  2490  			// Double the converted halved higher 16bits.
  2491  			c.assembler.CompileRegisterToRegister(amd64.ADDPS, vr, vr)
  2492  
  2493  			// Get the conversion result by add tmp (holding lower 16-bit conversion) into vr.
  2494  			c.assembler.CompileRegisterToRegister(amd64.ADDPS, tmp, vr)
  2495  		}
  2496  	case wazeroir.ShapeF64x2:
  2497  		if o.Signed {
  2498  			c.assembler.CompileRegisterToRegister(amd64.CVTDQ2PD, vr, vr)
  2499  		} else {
  2500  			tmp, err := c.allocateRegister(registerTypeVector)
  2501  			if err != nil {
  2502  				return err
  2503  			}
  2504  
  2505  			// tmp = [0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]
  2506  			if err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, asm.NewStaticConst(fConvertFromIMask[:16]), tmp); err != nil {
  2507  				return err
  2508  			}
  2509  
  2510  			// Given that we have vr = [d1, d2, d3, d4], this results in
  2511  			//	vr = [d1, [0x00, 0x00, 0x30, 0x43], d2, [0x00, 0x00, 0x30, 0x43]]
  2512  			//     = [float64(uint32(d1)) + 0x1.0p52, float64(uint32(d2)) + 0x1.0p52]
  2513  			//     ^See https://stackoverflow.com/questions/13269523/can-all-32-bit-ints-be-exactly-represented-as-a-double
  2514  			c.assembler.CompileRegisterToRegister(amd64.UNPCKLPS, tmp, vr)
  2515  
  2516  			// tmp = [float64(0x1.0p52), float64(0x1.0p52)]
  2517  			if err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU,
  2518  				asm.NewStaticConst(twop52[:]), tmp); err != nil {
  2519  				return err
  2520  			}
  2521  
  2522  			// Now, we get the result as
  2523  			// 	vr = [float64(uint32(d1)), float64(uint32(d2))]
  2524  			// because the following equality always satisfies:
  2525  			//  float64(0x1.0p52 + float64(uint32(x))) - float64(0x1.0p52 + float64(uint32(y))) = float64(uint32(x)) - float64(uint32(y))
  2526  			c.assembler.CompileRegisterToRegister(amd64.SUBPD, tmp, vr)
  2527  		}
  2528  	}
  2529  
  2530  	c.pushVectorRuntimeValueLocationOnRegister(vr)
  2531  	return nil
  2532  }
  2533  
  2534  // compileV128Narrow implements compiler.compileV128Narrow for amd64.
  2535  func (c *amd64Compiler) compileV128Narrow(o *wazeroir.OperationV128Narrow) error {
  2536  	x2 := c.locationStack.popV128()
  2537  	if err := c.compileEnsureOnRegister(x2); err != nil {
  2538  		return err
  2539  	}
  2540  
  2541  	x1 := c.locationStack.popV128()
  2542  	if err := c.compileEnsureOnRegister(x1); err != nil {
  2543  		return err
  2544  	}
  2545  
  2546  	var narrow asm.Instruction
  2547  	switch o.OriginShape {
  2548  	case wazeroir.ShapeI16x8:
  2549  		if o.Signed {
  2550  			narrow = amd64.PACKSSWB
  2551  		} else {
  2552  			narrow = amd64.PACKUSWB
  2553  		}
  2554  	case wazeroir.ShapeI32x4:
  2555  		if o.Signed {
  2556  			narrow = amd64.PACKSSDW
  2557  		} else {
  2558  			narrow = amd64.PACKUSDW
  2559  		}
  2560  	}
  2561  	c.assembler.CompileRegisterToRegister(narrow, x2.register, x1.register)
  2562  
  2563  	c.locationStack.markRegisterUnused(x2.register)
  2564  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
  2565  	return nil
  2566  }
  2567  
  2568  var (
  2569  	// i32sMaxOnF64x2 holds math.MaxInt32(=2147483647.0) on two f64 lanes.
  2570  	i32sMaxOnF64x2 = [16]byte{
  2571  		0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0xdf, 0x41, // float64(2147483647.0)
  2572  		0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0xdf, 0x41, // float64(2147483647.0)
  2573  	}
  2574  
  2575  	// i32sMaxOnF64x2 holds math.MaxUint32(=4294967295.0) on two f64 lanes.
  2576  	i32uMaxOnF64x2 = [16]byte{
  2577  		0x00, 0x00, 0xe0, 0xff, 0xff, 0xff, 0xef, 0x41, // float64(4294967295.0)
  2578  		0x00, 0x00, 0xe0, 0xff, 0xff, 0xff, 0xef, 0x41, // float64(4294967295.0)
  2579  	}
  2580  
  2581  	// twop52 holds two float64(0x1.0p52) on two f64 lanes. 0x1.0p52 is special in the sense that
  2582  	// with this exponent, the mantissa represents a corresponding uint32 number, and arithmetics,
  2583  	// like addition or subtraction, the resulted floating point holds exactly the same
  2584  	// bit representations in 32-bit integer on its mantissa.
  2585  	//
  2586  	// Note: the name twop52 is common across various compiler ecosystem.
  2587  	// 	E.g. https://github.com/llvm/llvm-project/blob/92ab024f81e5b64e258b7c3baaf213c7c26fcf40/compiler-rt/lib/builtins/floatdidf.c#L28
  2588  	// 	E.g. https://opensource.apple.com/source/clang/clang-425.0.24/src/projects/compiler-rt/lib/floatdidf.c.auto.html
  2589  	twop52 = [16]byte{
  2590  		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, // float64(0x1.0p52)
  2591  		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, // float64(0x1.0p52)
  2592  	}
  2593  )
  2594  
  2595  // compileV128ITruncSatFromF implements compiler.compileV128ITruncSatFromF for amd64.
  2596  func (c *amd64Compiler) compileV128ITruncSatFromF(o *wazeroir.OperationV128ITruncSatFromF) error {
  2597  	v := c.locationStack.popV128()
  2598  	if err := c.compileEnsureOnRegister(v); err != nil {
  2599  		return err
  2600  	}
  2601  	vr := v.register
  2602  
  2603  	tmp, err := c.allocateRegister(registerTypeVector)
  2604  	if err != nil {
  2605  		return err
  2606  	}
  2607  
  2608  	c.locationStack.markRegisterUsed(tmp)
  2609  
  2610  	switch o.OriginShape {
  2611  	case wazeroir.ShapeF32x4:
  2612  		if o.Signed {
  2613  			// Copy the value into tmp.
  2614  			c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vr, tmp)
  2615  
  2616  			// Assuming we have vr = [v1, v2, v3, v4].
  2617  			//
  2618  			// Set all bits if lane is not NaN on tmp.
  2619  			// tmp[i] = 0xffffffff  if vi != NaN
  2620  			//        = 0           if vi == NaN
  2621  			c.assembler.CompileRegisterToRegister(amd64.CMPEQPS, tmp, tmp)
  2622  
  2623  			// Clear NaN lanes on vr, meaning that
  2624  			// 	vr[i] = vi  if vi != NaN
  2625  			//	        0   if vi == NaN
  2626  			c.assembler.CompileRegisterToRegister(amd64.ANDPS, tmp, vr)
  2627  
  2628  			// tmp[i] = ^vi         if vi != NaN
  2629  			//        = 0xffffffff  if vi == NaN
  2630  			// which means that tmp[i] & 0x80000000 != 0 if and only if vi is negative.
  2631  			c.assembler.CompileRegisterToRegister(amd64.PXOR, vr, tmp)
  2632  
  2633  			// vr[i] = int32(vi)   if vi != NaN and vr is not overflowing.
  2634  			//       = 0x80000000  if vi != NaN and vr is overflowing (See https://www.felixcloutier.com/x86/cvttps2dq)
  2635  			//       = 0           if vi == NaN
  2636  			c.assembler.CompileRegisterToRegister(amd64.CVTTPS2DQ, vr, vr)
  2637  
  2638  			// Below, we have to convert 0x80000000 into 0x7FFFFFFF for positive overflowing lane.
  2639  			//
  2640  			// tmp[i] = 0x80000000                         if vi is positive
  2641  			//        = any satisfying any&0x80000000 = 0  if vi is negative or zero.
  2642  			c.assembler.CompileRegisterToRegister(amd64.PAND, vr, tmp)
  2643  
  2644  			// Arithmetic right shifting tmp by 31, meaning that we have
  2645  			// tmp[i] = 0xffffffff if vi is positive, 0 otherwise.
  2646  			c.assembler.CompileConstToRegister(amd64.PSRAD, 0x1f, tmp)
  2647  
  2648  			// Flipping 0x80000000 if vi is positive, otherwise keep intact.
  2649  			c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, vr)
  2650  		} else {
  2651  			tmp2, err := c.allocateRegister(registerTypeVector)
  2652  			if err != nil {
  2653  				return err
  2654  			}
  2655  
  2656  			// See https://github.com/bytecodealliance/wasmtime/pull/2440
  2657  			// Note: even v8 doesn't seem to have support for this i32x4.tranc_sat_f32x4_u.
  2658  			c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, tmp)
  2659  			c.assembler.CompileRegisterToRegister(amd64.MAXPS, tmp, vr)
  2660  			c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, tmp, tmp)
  2661  			c.assembler.CompileConstToRegister(amd64.PSRLD, 0x1, tmp)
  2662  			c.assembler.CompileRegisterToRegister(amd64.CVTDQ2PS, tmp, tmp)
  2663  			c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vr, tmp2)
  2664  			c.assembler.CompileRegisterToRegister(amd64.CVTTPS2DQ, vr, vr)
  2665  			c.assembler.CompileRegisterToRegister(amd64.SUBPS, tmp, tmp2)
  2666  			c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPS, tmp2, tmp, 0x2) // == CMPLEPS
  2667  			c.assembler.CompileRegisterToRegister(amd64.CVTTPS2DQ, tmp2, tmp2)
  2668  			c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, tmp2)
  2669  			c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, tmp)
  2670  			c.assembler.CompileRegisterToRegister(amd64.PMAXSD, tmp, tmp2)
  2671  			c.assembler.CompileRegisterToRegister(amd64.PADDD, tmp2, vr)
  2672  		}
  2673  	case wazeroir.ShapeF64x2:
  2674  		tmp2, err := c.allocateRegister(registerTypeVector)
  2675  		if err != nil {
  2676  			return err
  2677  		}
  2678  
  2679  		if o.Signed {
  2680  			// Copy the value into tmp.
  2681  			c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vr, tmp)
  2682  
  2683  			// Set all bits for non-NaN lanes, zeros otherwise.
  2684  			// I.e. tmp[i] = 0xffffffff_ffffffff if vi != NaN, 0 otherwise.
  2685  			c.assembler.CompileRegisterToRegister(amd64.CMPEQPD, tmp, tmp)
  2686  
  2687  			// Load the 2147483647 into tmp2's each lane.
  2688  			if err = c.assembler.CompileStaticConstToRegister(amd64.MOVUPD, asm.NewStaticConst(i32sMaxOnF64x2[:]), tmp2); err != nil {
  2689  				return err
  2690  			}
  2691  
  2692  			// tmp[i] = 2147483647 if vi != NaN, 0 otherwise.
  2693  			c.assembler.CompileRegisterToRegister(amd64.ANDPS, tmp2, tmp)
  2694  
  2695  			// MINPD returns the source register's value as-is, so we have
  2696  			//  vr[i] = vi   if vi != NaN
  2697  			//        = 0    if vi == NaN
  2698  			c.assembler.CompileRegisterToRegister(amd64.MINPD, tmp, vr)
  2699  
  2700  			c.assembler.CompileRegisterToRegister(amd64.CVTTPD2DQ, vr, vr)
  2701  		} else {
  2702  			// Clears all bits on tmp.
  2703  			c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, tmp)
  2704  
  2705  			//  vr[i] = vi   if vi != NaN && vi > 0
  2706  			//        = 0    if vi == NaN || vi <= 0
  2707  			c.assembler.CompileRegisterToRegister(amd64.MAXPD, tmp, vr)
  2708  
  2709  			// tmp2[i] = float64(math.MaxUint32) = math.MaxUint32
  2710  			if err = c.assembler.CompileStaticConstToRegister(amd64.MOVUPD, asm.NewStaticConst(i32uMaxOnF64x2[:]), tmp2); err != nil {
  2711  				return err
  2712  			}
  2713  
  2714  			// vr[i] = vi   if vi != NaN && vi > 0 && vi <= math.MaxUint32
  2715  			//       = 0    otherwise
  2716  			c.assembler.CompileRegisterToRegister(amd64.MINPD, tmp2, vr)
  2717  
  2718  			// Round the floating points into integer.
  2719  			c.assembler.CompileRegisterToRegisterWithArg(amd64.ROUNDPD, vr, vr, 0x3)
  2720  
  2721  			// tmp2[i] = float64(0x1.0p52)
  2722  			if err = c.assembler.CompileStaticConstToRegister(amd64.MOVUPD, asm.NewStaticConst(twop52[:]), tmp2); err != nil {
  2723  				return err
  2724  			}
  2725  
  2726  			// vr[i] = float64(0x1.0p52) + float64(uint32(vi)) if vi != NaN && vi > 0 && vi <= math.MaxUint32
  2727  			//       = 0                                       otherwise
  2728  			//
  2729  			// This means that vr[i] holds exactly the same bit of uint32(vi) in its lower 32-bits.
  2730  			c.assembler.CompileRegisterToRegister(amd64.ADDPD, tmp2, vr)
  2731  
  2732  			// At this point, we have
  2733  			// 	vr  = [uint32(v0), float64(0x1.0p52), uint32(v1), float64(0x1.0p52)]
  2734  			//  tmp = [0, 0, 0, 0]
  2735  			// as 32x4 lanes. Therefore, SHUFPS with 0b00_00_10_00 results in
  2736  			//	vr = [vr[00], vr[10], tmp[00], tmp[00]] = [vr[00], vr[10], 0, 0]
  2737  			// meaning that for i = 0 and 1, we have
  2738  			//  vr[i] = uint32(vi) if vi != NaN && vi > 0 && vi <= math.MaxUint32
  2739  			//        = 0          otherwise.
  2740  			c.assembler.CompileRegisterToRegisterWithArg(amd64.SHUFPS, tmp, vr, 0b00_00_10_00)
  2741  		}
  2742  	}
  2743  
  2744  	c.locationStack.markRegisterUnused(tmp)
  2745  	c.pushVectorRuntimeValueLocationOnRegister(vr)
  2746  	return nil
  2747  }