github.com/gagliardetto/golang-go@v0.0.0-20201020153340-53909ea70814/cmd/compile/internal/amd64/ssa.go (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package amd64
     6  
     7  import (
     8  	"fmt"
     9  	"math"
    10  
    11  	"github.com/gagliardetto/golang-go/cmd/compile/internal/gc"
    12  	"github.com/gagliardetto/golang-go/cmd/compile/internal/logopt"
    13  	"github.com/gagliardetto/golang-go/cmd/compile/internal/ssa"
    14  	"github.com/gagliardetto/golang-go/cmd/compile/internal/types"
    15  	"github.com/gagliardetto/golang-go/cmd/internal/obj"
    16  	"github.com/gagliardetto/golang-go/cmd/internal/obj/x86"
    17  )
    18  
    19  // markMoves marks any MOVXconst ops that need to avoid clobbering flags.
    20  func ssaMarkMoves(s *gc.SSAGenState, b *ssa.Block) {
    21  	flive := b.FlagsLiveAtEnd
    22  	for _, c := range b.ControlValues() {
    23  		flive = c.Type.IsFlags() || flive
    24  	}
    25  	for i := len(b.Values) - 1; i >= 0; i-- {
    26  		v := b.Values[i]
    27  		if flive && (v.Op == ssa.OpAMD64MOVLconst || v.Op == ssa.OpAMD64MOVQconst) {
    28  			// The "mark" is any non-nil Aux value.
    29  			v.Aux = v
    30  		}
    31  		if v.Type.IsFlags() {
    32  			flive = false
    33  		}
    34  		for _, a := range v.Args {
    35  			if a.Type.IsFlags() {
    36  				flive = true
    37  			}
    38  		}
    39  	}
    40  }
    41  
    42  // loadByType returns the load instruction of the given type.
    43  func loadByType(t *types.Type) obj.As {
    44  	// Avoid partial register write
    45  	if !t.IsFloat() && t.Size() <= 2 {
    46  		if t.Size() == 1 {
    47  			return x86.AMOVBLZX
    48  		} else {
    49  			return x86.AMOVWLZX
    50  		}
    51  	}
    52  	// Otherwise, there's no difference between load and store opcodes.
    53  	return storeByType(t)
    54  }
    55  
    56  // storeByType returns the store instruction of the given type.
    57  func storeByType(t *types.Type) obj.As {
    58  	width := t.Size()
    59  	if t.IsFloat() {
    60  		switch width {
    61  		case 4:
    62  			return x86.AMOVSS
    63  		case 8:
    64  			return x86.AMOVSD
    65  		}
    66  	} else {
    67  		switch width {
    68  		case 1:
    69  			return x86.AMOVB
    70  		case 2:
    71  			return x86.AMOVW
    72  		case 4:
    73  			return x86.AMOVL
    74  		case 8:
    75  			return x86.AMOVQ
    76  		}
    77  	}
    78  	panic("bad store type")
    79  }
    80  
    81  // moveByType returns the reg->reg move instruction of the given type.
    82  func moveByType(t *types.Type) obj.As {
    83  	if t.IsFloat() {
    84  		// Moving the whole sse2 register is faster
    85  		// than moving just the correct low portion of it.
    86  		// There is no xmm->xmm move with 1 byte opcode,
    87  		// so use movups, which has 2 byte opcode.
    88  		return x86.AMOVUPS
    89  	} else {
    90  		switch t.Size() {
    91  		case 1:
    92  			// Avoids partial register write
    93  			return x86.AMOVL
    94  		case 2:
    95  			return x86.AMOVL
    96  		case 4:
    97  			return x86.AMOVL
    98  		case 8:
    99  			return x86.AMOVQ
   100  		case 16:
   101  			return x86.AMOVUPS // int128s are in SSE registers
   102  		default:
   103  			panic(fmt.Sprintf("bad int register width %d:%s", t.Size(), t))
   104  		}
   105  	}
   106  }
   107  
   108  // opregreg emits instructions for
   109  //     dest := dest(To) op src(From)
   110  // and also returns the created obj.Prog so it
   111  // may be further adjusted (offset, scale, etc).
   112  func opregreg(s *gc.SSAGenState, op obj.As, dest, src int16) *obj.Prog {
   113  	p := s.Prog(op)
   114  	p.From.Type = obj.TYPE_REG
   115  	p.To.Type = obj.TYPE_REG
   116  	p.To.Reg = dest
   117  	p.From.Reg = src
   118  	return p
   119  }
   120  
   121  // memIdx fills out a as an indexed memory reference for v.
   122  // It assumes that the base register and the index register
   123  // are v.Args[0].Reg() and v.Args[1].Reg(), respectively.
   124  // The caller must still use gc.AddAux/gc.AddAux2 to handle v.Aux as necessary.
   125  func memIdx(a *obj.Addr, v *ssa.Value) {
   126  	r, i := v.Args[0].Reg(), v.Args[1].Reg()
   127  	a.Type = obj.TYPE_MEM
   128  	a.Scale = v.Op.Scale()
   129  	if a.Scale == 1 && i == x86.REG_SP {
   130  		r, i = i, r
   131  	}
   132  	a.Reg = r
   133  	a.Index = i
   134  }
   135  
   136  // DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ,
   137  // See runtime/mkduff.go.
   138  func duffStart(size int64) int64 {
   139  	x, _ := duff(size)
   140  	return x
   141  }
   142  func duffAdj(size int64) int64 {
   143  	_, x := duff(size)
   144  	return x
   145  }
   146  
   147  // duff returns the offset (from duffzero, in bytes) and pointer adjust (in bytes)
   148  // required to use the duffzero mechanism for a block of the given size.
   149  func duff(size int64) (int64, int64) {
   150  	if size < 32 || size > 1024 || size%dzClearStep != 0 {
   151  		panic("bad duffzero size")
   152  	}
   153  	steps := size / dzClearStep
   154  	blocks := steps / dzBlockLen
   155  	steps %= dzBlockLen
   156  	off := dzBlockSize * (dzBlocks - blocks)
   157  	var adj int64
   158  	if steps != 0 {
   159  		off -= dzLeaqSize
   160  		off -= dzMovSize * steps
   161  		adj -= dzClearStep * (dzBlockLen - steps)
   162  	}
   163  	return off, adj
   164  }
   165  
   166  func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
   167  	switch v.Op {
   168  	case ssa.OpAMD64VFMADD231SD:
   169  		p := s.Prog(v.Op.Asm())
   170  		p.From = obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[2].Reg()}
   171  		p.To = obj.Addr{Type: obj.TYPE_REG, Reg: v.Reg()}
   172  		p.SetFrom3(obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[1].Reg()})
   173  		if v.Reg() != v.Args[0].Reg() {
   174  			v.Fatalf("input[0] and output not in same register %s", v.LongString())
   175  		}
   176  	case ssa.OpAMD64ADDQ, ssa.OpAMD64ADDL:
   177  		r := v.Reg()
   178  		r1 := v.Args[0].Reg()
   179  		r2 := v.Args[1].Reg()
   180  		switch {
   181  		case r == r1:
   182  			p := s.Prog(v.Op.Asm())
   183  			p.From.Type = obj.TYPE_REG
   184  			p.From.Reg = r2
   185  			p.To.Type = obj.TYPE_REG
   186  			p.To.Reg = r
   187  		case r == r2:
   188  			p := s.Prog(v.Op.Asm())
   189  			p.From.Type = obj.TYPE_REG
   190  			p.From.Reg = r1
   191  			p.To.Type = obj.TYPE_REG
   192  			p.To.Reg = r
   193  		default:
   194  			var asm obj.As
   195  			if v.Op == ssa.OpAMD64ADDQ {
   196  				asm = x86.ALEAQ
   197  			} else {
   198  				asm = x86.ALEAL
   199  			}
   200  			p := s.Prog(asm)
   201  			p.From.Type = obj.TYPE_MEM
   202  			p.From.Reg = r1
   203  			p.From.Scale = 1
   204  			p.From.Index = r2
   205  			p.To.Type = obj.TYPE_REG
   206  			p.To.Reg = r
   207  		}
   208  	// 2-address opcode arithmetic
   209  	case ssa.OpAMD64SUBQ, ssa.OpAMD64SUBL,
   210  		ssa.OpAMD64MULQ, ssa.OpAMD64MULL,
   211  		ssa.OpAMD64ANDQ, ssa.OpAMD64ANDL,
   212  		ssa.OpAMD64ORQ, ssa.OpAMD64ORL,
   213  		ssa.OpAMD64XORQ, ssa.OpAMD64XORL,
   214  		ssa.OpAMD64SHLQ, ssa.OpAMD64SHLL,
   215  		ssa.OpAMD64SHRQ, ssa.OpAMD64SHRL, ssa.OpAMD64SHRW, ssa.OpAMD64SHRB,
   216  		ssa.OpAMD64SARQ, ssa.OpAMD64SARL, ssa.OpAMD64SARW, ssa.OpAMD64SARB,
   217  		ssa.OpAMD64ROLQ, ssa.OpAMD64ROLL, ssa.OpAMD64ROLW, ssa.OpAMD64ROLB,
   218  		ssa.OpAMD64RORQ, ssa.OpAMD64RORL, ssa.OpAMD64RORW, ssa.OpAMD64RORB,
   219  		ssa.OpAMD64ADDSS, ssa.OpAMD64ADDSD, ssa.OpAMD64SUBSS, ssa.OpAMD64SUBSD,
   220  		ssa.OpAMD64MULSS, ssa.OpAMD64MULSD, ssa.OpAMD64DIVSS, ssa.OpAMD64DIVSD,
   221  		ssa.OpAMD64PXOR,
   222  		ssa.OpAMD64BTSL, ssa.OpAMD64BTSQ,
   223  		ssa.OpAMD64BTCL, ssa.OpAMD64BTCQ,
   224  		ssa.OpAMD64BTRL, ssa.OpAMD64BTRQ:
   225  		r := v.Reg()
   226  		if r != v.Args[0].Reg() {
   227  			v.Fatalf("input[0] and output not in same register %s", v.LongString())
   228  		}
   229  		opregreg(s, v.Op.Asm(), r, v.Args[1].Reg())
   230  
   231  	case ssa.OpAMD64DIVQU, ssa.OpAMD64DIVLU, ssa.OpAMD64DIVWU:
   232  		// Arg[0] (the dividend) is in AX.
   233  		// Arg[1] (the divisor) can be in any other register.
   234  		// Result[0] (the quotient) is in AX.
   235  		// Result[1] (the remainder) is in DX.
   236  		r := v.Args[1].Reg()
   237  
   238  		// Zero extend dividend.
   239  		c := s.Prog(x86.AXORL)
   240  		c.From.Type = obj.TYPE_REG
   241  		c.From.Reg = x86.REG_DX
   242  		c.To.Type = obj.TYPE_REG
   243  		c.To.Reg = x86.REG_DX
   244  
   245  		// Issue divide.
   246  		p := s.Prog(v.Op.Asm())
   247  		p.From.Type = obj.TYPE_REG
   248  		p.From.Reg = r
   249  
   250  	case ssa.OpAMD64DIVQ, ssa.OpAMD64DIVL, ssa.OpAMD64DIVW:
   251  		// Arg[0] (the dividend) is in AX.
   252  		// Arg[1] (the divisor) can be in any other register.
   253  		// Result[0] (the quotient) is in AX.
   254  		// Result[1] (the remainder) is in DX.
   255  		r := v.Args[1].Reg()
   256  		var j1 *obj.Prog
   257  
   258  		// CPU faults upon signed overflow, which occurs when the most
   259  		// negative int is divided by -1. Handle divide by -1 as a special case.
   260  		if ssa.NeedsFixUp(v) {
   261  			var c *obj.Prog
   262  			switch v.Op {
   263  			case ssa.OpAMD64DIVQ:
   264  				c = s.Prog(x86.ACMPQ)
   265  			case ssa.OpAMD64DIVL:
   266  				c = s.Prog(x86.ACMPL)
   267  			case ssa.OpAMD64DIVW:
   268  				c = s.Prog(x86.ACMPW)
   269  			}
   270  			c.From.Type = obj.TYPE_REG
   271  			c.From.Reg = r
   272  			c.To.Type = obj.TYPE_CONST
   273  			c.To.Offset = -1
   274  			j1 = s.Prog(x86.AJEQ)
   275  			j1.To.Type = obj.TYPE_BRANCH
   276  		}
   277  
   278  		// Sign extend dividend.
   279  		switch v.Op {
   280  		case ssa.OpAMD64DIVQ:
   281  			s.Prog(x86.ACQO)
   282  		case ssa.OpAMD64DIVL:
   283  			s.Prog(x86.ACDQ)
   284  		case ssa.OpAMD64DIVW:
   285  			s.Prog(x86.ACWD)
   286  		}
   287  
   288  		// Issue divide.
   289  		p := s.Prog(v.Op.Asm())
   290  		p.From.Type = obj.TYPE_REG
   291  		p.From.Reg = r
   292  
   293  		if j1 != nil {
   294  			// Skip over -1 fixup code.
   295  			j2 := s.Prog(obj.AJMP)
   296  			j2.To.Type = obj.TYPE_BRANCH
   297  
   298  			// Issue -1 fixup code.
   299  			// n / -1 = -n
   300  			var n1 *obj.Prog
   301  			switch v.Op {
   302  			case ssa.OpAMD64DIVQ:
   303  				n1 = s.Prog(x86.ANEGQ)
   304  			case ssa.OpAMD64DIVL:
   305  				n1 = s.Prog(x86.ANEGL)
   306  			case ssa.OpAMD64DIVW:
   307  				n1 = s.Prog(x86.ANEGW)
   308  			}
   309  			n1.To.Type = obj.TYPE_REG
   310  			n1.To.Reg = x86.REG_AX
   311  
   312  			// n % -1 == 0
   313  			n2 := s.Prog(x86.AXORL)
   314  			n2.From.Type = obj.TYPE_REG
   315  			n2.From.Reg = x86.REG_DX
   316  			n2.To.Type = obj.TYPE_REG
   317  			n2.To.Reg = x86.REG_DX
   318  
   319  			// TODO(khr): issue only the -1 fixup code we need.
   320  			// For instance, if only the quotient is used, no point in zeroing the remainder.
   321  
   322  			j1.To.Val = n1
   323  			j2.To.Val = s.Pc()
   324  		}
   325  
   326  	case ssa.OpAMD64HMULQ, ssa.OpAMD64HMULL, ssa.OpAMD64HMULQU, ssa.OpAMD64HMULLU:
   327  		// the frontend rewrites constant division by 8/16/32 bit integers into
   328  		// HMUL by a constant
   329  		// SSA rewrites generate the 64 bit versions
   330  
   331  		// Arg[0] is already in AX as it's the only register we allow
   332  		// and DX is the only output we care about (the high bits)
   333  		p := s.Prog(v.Op.Asm())
   334  		p.From.Type = obj.TYPE_REG
   335  		p.From.Reg = v.Args[1].Reg()
   336  
   337  		// IMULB puts the high portion in AH instead of DL,
   338  		// so move it to DL for consistency
   339  		if v.Type.Size() == 1 {
   340  			m := s.Prog(x86.AMOVB)
   341  			m.From.Type = obj.TYPE_REG
   342  			m.From.Reg = x86.REG_AH
   343  			m.To.Type = obj.TYPE_REG
   344  			m.To.Reg = x86.REG_DX
   345  		}
   346  
   347  	case ssa.OpAMD64MULQU, ssa.OpAMD64MULLU:
   348  		// Arg[0] is already in AX as it's the only register we allow
   349  		// results lo in AX
   350  		p := s.Prog(v.Op.Asm())
   351  		p.From.Type = obj.TYPE_REG
   352  		p.From.Reg = v.Args[1].Reg()
   353  
   354  	case ssa.OpAMD64MULQU2:
   355  		// Arg[0] is already in AX as it's the only register we allow
   356  		// results hi in DX, lo in AX
   357  		p := s.Prog(v.Op.Asm())
   358  		p.From.Type = obj.TYPE_REG
   359  		p.From.Reg = v.Args[1].Reg()
   360  
   361  	case ssa.OpAMD64DIVQU2:
   362  		// Arg[0], Arg[1] are already in Dx, AX, as they're the only registers we allow
   363  		// results q in AX, r in DX
   364  		p := s.Prog(v.Op.Asm())
   365  		p.From.Type = obj.TYPE_REG
   366  		p.From.Reg = v.Args[2].Reg()
   367  
   368  	case ssa.OpAMD64AVGQU:
   369  		// compute (x+y)/2 unsigned.
   370  		// Do a 64-bit add, the overflow goes into the carry.
   371  		// Shift right once and pull the carry back into the 63rd bit.
   372  		r := v.Reg()
   373  		if r != v.Args[0].Reg() {
   374  			v.Fatalf("input[0] and output not in same register %s", v.LongString())
   375  		}
   376  		p := s.Prog(x86.AADDQ)
   377  		p.From.Type = obj.TYPE_REG
   378  		p.To.Type = obj.TYPE_REG
   379  		p.To.Reg = r
   380  		p.From.Reg = v.Args[1].Reg()
   381  		p = s.Prog(x86.ARCRQ)
   382  		p.From.Type = obj.TYPE_CONST
   383  		p.From.Offset = 1
   384  		p.To.Type = obj.TYPE_REG
   385  		p.To.Reg = r
   386  
   387  	case ssa.OpAMD64ADDQcarry, ssa.OpAMD64ADCQ:
   388  		r := v.Reg0()
   389  		r0 := v.Args[0].Reg()
   390  		r1 := v.Args[1].Reg()
   391  		switch r {
   392  		case r0:
   393  			p := s.Prog(v.Op.Asm())
   394  			p.From.Type = obj.TYPE_REG
   395  			p.From.Reg = r1
   396  			p.To.Type = obj.TYPE_REG
   397  			p.To.Reg = r
   398  		case r1:
   399  			p := s.Prog(v.Op.Asm())
   400  			p.From.Type = obj.TYPE_REG
   401  			p.From.Reg = r0
   402  			p.To.Type = obj.TYPE_REG
   403  			p.To.Reg = r
   404  		default:
   405  			v.Fatalf("output not in same register as an input %s", v.LongString())
   406  		}
   407  
   408  	case ssa.OpAMD64SUBQborrow, ssa.OpAMD64SBBQ:
   409  		p := s.Prog(v.Op.Asm())
   410  		p.From.Type = obj.TYPE_REG
   411  		p.From.Reg = v.Args[1].Reg()
   412  		p.To.Type = obj.TYPE_REG
   413  		p.To.Reg = v.Reg0()
   414  
   415  	case ssa.OpAMD64ADDQconstcarry, ssa.OpAMD64ADCQconst, ssa.OpAMD64SUBQconstborrow, ssa.OpAMD64SBBQconst:
   416  		p := s.Prog(v.Op.Asm())
   417  		p.From.Type = obj.TYPE_CONST
   418  		p.From.Offset = v.AuxInt
   419  		p.To.Type = obj.TYPE_REG
   420  		p.To.Reg = v.Reg0()
   421  
   422  	case ssa.OpAMD64ADDQconst, ssa.OpAMD64ADDLconst:
   423  		r := v.Reg()
   424  		a := v.Args[0].Reg()
   425  		if r == a {
   426  			switch v.AuxInt {
   427  			case 1:
   428  				var asm obj.As
   429  				// Software optimization manual recommends add $1,reg.
   430  				// But inc/dec is 1 byte smaller. ICC always uses inc
   431  				// Clang/GCC choose depending on flags, but prefer add.
   432  				// Experiments show that inc/dec is both a little faster
   433  				// and make a binary a little smaller.
   434  				if v.Op == ssa.OpAMD64ADDQconst {
   435  					asm = x86.AINCQ
   436  				} else {
   437  					asm = x86.AINCL
   438  				}
   439  				p := s.Prog(asm)
   440  				p.To.Type = obj.TYPE_REG
   441  				p.To.Reg = r
   442  				return
   443  			case -1:
   444  				var asm obj.As
   445  				if v.Op == ssa.OpAMD64ADDQconst {
   446  					asm = x86.ADECQ
   447  				} else {
   448  					asm = x86.ADECL
   449  				}
   450  				p := s.Prog(asm)
   451  				p.To.Type = obj.TYPE_REG
   452  				p.To.Reg = r
   453  				return
   454  			case 0x80:
   455  				// 'SUBQ $-0x80, r' is shorter to encode than
   456  				// and functionally equivalent to 'ADDQ $0x80, r'.
   457  				asm := x86.ASUBL
   458  				if v.Op == ssa.OpAMD64ADDQconst {
   459  					asm = x86.ASUBQ
   460  				}
   461  				p := s.Prog(asm)
   462  				p.From.Type = obj.TYPE_CONST
   463  				p.From.Offset = -0x80
   464  				p.To.Type = obj.TYPE_REG
   465  				p.To.Reg = r
   466  				return
   467  
   468  			}
   469  			p := s.Prog(v.Op.Asm())
   470  			p.From.Type = obj.TYPE_CONST
   471  			p.From.Offset = v.AuxInt
   472  			p.To.Type = obj.TYPE_REG
   473  			p.To.Reg = r
   474  			return
   475  		}
   476  		var asm obj.As
   477  		if v.Op == ssa.OpAMD64ADDQconst {
   478  			asm = x86.ALEAQ
   479  		} else {
   480  			asm = x86.ALEAL
   481  		}
   482  		p := s.Prog(asm)
   483  		p.From.Type = obj.TYPE_MEM
   484  		p.From.Reg = a
   485  		p.From.Offset = v.AuxInt
   486  		p.To.Type = obj.TYPE_REG
   487  		p.To.Reg = r
   488  
   489  	case ssa.OpAMD64CMOVQEQ, ssa.OpAMD64CMOVLEQ, ssa.OpAMD64CMOVWEQ,
   490  		ssa.OpAMD64CMOVQLT, ssa.OpAMD64CMOVLLT, ssa.OpAMD64CMOVWLT,
   491  		ssa.OpAMD64CMOVQNE, ssa.OpAMD64CMOVLNE, ssa.OpAMD64CMOVWNE,
   492  		ssa.OpAMD64CMOVQGT, ssa.OpAMD64CMOVLGT, ssa.OpAMD64CMOVWGT,
   493  		ssa.OpAMD64CMOVQLE, ssa.OpAMD64CMOVLLE, ssa.OpAMD64CMOVWLE,
   494  		ssa.OpAMD64CMOVQGE, ssa.OpAMD64CMOVLGE, ssa.OpAMD64CMOVWGE,
   495  		ssa.OpAMD64CMOVQHI, ssa.OpAMD64CMOVLHI, ssa.OpAMD64CMOVWHI,
   496  		ssa.OpAMD64CMOVQLS, ssa.OpAMD64CMOVLLS, ssa.OpAMD64CMOVWLS,
   497  		ssa.OpAMD64CMOVQCC, ssa.OpAMD64CMOVLCC, ssa.OpAMD64CMOVWCC,
   498  		ssa.OpAMD64CMOVQCS, ssa.OpAMD64CMOVLCS, ssa.OpAMD64CMOVWCS,
   499  		ssa.OpAMD64CMOVQGTF, ssa.OpAMD64CMOVLGTF, ssa.OpAMD64CMOVWGTF,
   500  		ssa.OpAMD64CMOVQGEF, ssa.OpAMD64CMOVLGEF, ssa.OpAMD64CMOVWGEF:
   501  		r := v.Reg()
   502  		if r != v.Args[0].Reg() {
   503  			v.Fatalf("input[0] and output not in same register %s", v.LongString())
   504  		}
   505  		p := s.Prog(v.Op.Asm())
   506  		p.From.Type = obj.TYPE_REG
   507  		p.From.Reg = v.Args[1].Reg()
   508  		p.To.Type = obj.TYPE_REG
   509  		p.To.Reg = r
   510  
   511  	case ssa.OpAMD64CMOVQNEF, ssa.OpAMD64CMOVLNEF, ssa.OpAMD64CMOVWNEF:
   512  		r := v.Reg()
   513  		if r != v.Args[0].Reg() {
   514  			v.Fatalf("input[0] and output not in same register %s", v.LongString())
   515  		}
   516  		// Flag condition: ^ZERO || PARITY
   517  		// Generate:
   518  		//   CMOV*NE  SRC,DST
   519  		//   CMOV*PS  SRC,DST
   520  		p := s.Prog(v.Op.Asm())
   521  		p.From.Type = obj.TYPE_REG
   522  		p.From.Reg = v.Args[1].Reg()
   523  		p.To.Type = obj.TYPE_REG
   524  		p.To.Reg = r
   525  		var q *obj.Prog
   526  		if v.Op == ssa.OpAMD64CMOVQNEF {
   527  			q = s.Prog(x86.ACMOVQPS)
   528  		} else if v.Op == ssa.OpAMD64CMOVLNEF {
   529  			q = s.Prog(x86.ACMOVLPS)
   530  		} else {
   531  			q = s.Prog(x86.ACMOVWPS)
   532  		}
   533  		q.From.Type = obj.TYPE_REG
   534  		q.From.Reg = v.Args[1].Reg()
   535  		q.To.Type = obj.TYPE_REG
   536  		q.To.Reg = r
   537  
   538  	case ssa.OpAMD64CMOVQEQF, ssa.OpAMD64CMOVLEQF, ssa.OpAMD64CMOVWEQF:
   539  		r := v.Reg()
   540  		if r != v.Args[0].Reg() {
   541  			v.Fatalf("input[0] and output not in same register %s", v.LongString())
   542  		}
   543  
   544  		// Flag condition: ZERO && !PARITY
   545  		// Generate:
   546  		//   MOV      SRC,AX
   547  		//   CMOV*NE  DST,AX
   548  		//   CMOV*PC  AX,DST
   549  		//
   550  		// TODO(rasky): we could generate:
   551  		//   CMOV*NE  DST,SRC
   552  		//   CMOV*PC  SRC,DST
   553  		// But this requires a way for regalloc to know that SRC might be
   554  		// clobbered by this instruction.
   555  		if v.Args[1].Reg() != x86.REG_AX {
   556  			opregreg(s, moveByType(v.Type), x86.REG_AX, v.Args[1].Reg())
   557  		}
   558  		p := s.Prog(v.Op.Asm())
   559  		p.From.Type = obj.TYPE_REG
   560  		p.From.Reg = r
   561  		p.To.Type = obj.TYPE_REG
   562  		p.To.Reg = x86.REG_AX
   563  		var q *obj.Prog
   564  		if v.Op == ssa.OpAMD64CMOVQEQF {
   565  			q = s.Prog(x86.ACMOVQPC)
   566  		} else if v.Op == ssa.OpAMD64CMOVLEQF {
   567  			q = s.Prog(x86.ACMOVLPC)
   568  		} else {
   569  			q = s.Prog(x86.ACMOVWPC)
   570  		}
   571  		q.From.Type = obj.TYPE_REG
   572  		q.From.Reg = x86.REG_AX
   573  		q.To.Type = obj.TYPE_REG
   574  		q.To.Reg = r
   575  
   576  	case ssa.OpAMD64MULQconst, ssa.OpAMD64MULLconst:
   577  		r := v.Reg()
   578  		p := s.Prog(v.Op.Asm())
   579  		p.From.Type = obj.TYPE_CONST
   580  		p.From.Offset = v.AuxInt
   581  		p.To.Type = obj.TYPE_REG
   582  		p.To.Reg = r
   583  		p.SetFrom3(obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[0].Reg()})
   584  
   585  	case ssa.OpAMD64SUBQconst, ssa.OpAMD64SUBLconst,
   586  		ssa.OpAMD64ANDQconst, ssa.OpAMD64ANDLconst,
   587  		ssa.OpAMD64ORQconst, ssa.OpAMD64ORLconst,
   588  		ssa.OpAMD64XORQconst, ssa.OpAMD64XORLconst,
   589  		ssa.OpAMD64SHLQconst, ssa.OpAMD64SHLLconst,
   590  		ssa.OpAMD64SHRQconst, ssa.OpAMD64SHRLconst, ssa.OpAMD64SHRWconst, ssa.OpAMD64SHRBconst,
   591  		ssa.OpAMD64SARQconst, ssa.OpAMD64SARLconst, ssa.OpAMD64SARWconst, ssa.OpAMD64SARBconst,
   592  		ssa.OpAMD64ROLQconst, ssa.OpAMD64ROLLconst, ssa.OpAMD64ROLWconst, ssa.OpAMD64ROLBconst:
   593  		r := v.Reg()
   594  		if r != v.Args[0].Reg() {
   595  			v.Fatalf("input[0] and output not in same register %s", v.LongString())
   596  		}
   597  		p := s.Prog(v.Op.Asm())
   598  		p.From.Type = obj.TYPE_CONST
   599  		p.From.Offset = v.AuxInt
   600  		p.To.Type = obj.TYPE_REG
   601  		p.To.Reg = r
   602  	case ssa.OpAMD64SBBQcarrymask, ssa.OpAMD64SBBLcarrymask:
   603  		r := v.Reg()
   604  		p := s.Prog(v.Op.Asm())
   605  		p.From.Type = obj.TYPE_REG
   606  		p.From.Reg = r
   607  		p.To.Type = obj.TYPE_REG
   608  		p.To.Reg = r
   609  	case ssa.OpAMD64LEAQ1, ssa.OpAMD64LEAQ2, ssa.OpAMD64LEAQ4, ssa.OpAMD64LEAQ8,
   610  		ssa.OpAMD64LEAL1, ssa.OpAMD64LEAL2, ssa.OpAMD64LEAL4, ssa.OpAMD64LEAL8,
   611  		ssa.OpAMD64LEAW1, ssa.OpAMD64LEAW2, ssa.OpAMD64LEAW4, ssa.OpAMD64LEAW8:
   612  		p := s.Prog(v.Op.Asm())
   613  		memIdx(&p.From, v)
   614  		o := v.Reg()
   615  		p.To.Type = obj.TYPE_REG
   616  		p.To.Reg = o
   617  		if v.AuxInt != 0 && v.Aux == nil {
   618  			// Emit an additional LEA to add the displacement instead of creating a slow 3 operand LEA.
   619  			switch v.Op {
   620  			case ssa.OpAMD64LEAQ1, ssa.OpAMD64LEAQ2, ssa.OpAMD64LEAQ4, ssa.OpAMD64LEAQ8:
   621  				p = s.Prog(x86.ALEAQ)
   622  			case ssa.OpAMD64LEAL1, ssa.OpAMD64LEAL2, ssa.OpAMD64LEAL4, ssa.OpAMD64LEAL8:
   623  				p = s.Prog(x86.ALEAL)
   624  			case ssa.OpAMD64LEAW1, ssa.OpAMD64LEAW2, ssa.OpAMD64LEAW4, ssa.OpAMD64LEAW8:
   625  				p = s.Prog(x86.ALEAW)
   626  			}
   627  			p.From.Type = obj.TYPE_MEM
   628  			p.From.Reg = o
   629  			p.To.Type = obj.TYPE_REG
   630  			p.To.Reg = o
   631  		}
   632  		gc.AddAux(&p.From, v)
   633  	case ssa.OpAMD64LEAQ, ssa.OpAMD64LEAL, ssa.OpAMD64LEAW:
   634  		p := s.Prog(v.Op.Asm())
   635  		p.From.Type = obj.TYPE_MEM
   636  		p.From.Reg = v.Args[0].Reg()
   637  		gc.AddAux(&p.From, v)
   638  		p.To.Type = obj.TYPE_REG
   639  		p.To.Reg = v.Reg()
   640  	case ssa.OpAMD64CMPQ, ssa.OpAMD64CMPL, ssa.OpAMD64CMPW, ssa.OpAMD64CMPB,
   641  		ssa.OpAMD64TESTQ, ssa.OpAMD64TESTL, ssa.OpAMD64TESTW, ssa.OpAMD64TESTB,
   642  		ssa.OpAMD64BTL, ssa.OpAMD64BTQ:
   643  		opregreg(s, v.Op.Asm(), v.Args[1].Reg(), v.Args[0].Reg())
   644  	case ssa.OpAMD64UCOMISS, ssa.OpAMD64UCOMISD:
   645  		// Go assembler has swapped operands for UCOMISx relative to CMP,
   646  		// must account for that right here.
   647  		opregreg(s, v.Op.Asm(), v.Args[0].Reg(), v.Args[1].Reg())
   648  	case ssa.OpAMD64CMPQconst, ssa.OpAMD64CMPLconst, ssa.OpAMD64CMPWconst, ssa.OpAMD64CMPBconst:
   649  		p := s.Prog(v.Op.Asm())
   650  		p.From.Type = obj.TYPE_REG
   651  		p.From.Reg = v.Args[0].Reg()
   652  		p.To.Type = obj.TYPE_CONST
   653  		p.To.Offset = v.AuxInt
   654  	case ssa.OpAMD64BTLconst, ssa.OpAMD64BTQconst,
   655  		ssa.OpAMD64TESTQconst, ssa.OpAMD64TESTLconst, ssa.OpAMD64TESTWconst, ssa.OpAMD64TESTBconst,
   656  		ssa.OpAMD64BTSLconst, ssa.OpAMD64BTSQconst,
   657  		ssa.OpAMD64BTCLconst, ssa.OpAMD64BTCQconst,
   658  		ssa.OpAMD64BTRLconst, ssa.OpAMD64BTRQconst:
   659  		op := v.Op
   660  		if op == ssa.OpAMD64BTQconst && v.AuxInt < 32 {
   661  			// Emit 32-bit version because it's shorter
   662  			op = ssa.OpAMD64BTLconst
   663  		}
   664  		p := s.Prog(op.Asm())
   665  		p.From.Type = obj.TYPE_CONST
   666  		p.From.Offset = v.AuxInt
   667  		p.To.Type = obj.TYPE_REG
   668  		p.To.Reg = v.Args[0].Reg()
   669  	case ssa.OpAMD64CMPQload, ssa.OpAMD64CMPLload, ssa.OpAMD64CMPWload, ssa.OpAMD64CMPBload:
   670  		p := s.Prog(v.Op.Asm())
   671  		p.From.Type = obj.TYPE_MEM
   672  		p.From.Reg = v.Args[0].Reg()
   673  		gc.AddAux(&p.From, v)
   674  		p.To.Type = obj.TYPE_REG
   675  		p.To.Reg = v.Args[1].Reg()
   676  	case ssa.OpAMD64CMPQconstload, ssa.OpAMD64CMPLconstload, ssa.OpAMD64CMPWconstload, ssa.OpAMD64CMPBconstload:
   677  		sc := v.AuxValAndOff()
   678  		p := s.Prog(v.Op.Asm())
   679  		p.From.Type = obj.TYPE_MEM
   680  		p.From.Reg = v.Args[0].Reg()
   681  		gc.AddAux2(&p.From, v, sc.Off())
   682  		p.To.Type = obj.TYPE_CONST
   683  		p.To.Offset = sc.Val()
   684  	case ssa.OpAMD64MOVLconst, ssa.OpAMD64MOVQconst:
   685  		x := v.Reg()
   686  
   687  		// If flags aren't live (indicated by v.Aux == nil),
   688  		// then we can rewrite MOV $0, AX into XOR AX, AX.
   689  		if v.AuxInt == 0 && v.Aux == nil {
   690  			p := s.Prog(x86.AXORL)
   691  			p.From.Type = obj.TYPE_REG
   692  			p.From.Reg = x
   693  			p.To.Type = obj.TYPE_REG
   694  			p.To.Reg = x
   695  			break
   696  		}
   697  
   698  		asm := v.Op.Asm()
   699  		// Use MOVL to move a small constant into a register
   700  		// when the constant is positive and fits into 32 bits.
   701  		if 0 <= v.AuxInt && v.AuxInt <= (1<<32-1) {
   702  			// The upper 32bit are zeroed automatically when using MOVL.
   703  			asm = x86.AMOVL
   704  		}
   705  		p := s.Prog(asm)
   706  		p.From.Type = obj.TYPE_CONST
   707  		p.From.Offset = v.AuxInt
   708  		p.To.Type = obj.TYPE_REG
   709  		p.To.Reg = x
   710  	case ssa.OpAMD64MOVSSconst, ssa.OpAMD64MOVSDconst:
   711  		x := v.Reg()
   712  		p := s.Prog(v.Op.Asm())
   713  		p.From.Type = obj.TYPE_FCONST
   714  		p.From.Val = math.Float64frombits(uint64(v.AuxInt))
   715  		p.To.Type = obj.TYPE_REG
   716  		p.To.Reg = x
   717  	case ssa.OpAMD64MOVQload, ssa.OpAMD64MOVSSload, ssa.OpAMD64MOVSDload, ssa.OpAMD64MOVLload, ssa.OpAMD64MOVWload, ssa.OpAMD64MOVBload, ssa.OpAMD64MOVBQSXload, ssa.OpAMD64MOVWQSXload, ssa.OpAMD64MOVLQSXload, ssa.OpAMD64MOVOload:
   718  		p := s.Prog(v.Op.Asm())
   719  		p.From.Type = obj.TYPE_MEM
   720  		p.From.Reg = v.Args[0].Reg()
   721  		gc.AddAux(&p.From, v)
   722  		p.To.Type = obj.TYPE_REG
   723  		p.To.Reg = v.Reg()
   724  	case ssa.OpAMD64MOVBloadidx1, ssa.OpAMD64MOVWloadidx1, ssa.OpAMD64MOVLloadidx1, ssa.OpAMD64MOVQloadidx1, ssa.OpAMD64MOVSSloadidx1, ssa.OpAMD64MOVSDloadidx1,
   725  		ssa.OpAMD64MOVQloadidx8, ssa.OpAMD64MOVSDloadidx8, ssa.OpAMD64MOVLloadidx8, ssa.OpAMD64MOVLloadidx4, ssa.OpAMD64MOVSSloadidx4, ssa.OpAMD64MOVWloadidx2:
   726  		p := s.Prog(v.Op.Asm())
   727  		memIdx(&p.From, v)
   728  		gc.AddAux(&p.From, v)
   729  		p.To.Type = obj.TYPE_REG
   730  		p.To.Reg = v.Reg()
   731  	case ssa.OpAMD64MOVQstore, ssa.OpAMD64MOVSSstore, ssa.OpAMD64MOVSDstore, ssa.OpAMD64MOVLstore, ssa.OpAMD64MOVWstore, ssa.OpAMD64MOVBstore, ssa.OpAMD64MOVOstore,
   732  		ssa.OpAMD64BTCQmodify, ssa.OpAMD64BTCLmodify, ssa.OpAMD64BTRQmodify, ssa.OpAMD64BTRLmodify, ssa.OpAMD64BTSQmodify, ssa.OpAMD64BTSLmodify,
   733  		ssa.OpAMD64ADDQmodify, ssa.OpAMD64SUBQmodify, ssa.OpAMD64ANDQmodify, ssa.OpAMD64ORQmodify, ssa.OpAMD64XORQmodify,
   734  		ssa.OpAMD64ADDLmodify, ssa.OpAMD64SUBLmodify, ssa.OpAMD64ANDLmodify, ssa.OpAMD64ORLmodify, ssa.OpAMD64XORLmodify:
   735  		p := s.Prog(v.Op.Asm())
   736  		p.From.Type = obj.TYPE_REG
   737  		p.From.Reg = v.Args[1].Reg()
   738  		p.To.Type = obj.TYPE_MEM
   739  		p.To.Reg = v.Args[0].Reg()
   740  		gc.AddAux(&p.To, v)
   741  	case ssa.OpAMD64MOVBstoreidx1, ssa.OpAMD64MOVWstoreidx1, ssa.OpAMD64MOVLstoreidx1, ssa.OpAMD64MOVQstoreidx1, ssa.OpAMD64MOVSSstoreidx1, ssa.OpAMD64MOVSDstoreidx1,
   742  		ssa.OpAMD64MOVQstoreidx8, ssa.OpAMD64MOVSDstoreidx8, ssa.OpAMD64MOVLstoreidx8, ssa.OpAMD64MOVSSstoreidx4, ssa.OpAMD64MOVLstoreidx4, ssa.OpAMD64MOVWstoreidx2:
   743  		p := s.Prog(v.Op.Asm())
   744  		p.From.Type = obj.TYPE_REG
   745  		p.From.Reg = v.Args[2].Reg()
   746  		memIdx(&p.To, v)
   747  		gc.AddAux(&p.To, v)
   748  	case ssa.OpAMD64ADDQconstmodify, ssa.OpAMD64ADDLconstmodify:
   749  		sc := v.AuxValAndOff()
   750  		off := sc.Off()
   751  		val := sc.Val()
   752  		if val == 1 || val == -1 {
   753  			var asm obj.As
   754  			if v.Op == ssa.OpAMD64ADDQconstmodify {
   755  				if val == 1 {
   756  					asm = x86.AINCQ
   757  				} else {
   758  					asm = x86.ADECQ
   759  				}
   760  			} else {
   761  				if val == 1 {
   762  					asm = x86.AINCL
   763  				} else {
   764  					asm = x86.ADECL
   765  				}
   766  			}
   767  			p := s.Prog(asm)
   768  			p.To.Type = obj.TYPE_MEM
   769  			p.To.Reg = v.Args[0].Reg()
   770  			gc.AddAux2(&p.To, v, off)
   771  			break
   772  		}
   773  		fallthrough
   774  	case ssa.OpAMD64ANDQconstmodify, ssa.OpAMD64ANDLconstmodify, ssa.OpAMD64ORQconstmodify, ssa.OpAMD64ORLconstmodify,
   775  		ssa.OpAMD64BTCQconstmodify, ssa.OpAMD64BTCLconstmodify, ssa.OpAMD64BTSQconstmodify, ssa.OpAMD64BTSLconstmodify,
   776  		ssa.OpAMD64BTRQconstmodify, ssa.OpAMD64BTRLconstmodify, ssa.OpAMD64XORQconstmodify, ssa.OpAMD64XORLconstmodify:
   777  		sc := v.AuxValAndOff()
   778  		off := sc.Off()
   779  		val := sc.Val()
   780  		p := s.Prog(v.Op.Asm())
   781  		p.From.Type = obj.TYPE_CONST
   782  		p.From.Offset = val
   783  		p.To.Type = obj.TYPE_MEM
   784  		p.To.Reg = v.Args[0].Reg()
   785  		gc.AddAux2(&p.To, v, off)
   786  	case ssa.OpAMD64MOVQstoreconst, ssa.OpAMD64MOVLstoreconst, ssa.OpAMD64MOVWstoreconst, ssa.OpAMD64MOVBstoreconst:
   787  		p := s.Prog(v.Op.Asm())
   788  		p.From.Type = obj.TYPE_CONST
   789  		sc := v.AuxValAndOff()
   790  		p.From.Offset = sc.Val()
   791  		p.To.Type = obj.TYPE_MEM
   792  		p.To.Reg = v.Args[0].Reg()
   793  		gc.AddAux2(&p.To, v, sc.Off())
   794  	case ssa.OpAMD64MOVQstoreconstidx1, ssa.OpAMD64MOVQstoreconstidx8, ssa.OpAMD64MOVLstoreconstidx1, ssa.OpAMD64MOVLstoreconstidx4, ssa.OpAMD64MOVWstoreconstidx1, ssa.OpAMD64MOVWstoreconstidx2, ssa.OpAMD64MOVBstoreconstidx1:
   795  		p := s.Prog(v.Op.Asm())
   796  		p.From.Type = obj.TYPE_CONST
   797  		sc := v.AuxValAndOff()
   798  		p.From.Offset = sc.Val()
   799  		memIdx(&p.To, v)
   800  		gc.AddAux2(&p.To, v, sc.Off())
   801  	case ssa.OpAMD64MOVLQSX, ssa.OpAMD64MOVWQSX, ssa.OpAMD64MOVBQSX, ssa.OpAMD64MOVLQZX, ssa.OpAMD64MOVWQZX, ssa.OpAMD64MOVBQZX,
   802  		ssa.OpAMD64CVTTSS2SL, ssa.OpAMD64CVTTSD2SL, ssa.OpAMD64CVTTSS2SQ, ssa.OpAMD64CVTTSD2SQ,
   803  		ssa.OpAMD64CVTSS2SD, ssa.OpAMD64CVTSD2SS:
   804  		opregreg(s, v.Op.Asm(), v.Reg(), v.Args[0].Reg())
   805  	case ssa.OpAMD64CVTSL2SD, ssa.OpAMD64CVTSQ2SD, ssa.OpAMD64CVTSQ2SS, ssa.OpAMD64CVTSL2SS:
   806  		r := v.Reg()
   807  		// Break false dependency on destination register.
   808  		opregreg(s, x86.AXORPS, r, r)
   809  		opregreg(s, v.Op.Asm(), r, v.Args[0].Reg())
   810  	case ssa.OpAMD64MOVQi2f, ssa.OpAMD64MOVQf2i, ssa.OpAMD64MOVLi2f, ssa.OpAMD64MOVLf2i:
   811  		var p *obj.Prog
   812  		switch v.Op {
   813  		case ssa.OpAMD64MOVQi2f, ssa.OpAMD64MOVQf2i:
   814  			p = s.Prog(x86.AMOVQ)
   815  		case ssa.OpAMD64MOVLi2f, ssa.OpAMD64MOVLf2i:
   816  			p = s.Prog(x86.AMOVL)
   817  		}
   818  		p.From.Type = obj.TYPE_REG
   819  		p.From.Reg = v.Args[0].Reg()
   820  		p.To.Type = obj.TYPE_REG
   821  		p.To.Reg = v.Reg()
   822  	case ssa.OpAMD64ADDQload, ssa.OpAMD64ADDLload, ssa.OpAMD64SUBQload, ssa.OpAMD64SUBLload,
   823  		ssa.OpAMD64ANDQload, ssa.OpAMD64ANDLload, ssa.OpAMD64ORQload, ssa.OpAMD64ORLload,
   824  		ssa.OpAMD64XORQload, ssa.OpAMD64XORLload, ssa.OpAMD64ADDSDload, ssa.OpAMD64ADDSSload,
   825  		ssa.OpAMD64SUBSDload, ssa.OpAMD64SUBSSload, ssa.OpAMD64MULSDload, ssa.OpAMD64MULSSload,
   826  		ssa.OpAMD64DIVSDload, ssa.OpAMD64DIVSSload:
   827  		p := s.Prog(v.Op.Asm())
   828  		p.From.Type = obj.TYPE_MEM
   829  		p.From.Reg = v.Args[1].Reg()
   830  		gc.AddAux(&p.From, v)
   831  		p.To.Type = obj.TYPE_REG
   832  		p.To.Reg = v.Reg()
   833  		if v.Reg() != v.Args[0].Reg() {
   834  			v.Fatalf("input[0] and output not in same register %s", v.LongString())
   835  		}
   836  	case ssa.OpAMD64DUFFZERO:
   837  		off := duffStart(v.AuxInt)
   838  		adj := duffAdj(v.AuxInt)
   839  		var p *obj.Prog
   840  		if adj != 0 {
   841  			p = s.Prog(x86.ALEAQ)
   842  			p.From.Type = obj.TYPE_MEM
   843  			p.From.Offset = adj
   844  			p.From.Reg = x86.REG_DI
   845  			p.To.Type = obj.TYPE_REG
   846  			p.To.Reg = x86.REG_DI
   847  		}
   848  		p = s.Prog(obj.ADUFFZERO)
   849  		p.To.Type = obj.TYPE_ADDR
   850  		p.To.Sym = gc.Duffzero
   851  		p.To.Offset = off
   852  	case ssa.OpAMD64MOVOconst:
   853  		if v.AuxInt != 0 {
   854  			v.Fatalf("MOVOconst can only do constant=0")
   855  		}
   856  		r := v.Reg()
   857  		opregreg(s, x86.AXORPS, r, r)
   858  	case ssa.OpAMD64DUFFCOPY:
   859  		p := s.Prog(obj.ADUFFCOPY)
   860  		p.To.Type = obj.TYPE_ADDR
   861  		p.To.Sym = gc.Duffcopy
   862  		p.To.Offset = v.AuxInt
   863  
   864  	case ssa.OpCopy: // TODO: use MOVQreg for reg->reg copies instead of OpCopy?
   865  		if v.Type.IsMemory() {
   866  			return
   867  		}
   868  		x := v.Args[0].Reg()
   869  		y := v.Reg()
   870  		if x != y {
   871  			opregreg(s, moveByType(v.Type), y, x)
   872  		}
   873  	case ssa.OpLoadReg:
   874  		if v.Type.IsFlags() {
   875  			v.Fatalf("load flags not implemented: %v", v.LongString())
   876  			return
   877  		}
   878  		p := s.Prog(loadByType(v.Type))
   879  		gc.AddrAuto(&p.From, v.Args[0])
   880  		p.To.Type = obj.TYPE_REG
   881  		p.To.Reg = v.Reg()
   882  
   883  	case ssa.OpStoreReg:
   884  		if v.Type.IsFlags() {
   885  			v.Fatalf("store flags not implemented: %v", v.LongString())
   886  			return
   887  		}
   888  		p := s.Prog(storeByType(v.Type))
   889  		p.From.Type = obj.TYPE_REG
   890  		p.From.Reg = v.Args[0].Reg()
   891  		gc.AddrAuto(&p.To, v)
   892  	case ssa.OpAMD64LoweredGetClosurePtr:
   893  		// Closure pointer is DX.
   894  		gc.CheckLoweredGetClosurePtr(v)
   895  	case ssa.OpAMD64LoweredGetG:
   896  		r := v.Reg()
   897  		// See the comments in cmd/internal/obj/x86/obj6.go
   898  		// near CanUse1InsnTLS for a detailed explanation of these instructions.
   899  		if x86.CanUse1InsnTLS(gc.Ctxt) {
   900  			// MOVQ (TLS), r
   901  			p := s.Prog(x86.AMOVQ)
   902  			p.From.Type = obj.TYPE_MEM
   903  			p.From.Reg = x86.REG_TLS
   904  			p.To.Type = obj.TYPE_REG
   905  			p.To.Reg = r
   906  		} else {
   907  			// MOVQ TLS, r
   908  			// MOVQ (r)(TLS*1), r
   909  			p := s.Prog(x86.AMOVQ)
   910  			p.From.Type = obj.TYPE_REG
   911  			p.From.Reg = x86.REG_TLS
   912  			p.To.Type = obj.TYPE_REG
   913  			p.To.Reg = r
   914  			q := s.Prog(x86.AMOVQ)
   915  			q.From.Type = obj.TYPE_MEM
   916  			q.From.Reg = r
   917  			q.From.Index = x86.REG_TLS
   918  			q.From.Scale = 1
   919  			q.To.Type = obj.TYPE_REG
   920  			q.To.Reg = r
   921  		}
   922  	case ssa.OpAMD64CALLstatic, ssa.OpAMD64CALLclosure, ssa.OpAMD64CALLinter:
   923  		s.Call(v)
   924  
   925  	case ssa.OpAMD64LoweredGetCallerPC:
   926  		p := s.Prog(x86.AMOVQ)
   927  		p.From.Type = obj.TYPE_MEM
   928  		p.From.Offset = -8 // PC is stored 8 bytes below first parameter.
   929  		p.From.Name = obj.NAME_PARAM
   930  		p.To.Type = obj.TYPE_REG
   931  		p.To.Reg = v.Reg()
   932  
   933  	case ssa.OpAMD64LoweredGetCallerSP:
   934  		// caller's SP is the address of the first arg
   935  		mov := x86.AMOVQ
   936  		if gc.Widthptr == 4 {
   937  			mov = x86.AMOVL
   938  		}
   939  		p := s.Prog(mov)
   940  		p.From.Type = obj.TYPE_ADDR
   941  		p.From.Offset = -gc.Ctxt.FixedFrameSize() // 0 on amd64, just to be consistent with other architectures
   942  		p.From.Name = obj.NAME_PARAM
   943  		p.To.Type = obj.TYPE_REG
   944  		p.To.Reg = v.Reg()
   945  
   946  	case ssa.OpAMD64LoweredWB:
   947  		p := s.Prog(obj.ACALL)
   948  		p.To.Type = obj.TYPE_MEM
   949  		p.To.Name = obj.NAME_EXTERN
   950  		p.To.Sym = v.Aux.(*obj.LSym)
   951  
   952  	case ssa.OpAMD64LoweredPanicBoundsA, ssa.OpAMD64LoweredPanicBoundsB, ssa.OpAMD64LoweredPanicBoundsC:
   953  		p := s.Prog(obj.ACALL)
   954  		p.To.Type = obj.TYPE_MEM
   955  		p.To.Name = obj.NAME_EXTERN
   956  		p.To.Sym = gc.BoundsCheckFunc[v.AuxInt]
   957  		s.UseArgs(int64(2 * gc.Widthptr)) // space used in callee args area by assembly stubs
   958  
   959  	case ssa.OpAMD64NEGQ, ssa.OpAMD64NEGL,
   960  		ssa.OpAMD64BSWAPQ, ssa.OpAMD64BSWAPL,
   961  		ssa.OpAMD64NOTQ, ssa.OpAMD64NOTL:
   962  		r := v.Reg()
   963  		if r != v.Args[0].Reg() {
   964  			v.Fatalf("input[0] and output not in same register %s", v.LongString())
   965  		}
   966  		p := s.Prog(v.Op.Asm())
   967  		p.To.Type = obj.TYPE_REG
   968  		p.To.Reg = r
   969  
   970  	case ssa.OpAMD64NEGLflags:
   971  		r := v.Reg0()
   972  		if r != v.Args[0].Reg() {
   973  			v.Fatalf("input[0] and output not in same register %s", v.LongString())
   974  		}
   975  		p := s.Prog(v.Op.Asm())
   976  		p.To.Type = obj.TYPE_REG
   977  		p.To.Reg = r
   978  
   979  	case ssa.OpAMD64BSFQ, ssa.OpAMD64BSRQ, ssa.OpAMD64BSFL, ssa.OpAMD64BSRL, ssa.OpAMD64SQRTSD:
   980  		p := s.Prog(v.Op.Asm())
   981  		p.From.Type = obj.TYPE_REG
   982  		p.From.Reg = v.Args[0].Reg()
   983  		p.To.Type = obj.TYPE_REG
   984  		switch v.Op {
   985  		case ssa.OpAMD64BSFQ, ssa.OpAMD64BSRQ:
   986  			p.To.Reg = v.Reg0()
   987  		case ssa.OpAMD64BSFL, ssa.OpAMD64BSRL, ssa.OpAMD64SQRTSD:
   988  			p.To.Reg = v.Reg()
   989  		}
   990  	case ssa.OpAMD64ROUNDSD:
   991  		p := s.Prog(v.Op.Asm())
   992  		val := v.AuxInt
   993  		// 0 means math.RoundToEven, 1 Floor, 2 Ceil, 3 Trunc
   994  		if val != 0 && val != 1 && val != 2 && val != 3 {
   995  			v.Fatalf("Invalid rounding mode")
   996  		}
   997  		p.From.Offset = val
   998  		p.From.Type = obj.TYPE_CONST
   999  		p.SetFrom3(obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[0].Reg()})
  1000  		p.To.Type = obj.TYPE_REG
  1001  		p.To.Reg = v.Reg()
  1002  	case ssa.OpAMD64POPCNTQ, ssa.OpAMD64POPCNTL:
  1003  		if v.Args[0].Reg() != v.Reg() {
  1004  			// POPCNT on Intel has a false dependency on the destination register.
  1005  			// Xor register with itself to break the dependency.
  1006  			p := s.Prog(x86.AXORQ)
  1007  			p.From.Type = obj.TYPE_REG
  1008  			p.From.Reg = v.Reg()
  1009  			p.To.Type = obj.TYPE_REG
  1010  			p.To.Reg = v.Reg()
  1011  		}
  1012  		p := s.Prog(v.Op.Asm())
  1013  		p.From.Type = obj.TYPE_REG
  1014  		p.From.Reg = v.Args[0].Reg()
  1015  		p.To.Type = obj.TYPE_REG
  1016  		p.To.Reg = v.Reg()
  1017  
  1018  	case ssa.OpAMD64SETEQ, ssa.OpAMD64SETNE,
  1019  		ssa.OpAMD64SETL, ssa.OpAMD64SETLE,
  1020  		ssa.OpAMD64SETG, ssa.OpAMD64SETGE,
  1021  		ssa.OpAMD64SETGF, ssa.OpAMD64SETGEF,
  1022  		ssa.OpAMD64SETB, ssa.OpAMD64SETBE,
  1023  		ssa.OpAMD64SETORD, ssa.OpAMD64SETNAN,
  1024  		ssa.OpAMD64SETA, ssa.OpAMD64SETAE,
  1025  		ssa.OpAMD64SETO:
  1026  		p := s.Prog(v.Op.Asm())
  1027  		p.To.Type = obj.TYPE_REG
  1028  		p.To.Reg = v.Reg()
  1029  
  1030  	case ssa.OpAMD64SETEQstore, ssa.OpAMD64SETNEstore,
  1031  		ssa.OpAMD64SETLstore, ssa.OpAMD64SETLEstore,
  1032  		ssa.OpAMD64SETGstore, ssa.OpAMD64SETGEstore,
  1033  		ssa.OpAMD64SETBstore, ssa.OpAMD64SETBEstore,
  1034  		ssa.OpAMD64SETAstore, ssa.OpAMD64SETAEstore:
  1035  		p := s.Prog(v.Op.Asm())
  1036  		p.To.Type = obj.TYPE_MEM
  1037  		p.To.Reg = v.Args[0].Reg()
  1038  		gc.AddAux(&p.To, v)
  1039  
  1040  	case ssa.OpAMD64SETNEF:
  1041  		p := s.Prog(v.Op.Asm())
  1042  		p.To.Type = obj.TYPE_REG
  1043  		p.To.Reg = v.Reg()
  1044  		q := s.Prog(x86.ASETPS)
  1045  		q.To.Type = obj.TYPE_REG
  1046  		q.To.Reg = x86.REG_AX
  1047  		// ORL avoids partial register write and is smaller than ORQ, used by old compiler
  1048  		opregreg(s, x86.AORL, v.Reg(), x86.REG_AX)
  1049  
  1050  	case ssa.OpAMD64SETEQF:
  1051  		p := s.Prog(v.Op.Asm())
  1052  		p.To.Type = obj.TYPE_REG
  1053  		p.To.Reg = v.Reg()
  1054  		q := s.Prog(x86.ASETPC)
  1055  		q.To.Type = obj.TYPE_REG
  1056  		q.To.Reg = x86.REG_AX
  1057  		// ANDL avoids partial register write and is smaller than ANDQ, used by old compiler
  1058  		opregreg(s, x86.AANDL, v.Reg(), x86.REG_AX)
  1059  
  1060  	case ssa.OpAMD64InvertFlags:
  1061  		v.Fatalf("InvertFlags should never make it to codegen %v", v.LongString())
  1062  	case ssa.OpAMD64FlagEQ, ssa.OpAMD64FlagLT_ULT, ssa.OpAMD64FlagLT_UGT, ssa.OpAMD64FlagGT_ULT, ssa.OpAMD64FlagGT_UGT:
  1063  		v.Fatalf("Flag* ops should never make it to codegen %v", v.LongString())
  1064  	case ssa.OpAMD64AddTupleFirst32, ssa.OpAMD64AddTupleFirst64:
  1065  		v.Fatalf("AddTupleFirst* should never make it to codegen %v", v.LongString())
  1066  	case ssa.OpAMD64REPSTOSQ:
  1067  		s.Prog(x86.AREP)
  1068  		s.Prog(x86.ASTOSQ)
  1069  	case ssa.OpAMD64REPMOVSQ:
  1070  		s.Prog(x86.AREP)
  1071  		s.Prog(x86.AMOVSQ)
  1072  	case ssa.OpAMD64LoweredNilCheck:
  1073  		// Issue a load which will fault if the input is nil.
  1074  		// TODO: We currently use the 2-byte instruction TESTB AX, (reg).
  1075  		// Should we use the 3-byte TESTB $0, (reg) instead? It is larger
  1076  		// but it doesn't have false dependency on AX.
  1077  		// Or maybe allocate an output register and use MOVL (reg),reg2 ?
  1078  		// That trades clobbering flags for clobbering a register.
  1079  		p := s.Prog(x86.ATESTB)
  1080  		p.From.Type = obj.TYPE_REG
  1081  		p.From.Reg = x86.REG_AX
  1082  		p.To.Type = obj.TYPE_MEM
  1083  		p.To.Reg = v.Args[0].Reg()
  1084  		gc.AddAux(&p.To, v)
  1085  		if logopt.Enabled() {
  1086  			logopt.LogOpt(v.Pos, "nilcheck", "genssa", v.Block.Func.Name)
  1087  		}
  1088  		if gc.Debug_checknil != 0 && v.Pos.Line() > 1 { // v.Pos.Line()==1 in generated wrappers
  1089  			gc.Warnl(v.Pos, "generated nil check")
  1090  		}
  1091  	case ssa.OpAMD64MOVBatomicload, ssa.OpAMD64MOVLatomicload, ssa.OpAMD64MOVQatomicload:
  1092  		p := s.Prog(v.Op.Asm())
  1093  		p.From.Type = obj.TYPE_MEM
  1094  		p.From.Reg = v.Args[0].Reg()
  1095  		gc.AddAux(&p.From, v)
  1096  		p.To.Type = obj.TYPE_REG
  1097  		p.To.Reg = v.Reg0()
  1098  	case ssa.OpAMD64XCHGB, ssa.OpAMD64XCHGL, ssa.OpAMD64XCHGQ:
  1099  		r := v.Reg0()
  1100  		if r != v.Args[0].Reg() {
  1101  			v.Fatalf("input[0] and output[0] not in same register %s", v.LongString())
  1102  		}
  1103  		p := s.Prog(v.Op.Asm())
  1104  		p.From.Type = obj.TYPE_REG
  1105  		p.From.Reg = r
  1106  		p.To.Type = obj.TYPE_MEM
  1107  		p.To.Reg = v.Args[1].Reg()
  1108  		gc.AddAux(&p.To, v)
  1109  	case ssa.OpAMD64XADDLlock, ssa.OpAMD64XADDQlock:
  1110  		r := v.Reg0()
  1111  		if r != v.Args[0].Reg() {
  1112  			v.Fatalf("input[0] and output[0] not in same register %s", v.LongString())
  1113  		}
  1114  		s.Prog(x86.ALOCK)
  1115  		p := s.Prog(v.Op.Asm())
  1116  		p.From.Type = obj.TYPE_REG
  1117  		p.From.Reg = r
  1118  		p.To.Type = obj.TYPE_MEM
  1119  		p.To.Reg = v.Args[1].Reg()
  1120  		gc.AddAux(&p.To, v)
  1121  	case ssa.OpAMD64CMPXCHGLlock, ssa.OpAMD64CMPXCHGQlock:
  1122  		if v.Args[1].Reg() != x86.REG_AX {
  1123  			v.Fatalf("input[1] not in AX %s", v.LongString())
  1124  		}
  1125  		s.Prog(x86.ALOCK)
  1126  		p := s.Prog(v.Op.Asm())
  1127  		p.From.Type = obj.TYPE_REG
  1128  		p.From.Reg = v.Args[2].Reg()
  1129  		p.To.Type = obj.TYPE_MEM
  1130  		p.To.Reg = v.Args[0].Reg()
  1131  		gc.AddAux(&p.To, v)
  1132  		p = s.Prog(x86.ASETEQ)
  1133  		p.To.Type = obj.TYPE_REG
  1134  		p.To.Reg = v.Reg0()
  1135  	case ssa.OpAMD64ANDBlock, ssa.OpAMD64ORBlock:
  1136  		s.Prog(x86.ALOCK)
  1137  		p := s.Prog(v.Op.Asm())
  1138  		p.From.Type = obj.TYPE_REG
  1139  		p.From.Reg = v.Args[1].Reg()
  1140  		p.To.Type = obj.TYPE_MEM
  1141  		p.To.Reg = v.Args[0].Reg()
  1142  		gc.AddAux(&p.To, v)
  1143  	case ssa.OpClobber:
  1144  		p := s.Prog(x86.AMOVL)
  1145  		p.From.Type = obj.TYPE_CONST
  1146  		p.From.Offset = 0xdeaddead
  1147  		p.To.Type = obj.TYPE_MEM
  1148  		p.To.Reg = x86.REG_SP
  1149  		gc.AddAux(&p.To, v)
  1150  		p = s.Prog(x86.AMOVL)
  1151  		p.From.Type = obj.TYPE_CONST
  1152  		p.From.Offset = 0xdeaddead
  1153  		p.To.Type = obj.TYPE_MEM
  1154  		p.To.Reg = x86.REG_SP
  1155  		gc.AddAux(&p.To, v)
  1156  		p.To.Offset += 4
  1157  	default:
  1158  		v.Fatalf("genValue not implemented: %s", v.LongString())
  1159  	}
  1160  }
  1161  
  1162  var blockJump = [...]struct {
  1163  	asm, invasm obj.As
  1164  }{
  1165  	ssa.BlockAMD64EQ:  {x86.AJEQ, x86.AJNE},
  1166  	ssa.BlockAMD64NE:  {x86.AJNE, x86.AJEQ},
  1167  	ssa.BlockAMD64LT:  {x86.AJLT, x86.AJGE},
  1168  	ssa.BlockAMD64GE:  {x86.AJGE, x86.AJLT},
  1169  	ssa.BlockAMD64LE:  {x86.AJLE, x86.AJGT},
  1170  	ssa.BlockAMD64GT:  {x86.AJGT, x86.AJLE},
  1171  	ssa.BlockAMD64OS:  {x86.AJOS, x86.AJOC},
  1172  	ssa.BlockAMD64OC:  {x86.AJOC, x86.AJOS},
  1173  	ssa.BlockAMD64ULT: {x86.AJCS, x86.AJCC},
  1174  	ssa.BlockAMD64UGE: {x86.AJCC, x86.AJCS},
  1175  	ssa.BlockAMD64UGT: {x86.AJHI, x86.AJLS},
  1176  	ssa.BlockAMD64ULE: {x86.AJLS, x86.AJHI},
  1177  	ssa.BlockAMD64ORD: {x86.AJPC, x86.AJPS},
  1178  	ssa.BlockAMD64NAN: {x86.AJPS, x86.AJPC},
  1179  }
  1180  
  1181  var eqfJumps = [2][2]gc.FloatingEQNEJump{
  1182  	{{Jump: x86.AJNE, Index: 1}, {Jump: x86.AJPS, Index: 1}}, // next == b.Succs[0]
  1183  	{{Jump: x86.AJNE, Index: 1}, {Jump: x86.AJPC, Index: 0}}, // next == b.Succs[1]
  1184  }
  1185  var nefJumps = [2][2]gc.FloatingEQNEJump{
  1186  	{{Jump: x86.AJNE, Index: 0}, {Jump: x86.AJPC, Index: 1}}, // next == b.Succs[0]
  1187  	{{Jump: x86.AJNE, Index: 0}, {Jump: x86.AJPS, Index: 0}}, // next == b.Succs[1]
  1188  }
  1189  
  1190  func ssaGenBlock(s *gc.SSAGenState, b, next *ssa.Block) {
  1191  	switch b.Kind {
  1192  	case ssa.BlockPlain:
  1193  		if b.Succs[0].Block() != next {
  1194  			p := s.Prog(obj.AJMP)
  1195  			p.To.Type = obj.TYPE_BRANCH
  1196  			s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()})
  1197  		}
  1198  	case ssa.BlockDefer:
  1199  		// defer returns in rax:
  1200  		// 0 if we should continue executing
  1201  		// 1 if we should jump to deferreturn call
  1202  		p := s.Prog(x86.ATESTL)
  1203  		p.From.Type = obj.TYPE_REG
  1204  		p.From.Reg = x86.REG_AX
  1205  		p.To.Type = obj.TYPE_REG
  1206  		p.To.Reg = x86.REG_AX
  1207  		p = s.Prog(x86.AJNE)
  1208  		p.To.Type = obj.TYPE_BRANCH
  1209  		s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[1].Block()})
  1210  		if b.Succs[0].Block() != next {
  1211  			p := s.Prog(obj.AJMP)
  1212  			p.To.Type = obj.TYPE_BRANCH
  1213  			s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()})
  1214  		}
  1215  	case ssa.BlockExit:
  1216  	case ssa.BlockRet:
  1217  		s.Prog(obj.ARET)
  1218  	case ssa.BlockRetJmp:
  1219  		p := s.Prog(obj.ARET)
  1220  		p.To.Type = obj.TYPE_MEM
  1221  		p.To.Name = obj.NAME_EXTERN
  1222  		p.To.Sym = b.Aux.(*obj.LSym)
  1223  
  1224  	case ssa.BlockAMD64EQF:
  1225  		s.FPJump(b, next, &eqfJumps)
  1226  
  1227  	case ssa.BlockAMD64NEF:
  1228  		s.FPJump(b, next, &nefJumps)
  1229  
  1230  	case ssa.BlockAMD64EQ, ssa.BlockAMD64NE,
  1231  		ssa.BlockAMD64LT, ssa.BlockAMD64GE,
  1232  		ssa.BlockAMD64LE, ssa.BlockAMD64GT,
  1233  		ssa.BlockAMD64OS, ssa.BlockAMD64OC,
  1234  		ssa.BlockAMD64ULT, ssa.BlockAMD64UGT,
  1235  		ssa.BlockAMD64ULE, ssa.BlockAMD64UGE:
  1236  		jmp := blockJump[b.Kind]
  1237  		switch next {
  1238  		case b.Succs[0].Block():
  1239  			s.Br(jmp.invasm, b.Succs[1].Block())
  1240  		case b.Succs[1].Block():
  1241  			s.Br(jmp.asm, b.Succs[0].Block())
  1242  		default:
  1243  			if b.Likely != ssa.BranchUnlikely {
  1244  				s.Br(jmp.asm, b.Succs[0].Block())
  1245  				s.Br(obj.AJMP, b.Succs[1].Block())
  1246  			} else {
  1247  				s.Br(jmp.invasm, b.Succs[1].Block())
  1248  				s.Br(obj.AJMP, b.Succs[0].Block())
  1249  			}
  1250  		}
  1251  
  1252  	default:
  1253  		b.Fatalf("branch not implemented: %s", b.LongString())
  1254  	}
  1255  }