github.com/bir3/gocompiler@v0.9.2202/src/cmd/compile/internal/amd64/ssa.go (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package amd64
     6  
     7  import (
     8  	"fmt"
     9  	"github.com/bir3/gocompiler/src/internal/buildcfg"
    10  	"math"
    11  
    12  	"github.com/bir3/gocompiler/src/cmd/compile/internal/base"
    13  	"github.com/bir3/gocompiler/src/cmd/compile/internal/ir"
    14  	"github.com/bir3/gocompiler/src/cmd/compile/internal/logopt"
    15  	"github.com/bir3/gocompiler/src/cmd/compile/internal/objw"
    16  	"github.com/bir3/gocompiler/src/cmd/compile/internal/ssa"
    17  	"github.com/bir3/gocompiler/src/cmd/compile/internal/ssagen"
    18  	"github.com/bir3/gocompiler/src/cmd/compile/internal/types"
    19  	"github.com/bir3/gocompiler/src/cmd/internal/obj"
    20  	"github.com/bir3/gocompiler/src/cmd/internal/obj/x86"
    21  )
    22  
    23  // ssaMarkMoves marks any MOVXconst ops that need to avoid clobbering flags.
    24  func ssaMarkMoves(s *ssagen.State, b *ssa.Block) {
    25  	flive := b.FlagsLiveAtEnd
    26  	for _, c := range b.ControlValues() {
    27  		flive = c.Type.IsFlags() || flive
    28  	}
    29  	for i := len(b.Values) - 1; i >= 0; i-- {
    30  		v := b.Values[i]
    31  		if flive && (v.Op == ssa.OpAMD64MOVLconst || v.Op == ssa.OpAMD64MOVQconst) {
    32  			// The "mark" is any non-nil Aux value.
    33  			v.Aux = ssa.AuxMark
    34  		}
    35  		if v.Type.IsFlags() {
    36  			flive = false
    37  		}
    38  		for _, a := range v.Args {
    39  			if a.Type.IsFlags() {
    40  				flive = true
    41  			}
    42  		}
    43  	}
    44  }
    45  
    46  // loadByType returns the load instruction of the given type.
    47  func loadByType(t *types.Type) obj.As {
    48  	// Avoid partial register write
    49  	if !t.IsFloat() {
    50  		switch t.Size() {
    51  		case 1:
    52  			return x86.AMOVBLZX
    53  		case 2:
    54  			return x86.AMOVWLZX
    55  		}
    56  	}
    57  	// Otherwise, there's no difference between load and store opcodes.
    58  	return storeByType(t)
    59  }
    60  
    61  // storeByType returns the store instruction of the given type.
    62  func storeByType(t *types.Type) obj.As {
    63  	width := t.Size()
    64  	if t.IsFloat() {
    65  		switch width {
    66  		case 4:
    67  			return x86.AMOVSS
    68  		case 8:
    69  			return x86.AMOVSD
    70  		}
    71  	} else {
    72  		switch width {
    73  		case 1:
    74  			return x86.AMOVB
    75  		case 2:
    76  			return x86.AMOVW
    77  		case 4:
    78  			return x86.AMOVL
    79  		case 8:
    80  			return x86.AMOVQ
    81  		case 16:
    82  			return x86.AMOVUPS
    83  		}
    84  	}
    85  	panic(fmt.Sprintf("bad store type %v", t))
    86  }
    87  
    88  // moveByType returns the reg->reg move instruction of the given type.
    89  func moveByType(t *types.Type) obj.As {
    90  	if t.IsFloat() {
    91  		// Moving the whole sse2 register is faster
    92  		// than moving just the correct low portion of it.
    93  		// There is no xmm->xmm move with 1 byte opcode,
    94  		// so use movups, which has 2 byte opcode.
    95  		return x86.AMOVUPS
    96  	} else {
    97  		switch t.Size() {
    98  		case 1:
    99  			// Avoids partial register write
   100  			return x86.AMOVL
   101  		case 2:
   102  			return x86.AMOVL
   103  		case 4:
   104  			return x86.AMOVL
   105  		case 8:
   106  			return x86.AMOVQ
   107  		case 16:
   108  			return x86.AMOVUPS	// int128s are in SSE registers
   109  		default:
   110  			panic(fmt.Sprintf("bad int register width %d:%v", t.Size(), t))
   111  		}
   112  	}
   113  }
   114  
   115  // opregreg emits instructions for
   116  //
   117  //	dest := dest(To) op src(From)
   118  //
   119  // and also returns the created obj.Prog so it
   120  // may be further adjusted (offset, scale, etc).
   121  func opregreg(s *ssagen.State, op obj.As, dest, src int16) *obj.Prog {
   122  	p := s.Prog(op)
   123  	p.From.Type = obj.TYPE_REG
   124  	p.To.Type = obj.TYPE_REG
   125  	p.To.Reg = dest
   126  	p.From.Reg = src
   127  	return p
   128  }
   129  
   130  // memIdx fills out a as an indexed memory reference for v.
   131  // It assumes that the base register and the index register
   132  // are v.Args[0].Reg() and v.Args[1].Reg(), respectively.
   133  // The caller must still use gc.AddAux/gc.AddAux2 to handle v.Aux as necessary.
   134  func memIdx(a *obj.Addr, v *ssa.Value) {
   135  	r, i := v.Args[0].Reg(), v.Args[1].Reg()
   136  	a.Type = obj.TYPE_MEM
   137  	a.Scale = v.Op.Scale()
   138  	if a.Scale == 1 && i == x86.REG_SP {
   139  		r, i = i, r
   140  	}
   141  	a.Reg = r
   142  	a.Index = i
   143  }
   144  
   145  // DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ,
   146  // See runtime/mkduff.go.
   147  func duffStart(size int64) int64 {
   148  	x, _ := duff(size)
   149  	return x
   150  }
   151  func duffAdj(size int64) int64 {
   152  	_, x := duff(size)
   153  	return x
   154  }
   155  
   156  // duff returns the offset (from duffzero, in bytes) and pointer adjust (in bytes)
   157  // required to use the duffzero mechanism for a block of the given size.
   158  func duff(size int64) (int64, int64) {
   159  	if size < 32 || size > 1024 || size%dzClearStep != 0 {
   160  		panic("bad duffzero size")
   161  	}
   162  	steps := size / dzClearStep
   163  	blocks := steps / dzBlockLen
   164  	steps %= dzBlockLen
   165  	off := dzBlockSize * (dzBlocks - blocks)
   166  	var adj int64
   167  	if steps != 0 {
   168  		off -= dzLeaqSize
   169  		off -= dzMovSize * steps
   170  		adj -= dzClearStep * (dzBlockLen - steps)
   171  	}
   172  	return off, adj
   173  }
   174  
   175  func getgFromTLS(s *ssagen.State, r int16) {
   176  	// See the comments in cmd/internal/obj/x86/obj6.go
   177  	// near CanUse1InsnTLS for a detailed explanation of these instructions.
   178  	if x86.CanUse1InsnTLS(base.Ctxt) {
   179  		// MOVQ (TLS), r
   180  		p := s.Prog(x86.AMOVQ)
   181  		p.From.Type = obj.TYPE_MEM
   182  		p.From.Reg = x86.REG_TLS
   183  		p.To.Type = obj.TYPE_REG
   184  		p.To.Reg = r
   185  	} else {
   186  		// MOVQ TLS, r
   187  		// MOVQ (r)(TLS*1), r
   188  		p := s.Prog(x86.AMOVQ)
   189  		p.From.Type = obj.TYPE_REG
   190  		p.From.Reg = x86.REG_TLS
   191  		p.To.Type = obj.TYPE_REG
   192  		p.To.Reg = r
   193  		q := s.Prog(x86.AMOVQ)
   194  		q.From.Type = obj.TYPE_MEM
   195  		q.From.Reg = r
   196  		q.From.Index = x86.REG_TLS
   197  		q.From.Scale = 1
   198  		q.To.Type = obj.TYPE_REG
   199  		q.To.Reg = r
   200  	}
   201  }
   202  
   203  func ssaGenValue(s *ssagen.State, v *ssa.Value) {
   204  	switch v.Op {
   205  	case ssa.OpAMD64VFMADD231SD:
   206  		p := s.Prog(v.Op.Asm())
   207  		p.From = obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[2].Reg()}
   208  		p.To = obj.Addr{Type: obj.TYPE_REG, Reg: v.Reg()}
   209  		p.AddRestSourceReg(v.Args[1].Reg())
   210  	case ssa.OpAMD64ADDQ, ssa.OpAMD64ADDL:
   211  		r := v.Reg()
   212  		r1 := v.Args[0].Reg()
   213  		r2 := v.Args[1].Reg()
   214  		switch {
   215  		case r == r1:
   216  			p := s.Prog(v.Op.Asm())
   217  			p.From.Type = obj.TYPE_REG
   218  			p.From.Reg = r2
   219  			p.To.Type = obj.TYPE_REG
   220  			p.To.Reg = r
   221  		case r == r2:
   222  			p := s.Prog(v.Op.Asm())
   223  			p.From.Type = obj.TYPE_REG
   224  			p.From.Reg = r1
   225  			p.To.Type = obj.TYPE_REG
   226  			p.To.Reg = r
   227  		default:
   228  			var asm obj.As
   229  			if v.Op == ssa.OpAMD64ADDQ {
   230  				asm = x86.ALEAQ
   231  			} else {
   232  				asm = x86.ALEAL
   233  			}
   234  			p := s.Prog(asm)
   235  			p.From.Type = obj.TYPE_MEM
   236  			p.From.Reg = r1
   237  			p.From.Scale = 1
   238  			p.From.Index = r2
   239  			p.To.Type = obj.TYPE_REG
   240  			p.To.Reg = r
   241  		}
   242  	// 2-address opcode arithmetic
   243  	case ssa.OpAMD64SUBQ, ssa.OpAMD64SUBL,
   244  		ssa.OpAMD64MULQ, ssa.OpAMD64MULL,
   245  		ssa.OpAMD64ANDQ, ssa.OpAMD64ANDL,
   246  		ssa.OpAMD64ORQ, ssa.OpAMD64ORL,
   247  		ssa.OpAMD64XORQ, ssa.OpAMD64XORL,
   248  		ssa.OpAMD64SHLQ, ssa.OpAMD64SHLL,
   249  		ssa.OpAMD64SHRQ, ssa.OpAMD64SHRL, ssa.OpAMD64SHRW, ssa.OpAMD64SHRB,
   250  		ssa.OpAMD64SARQ, ssa.OpAMD64SARL, ssa.OpAMD64SARW, ssa.OpAMD64SARB,
   251  		ssa.OpAMD64ROLQ, ssa.OpAMD64ROLL, ssa.OpAMD64ROLW, ssa.OpAMD64ROLB,
   252  		ssa.OpAMD64RORQ, ssa.OpAMD64RORL, ssa.OpAMD64RORW, ssa.OpAMD64RORB,
   253  		ssa.OpAMD64ADDSS, ssa.OpAMD64ADDSD, ssa.OpAMD64SUBSS, ssa.OpAMD64SUBSD,
   254  		ssa.OpAMD64MULSS, ssa.OpAMD64MULSD, ssa.OpAMD64DIVSS, ssa.OpAMD64DIVSD,
   255  		ssa.OpAMD64MINSS, ssa.OpAMD64MINSD,
   256  		ssa.OpAMD64POR, ssa.OpAMD64PXOR,
   257  		ssa.OpAMD64BTSL, ssa.OpAMD64BTSQ,
   258  		ssa.OpAMD64BTCL, ssa.OpAMD64BTCQ,
   259  		ssa.OpAMD64BTRL, ssa.OpAMD64BTRQ:
   260  		opregreg(s, v.Op.Asm(), v.Reg(), v.Args[1].Reg())
   261  
   262  	case ssa.OpAMD64SHRDQ, ssa.OpAMD64SHLDQ:
   263  		p := s.Prog(v.Op.Asm())
   264  		lo, hi, bits := v.Args[0].Reg(), v.Args[1].Reg(), v.Args[2].Reg()
   265  		p.From.Type = obj.TYPE_REG
   266  		p.From.Reg = bits
   267  		p.To.Type = obj.TYPE_REG
   268  		p.To.Reg = lo
   269  		p.AddRestSourceReg(hi)
   270  
   271  	case ssa.OpAMD64BLSIQ, ssa.OpAMD64BLSIL,
   272  		ssa.OpAMD64BLSMSKQ, ssa.OpAMD64BLSMSKL,
   273  		ssa.OpAMD64BLSRQ, ssa.OpAMD64BLSRL:
   274  		p := s.Prog(v.Op.Asm())
   275  		p.From.Type = obj.TYPE_REG
   276  		p.From.Reg = v.Args[0].Reg()
   277  		p.To.Type = obj.TYPE_REG
   278  		switch v.Op {
   279  		case ssa.OpAMD64BLSRQ, ssa.OpAMD64BLSRL:
   280  			p.To.Reg = v.Reg0()
   281  		default:
   282  			p.To.Reg = v.Reg()
   283  		}
   284  
   285  	case ssa.OpAMD64ANDNQ, ssa.OpAMD64ANDNL:
   286  		p := s.Prog(v.Op.Asm())
   287  		p.From.Type = obj.TYPE_REG
   288  		p.From.Reg = v.Args[0].Reg()
   289  		p.To.Type = obj.TYPE_REG
   290  		p.To.Reg = v.Reg()
   291  		p.AddRestSourceReg(v.Args[1].Reg())
   292  
   293  	case ssa.OpAMD64SARXL, ssa.OpAMD64SARXQ,
   294  		ssa.OpAMD64SHLXL, ssa.OpAMD64SHLXQ,
   295  		ssa.OpAMD64SHRXL, ssa.OpAMD64SHRXQ:
   296  		p := opregreg(s, v.Op.Asm(), v.Reg(), v.Args[1].Reg())
   297  		p.AddRestSourceReg(v.Args[0].Reg())
   298  
   299  	case ssa.OpAMD64SHLXLload, ssa.OpAMD64SHLXQload,
   300  		ssa.OpAMD64SHRXLload, ssa.OpAMD64SHRXQload,
   301  		ssa.OpAMD64SARXLload, ssa.OpAMD64SARXQload:
   302  		p := opregreg(s, v.Op.Asm(), v.Reg(), v.Args[1].Reg())
   303  		m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[0].Reg()}
   304  		ssagen.AddAux(&m, v)
   305  		p.AddRestSource(m)
   306  
   307  	case ssa.OpAMD64SHLXLloadidx1, ssa.OpAMD64SHLXLloadidx4, ssa.OpAMD64SHLXLloadidx8,
   308  		ssa.OpAMD64SHRXLloadidx1, ssa.OpAMD64SHRXLloadidx4, ssa.OpAMD64SHRXLloadidx8,
   309  		ssa.OpAMD64SARXLloadidx1, ssa.OpAMD64SARXLloadidx4, ssa.OpAMD64SARXLloadidx8,
   310  		ssa.OpAMD64SHLXQloadidx1, ssa.OpAMD64SHLXQloadidx8,
   311  		ssa.OpAMD64SHRXQloadidx1, ssa.OpAMD64SHRXQloadidx8,
   312  		ssa.OpAMD64SARXQloadidx1, ssa.OpAMD64SARXQloadidx8:
   313  		p := opregreg(s, v.Op.Asm(), v.Reg(), v.Args[2].Reg())
   314  		m := obj.Addr{Type: obj.TYPE_MEM}
   315  		memIdx(&m, v)
   316  		ssagen.AddAux(&m, v)
   317  		p.AddRestSource(m)
   318  
   319  	case ssa.OpAMD64DIVQU, ssa.OpAMD64DIVLU, ssa.OpAMD64DIVWU:
   320  		// Arg[0] (the dividend) is in AX.
   321  		// Arg[1] (the divisor) can be in any other register.
   322  		// Result[0] (the quotient) is in AX.
   323  		// Result[1] (the remainder) is in DX.
   324  		r := v.Args[1].Reg()
   325  
   326  		// Zero extend dividend.
   327  		opregreg(s, x86.AXORL, x86.REG_DX, x86.REG_DX)
   328  
   329  		// Issue divide.
   330  		p := s.Prog(v.Op.Asm())
   331  		p.From.Type = obj.TYPE_REG
   332  		p.From.Reg = r
   333  
   334  	case ssa.OpAMD64DIVQ, ssa.OpAMD64DIVL, ssa.OpAMD64DIVW:
   335  		// Arg[0] (the dividend) is in AX.
   336  		// Arg[1] (the divisor) can be in any other register.
   337  		// Result[0] (the quotient) is in AX.
   338  		// Result[1] (the remainder) is in DX.
   339  		r := v.Args[1].Reg()
   340  
   341  		var opCMP, opNEG, opSXD obj.As
   342  		switch v.Op {
   343  		case ssa.OpAMD64DIVQ:
   344  			opCMP, opNEG, opSXD = x86.ACMPQ, x86.ANEGQ, x86.ACQO
   345  		case ssa.OpAMD64DIVL:
   346  			opCMP, opNEG, opSXD = x86.ACMPL, x86.ANEGL, x86.ACDQ
   347  		case ssa.OpAMD64DIVW:
   348  			opCMP, opNEG, opSXD = x86.ACMPW, x86.ANEGW, x86.ACWD
   349  		}
   350  
   351  		// CPU faults upon signed overflow, which occurs when the most
   352  		// negative int is divided by -1. Handle divide by -1 as a special case.
   353  		var j1, j2 *obj.Prog
   354  		if ssa.DivisionNeedsFixUp(v) {
   355  			c := s.Prog(opCMP)
   356  			c.From.Type = obj.TYPE_REG
   357  			c.From.Reg = r
   358  			c.To.Type = obj.TYPE_CONST
   359  			c.To.Offset = -1
   360  
   361  			// Divisor is not -1, proceed with normal division.
   362  			j1 = s.Prog(x86.AJNE)
   363  			j1.To.Type = obj.TYPE_BRANCH
   364  
   365  			// Divisor is -1, manually compute quotient and remainder via fixup code.
   366  			// n / -1 = -n
   367  			n1 := s.Prog(opNEG)
   368  			n1.To.Type = obj.TYPE_REG
   369  			n1.To.Reg = x86.REG_AX
   370  
   371  			// n % -1 == 0
   372  			opregreg(s, x86.AXORL, x86.REG_DX, x86.REG_DX)
   373  
   374  			// TODO(khr): issue only the -1 fixup code we need.
   375  			// For instance, if only the quotient is used, no point in zeroing the remainder.
   376  
   377  			// Skip over normal division.
   378  			j2 = s.Prog(obj.AJMP)
   379  			j2.To.Type = obj.TYPE_BRANCH
   380  		}
   381  
   382  		// Sign extend dividend and perform division.
   383  		p := s.Prog(opSXD)
   384  		if j1 != nil {
   385  			j1.To.SetTarget(p)
   386  		}
   387  		p = s.Prog(v.Op.Asm())
   388  		p.From.Type = obj.TYPE_REG
   389  		p.From.Reg = r
   390  
   391  		if j2 != nil {
   392  			j2.To.SetTarget(s.Pc())
   393  		}
   394  
   395  	case ssa.OpAMD64HMULQ, ssa.OpAMD64HMULL, ssa.OpAMD64HMULQU, ssa.OpAMD64HMULLU:
   396  		// the frontend rewrites constant division by 8/16/32 bit integers into
   397  		// HMUL by a constant
   398  		// SSA rewrites generate the 64 bit versions
   399  
   400  		// Arg[0] is already in AX as it's the only register we allow
   401  		// and DX is the only output we care about (the high bits)
   402  		p := s.Prog(v.Op.Asm())
   403  		p.From.Type = obj.TYPE_REG
   404  		p.From.Reg = v.Args[1].Reg()
   405  
   406  		// IMULB puts the high portion in AH instead of DL,
   407  		// so move it to DL for consistency
   408  		if v.Type.Size() == 1 {
   409  			m := s.Prog(x86.AMOVB)
   410  			m.From.Type = obj.TYPE_REG
   411  			m.From.Reg = x86.REG_AH
   412  			m.To.Type = obj.TYPE_REG
   413  			m.To.Reg = x86.REG_DX
   414  		}
   415  
   416  	case ssa.OpAMD64MULQU, ssa.OpAMD64MULLU:
   417  		// Arg[0] is already in AX as it's the only register we allow
   418  		// results lo in AX
   419  		p := s.Prog(v.Op.Asm())
   420  		p.From.Type = obj.TYPE_REG
   421  		p.From.Reg = v.Args[1].Reg()
   422  
   423  	case ssa.OpAMD64MULQU2:
   424  		// Arg[0] is already in AX as it's the only register we allow
   425  		// results hi in DX, lo in AX
   426  		p := s.Prog(v.Op.Asm())
   427  		p.From.Type = obj.TYPE_REG
   428  		p.From.Reg = v.Args[1].Reg()
   429  
   430  	case ssa.OpAMD64DIVQU2:
   431  		// Arg[0], Arg[1] are already in Dx, AX, as they're the only registers we allow
   432  		// results q in AX, r in DX
   433  		p := s.Prog(v.Op.Asm())
   434  		p.From.Type = obj.TYPE_REG
   435  		p.From.Reg = v.Args[2].Reg()
   436  
   437  	case ssa.OpAMD64AVGQU:
   438  		// compute (x+y)/2 unsigned.
   439  		// Do a 64-bit add, the overflow goes into the carry.
   440  		// Shift right once and pull the carry back into the 63rd bit.
   441  		p := s.Prog(x86.AADDQ)
   442  		p.From.Type = obj.TYPE_REG
   443  		p.To.Type = obj.TYPE_REG
   444  		p.To.Reg = v.Reg()
   445  		p.From.Reg = v.Args[1].Reg()
   446  		p = s.Prog(x86.ARCRQ)
   447  		p.From.Type = obj.TYPE_CONST
   448  		p.From.Offset = 1
   449  		p.To.Type = obj.TYPE_REG
   450  		p.To.Reg = v.Reg()
   451  
   452  	case ssa.OpAMD64ADDQcarry, ssa.OpAMD64ADCQ:
   453  		r := v.Reg0()
   454  		r0 := v.Args[0].Reg()
   455  		r1 := v.Args[1].Reg()
   456  		switch r {
   457  		case r0:
   458  			p := s.Prog(v.Op.Asm())
   459  			p.From.Type = obj.TYPE_REG
   460  			p.From.Reg = r1
   461  			p.To.Type = obj.TYPE_REG
   462  			p.To.Reg = r
   463  		case r1:
   464  			p := s.Prog(v.Op.Asm())
   465  			p.From.Type = obj.TYPE_REG
   466  			p.From.Reg = r0
   467  			p.To.Type = obj.TYPE_REG
   468  			p.To.Reg = r
   469  		default:
   470  			v.Fatalf("output not in same register as an input %s", v.LongString())
   471  		}
   472  
   473  	case ssa.OpAMD64SUBQborrow, ssa.OpAMD64SBBQ:
   474  		p := s.Prog(v.Op.Asm())
   475  		p.From.Type = obj.TYPE_REG
   476  		p.From.Reg = v.Args[1].Reg()
   477  		p.To.Type = obj.TYPE_REG
   478  		p.To.Reg = v.Reg0()
   479  
   480  	case ssa.OpAMD64ADDQconstcarry, ssa.OpAMD64ADCQconst, ssa.OpAMD64SUBQconstborrow, ssa.OpAMD64SBBQconst:
   481  		p := s.Prog(v.Op.Asm())
   482  		p.From.Type = obj.TYPE_CONST
   483  		p.From.Offset = v.AuxInt
   484  		p.To.Type = obj.TYPE_REG
   485  		p.To.Reg = v.Reg0()
   486  
   487  	case ssa.OpAMD64ADDQconst, ssa.OpAMD64ADDLconst:
   488  		r := v.Reg()
   489  		a := v.Args[0].Reg()
   490  		if r == a {
   491  			switch v.AuxInt {
   492  			case 1:
   493  				var asm obj.As
   494  				// Software optimization manual recommends add $1,reg.
   495  				// But inc/dec is 1 byte smaller. ICC always uses inc
   496  				// Clang/GCC choose depending on flags, but prefer add.
   497  				// Experiments show that inc/dec is both a little faster
   498  				// and make a binary a little smaller.
   499  				if v.Op == ssa.OpAMD64ADDQconst {
   500  					asm = x86.AINCQ
   501  				} else {
   502  					asm = x86.AINCL
   503  				}
   504  				p := s.Prog(asm)
   505  				p.To.Type = obj.TYPE_REG
   506  				p.To.Reg = r
   507  				return
   508  			case -1:
   509  				var asm obj.As
   510  				if v.Op == ssa.OpAMD64ADDQconst {
   511  					asm = x86.ADECQ
   512  				} else {
   513  					asm = x86.ADECL
   514  				}
   515  				p := s.Prog(asm)
   516  				p.To.Type = obj.TYPE_REG
   517  				p.To.Reg = r
   518  				return
   519  			case 0x80:
   520  				// 'SUBQ $-0x80, r' is shorter to encode than
   521  				// and functionally equivalent to 'ADDQ $0x80, r'.
   522  				asm := x86.ASUBL
   523  				if v.Op == ssa.OpAMD64ADDQconst {
   524  					asm = x86.ASUBQ
   525  				}
   526  				p := s.Prog(asm)
   527  				p.From.Type = obj.TYPE_CONST
   528  				p.From.Offset = -0x80
   529  				p.To.Type = obj.TYPE_REG
   530  				p.To.Reg = r
   531  				return
   532  
   533  			}
   534  			p := s.Prog(v.Op.Asm())
   535  			p.From.Type = obj.TYPE_CONST
   536  			p.From.Offset = v.AuxInt
   537  			p.To.Type = obj.TYPE_REG
   538  			p.To.Reg = r
   539  			return
   540  		}
   541  		var asm obj.As
   542  		if v.Op == ssa.OpAMD64ADDQconst {
   543  			asm = x86.ALEAQ
   544  		} else {
   545  			asm = x86.ALEAL
   546  		}
   547  		p := s.Prog(asm)
   548  		p.From.Type = obj.TYPE_MEM
   549  		p.From.Reg = a
   550  		p.From.Offset = v.AuxInt
   551  		p.To.Type = obj.TYPE_REG
   552  		p.To.Reg = r
   553  
   554  	case ssa.OpAMD64CMOVQEQ, ssa.OpAMD64CMOVLEQ, ssa.OpAMD64CMOVWEQ,
   555  		ssa.OpAMD64CMOVQLT, ssa.OpAMD64CMOVLLT, ssa.OpAMD64CMOVWLT,
   556  		ssa.OpAMD64CMOVQNE, ssa.OpAMD64CMOVLNE, ssa.OpAMD64CMOVWNE,
   557  		ssa.OpAMD64CMOVQGT, ssa.OpAMD64CMOVLGT, ssa.OpAMD64CMOVWGT,
   558  		ssa.OpAMD64CMOVQLE, ssa.OpAMD64CMOVLLE, ssa.OpAMD64CMOVWLE,
   559  		ssa.OpAMD64CMOVQGE, ssa.OpAMD64CMOVLGE, ssa.OpAMD64CMOVWGE,
   560  		ssa.OpAMD64CMOVQHI, ssa.OpAMD64CMOVLHI, ssa.OpAMD64CMOVWHI,
   561  		ssa.OpAMD64CMOVQLS, ssa.OpAMD64CMOVLLS, ssa.OpAMD64CMOVWLS,
   562  		ssa.OpAMD64CMOVQCC, ssa.OpAMD64CMOVLCC, ssa.OpAMD64CMOVWCC,
   563  		ssa.OpAMD64CMOVQCS, ssa.OpAMD64CMOVLCS, ssa.OpAMD64CMOVWCS,
   564  		ssa.OpAMD64CMOVQGTF, ssa.OpAMD64CMOVLGTF, ssa.OpAMD64CMOVWGTF,
   565  		ssa.OpAMD64CMOVQGEF, ssa.OpAMD64CMOVLGEF, ssa.OpAMD64CMOVWGEF:
   566  		p := s.Prog(v.Op.Asm())
   567  		p.From.Type = obj.TYPE_REG
   568  		p.From.Reg = v.Args[1].Reg()
   569  		p.To.Type = obj.TYPE_REG
   570  		p.To.Reg = v.Reg()
   571  
   572  	case ssa.OpAMD64CMOVQNEF, ssa.OpAMD64CMOVLNEF, ssa.OpAMD64CMOVWNEF:
   573  		// Flag condition: ^ZERO || PARITY
   574  		// Generate:
   575  		//   CMOV*NE  SRC,DST
   576  		//   CMOV*PS  SRC,DST
   577  		p := s.Prog(v.Op.Asm())
   578  		p.From.Type = obj.TYPE_REG
   579  		p.From.Reg = v.Args[1].Reg()
   580  		p.To.Type = obj.TYPE_REG
   581  		p.To.Reg = v.Reg()
   582  		var q *obj.Prog
   583  		if v.Op == ssa.OpAMD64CMOVQNEF {
   584  			q = s.Prog(x86.ACMOVQPS)
   585  		} else if v.Op == ssa.OpAMD64CMOVLNEF {
   586  			q = s.Prog(x86.ACMOVLPS)
   587  		} else {
   588  			q = s.Prog(x86.ACMOVWPS)
   589  		}
   590  		q.From.Type = obj.TYPE_REG
   591  		q.From.Reg = v.Args[1].Reg()
   592  		q.To.Type = obj.TYPE_REG
   593  		q.To.Reg = v.Reg()
   594  
   595  	case ssa.OpAMD64CMOVQEQF, ssa.OpAMD64CMOVLEQF, ssa.OpAMD64CMOVWEQF:
   596  		// Flag condition: ZERO && !PARITY
   597  		// Generate:
   598  		//   MOV      SRC,TMP
   599  		//   CMOV*NE  DST,TMP
   600  		//   CMOV*PC  TMP,DST
   601  		//
   602  		// TODO(rasky): we could generate:
   603  		//   CMOV*NE  DST,SRC
   604  		//   CMOV*PC  SRC,DST
   605  		// But this requires a way for regalloc to know that SRC might be
   606  		// clobbered by this instruction.
   607  		t := v.RegTmp()
   608  		opregreg(s, moveByType(v.Type), t, v.Args[1].Reg())
   609  
   610  		p := s.Prog(v.Op.Asm())
   611  		p.From.Type = obj.TYPE_REG
   612  		p.From.Reg = v.Reg()
   613  		p.To.Type = obj.TYPE_REG
   614  		p.To.Reg = t
   615  		var q *obj.Prog
   616  		if v.Op == ssa.OpAMD64CMOVQEQF {
   617  			q = s.Prog(x86.ACMOVQPC)
   618  		} else if v.Op == ssa.OpAMD64CMOVLEQF {
   619  			q = s.Prog(x86.ACMOVLPC)
   620  		} else {
   621  			q = s.Prog(x86.ACMOVWPC)
   622  		}
   623  		q.From.Type = obj.TYPE_REG
   624  		q.From.Reg = t
   625  		q.To.Type = obj.TYPE_REG
   626  		q.To.Reg = v.Reg()
   627  
   628  	case ssa.OpAMD64MULQconst, ssa.OpAMD64MULLconst:
   629  		r := v.Reg()
   630  		p := s.Prog(v.Op.Asm())
   631  		p.From.Type = obj.TYPE_CONST
   632  		p.From.Offset = v.AuxInt
   633  		p.To.Type = obj.TYPE_REG
   634  		p.To.Reg = r
   635  		p.AddRestSourceReg(v.Args[0].Reg())
   636  
   637  	case ssa.OpAMD64ANDQconst:
   638  		asm := v.Op.Asm()
   639  		// If the constant is positive and fits into 32 bits, use ANDL.
   640  		// This saves a few bytes of encoding.
   641  		if 0 <= v.AuxInt && v.AuxInt <= (1<<32-1) {
   642  			asm = x86.AANDL
   643  		}
   644  		p := s.Prog(asm)
   645  		p.From.Type = obj.TYPE_CONST
   646  		p.From.Offset = v.AuxInt
   647  		p.To.Type = obj.TYPE_REG
   648  		p.To.Reg = v.Reg()
   649  
   650  	case ssa.OpAMD64SUBQconst, ssa.OpAMD64SUBLconst,
   651  		ssa.OpAMD64ANDLconst,
   652  		ssa.OpAMD64ORQconst, ssa.OpAMD64ORLconst,
   653  		ssa.OpAMD64XORQconst, ssa.OpAMD64XORLconst,
   654  		ssa.OpAMD64SHLQconst, ssa.OpAMD64SHLLconst,
   655  		ssa.OpAMD64SHRQconst, ssa.OpAMD64SHRLconst, ssa.OpAMD64SHRWconst, ssa.OpAMD64SHRBconst,
   656  		ssa.OpAMD64SARQconst, ssa.OpAMD64SARLconst, ssa.OpAMD64SARWconst, ssa.OpAMD64SARBconst,
   657  		ssa.OpAMD64ROLQconst, ssa.OpAMD64ROLLconst, ssa.OpAMD64ROLWconst, ssa.OpAMD64ROLBconst:
   658  		p := s.Prog(v.Op.Asm())
   659  		p.From.Type = obj.TYPE_CONST
   660  		p.From.Offset = v.AuxInt
   661  		p.To.Type = obj.TYPE_REG
   662  		p.To.Reg = v.Reg()
   663  	case ssa.OpAMD64SBBQcarrymask, ssa.OpAMD64SBBLcarrymask:
   664  		r := v.Reg()
   665  		p := s.Prog(v.Op.Asm())
   666  		p.From.Type = obj.TYPE_REG
   667  		p.From.Reg = r
   668  		p.To.Type = obj.TYPE_REG
   669  		p.To.Reg = r
   670  	case ssa.OpAMD64LEAQ1, ssa.OpAMD64LEAQ2, ssa.OpAMD64LEAQ4, ssa.OpAMD64LEAQ8,
   671  		ssa.OpAMD64LEAL1, ssa.OpAMD64LEAL2, ssa.OpAMD64LEAL4, ssa.OpAMD64LEAL8,
   672  		ssa.OpAMD64LEAW1, ssa.OpAMD64LEAW2, ssa.OpAMD64LEAW4, ssa.OpAMD64LEAW8:
   673  		p := s.Prog(v.Op.Asm())
   674  		memIdx(&p.From, v)
   675  		o := v.Reg()
   676  		p.To.Type = obj.TYPE_REG
   677  		p.To.Reg = o
   678  		if v.AuxInt != 0 && v.Aux == nil {
   679  			// Emit an additional LEA to add the displacement instead of creating a slow 3 operand LEA.
   680  			switch v.Op {
   681  			case ssa.OpAMD64LEAQ1, ssa.OpAMD64LEAQ2, ssa.OpAMD64LEAQ4, ssa.OpAMD64LEAQ8:
   682  				p = s.Prog(x86.ALEAQ)
   683  			case ssa.OpAMD64LEAL1, ssa.OpAMD64LEAL2, ssa.OpAMD64LEAL4, ssa.OpAMD64LEAL8:
   684  				p = s.Prog(x86.ALEAL)
   685  			case ssa.OpAMD64LEAW1, ssa.OpAMD64LEAW2, ssa.OpAMD64LEAW4, ssa.OpAMD64LEAW8:
   686  				p = s.Prog(x86.ALEAW)
   687  			}
   688  			p.From.Type = obj.TYPE_MEM
   689  			p.From.Reg = o
   690  			p.To.Type = obj.TYPE_REG
   691  			p.To.Reg = o
   692  		}
   693  		ssagen.AddAux(&p.From, v)
   694  	case ssa.OpAMD64LEAQ, ssa.OpAMD64LEAL, ssa.OpAMD64LEAW:
   695  		p := s.Prog(v.Op.Asm())
   696  		p.From.Type = obj.TYPE_MEM
   697  		p.From.Reg = v.Args[0].Reg()
   698  		ssagen.AddAux(&p.From, v)
   699  		p.To.Type = obj.TYPE_REG
   700  		p.To.Reg = v.Reg()
   701  	case ssa.OpAMD64CMPQ, ssa.OpAMD64CMPL, ssa.OpAMD64CMPW, ssa.OpAMD64CMPB,
   702  		ssa.OpAMD64TESTQ, ssa.OpAMD64TESTL, ssa.OpAMD64TESTW, ssa.OpAMD64TESTB,
   703  		ssa.OpAMD64BTL, ssa.OpAMD64BTQ:
   704  		opregreg(s, v.Op.Asm(), v.Args[1].Reg(), v.Args[0].Reg())
   705  	case ssa.OpAMD64UCOMISS, ssa.OpAMD64UCOMISD:
   706  		// Go assembler has swapped operands for UCOMISx relative to CMP,
   707  		// must account for that right here.
   708  		opregreg(s, v.Op.Asm(), v.Args[0].Reg(), v.Args[1].Reg())
   709  	case ssa.OpAMD64CMPQconst, ssa.OpAMD64CMPLconst, ssa.OpAMD64CMPWconst, ssa.OpAMD64CMPBconst:
   710  		p := s.Prog(v.Op.Asm())
   711  		p.From.Type = obj.TYPE_REG
   712  		p.From.Reg = v.Args[0].Reg()
   713  		p.To.Type = obj.TYPE_CONST
   714  		p.To.Offset = v.AuxInt
   715  	case ssa.OpAMD64BTLconst, ssa.OpAMD64BTQconst,
   716  		ssa.OpAMD64TESTQconst, ssa.OpAMD64TESTLconst, ssa.OpAMD64TESTWconst, ssa.OpAMD64TESTBconst,
   717  		ssa.OpAMD64BTSQconst,
   718  		ssa.OpAMD64BTCQconst,
   719  		ssa.OpAMD64BTRQconst:
   720  		op := v.Op
   721  		if op == ssa.OpAMD64BTQconst && v.AuxInt < 32 {
   722  			// Emit 32-bit version because it's shorter
   723  			op = ssa.OpAMD64BTLconst
   724  		}
   725  		p := s.Prog(op.Asm())
   726  		p.From.Type = obj.TYPE_CONST
   727  		p.From.Offset = v.AuxInt
   728  		p.To.Type = obj.TYPE_REG
   729  		p.To.Reg = v.Args[0].Reg()
   730  	case ssa.OpAMD64CMPQload, ssa.OpAMD64CMPLload, ssa.OpAMD64CMPWload, ssa.OpAMD64CMPBload:
   731  		p := s.Prog(v.Op.Asm())
   732  		p.From.Type = obj.TYPE_MEM
   733  		p.From.Reg = v.Args[0].Reg()
   734  		ssagen.AddAux(&p.From, v)
   735  		p.To.Type = obj.TYPE_REG
   736  		p.To.Reg = v.Args[1].Reg()
   737  	case ssa.OpAMD64CMPQconstload, ssa.OpAMD64CMPLconstload, ssa.OpAMD64CMPWconstload, ssa.OpAMD64CMPBconstload:
   738  		sc := v.AuxValAndOff()
   739  		p := s.Prog(v.Op.Asm())
   740  		p.From.Type = obj.TYPE_MEM
   741  		p.From.Reg = v.Args[0].Reg()
   742  		ssagen.AddAux2(&p.From, v, sc.Off64())
   743  		p.To.Type = obj.TYPE_CONST
   744  		p.To.Offset = sc.Val64()
   745  	case ssa.OpAMD64CMPQloadidx8, ssa.OpAMD64CMPQloadidx1, ssa.OpAMD64CMPLloadidx4, ssa.OpAMD64CMPLloadidx1, ssa.OpAMD64CMPWloadidx2, ssa.OpAMD64CMPWloadidx1, ssa.OpAMD64CMPBloadidx1:
   746  		p := s.Prog(v.Op.Asm())
   747  		memIdx(&p.From, v)
   748  		ssagen.AddAux(&p.From, v)
   749  		p.To.Type = obj.TYPE_REG
   750  		p.To.Reg = v.Args[2].Reg()
   751  	case ssa.OpAMD64CMPQconstloadidx8, ssa.OpAMD64CMPQconstloadidx1, ssa.OpAMD64CMPLconstloadidx4, ssa.OpAMD64CMPLconstloadidx1, ssa.OpAMD64CMPWconstloadidx2, ssa.OpAMD64CMPWconstloadidx1, ssa.OpAMD64CMPBconstloadidx1:
   752  		sc := v.AuxValAndOff()
   753  		p := s.Prog(v.Op.Asm())
   754  		memIdx(&p.From, v)
   755  		ssagen.AddAux2(&p.From, v, sc.Off64())
   756  		p.To.Type = obj.TYPE_CONST
   757  		p.To.Offset = sc.Val64()
   758  	case ssa.OpAMD64MOVLconst, ssa.OpAMD64MOVQconst:
   759  		x := v.Reg()
   760  
   761  		// If flags aren't live (indicated by v.Aux == nil),
   762  		// then we can rewrite MOV $0, AX into XOR AX, AX.
   763  		if v.AuxInt == 0 && v.Aux == nil {
   764  			opregreg(s, x86.AXORL, x, x)
   765  			break
   766  		}
   767  
   768  		asm := v.Op.Asm()
   769  		// Use MOVL to move a small constant into a register
   770  		// when the constant is positive and fits into 32 bits.
   771  		if 0 <= v.AuxInt && v.AuxInt <= (1<<32-1) {
   772  			// The upper 32bit are zeroed automatically when using MOVL.
   773  			asm = x86.AMOVL
   774  		}
   775  		p := s.Prog(asm)
   776  		p.From.Type = obj.TYPE_CONST
   777  		p.From.Offset = v.AuxInt
   778  		p.To.Type = obj.TYPE_REG
   779  		p.To.Reg = x
   780  	case ssa.OpAMD64MOVSSconst, ssa.OpAMD64MOVSDconst:
   781  		x := v.Reg()
   782  		p := s.Prog(v.Op.Asm())
   783  		p.From.Type = obj.TYPE_FCONST
   784  		p.From.Val = math.Float64frombits(uint64(v.AuxInt))
   785  		p.To.Type = obj.TYPE_REG
   786  		p.To.Reg = x
   787  	case ssa.OpAMD64MOVQload, ssa.OpAMD64MOVLload, ssa.OpAMD64MOVWload, ssa.OpAMD64MOVBload, ssa.OpAMD64MOVOload,
   788  		ssa.OpAMD64MOVSSload, ssa.OpAMD64MOVSDload, ssa.OpAMD64MOVBQSXload, ssa.OpAMD64MOVWQSXload, ssa.OpAMD64MOVLQSXload,
   789  		ssa.OpAMD64MOVBEQload, ssa.OpAMD64MOVBELload:
   790  		p := s.Prog(v.Op.Asm())
   791  		p.From.Type = obj.TYPE_MEM
   792  		p.From.Reg = v.Args[0].Reg()
   793  		ssagen.AddAux(&p.From, v)
   794  		p.To.Type = obj.TYPE_REG
   795  		p.To.Reg = v.Reg()
   796  	case ssa.OpAMD64MOVBloadidx1, ssa.OpAMD64MOVWloadidx1, ssa.OpAMD64MOVLloadidx1, ssa.OpAMD64MOVQloadidx1, ssa.OpAMD64MOVSSloadidx1, ssa.OpAMD64MOVSDloadidx1,
   797  		ssa.OpAMD64MOVQloadidx8, ssa.OpAMD64MOVSDloadidx8, ssa.OpAMD64MOVLloadidx8, ssa.OpAMD64MOVLloadidx4, ssa.OpAMD64MOVSSloadidx4, ssa.OpAMD64MOVWloadidx2,
   798  		ssa.OpAMD64MOVBELloadidx1, ssa.OpAMD64MOVBELloadidx4, ssa.OpAMD64MOVBELloadidx8, ssa.OpAMD64MOVBEQloadidx1, ssa.OpAMD64MOVBEQloadidx8:
   799  		p := s.Prog(v.Op.Asm())
   800  		memIdx(&p.From, v)
   801  		ssagen.AddAux(&p.From, v)
   802  		p.To.Type = obj.TYPE_REG
   803  		p.To.Reg = v.Reg()
   804  	case ssa.OpAMD64MOVQstore, ssa.OpAMD64MOVSSstore, ssa.OpAMD64MOVSDstore, ssa.OpAMD64MOVLstore, ssa.OpAMD64MOVWstore, ssa.OpAMD64MOVBstore, ssa.OpAMD64MOVOstore,
   805  		ssa.OpAMD64ADDQmodify, ssa.OpAMD64SUBQmodify, ssa.OpAMD64ANDQmodify, ssa.OpAMD64ORQmodify, ssa.OpAMD64XORQmodify,
   806  		ssa.OpAMD64ADDLmodify, ssa.OpAMD64SUBLmodify, ssa.OpAMD64ANDLmodify, ssa.OpAMD64ORLmodify, ssa.OpAMD64XORLmodify,
   807  		ssa.OpAMD64MOVBEQstore, ssa.OpAMD64MOVBELstore, ssa.OpAMD64MOVBEWstore:
   808  		p := s.Prog(v.Op.Asm())
   809  		p.From.Type = obj.TYPE_REG
   810  		p.From.Reg = v.Args[1].Reg()
   811  		p.To.Type = obj.TYPE_MEM
   812  		p.To.Reg = v.Args[0].Reg()
   813  		ssagen.AddAux(&p.To, v)
   814  	case ssa.OpAMD64MOVBstoreidx1, ssa.OpAMD64MOVWstoreidx1, ssa.OpAMD64MOVLstoreidx1, ssa.OpAMD64MOVQstoreidx1, ssa.OpAMD64MOVSSstoreidx1, ssa.OpAMD64MOVSDstoreidx1,
   815  		ssa.OpAMD64MOVQstoreidx8, ssa.OpAMD64MOVSDstoreidx8, ssa.OpAMD64MOVLstoreidx8, ssa.OpAMD64MOVSSstoreidx4, ssa.OpAMD64MOVLstoreidx4, ssa.OpAMD64MOVWstoreidx2,
   816  		ssa.OpAMD64ADDLmodifyidx1, ssa.OpAMD64ADDLmodifyidx4, ssa.OpAMD64ADDLmodifyidx8, ssa.OpAMD64ADDQmodifyidx1, ssa.OpAMD64ADDQmodifyidx8,
   817  		ssa.OpAMD64SUBLmodifyidx1, ssa.OpAMD64SUBLmodifyidx4, ssa.OpAMD64SUBLmodifyidx8, ssa.OpAMD64SUBQmodifyidx1, ssa.OpAMD64SUBQmodifyidx8,
   818  		ssa.OpAMD64ANDLmodifyidx1, ssa.OpAMD64ANDLmodifyidx4, ssa.OpAMD64ANDLmodifyidx8, ssa.OpAMD64ANDQmodifyidx1, ssa.OpAMD64ANDQmodifyidx8,
   819  		ssa.OpAMD64ORLmodifyidx1, ssa.OpAMD64ORLmodifyidx4, ssa.OpAMD64ORLmodifyidx8, ssa.OpAMD64ORQmodifyidx1, ssa.OpAMD64ORQmodifyidx8,
   820  		ssa.OpAMD64XORLmodifyidx1, ssa.OpAMD64XORLmodifyidx4, ssa.OpAMD64XORLmodifyidx8, ssa.OpAMD64XORQmodifyidx1, ssa.OpAMD64XORQmodifyidx8,
   821  		ssa.OpAMD64MOVBEWstoreidx1, ssa.OpAMD64MOVBEWstoreidx2, ssa.OpAMD64MOVBELstoreidx1, ssa.OpAMD64MOVBELstoreidx4, ssa.OpAMD64MOVBELstoreidx8, ssa.OpAMD64MOVBEQstoreidx1, ssa.OpAMD64MOVBEQstoreidx8:
   822  		p := s.Prog(v.Op.Asm())
   823  		p.From.Type = obj.TYPE_REG
   824  		p.From.Reg = v.Args[2].Reg()
   825  		memIdx(&p.To, v)
   826  		ssagen.AddAux(&p.To, v)
   827  	case ssa.OpAMD64ADDQconstmodify, ssa.OpAMD64ADDLconstmodify:
   828  		sc := v.AuxValAndOff()
   829  		off := sc.Off64()
   830  		val := sc.Val()
   831  		if val == 1 || val == -1 {
   832  			var asm obj.As
   833  			if v.Op == ssa.OpAMD64ADDQconstmodify {
   834  				if val == 1 {
   835  					asm = x86.AINCQ
   836  				} else {
   837  					asm = x86.ADECQ
   838  				}
   839  			} else {
   840  				if val == 1 {
   841  					asm = x86.AINCL
   842  				} else {
   843  					asm = x86.ADECL
   844  				}
   845  			}
   846  			p := s.Prog(asm)
   847  			p.To.Type = obj.TYPE_MEM
   848  			p.To.Reg = v.Args[0].Reg()
   849  			ssagen.AddAux2(&p.To, v, off)
   850  			break
   851  		}
   852  		fallthrough
   853  	case ssa.OpAMD64ANDQconstmodify, ssa.OpAMD64ANDLconstmodify, ssa.OpAMD64ORQconstmodify, ssa.OpAMD64ORLconstmodify,
   854  		ssa.OpAMD64XORQconstmodify, ssa.OpAMD64XORLconstmodify,
   855  		ssa.OpAMD64BTSQconstmodify, ssa.OpAMD64BTRQconstmodify, ssa.OpAMD64BTCQconstmodify:
   856  		sc := v.AuxValAndOff()
   857  		off := sc.Off64()
   858  		val := sc.Val64()
   859  		p := s.Prog(v.Op.Asm())
   860  		p.From.Type = obj.TYPE_CONST
   861  		p.From.Offset = val
   862  		p.To.Type = obj.TYPE_MEM
   863  		p.To.Reg = v.Args[0].Reg()
   864  		ssagen.AddAux2(&p.To, v, off)
   865  
   866  	case ssa.OpAMD64MOVQstoreconst, ssa.OpAMD64MOVLstoreconst, ssa.OpAMD64MOVWstoreconst, ssa.OpAMD64MOVBstoreconst:
   867  		p := s.Prog(v.Op.Asm())
   868  		p.From.Type = obj.TYPE_CONST
   869  		sc := v.AuxValAndOff()
   870  		p.From.Offset = sc.Val64()
   871  		p.To.Type = obj.TYPE_MEM
   872  		p.To.Reg = v.Args[0].Reg()
   873  		ssagen.AddAux2(&p.To, v, sc.Off64())
   874  	case ssa.OpAMD64MOVOstoreconst:
   875  		sc := v.AuxValAndOff()
   876  		if sc.Val() != 0 {
   877  			v.Fatalf("MOVO for non zero constants not implemented: %s", v.LongString())
   878  		}
   879  
   880  		if s.ABI != obj.ABIInternal {
   881  			// zero X15 manually
   882  			opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
   883  		}
   884  		p := s.Prog(v.Op.Asm())
   885  		p.From.Type = obj.TYPE_REG
   886  		p.From.Reg = x86.REG_X15
   887  		p.To.Type = obj.TYPE_MEM
   888  		p.To.Reg = v.Args[0].Reg()
   889  		ssagen.AddAux2(&p.To, v, sc.Off64())
   890  
   891  	case ssa.OpAMD64MOVQstoreconstidx1, ssa.OpAMD64MOVQstoreconstidx8, ssa.OpAMD64MOVLstoreconstidx1, ssa.OpAMD64MOVLstoreconstidx4, ssa.OpAMD64MOVWstoreconstidx1, ssa.OpAMD64MOVWstoreconstidx2, ssa.OpAMD64MOVBstoreconstidx1,
   892  		ssa.OpAMD64ADDLconstmodifyidx1, ssa.OpAMD64ADDLconstmodifyidx4, ssa.OpAMD64ADDLconstmodifyidx8, ssa.OpAMD64ADDQconstmodifyidx1, ssa.OpAMD64ADDQconstmodifyidx8,
   893  		ssa.OpAMD64ANDLconstmodifyidx1, ssa.OpAMD64ANDLconstmodifyidx4, ssa.OpAMD64ANDLconstmodifyidx8, ssa.OpAMD64ANDQconstmodifyidx1, ssa.OpAMD64ANDQconstmodifyidx8,
   894  		ssa.OpAMD64ORLconstmodifyidx1, ssa.OpAMD64ORLconstmodifyidx4, ssa.OpAMD64ORLconstmodifyidx8, ssa.OpAMD64ORQconstmodifyidx1, ssa.OpAMD64ORQconstmodifyidx8,
   895  		ssa.OpAMD64XORLconstmodifyidx1, ssa.OpAMD64XORLconstmodifyidx4, ssa.OpAMD64XORLconstmodifyidx8, ssa.OpAMD64XORQconstmodifyidx1, ssa.OpAMD64XORQconstmodifyidx8:
   896  		p := s.Prog(v.Op.Asm())
   897  		p.From.Type = obj.TYPE_CONST
   898  		sc := v.AuxValAndOff()
   899  		p.From.Offset = sc.Val64()
   900  		switch {
   901  		case p.As == x86.AADDQ && p.From.Offset == 1:
   902  			p.As = x86.AINCQ
   903  			p.From.Type = obj.TYPE_NONE
   904  		case p.As == x86.AADDQ && p.From.Offset == -1:
   905  			p.As = x86.ADECQ
   906  			p.From.Type = obj.TYPE_NONE
   907  		case p.As == x86.AADDL && p.From.Offset == 1:
   908  			p.As = x86.AINCL
   909  			p.From.Type = obj.TYPE_NONE
   910  		case p.As == x86.AADDL && p.From.Offset == -1:
   911  			p.As = x86.ADECL
   912  			p.From.Type = obj.TYPE_NONE
   913  		}
   914  		memIdx(&p.To, v)
   915  		ssagen.AddAux2(&p.To, v, sc.Off64())
   916  	case ssa.OpAMD64MOVLQSX, ssa.OpAMD64MOVWQSX, ssa.OpAMD64MOVBQSX, ssa.OpAMD64MOVLQZX, ssa.OpAMD64MOVWQZX, ssa.OpAMD64MOVBQZX,
   917  		ssa.OpAMD64CVTTSS2SL, ssa.OpAMD64CVTTSD2SL, ssa.OpAMD64CVTTSS2SQ, ssa.OpAMD64CVTTSD2SQ,
   918  		ssa.OpAMD64CVTSS2SD, ssa.OpAMD64CVTSD2SS:
   919  		opregreg(s, v.Op.Asm(), v.Reg(), v.Args[0].Reg())
   920  	case ssa.OpAMD64CVTSL2SD, ssa.OpAMD64CVTSQ2SD, ssa.OpAMD64CVTSQ2SS, ssa.OpAMD64CVTSL2SS:
   921  		r := v.Reg()
   922  		// Break false dependency on destination register.
   923  		opregreg(s, x86.AXORPS, r, r)
   924  		opregreg(s, v.Op.Asm(), r, v.Args[0].Reg())
   925  	case ssa.OpAMD64MOVQi2f, ssa.OpAMD64MOVQf2i, ssa.OpAMD64MOVLi2f, ssa.OpAMD64MOVLf2i:
   926  		var p *obj.Prog
   927  		switch v.Op {
   928  		case ssa.OpAMD64MOVQi2f, ssa.OpAMD64MOVQf2i:
   929  			p = s.Prog(x86.AMOVQ)
   930  		case ssa.OpAMD64MOVLi2f, ssa.OpAMD64MOVLf2i:
   931  			p = s.Prog(x86.AMOVL)
   932  		}
   933  		p.From.Type = obj.TYPE_REG
   934  		p.From.Reg = v.Args[0].Reg()
   935  		p.To.Type = obj.TYPE_REG
   936  		p.To.Reg = v.Reg()
   937  	case ssa.OpAMD64ADDQload, ssa.OpAMD64ADDLload, ssa.OpAMD64SUBQload, ssa.OpAMD64SUBLload,
   938  		ssa.OpAMD64ANDQload, ssa.OpAMD64ANDLload, ssa.OpAMD64ORQload, ssa.OpAMD64ORLload,
   939  		ssa.OpAMD64XORQload, ssa.OpAMD64XORLload, ssa.OpAMD64ADDSDload, ssa.OpAMD64ADDSSload,
   940  		ssa.OpAMD64SUBSDload, ssa.OpAMD64SUBSSload, ssa.OpAMD64MULSDload, ssa.OpAMD64MULSSload,
   941  		ssa.OpAMD64DIVSDload, ssa.OpAMD64DIVSSload:
   942  		p := s.Prog(v.Op.Asm())
   943  		p.From.Type = obj.TYPE_MEM
   944  		p.From.Reg = v.Args[1].Reg()
   945  		ssagen.AddAux(&p.From, v)
   946  		p.To.Type = obj.TYPE_REG
   947  		p.To.Reg = v.Reg()
   948  	case ssa.OpAMD64ADDLloadidx1, ssa.OpAMD64ADDLloadidx4, ssa.OpAMD64ADDLloadidx8, ssa.OpAMD64ADDQloadidx1, ssa.OpAMD64ADDQloadidx8,
   949  		ssa.OpAMD64SUBLloadidx1, ssa.OpAMD64SUBLloadidx4, ssa.OpAMD64SUBLloadidx8, ssa.OpAMD64SUBQloadidx1, ssa.OpAMD64SUBQloadidx8,
   950  		ssa.OpAMD64ANDLloadidx1, ssa.OpAMD64ANDLloadidx4, ssa.OpAMD64ANDLloadidx8, ssa.OpAMD64ANDQloadidx1, ssa.OpAMD64ANDQloadidx8,
   951  		ssa.OpAMD64ORLloadidx1, ssa.OpAMD64ORLloadidx4, ssa.OpAMD64ORLloadidx8, ssa.OpAMD64ORQloadidx1, ssa.OpAMD64ORQloadidx8,
   952  		ssa.OpAMD64XORLloadidx1, ssa.OpAMD64XORLloadidx4, ssa.OpAMD64XORLloadidx8, ssa.OpAMD64XORQloadidx1, ssa.OpAMD64XORQloadidx8,
   953  		ssa.OpAMD64ADDSSloadidx1, ssa.OpAMD64ADDSSloadidx4, ssa.OpAMD64ADDSDloadidx1, ssa.OpAMD64ADDSDloadidx8,
   954  		ssa.OpAMD64SUBSSloadidx1, ssa.OpAMD64SUBSSloadidx4, ssa.OpAMD64SUBSDloadidx1, ssa.OpAMD64SUBSDloadidx8,
   955  		ssa.OpAMD64MULSSloadidx1, ssa.OpAMD64MULSSloadidx4, ssa.OpAMD64MULSDloadidx1, ssa.OpAMD64MULSDloadidx8,
   956  		ssa.OpAMD64DIVSSloadidx1, ssa.OpAMD64DIVSSloadidx4, ssa.OpAMD64DIVSDloadidx1, ssa.OpAMD64DIVSDloadidx8:
   957  		p := s.Prog(v.Op.Asm())
   958  
   959  		r, i := v.Args[1].Reg(), v.Args[2].Reg()
   960  		p.From.Type = obj.TYPE_MEM
   961  		p.From.Scale = v.Op.Scale()
   962  		if p.From.Scale == 1 && i == x86.REG_SP {
   963  			r, i = i, r
   964  		}
   965  		p.From.Reg = r
   966  		p.From.Index = i
   967  
   968  		ssagen.AddAux(&p.From, v)
   969  		p.To.Type = obj.TYPE_REG
   970  		p.To.Reg = v.Reg()
   971  	case ssa.OpAMD64DUFFZERO:
   972  		if s.ABI != obj.ABIInternal {
   973  			// zero X15 manually
   974  			opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
   975  		}
   976  		off := duffStart(v.AuxInt)
   977  		adj := duffAdj(v.AuxInt)
   978  		var p *obj.Prog
   979  		if adj != 0 {
   980  			p = s.Prog(x86.ALEAQ)
   981  			p.From.Type = obj.TYPE_MEM
   982  			p.From.Offset = adj
   983  			p.From.Reg = x86.REG_DI
   984  			p.To.Type = obj.TYPE_REG
   985  			p.To.Reg = x86.REG_DI
   986  		}
   987  		p = s.Prog(obj.ADUFFZERO)
   988  		p.To.Type = obj.TYPE_ADDR
   989  		p.To.Sym = ir.Syms.Duffzero
   990  		p.To.Offset = off
   991  	case ssa.OpAMD64DUFFCOPY:
   992  		p := s.Prog(obj.ADUFFCOPY)
   993  		p.To.Type = obj.TYPE_ADDR
   994  		p.To.Sym = ir.Syms.Duffcopy
   995  		if v.AuxInt%16 != 0 {
   996  			v.Fatalf("bad DUFFCOPY AuxInt %v", v.AuxInt)
   997  		}
   998  		p.To.Offset = 14 * (64 - v.AuxInt/16)
   999  		// 14 and 64 are magic constants.  14 is the number of bytes to encode:
  1000  		//	MOVUPS	(SI), X0
  1001  		//	ADDQ	$16, SI
  1002  		//	MOVUPS	X0, (DI)
  1003  		//	ADDQ	$16, DI
  1004  		// and 64 is the number of such blocks. See src/runtime/duff_amd64.s:duffcopy.
  1005  
  1006  	case ssa.OpCopy:	// TODO: use MOVQreg for reg->reg copies instead of OpCopy?
  1007  		if v.Type.IsMemory() {
  1008  			return
  1009  		}
  1010  		x := v.Args[0].Reg()
  1011  		y := v.Reg()
  1012  		if x != y {
  1013  			opregreg(s, moveByType(v.Type), y, x)
  1014  		}
  1015  	case ssa.OpLoadReg:
  1016  		if v.Type.IsFlags() {
  1017  			v.Fatalf("load flags not implemented: %v", v.LongString())
  1018  			return
  1019  		}
  1020  		p := s.Prog(loadByType(v.Type))
  1021  		ssagen.AddrAuto(&p.From, v.Args[0])
  1022  		p.To.Type = obj.TYPE_REG
  1023  		p.To.Reg = v.Reg()
  1024  
  1025  	case ssa.OpStoreReg:
  1026  		if v.Type.IsFlags() {
  1027  			v.Fatalf("store flags not implemented: %v", v.LongString())
  1028  			return
  1029  		}
  1030  		p := s.Prog(storeByType(v.Type))
  1031  		p.From.Type = obj.TYPE_REG
  1032  		p.From.Reg = v.Args[0].Reg()
  1033  		ssagen.AddrAuto(&p.To, v)
  1034  	case ssa.OpAMD64LoweredHasCPUFeature:
  1035  		p := s.Prog(x86.AMOVBLZX)
  1036  		p.From.Type = obj.TYPE_MEM
  1037  		ssagen.AddAux(&p.From, v)
  1038  		p.To.Type = obj.TYPE_REG
  1039  		p.To.Reg = v.Reg()
  1040  	case ssa.OpArgIntReg, ssa.OpArgFloatReg:
  1041  		// The assembler needs to wrap the entry safepoint/stack growth code with spill/unspill
  1042  		// The loop only runs once.
  1043  		for _, ap := range v.Block.Func.RegArgs {
  1044  			// Pass the spill/unspill information along to the assembler, offset by size of return PC pushed on stack.
  1045  			addr := ssagen.SpillSlotAddr(ap, x86.REG_SP, v.Block.Func.Config.PtrSize)
  1046  			s.FuncInfo().AddSpill(
  1047  				obj.RegSpill{Reg: ap.Reg, Addr: addr, Unspill: loadByType(ap.Type), Spill: storeByType(ap.Type)})
  1048  		}
  1049  		v.Block.Func.RegArgs = nil
  1050  		ssagen.CheckArgReg(v)
  1051  	case ssa.OpAMD64LoweredGetClosurePtr:
  1052  		// Closure pointer is DX.
  1053  		ssagen.CheckLoweredGetClosurePtr(v)
  1054  	case ssa.OpAMD64LoweredGetG:
  1055  		if s.ABI == obj.ABIInternal {
  1056  			v.Fatalf("LoweredGetG should not appear in ABIInternal")
  1057  		}
  1058  		r := v.Reg()
  1059  		getgFromTLS(s, r)
  1060  	case ssa.OpAMD64CALLstatic, ssa.OpAMD64CALLtail:
  1061  		if s.ABI == obj.ABI0 && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABIInternal {
  1062  			// zeroing X15 when entering ABIInternal from ABI0
  1063  			if buildcfg.GOOS != "plan9" {	// do not use SSE on Plan 9
  1064  				opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
  1065  			}
  1066  			// set G register from TLS
  1067  			getgFromTLS(s, x86.REG_R14)
  1068  		}
  1069  		if v.Op == ssa.OpAMD64CALLtail {
  1070  			s.TailCall(v)
  1071  			break
  1072  		}
  1073  		s.Call(v)
  1074  		if s.ABI == obj.ABIInternal && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABI0 {
  1075  			// zeroing X15 when entering ABIInternal from ABI0
  1076  			if buildcfg.GOOS != "plan9" {	// do not use SSE on Plan 9
  1077  				opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
  1078  			}
  1079  			// set G register from TLS
  1080  			getgFromTLS(s, x86.REG_R14)
  1081  		}
  1082  	case ssa.OpAMD64CALLclosure, ssa.OpAMD64CALLinter:
  1083  		s.Call(v)
  1084  
  1085  	case ssa.OpAMD64LoweredGetCallerPC:
  1086  		p := s.Prog(x86.AMOVQ)
  1087  		p.From.Type = obj.TYPE_MEM
  1088  		p.From.Offset = -8	// PC is stored 8 bytes below first parameter.
  1089  		p.From.Name = obj.NAME_PARAM
  1090  		p.To.Type = obj.TYPE_REG
  1091  		p.To.Reg = v.Reg()
  1092  
  1093  	case ssa.OpAMD64LoweredGetCallerSP:
  1094  		// caller's SP is the address of the first arg
  1095  		mov := x86.AMOVQ
  1096  		if types.PtrSize == 4 {
  1097  			mov = x86.AMOVL
  1098  		}
  1099  		p := s.Prog(mov)
  1100  		p.From.Type = obj.TYPE_ADDR
  1101  		p.From.Offset = -base.Ctxt.Arch.FixedFrameSize	// 0 on amd64, just to be consistent with other architectures
  1102  		p.From.Name = obj.NAME_PARAM
  1103  		p.To.Type = obj.TYPE_REG
  1104  		p.To.Reg = v.Reg()
  1105  
  1106  	case ssa.OpAMD64LoweredWB:
  1107  		p := s.Prog(obj.ACALL)
  1108  		p.To.Type = obj.TYPE_MEM
  1109  		p.To.Name = obj.NAME_EXTERN
  1110  		// AuxInt encodes how many buffer entries we need.
  1111  		p.To.Sym = ir.Syms.GCWriteBarrier[v.AuxInt-1]
  1112  
  1113  	case ssa.OpAMD64LoweredPanicBoundsA, ssa.OpAMD64LoweredPanicBoundsB, ssa.OpAMD64LoweredPanicBoundsC:
  1114  		p := s.Prog(obj.ACALL)
  1115  		p.To.Type = obj.TYPE_MEM
  1116  		p.To.Name = obj.NAME_EXTERN
  1117  		p.To.Sym = ssagen.BoundsCheckFunc[v.AuxInt]
  1118  		s.UseArgs(int64(2 * types.PtrSize))	// space used in callee args area by assembly stubs
  1119  
  1120  	case ssa.OpAMD64NEGQ, ssa.OpAMD64NEGL,
  1121  		ssa.OpAMD64BSWAPQ, ssa.OpAMD64BSWAPL,
  1122  		ssa.OpAMD64NOTQ, ssa.OpAMD64NOTL:
  1123  		p := s.Prog(v.Op.Asm())
  1124  		p.To.Type = obj.TYPE_REG
  1125  		p.To.Reg = v.Reg()
  1126  
  1127  	case ssa.OpAMD64NEGLflags:
  1128  		p := s.Prog(v.Op.Asm())
  1129  		p.To.Type = obj.TYPE_REG
  1130  		p.To.Reg = v.Reg0()
  1131  
  1132  	case ssa.OpAMD64BSFQ, ssa.OpAMD64BSRQ, ssa.OpAMD64BSFL, ssa.OpAMD64BSRL, ssa.OpAMD64SQRTSD, ssa.OpAMD64SQRTSS:
  1133  		p := s.Prog(v.Op.Asm())
  1134  		p.From.Type = obj.TYPE_REG
  1135  		p.From.Reg = v.Args[0].Reg()
  1136  		p.To.Type = obj.TYPE_REG
  1137  		switch v.Op {
  1138  		case ssa.OpAMD64BSFQ, ssa.OpAMD64BSRQ:
  1139  			p.To.Reg = v.Reg0()
  1140  		case ssa.OpAMD64BSFL, ssa.OpAMD64BSRL, ssa.OpAMD64SQRTSD, ssa.OpAMD64SQRTSS:
  1141  			p.To.Reg = v.Reg()
  1142  		}
  1143  	case ssa.OpAMD64ROUNDSD:
  1144  		p := s.Prog(v.Op.Asm())
  1145  		val := v.AuxInt
  1146  		// 0 means math.RoundToEven, 1 Floor, 2 Ceil, 3 Trunc
  1147  		if val < 0 || val > 3 {
  1148  			v.Fatalf("Invalid rounding mode")
  1149  		}
  1150  		p.From.Offset = val
  1151  		p.From.Type = obj.TYPE_CONST
  1152  		p.AddRestSourceReg(v.Args[0].Reg())
  1153  		p.To.Type = obj.TYPE_REG
  1154  		p.To.Reg = v.Reg()
  1155  	case ssa.OpAMD64POPCNTQ, ssa.OpAMD64POPCNTL,
  1156  		ssa.OpAMD64TZCNTQ, ssa.OpAMD64TZCNTL,
  1157  		ssa.OpAMD64LZCNTQ, ssa.OpAMD64LZCNTL:
  1158  		if v.Args[0].Reg() != v.Reg() {
  1159  			// POPCNT/TZCNT/LZCNT have a false dependency on the destination register on Intel cpus.
  1160  			// TZCNT/LZCNT problem affects pre-Skylake models. See discussion at https://gcc.gnu.org/bugzilla/show_bug.cgi?id=62011#c7.
  1161  			// Xor register with itself to break the dependency.
  1162  			opregreg(s, x86.AXORL, v.Reg(), v.Reg())
  1163  		}
  1164  		p := s.Prog(v.Op.Asm())
  1165  		p.From.Type = obj.TYPE_REG
  1166  		p.From.Reg = v.Args[0].Reg()
  1167  		p.To.Type = obj.TYPE_REG
  1168  		p.To.Reg = v.Reg()
  1169  
  1170  	case ssa.OpAMD64SETEQ, ssa.OpAMD64SETNE,
  1171  		ssa.OpAMD64SETL, ssa.OpAMD64SETLE,
  1172  		ssa.OpAMD64SETG, ssa.OpAMD64SETGE,
  1173  		ssa.OpAMD64SETGF, ssa.OpAMD64SETGEF,
  1174  		ssa.OpAMD64SETB, ssa.OpAMD64SETBE,
  1175  		ssa.OpAMD64SETORD, ssa.OpAMD64SETNAN,
  1176  		ssa.OpAMD64SETA, ssa.OpAMD64SETAE,
  1177  		ssa.OpAMD64SETO:
  1178  		p := s.Prog(v.Op.Asm())
  1179  		p.To.Type = obj.TYPE_REG
  1180  		p.To.Reg = v.Reg()
  1181  
  1182  	case ssa.OpAMD64SETEQstore, ssa.OpAMD64SETNEstore,
  1183  		ssa.OpAMD64SETLstore, ssa.OpAMD64SETLEstore,
  1184  		ssa.OpAMD64SETGstore, ssa.OpAMD64SETGEstore,
  1185  		ssa.OpAMD64SETBstore, ssa.OpAMD64SETBEstore,
  1186  		ssa.OpAMD64SETAstore, ssa.OpAMD64SETAEstore:
  1187  		p := s.Prog(v.Op.Asm())
  1188  		p.To.Type = obj.TYPE_MEM
  1189  		p.To.Reg = v.Args[0].Reg()
  1190  		ssagen.AddAux(&p.To, v)
  1191  
  1192  	case ssa.OpAMD64SETEQstoreidx1, ssa.OpAMD64SETNEstoreidx1,
  1193  		ssa.OpAMD64SETLstoreidx1, ssa.OpAMD64SETLEstoreidx1,
  1194  		ssa.OpAMD64SETGstoreidx1, ssa.OpAMD64SETGEstoreidx1,
  1195  		ssa.OpAMD64SETBstoreidx1, ssa.OpAMD64SETBEstoreidx1,
  1196  		ssa.OpAMD64SETAstoreidx1, ssa.OpAMD64SETAEstoreidx1:
  1197  		p := s.Prog(v.Op.Asm())
  1198  		memIdx(&p.To, v)
  1199  		ssagen.AddAux(&p.To, v)
  1200  
  1201  	case ssa.OpAMD64SETNEF:
  1202  		t := v.RegTmp()
  1203  		p := s.Prog(v.Op.Asm())
  1204  		p.To.Type = obj.TYPE_REG
  1205  		p.To.Reg = v.Reg()
  1206  		q := s.Prog(x86.ASETPS)
  1207  		q.To.Type = obj.TYPE_REG
  1208  		q.To.Reg = t
  1209  		// ORL avoids partial register write and is smaller than ORQ, used by old compiler
  1210  		opregreg(s, x86.AORL, v.Reg(), t)
  1211  
  1212  	case ssa.OpAMD64SETEQF:
  1213  		t := v.RegTmp()
  1214  		p := s.Prog(v.Op.Asm())
  1215  		p.To.Type = obj.TYPE_REG
  1216  		p.To.Reg = v.Reg()
  1217  		q := s.Prog(x86.ASETPC)
  1218  		q.To.Type = obj.TYPE_REG
  1219  		q.To.Reg = t
  1220  		// ANDL avoids partial register write and is smaller than ANDQ, used by old compiler
  1221  		opregreg(s, x86.AANDL, v.Reg(), t)
  1222  
  1223  	case ssa.OpAMD64InvertFlags:
  1224  		v.Fatalf("InvertFlags should never make it to codegen %v", v.LongString())
  1225  	case ssa.OpAMD64FlagEQ, ssa.OpAMD64FlagLT_ULT, ssa.OpAMD64FlagLT_UGT, ssa.OpAMD64FlagGT_ULT, ssa.OpAMD64FlagGT_UGT:
  1226  		v.Fatalf("Flag* ops should never make it to codegen %v", v.LongString())
  1227  	case ssa.OpAMD64AddTupleFirst32, ssa.OpAMD64AddTupleFirst64:
  1228  		v.Fatalf("AddTupleFirst* should never make it to codegen %v", v.LongString())
  1229  	case ssa.OpAMD64REPSTOSQ:
  1230  		s.Prog(x86.AREP)
  1231  		s.Prog(x86.ASTOSQ)
  1232  	case ssa.OpAMD64REPMOVSQ:
  1233  		s.Prog(x86.AREP)
  1234  		s.Prog(x86.AMOVSQ)
  1235  	case ssa.OpAMD64LoweredNilCheck:
  1236  		// Issue a load which will fault if the input is nil.
  1237  		// TODO: We currently use the 2-byte instruction TESTB AX, (reg).
  1238  		// Should we use the 3-byte TESTB $0, (reg) instead? It is larger
  1239  		// but it doesn't have false dependency on AX.
  1240  		// Or maybe allocate an output register and use MOVL (reg),reg2 ?
  1241  		// That trades clobbering flags for clobbering a register.
  1242  		p := s.Prog(x86.ATESTB)
  1243  		p.From.Type = obj.TYPE_REG
  1244  		p.From.Reg = x86.REG_AX
  1245  		p.To.Type = obj.TYPE_MEM
  1246  		p.To.Reg = v.Args[0].Reg()
  1247  		if logopt.Enabled() {
  1248  			logopt.LogOpt(v.Pos, "nilcheck", "genssa", v.Block.Func.Name)
  1249  		}
  1250  		if base.Debug.Nil != 0 && v.Pos.Line() > 1 {	// v.Pos.Line()==1 in generated wrappers
  1251  			base.WarnfAt(v.Pos, "generated nil check")
  1252  		}
  1253  	case ssa.OpAMD64MOVBatomicload, ssa.OpAMD64MOVLatomicload, ssa.OpAMD64MOVQatomicload:
  1254  		p := s.Prog(v.Op.Asm())
  1255  		p.From.Type = obj.TYPE_MEM
  1256  		p.From.Reg = v.Args[0].Reg()
  1257  		ssagen.AddAux(&p.From, v)
  1258  		p.To.Type = obj.TYPE_REG
  1259  		p.To.Reg = v.Reg0()
  1260  	case ssa.OpAMD64XCHGB, ssa.OpAMD64XCHGL, ssa.OpAMD64XCHGQ:
  1261  		p := s.Prog(v.Op.Asm())
  1262  		p.From.Type = obj.TYPE_REG
  1263  		p.From.Reg = v.Reg0()
  1264  		p.To.Type = obj.TYPE_MEM
  1265  		p.To.Reg = v.Args[1].Reg()
  1266  		ssagen.AddAux(&p.To, v)
  1267  	case ssa.OpAMD64XADDLlock, ssa.OpAMD64XADDQlock:
  1268  		s.Prog(x86.ALOCK)
  1269  		p := s.Prog(v.Op.Asm())
  1270  		p.From.Type = obj.TYPE_REG
  1271  		p.From.Reg = v.Reg0()
  1272  		p.To.Type = obj.TYPE_MEM
  1273  		p.To.Reg = v.Args[1].Reg()
  1274  		ssagen.AddAux(&p.To, v)
  1275  	case ssa.OpAMD64CMPXCHGLlock, ssa.OpAMD64CMPXCHGQlock:
  1276  		if v.Args[1].Reg() != x86.REG_AX {
  1277  			v.Fatalf("input[1] not in AX %s", v.LongString())
  1278  		}
  1279  		s.Prog(x86.ALOCK)
  1280  		p := s.Prog(v.Op.Asm())
  1281  		p.From.Type = obj.TYPE_REG
  1282  		p.From.Reg = v.Args[2].Reg()
  1283  		p.To.Type = obj.TYPE_MEM
  1284  		p.To.Reg = v.Args[0].Reg()
  1285  		ssagen.AddAux(&p.To, v)
  1286  		p = s.Prog(x86.ASETEQ)
  1287  		p.To.Type = obj.TYPE_REG
  1288  		p.To.Reg = v.Reg0()
  1289  	case ssa.OpAMD64ANDBlock, ssa.OpAMD64ANDLlock, ssa.OpAMD64ORBlock, ssa.OpAMD64ORLlock:
  1290  		s.Prog(x86.ALOCK)
  1291  		p := s.Prog(v.Op.Asm())
  1292  		p.From.Type = obj.TYPE_REG
  1293  		p.From.Reg = v.Args[1].Reg()
  1294  		p.To.Type = obj.TYPE_MEM
  1295  		p.To.Reg = v.Args[0].Reg()
  1296  		ssagen.AddAux(&p.To, v)
  1297  	case ssa.OpAMD64PrefetchT0, ssa.OpAMD64PrefetchNTA:
  1298  		p := s.Prog(v.Op.Asm())
  1299  		p.From.Type = obj.TYPE_MEM
  1300  		p.From.Reg = v.Args[0].Reg()
  1301  	case ssa.OpClobber:
  1302  		p := s.Prog(x86.AMOVL)
  1303  		p.From.Type = obj.TYPE_CONST
  1304  		p.From.Offset = 0xdeaddead
  1305  		p.To.Type = obj.TYPE_MEM
  1306  		p.To.Reg = x86.REG_SP
  1307  		ssagen.AddAux(&p.To, v)
  1308  		p = s.Prog(x86.AMOVL)
  1309  		p.From.Type = obj.TYPE_CONST
  1310  		p.From.Offset = 0xdeaddead
  1311  		p.To.Type = obj.TYPE_MEM
  1312  		p.To.Reg = x86.REG_SP
  1313  		ssagen.AddAux(&p.To, v)
  1314  		p.To.Offset += 4
  1315  	case ssa.OpClobberReg:
  1316  		x := uint64(0xdeaddeaddeaddead)
  1317  		p := s.Prog(x86.AMOVQ)
  1318  		p.From.Type = obj.TYPE_CONST
  1319  		p.From.Offset = int64(x)
  1320  		p.To.Type = obj.TYPE_REG
  1321  		p.To.Reg = v.Reg()
  1322  	default:
  1323  		v.Fatalf("genValue not implemented: %s", v.LongString())
  1324  	}
  1325  }
  1326  
  1327  var blockJump = [...]struct {
  1328  	asm, invasm obj.As
  1329  }{
  1330  	ssa.BlockAMD64EQ:	{x86.AJEQ, x86.AJNE},
  1331  	ssa.BlockAMD64NE:	{x86.AJNE, x86.AJEQ},
  1332  	ssa.BlockAMD64LT:	{x86.AJLT, x86.AJGE},
  1333  	ssa.BlockAMD64GE:	{x86.AJGE, x86.AJLT},
  1334  	ssa.BlockAMD64LE:	{x86.AJLE, x86.AJGT},
  1335  	ssa.BlockAMD64GT:	{x86.AJGT, x86.AJLE},
  1336  	ssa.BlockAMD64OS:	{x86.AJOS, x86.AJOC},
  1337  	ssa.BlockAMD64OC:	{x86.AJOC, x86.AJOS},
  1338  	ssa.BlockAMD64ULT:	{x86.AJCS, x86.AJCC},
  1339  	ssa.BlockAMD64UGE:	{x86.AJCC, x86.AJCS},
  1340  	ssa.BlockAMD64UGT:	{x86.AJHI, x86.AJLS},
  1341  	ssa.BlockAMD64ULE:	{x86.AJLS, x86.AJHI},
  1342  	ssa.BlockAMD64ORD:	{x86.AJPC, x86.AJPS},
  1343  	ssa.BlockAMD64NAN:	{x86.AJPS, x86.AJPC},
  1344  }
  1345  
  1346  var eqfJumps = [2][2]ssagen.IndexJump{
  1347  	{{Jump: x86.AJNE, Index: 1}, {Jump: x86.AJPS, Index: 1}},	// next == b.Succs[0]
  1348  	{{Jump: x86.AJNE, Index: 1}, {Jump: x86.AJPC, Index: 0}},	// next == b.Succs[1]
  1349  }
  1350  var nefJumps = [2][2]ssagen.IndexJump{
  1351  	{{Jump: x86.AJNE, Index: 0}, {Jump: x86.AJPC, Index: 1}},	// next == b.Succs[0]
  1352  	{{Jump: x86.AJNE, Index: 0}, {Jump: x86.AJPS, Index: 0}},	// next == b.Succs[1]
  1353  }
  1354  
  1355  func ssaGenBlock(s *ssagen.State, b, next *ssa.Block) {
  1356  	switch b.Kind {
  1357  	case ssa.BlockPlain:
  1358  		if b.Succs[0].Block() != next {
  1359  			p := s.Prog(obj.AJMP)
  1360  			p.To.Type = obj.TYPE_BRANCH
  1361  			s.Branches = append(s.Branches, ssagen.Branch{P: p, B: b.Succs[0].Block()})
  1362  		}
  1363  	case ssa.BlockDefer:
  1364  		// defer returns in rax:
  1365  		// 0 if we should continue executing
  1366  		// 1 if we should jump to deferreturn call
  1367  		p := s.Prog(x86.ATESTL)
  1368  		p.From.Type = obj.TYPE_REG
  1369  		p.From.Reg = x86.REG_AX
  1370  		p.To.Type = obj.TYPE_REG
  1371  		p.To.Reg = x86.REG_AX
  1372  		p = s.Prog(x86.AJNE)
  1373  		p.To.Type = obj.TYPE_BRANCH
  1374  		s.Branches = append(s.Branches, ssagen.Branch{P: p, B: b.Succs[1].Block()})
  1375  		if b.Succs[0].Block() != next {
  1376  			p := s.Prog(obj.AJMP)
  1377  			p.To.Type = obj.TYPE_BRANCH
  1378  			s.Branches = append(s.Branches, ssagen.Branch{P: p, B: b.Succs[0].Block()})
  1379  		}
  1380  	case ssa.BlockExit, ssa.BlockRetJmp:
  1381  	case ssa.BlockRet:
  1382  		s.Prog(obj.ARET)
  1383  
  1384  	case ssa.BlockAMD64EQF:
  1385  		s.CombJump(b, next, &eqfJumps)
  1386  
  1387  	case ssa.BlockAMD64NEF:
  1388  		s.CombJump(b, next, &nefJumps)
  1389  
  1390  	case ssa.BlockAMD64EQ, ssa.BlockAMD64NE,
  1391  		ssa.BlockAMD64LT, ssa.BlockAMD64GE,
  1392  		ssa.BlockAMD64LE, ssa.BlockAMD64GT,
  1393  		ssa.BlockAMD64OS, ssa.BlockAMD64OC,
  1394  		ssa.BlockAMD64ULT, ssa.BlockAMD64UGT,
  1395  		ssa.BlockAMD64ULE, ssa.BlockAMD64UGE:
  1396  		jmp := blockJump[b.Kind]
  1397  		switch next {
  1398  		case b.Succs[0].Block():
  1399  			s.Br(jmp.invasm, b.Succs[1].Block())
  1400  		case b.Succs[1].Block():
  1401  			s.Br(jmp.asm, b.Succs[0].Block())
  1402  		default:
  1403  			if b.Likely != ssa.BranchUnlikely {
  1404  				s.Br(jmp.asm, b.Succs[0].Block())
  1405  				s.Br(obj.AJMP, b.Succs[1].Block())
  1406  			} else {
  1407  				s.Br(jmp.invasm, b.Succs[1].Block())
  1408  				s.Br(obj.AJMP, b.Succs[0].Block())
  1409  			}
  1410  		}
  1411  
  1412  	case ssa.BlockAMD64JUMPTABLE:
  1413  		// JMP      *(TABLE)(INDEX*8)
  1414  		p := s.Prog(obj.AJMP)
  1415  		p.To.Type = obj.TYPE_MEM
  1416  		p.To.Reg = b.Controls[1].Reg()
  1417  		p.To.Index = b.Controls[0].Reg()
  1418  		p.To.Scale = 8
  1419  		// Save jump tables for later resolution of the target blocks.
  1420  		s.JumpTables = append(s.JumpTables, b)
  1421  
  1422  	default:
  1423  		b.Fatalf("branch not implemented: %s", b.LongString())
  1424  	}
  1425  }
  1426  
  1427  func loadRegResult(s *ssagen.State, f *ssa.Func, t *types.Type, reg int16, n *ir.Name, off int64) *obj.Prog {
  1428  	p := s.Prog(loadByType(t))
  1429  	p.From.Type = obj.TYPE_MEM
  1430  	p.From.Name = obj.NAME_AUTO
  1431  	p.From.Sym = n.Linksym()
  1432  	p.From.Offset = n.FrameOffset() + off
  1433  	p.To.Type = obj.TYPE_REG
  1434  	p.To.Reg = reg
  1435  	return p
  1436  }
  1437  
  1438  func spillArgReg(pp *objw.Progs, p *obj.Prog, f *ssa.Func, t *types.Type, reg int16, n *ir.Name, off int64) *obj.Prog {
  1439  	p = pp.Append(p, storeByType(t), obj.TYPE_REG, reg, 0, obj.TYPE_MEM, 0, n.FrameOffset()+off)
  1440  	p.To.Name = obj.NAME_PARAM
  1441  	p.To.Sym = n.Linksym()
  1442  	p.Pos = p.Pos.WithNotStmt()
  1443  	return p
  1444  }