github.com/euank/go@v0.0.0-20160829210321-495514729181/src/cmd/compile/internal/amd64/ssa.go (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package amd64
     6  
     7  import (
     8  	"fmt"
     9  	"math"
    10  
    11  	"cmd/compile/internal/gc"
    12  	"cmd/compile/internal/ssa"
    13  	"cmd/internal/obj"
    14  	"cmd/internal/obj/x86"
    15  )
    16  
    17  // Smallest possible faulting page at address zero.
    18  const minZeroPage = 4096
    19  
    20  // ssaRegToReg maps ssa register numbers to obj register numbers.
    21  var ssaRegToReg = []int16{
    22  	x86.REG_AX,
    23  	x86.REG_CX,
    24  	x86.REG_DX,
    25  	x86.REG_BX,
    26  	x86.REG_SP,
    27  	x86.REG_BP,
    28  	x86.REG_SI,
    29  	x86.REG_DI,
    30  	x86.REG_R8,
    31  	x86.REG_R9,
    32  	x86.REG_R10,
    33  	x86.REG_R11,
    34  	x86.REG_R12,
    35  	x86.REG_R13,
    36  	x86.REG_R14,
    37  	x86.REG_R15,
    38  	x86.REG_X0,
    39  	x86.REG_X1,
    40  	x86.REG_X2,
    41  	x86.REG_X3,
    42  	x86.REG_X4,
    43  	x86.REG_X5,
    44  	x86.REG_X6,
    45  	x86.REG_X7,
    46  	x86.REG_X8,
    47  	x86.REG_X9,
    48  	x86.REG_X10,
    49  	x86.REG_X11,
    50  	x86.REG_X12,
    51  	x86.REG_X13,
    52  	x86.REG_X14,
    53  	x86.REG_X15,
    54  	0, // SB isn't a real register.  We fill an Addr.Reg field with 0 in this case.
    55  }
    56  
    57  // markMoves marks any MOVXconst ops that need to avoid clobbering flags.
    58  func ssaMarkMoves(s *gc.SSAGenState, b *ssa.Block) {
    59  	flive := b.FlagsLiveAtEnd
    60  	if b.Control != nil && b.Control.Type.IsFlags() {
    61  		flive = true
    62  	}
    63  	for i := len(b.Values) - 1; i >= 0; i-- {
    64  		v := b.Values[i]
    65  		if flive && (v.Op == ssa.OpAMD64MOVLconst || v.Op == ssa.OpAMD64MOVQconst) {
    66  			// The "mark" is any non-nil Aux value.
    67  			v.Aux = v
    68  		}
    69  		if v.Type.IsFlags() {
    70  			flive = false
    71  		}
    72  		for _, a := range v.Args {
    73  			if a.Type.IsFlags() {
    74  				flive = true
    75  			}
    76  		}
    77  	}
    78  }
    79  
    80  // loadByType returns the load instruction of the given type.
    81  func loadByType(t ssa.Type) obj.As {
    82  	// Avoid partial register write
    83  	if !t.IsFloat() && t.Size() <= 2 {
    84  		if t.Size() == 1 {
    85  			return x86.AMOVBLZX
    86  		} else {
    87  			return x86.AMOVWLZX
    88  		}
    89  	}
    90  	// Otherwise, there's no difference between load and store opcodes.
    91  	return storeByType(t)
    92  }
    93  
    94  // storeByType returns the store instruction of the given type.
    95  func storeByType(t ssa.Type) obj.As {
    96  	width := t.Size()
    97  	if t.IsFloat() {
    98  		switch width {
    99  		case 4:
   100  			return x86.AMOVSS
   101  		case 8:
   102  			return x86.AMOVSD
   103  		}
   104  	} else {
   105  		switch width {
   106  		case 1:
   107  			return x86.AMOVB
   108  		case 2:
   109  			return x86.AMOVW
   110  		case 4:
   111  			return x86.AMOVL
   112  		case 8:
   113  			return x86.AMOVQ
   114  		}
   115  	}
   116  	panic("bad store type")
   117  }
   118  
   119  // moveByType returns the reg->reg move instruction of the given type.
   120  func moveByType(t ssa.Type) obj.As {
   121  	if t.IsFloat() {
   122  		// Moving the whole sse2 register is faster
   123  		// than moving just the correct low portion of it.
   124  		// There is no xmm->xmm move with 1 byte opcode,
   125  		// so use movups, which has 2 byte opcode.
   126  		return x86.AMOVUPS
   127  	} else {
   128  		switch t.Size() {
   129  		case 1:
   130  			// Avoids partial register write
   131  			return x86.AMOVL
   132  		case 2:
   133  			return x86.AMOVL
   134  		case 4:
   135  			return x86.AMOVL
   136  		case 8:
   137  			return x86.AMOVQ
   138  		case 16:
   139  			return x86.AMOVUPS // int128s are in SSE registers
   140  		default:
   141  			panic(fmt.Sprintf("bad int register width %d:%s", t.Size(), t))
   142  		}
   143  	}
   144  }
   145  
   146  // opregreg emits instructions for
   147  //     dest := dest(To) op src(From)
   148  // and also returns the created obj.Prog so it
   149  // may be further adjusted (offset, scale, etc).
   150  func opregreg(op obj.As, dest, src int16) *obj.Prog {
   151  	p := gc.Prog(op)
   152  	p.From.Type = obj.TYPE_REG
   153  	p.To.Type = obj.TYPE_REG
   154  	p.To.Reg = dest
   155  	p.From.Reg = src
   156  	return p
   157  }
   158  
   159  // DUFFZERO consists of repeated blocks of 4 MOVUPSs + ADD,
   160  // See runtime/mkduff.go.
   161  func duffStart(size int64) int64 {
   162  	x, _ := duff(size)
   163  	return x
   164  }
   165  func duffAdj(size int64) int64 {
   166  	_, x := duff(size)
   167  	return x
   168  }
   169  
   170  // duff returns the offset (from duffzero, in bytes) and pointer adjust (in bytes)
   171  // required to use the duffzero mechanism for a block of the given size.
   172  func duff(size int64) (int64, int64) {
   173  	if size < 32 || size > 1024 || size%dzClearStep != 0 {
   174  		panic("bad duffzero size")
   175  	}
   176  	steps := size / dzClearStep
   177  	blocks := steps / dzBlockLen
   178  	steps %= dzBlockLen
   179  	off := dzBlockSize * (dzBlocks - blocks)
   180  	var adj int64
   181  	if steps != 0 {
   182  		off -= dzAddSize
   183  		off -= dzMovSize * steps
   184  		adj -= dzClearStep * (dzBlockLen - steps)
   185  	}
   186  	return off, adj
   187  }
   188  
   189  func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
   190  	s.SetLineno(v.Line)
   191  	switch v.Op {
   192  	case ssa.OpAMD64ADDQ, ssa.OpAMD64ADDL:
   193  		r := gc.SSARegNum(v)
   194  		r1 := gc.SSARegNum(v.Args[0])
   195  		r2 := gc.SSARegNum(v.Args[1])
   196  		switch {
   197  		case r == r1:
   198  			p := gc.Prog(v.Op.Asm())
   199  			p.From.Type = obj.TYPE_REG
   200  			p.From.Reg = r2
   201  			p.To.Type = obj.TYPE_REG
   202  			p.To.Reg = r
   203  		case r == r2:
   204  			p := gc.Prog(v.Op.Asm())
   205  			p.From.Type = obj.TYPE_REG
   206  			p.From.Reg = r1
   207  			p.To.Type = obj.TYPE_REG
   208  			p.To.Reg = r
   209  		default:
   210  			var asm obj.As
   211  			if v.Op == ssa.OpAMD64ADDQ {
   212  				asm = x86.ALEAQ
   213  			} else {
   214  				asm = x86.ALEAL
   215  			}
   216  			p := gc.Prog(asm)
   217  			p.From.Type = obj.TYPE_MEM
   218  			p.From.Reg = r1
   219  			p.From.Scale = 1
   220  			p.From.Index = r2
   221  			p.To.Type = obj.TYPE_REG
   222  			p.To.Reg = r
   223  		}
   224  	// 2-address opcode arithmetic
   225  	case ssa.OpAMD64SUBQ, ssa.OpAMD64SUBL,
   226  		ssa.OpAMD64MULQ, ssa.OpAMD64MULL,
   227  		ssa.OpAMD64ANDQ, ssa.OpAMD64ANDL,
   228  		ssa.OpAMD64ORQ, ssa.OpAMD64ORL,
   229  		ssa.OpAMD64XORQ, ssa.OpAMD64XORL,
   230  		ssa.OpAMD64SHLQ, ssa.OpAMD64SHLL,
   231  		ssa.OpAMD64SHRQ, ssa.OpAMD64SHRL, ssa.OpAMD64SHRW, ssa.OpAMD64SHRB,
   232  		ssa.OpAMD64SARQ, ssa.OpAMD64SARL, ssa.OpAMD64SARW, ssa.OpAMD64SARB,
   233  		ssa.OpAMD64ADDSS, ssa.OpAMD64ADDSD, ssa.OpAMD64SUBSS, ssa.OpAMD64SUBSD,
   234  		ssa.OpAMD64MULSS, ssa.OpAMD64MULSD, ssa.OpAMD64DIVSS, ssa.OpAMD64DIVSD,
   235  		ssa.OpAMD64PXOR:
   236  		r := gc.SSARegNum(v)
   237  		if r != gc.SSARegNum(v.Args[0]) {
   238  			v.Fatalf("input[0] and output not in same register %s", v.LongString())
   239  		}
   240  		opregreg(v.Op.Asm(), r, gc.SSARegNum(v.Args[1]))
   241  
   242  	case ssa.OpAMD64DIVQU, ssa.OpAMD64DIVLU, ssa.OpAMD64DIVWU:
   243  		// Arg[0] (the dividend) is in AX.
   244  		// Arg[1] (the divisor) can be in any other register.
   245  		// Result[0] (the quotient) is in AX.
   246  		// Result[1] (the remainder) is in DX.
   247  		r := gc.SSARegNum(v.Args[1])
   248  
   249  		// Zero extend dividend.
   250  		c := gc.Prog(x86.AXORL)
   251  		c.From.Type = obj.TYPE_REG
   252  		c.From.Reg = x86.REG_DX
   253  		c.To.Type = obj.TYPE_REG
   254  		c.To.Reg = x86.REG_DX
   255  
   256  		// Issue divide.
   257  		p := gc.Prog(v.Op.Asm())
   258  		p.From.Type = obj.TYPE_REG
   259  		p.From.Reg = r
   260  
   261  	case ssa.OpAMD64DIVQ, ssa.OpAMD64DIVL, ssa.OpAMD64DIVW:
   262  		// Arg[0] (the dividend) is in AX.
   263  		// Arg[1] (the divisor) can be in any other register.
   264  		// Result[0] (the quotient) is in AX.
   265  		// Result[1] (the remainder) is in DX.
   266  		r := gc.SSARegNum(v.Args[1])
   267  
   268  		// CPU faults upon signed overflow, which occurs when the most
   269  		// negative int is divided by -1. Handle divide by -1 as a special case.
   270  		var c *obj.Prog
   271  		switch v.Op {
   272  		case ssa.OpAMD64DIVQ:
   273  			c = gc.Prog(x86.ACMPQ)
   274  		case ssa.OpAMD64DIVL:
   275  			c = gc.Prog(x86.ACMPL)
   276  		case ssa.OpAMD64DIVW:
   277  			c = gc.Prog(x86.ACMPW)
   278  		}
   279  		c.From.Type = obj.TYPE_REG
   280  		c.From.Reg = r
   281  		c.To.Type = obj.TYPE_CONST
   282  		c.To.Offset = -1
   283  		j1 := gc.Prog(x86.AJEQ)
   284  		j1.To.Type = obj.TYPE_BRANCH
   285  
   286  		// Sign extend dividend.
   287  		switch v.Op {
   288  		case ssa.OpAMD64DIVQ:
   289  			gc.Prog(x86.ACQO)
   290  		case ssa.OpAMD64DIVL:
   291  			gc.Prog(x86.ACDQ)
   292  		case ssa.OpAMD64DIVW:
   293  			gc.Prog(x86.ACWD)
   294  		}
   295  
   296  		// Issue divide.
   297  		p := gc.Prog(v.Op.Asm())
   298  		p.From.Type = obj.TYPE_REG
   299  		p.From.Reg = r
   300  
   301  		// Skip over -1 fixup code.
   302  		j2 := gc.Prog(obj.AJMP)
   303  		j2.To.Type = obj.TYPE_BRANCH
   304  
   305  		// Issue -1 fixup code.
   306  		// n / -1 = -n
   307  		n1 := gc.Prog(x86.ANEGQ)
   308  		n1.To.Type = obj.TYPE_REG
   309  		n1.To.Reg = x86.REG_AX
   310  
   311  		// n % -1 == 0
   312  		n2 := gc.Prog(x86.AXORL)
   313  		n2.From.Type = obj.TYPE_REG
   314  		n2.From.Reg = x86.REG_DX
   315  		n2.To.Type = obj.TYPE_REG
   316  		n2.To.Reg = x86.REG_DX
   317  
   318  		// TODO(khr): issue only the -1 fixup code we need.
   319  		// For instance, if only the quotient is used, no point in zeroing the remainder.
   320  
   321  		j1.To.Val = n1
   322  		j2.To.Val = s.Pc()
   323  
   324  	case ssa.OpAMD64HMULQ, ssa.OpAMD64HMULL, ssa.OpAMD64HMULW, ssa.OpAMD64HMULB,
   325  		ssa.OpAMD64HMULQU, ssa.OpAMD64HMULLU, ssa.OpAMD64HMULWU, ssa.OpAMD64HMULBU:
   326  		// the frontend rewrites constant division by 8/16/32 bit integers into
   327  		// HMUL by a constant
   328  		// SSA rewrites generate the 64 bit versions
   329  
   330  		// Arg[0] is already in AX as it's the only register we allow
   331  		// and DX is the only output we care about (the high bits)
   332  		p := gc.Prog(v.Op.Asm())
   333  		p.From.Type = obj.TYPE_REG
   334  		p.From.Reg = gc.SSARegNum(v.Args[1])
   335  
   336  		// IMULB puts the high portion in AH instead of DL,
   337  		// so move it to DL for consistency
   338  		if v.Type.Size() == 1 {
   339  			m := gc.Prog(x86.AMOVB)
   340  			m.From.Type = obj.TYPE_REG
   341  			m.From.Reg = x86.REG_AH
   342  			m.To.Type = obj.TYPE_REG
   343  			m.To.Reg = x86.REG_DX
   344  		}
   345  
   346  	case ssa.OpAMD64AVGQU:
   347  		// compute (x+y)/2 unsigned.
   348  		// Do a 64-bit add, the overflow goes into the carry.
   349  		// Shift right once and pull the carry back into the 63rd bit.
   350  		r := gc.SSARegNum(v)
   351  		if r != gc.SSARegNum(v.Args[0]) {
   352  			v.Fatalf("input[0] and output not in same register %s", v.LongString())
   353  		}
   354  		p := gc.Prog(x86.AADDQ)
   355  		p.From.Type = obj.TYPE_REG
   356  		p.To.Type = obj.TYPE_REG
   357  		p.To.Reg = r
   358  		p.From.Reg = gc.SSARegNum(v.Args[1])
   359  		p = gc.Prog(x86.ARCRQ)
   360  		p.From.Type = obj.TYPE_CONST
   361  		p.From.Offset = 1
   362  		p.To.Type = obj.TYPE_REG
   363  		p.To.Reg = r
   364  
   365  	case ssa.OpAMD64ADDQconst, ssa.OpAMD64ADDLconst:
   366  		r := gc.SSARegNum(v)
   367  		a := gc.SSARegNum(v.Args[0])
   368  		if r == a {
   369  			if v.AuxInt == 1 {
   370  				var asm obj.As
   371  				// Software optimization manual recommends add $1,reg.
   372  				// But inc/dec is 1 byte smaller. ICC always uses inc
   373  				// Clang/GCC choose depending on flags, but prefer add.
   374  				// Experiments show that inc/dec is both a little faster
   375  				// and make a binary a little smaller.
   376  				if v.Op == ssa.OpAMD64ADDQconst {
   377  					asm = x86.AINCQ
   378  				} else {
   379  					asm = x86.AINCL
   380  				}
   381  				p := gc.Prog(asm)
   382  				p.To.Type = obj.TYPE_REG
   383  				p.To.Reg = r
   384  				return
   385  			}
   386  			if v.AuxInt == -1 {
   387  				var asm obj.As
   388  				if v.Op == ssa.OpAMD64ADDQconst {
   389  					asm = x86.ADECQ
   390  				} else {
   391  					asm = x86.ADECL
   392  				}
   393  				p := gc.Prog(asm)
   394  				p.To.Type = obj.TYPE_REG
   395  				p.To.Reg = r
   396  				return
   397  			}
   398  			p := gc.Prog(v.Op.Asm())
   399  			p.From.Type = obj.TYPE_CONST
   400  			p.From.Offset = v.AuxInt
   401  			p.To.Type = obj.TYPE_REG
   402  			p.To.Reg = r
   403  			return
   404  		}
   405  		var asm obj.As
   406  		if v.Op == ssa.OpAMD64ADDQconst {
   407  			asm = x86.ALEAQ
   408  		} else {
   409  			asm = x86.ALEAL
   410  		}
   411  		p := gc.Prog(asm)
   412  		p.From.Type = obj.TYPE_MEM
   413  		p.From.Reg = a
   414  		p.From.Offset = v.AuxInt
   415  		p.To.Type = obj.TYPE_REG
   416  		p.To.Reg = r
   417  
   418  	case ssa.OpAMD64CMOVQEQ, ssa.OpAMD64CMOVLEQ:
   419  		r := gc.SSARegNum(v)
   420  		if r != gc.SSARegNum(v.Args[0]) {
   421  			v.Fatalf("input[0] and output not in same register %s", v.LongString())
   422  		}
   423  		p := gc.Prog(v.Op.Asm())
   424  		p.From.Type = obj.TYPE_REG
   425  		p.From.Reg = gc.SSARegNum(v.Args[1])
   426  		p.To.Type = obj.TYPE_REG
   427  		p.To.Reg = r
   428  
   429  	case ssa.OpAMD64MULQconst, ssa.OpAMD64MULLconst:
   430  		r := gc.SSARegNum(v)
   431  		if r != gc.SSARegNum(v.Args[0]) {
   432  			v.Fatalf("input[0] and output not in same register %s", v.LongString())
   433  		}
   434  		p := gc.Prog(v.Op.Asm())
   435  		p.From.Type = obj.TYPE_CONST
   436  		p.From.Offset = v.AuxInt
   437  		p.To.Type = obj.TYPE_REG
   438  		p.To.Reg = r
   439  		// TODO: Teach doasm to compile the three-address multiply imul $c, r1, r2
   440  		// then we don't need to use resultInArg0 for these ops.
   441  		//p.From3 = new(obj.Addr)
   442  		//p.From3.Type = obj.TYPE_REG
   443  		//p.From3.Reg = gc.SSARegNum(v.Args[0])
   444  
   445  	case ssa.OpAMD64SUBQconst, ssa.OpAMD64SUBLconst,
   446  		ssa.OpAMD64ANDQconst, ssa.OpAMD64ANDLconst,
   447  		ssa.OpAMD64ORQconst, ssa.OpAMD64ORLconst,
   448  		ssa.OpAMD64XORQconst, ssa.OpAMD64XORLconst,
   449  		ssa.OpAMD64SHLQconst, ssa.OpAMD64SHLLconst,
   450  		ssa.OpAMD64SHRQconst, ssa.OpAMD64SHRLconst, ssa.OpAMD64SHRWconst, ssa.OpAMD64SHRBconst,
   451  		ssa.OpAMD64SARQconst, ssa.OpAMD64SARLconst, ssa.OpAMD64SARWconst, ssa.OpAMD64SARBconst,
   452  		ssa.OpAMD64ROLQconst, ssa.OpAMD64ROLLconst, ssa.OpAMD64ROLWconst, ssa.OpAMD64ROLBconst:
   453  		r := gc.SSARegNum(v)
   454  		if r != gc.SSARegNum(v.Args[0]) {
   455  			v.Fatalf("input[0] and output not in same register %s", v.LongString())
   456  		}
   457  		p := gc.Prog(v.Op.Asm())
   458  		p.From.Type = obj.TYPE_CONST
   459  		p.From.Offset = v.AuxInt
   460  		p.To.Type = obj.TYPE_REG
   461  		p.To.Reg = r
   462  	case ssa.OpAMD64SBBQcarrymask, ssa.OpAMD64SBBLcarrymask:
   463  		r := gc.SSARegNum(v)
   464  		p := gc.Prog(v.Op.Asm())
   465  		p.From.Type = obj.TYPE_REG
   466  		p.From.Reg = r
   467  		p.To.Type = obj.TYPE_REG
   468  		p.To.Reg = r
   469  	case ssa.OpAMD64LEAQ1, ssa.OpAMD64LEAQ2, ssa.OpAMD64LEAQ4, ssa.OpAMD64LEAQ8:
   470  		r := gc.SSARegNum(v.Args[0])
   471  		i := gc.SSARegNum(v.Args[1])
   472  		p := gc.Prog(x86.ALEAQ)
   473  		switch v.Op {
   474  		case ssa.OpAMD64LEAQ1:
   475  			p.From.Scale = 1
   476  			if i == x86.REG_SP {
   477  				r, i = i, r
   478  			}
   479  		case ssa.OpAMD64LEAQ2:
   480  			p.From.Scale = 2
   481  		case ssa.OpAMD64LEAQ4:
   482  			p.From.Scale = 4
   483  		case ssa.OpAMD64LEAQ8:
   484  			p.From.Scale = 8
   485  		}
   486  		p.From.Type = obj.TYPE_MEM
   487  		p.From.Reg = r
   488  		p.From.Index = i
   489  		gc.AddAux(&p.From, v)
   490  		p.To.Type = obj.TYPE_REG
   491  		p.To.Reg = gc.SSARegNum(v)
   492  	case ssa.OpAMD64LEAQ, ssa.OpAMD64LEAL:
   493  		p := gc.Prog(v.Op.Asm())
   494  		p.From.Type = obj.TYPE_MEM
   495  		p.From.Reg = gc.SSARegNum(v.Args[0])
   496  		gc.AddAux(&p.From, v)
   497  		p.To.Type = obj.TYPE_REG
   498  		p.To.Reg = gc.SSARegNum(v)
   499  	case ssa.OpAMD64CMPQ, ssa.OpAMD64CMPL, ssa.OpAMD64CMPW, ssa.OpAMD64CMPB,
   500  		ssa.OpAMD64TESTQ, ssa.OpAMD64TESTL, ssa.OpAMD64TESTW, ssa.OpAMD64TESTB:
   501  		opregreg(v.Op.Asm(), gc.SSARegNum(v.Args[1]), gc.SSARegNum(v.Args[0]))
   502  	case ssa.OpAMD64UCOMISS, ssa.OpAMD64UCOMISD:
   503  		// Go assembler has swapped operands for UCOMISx relative to CMP,
   504  		// must account for that right here.
   505  		opregreg(v.Op.Asm(), gc.SSARegNum(v.Args[0]), gc.SSARegNum(v.Args[1]))
   506  	case ssa.OpAMD64CMPQconst, ssa.OpAMD64CMPLconst, ssa.OpAMD64CMPWconst, ssa.OpAMD64CMPBconst:
   507  		p := gc.Prog(v.Op.Asm())
   508  		p.From.Type = obj.TYPE_REG
   509  		p.From.Reg = gc.SSARegNum(v.Args[0])
   510  		p.To.Type = obj.TYPE_CONST
   511  		p.To.Offset = v.AuxInt
   512  	case ssa.OpAMD64TESTQconst, ssa.OpAMD64TESTLconst, ssa.OpAMD64TESTWconst, ssa.OpAMD64TESTBconst:
   513  		p := gc.Prog(v.Op.Asm())
   514  		p.From.Type = obj.TYPE_CONST
   515  		p.From.Offset = v.AuxInt
   516  		p.To.Type = obj.TYPE_REG
   517  		p.To.Reg = gc.SSARegNum(v.Args[0])
   518  	case ssa.OpAMD64MOVLconst, ssa.OpAMD64MOVQconst:
   519  		x := gc.SSARegNum(v)
   520  		p := gc.Prog(v.Op.Asm())
   521  		p.From.Type = obj.TYPE_CONST
   522  		p.From.Offset = v.AuxInt
   523  		p.To.Type = obj.TYPE_REG
   524  		p.To.Reg = x
   525  		// If flags are live at this instruction, suppress the
   526  		// MOV $0,AX -> XOR AX,AX optimization.
   527  		if v.Aux != nil {
   528  			p.Mark |= x86.PRESERVEFLAGS
   529  		}
   530  	case ssa.OpAMD64MOVSSconst, ssa.OpAMD64MOVSDconst:
   531  		x := gc.SSARegNum(v)
   532  		p := gc.Prog(v.Op.Asm())
   533  		p.From.Type = obj.TYPE_FCONST
   534  		p.From.Val = math.Float64frombits(uint64(v.AuxInt))
   535  		p.To.Type = obj.TYPE_REG
   536  		p.To.Reg = x
   537  	case ssa.OpAMD64MOVQload, ssa.OpAMD64MOVSSload, ssa.OpAMD64MOVSDload, ssa.OpAMD64MOVLload, ssa.OpAMD64MOVWload, ssa.OpAMD64MOVBload, ssa.OpAMD64MOVBQSXload, ssa.OpAMD64MOVWQSXload, ssa.OpAMD64MOVLQSXload, ssa.OpAMD64MOVOload:
   538  		p := gc.Prog(v.Op.Asm())
   539  		p.From.Type = obj.TYPE_MEM
   540  		p.From.Reg = gc.SSARegNum(v.Args[0])
   541  		gc.AddAux(&p.From, v)
   542  		p.To.Type = obj.TYPE_REG
   543  		p.To.Reg = gc.SSARegNum(v)
   544  	case ssa.OpAMD64MOVQloadidx8, ssa.OpAMD64MOVSDloadidx8:
   545  		p := gc.Prog(v.Op.Asm())
   546  		p.From.Type = obj.TYPE_MEM
   547  		p.From.Reg = gc.SSARegNum(v.Args[0])
   548  		gc.AddAux(&p.From, v)
   549  		p.From.Scale = 8
   550  		p.From.Index = gc.SSARegNum(v.Args[1])
   551  		p.To.Type = obj.TYPE_REG
   552  		p.To.Reg = gc.SSARegNum(v)
   553  	case ssa.OpAMD64MOVLloadidx4, ssa.OpAMD64MOVSSloadidx4:
   554  		p := gc.Prog(v.Op.Asm())
   555  		p.From.Type = obj.TYPE_MEM
   556  		p.From.Reg = gc.SSARegNum(v.Args[0])
   557  		gc.AddAux(&p.From, v)
   558  		p.From.Scale = 4
   559  		p.From.Index = gc.SSARegNum(v.Args[1])
   560  		p.To.Type = obj.TYPE_REG
   561  		p.To.Reg = gc.SSARegNum(v)
   562  	case ssa.OpAMD64MOVWloadidx2:
   563  		p := gc.Prog(v.Op.Asm())
   564  		p.From.Type = obj.TYPE_MEM
   565  		p.From.Reg = gc.SSARegNum(v.Args[0])
   566  		gc.AddAux(&p.From, v)
   567  		p.From.Scale = 2
   568  		p.From.Index = gc.SSARegNum(v.Args[1])
   569  		p.To.Type = obj.TYPE_REG
   570  		p.To.Reg = gc.SSARegNum(v)
   571  	case ssa.OpAMD64MOVBloadidx1, ssa.OpAMD64MOVWloadidx1, ssa.OpAMD64MOVLloadidx1, ssa.OpAMD64MOVQloadidx1, ssa.OpAMD64MOVSSloadidx1, ssa.OpAMD64MOVSDloadidx1:
   572  		r := gc.SSARegNum(v.Args[0])
   573  		i := gc.SSARegNum(v.Args[1])
   574  		if i == x86.REG_SP {
   575  			r, i = i, r
   576  		}
   577  		p := gc.Prog(v.Op.Asm())
   578  		p.From.Type = obj.TYPE_MEM
   579  		p.From.Reg = r
   580  		p.From.Scale = 1
   581  		p.From.Index = i
   582  		gc.AddAux(&p.From, v)
   583  		p.To.Type = obj.TYPE_REG
   584  		p.To.Reg = gc.SSARegNum(v)
   585  	case ssa.OpAMD64MOVQstore, ssa.OpAMD64MOVSSstore, ssa.OpAMD64MOVSDstore, ssa.OpAMD64MOVLstore, ssa.OpAMD64MOVWstore, ssa.OpAMD64MOVBstore, ssa.OpAMD64MOVOstore:
   586  		p := gc.Prog(v.Op.Asm())
   587  		p.From.Type = obj.TYPE_REG
   588  		p.From.Reg = gc.SSARegNum(v.Args[1])
   589  		p.To.Type = obj.TYPE_MEM
   590  		p.To.Reg = gc.SSARegNum(v.Args[0])
   591  		gc.AddAux(&p.To, v)
   592  	case ssa.OpAMD64MOVQstoreidx8, ssa.OpAMD64MOVSDstoreidx8:
   593  		p := gc.Prog(v.Op.Asm())
   594  		p.From.Type = obj.TYPE_REG
   595  		p.From.Reg = gc.SSARegNum(v.Args[2])
   596  		p.To.Type = obj.TYPE_MEM
   597  		p.To.Reg = gc.SSARegNum(v.Args[0])
   598  		p.To.Scale = 8
   599  		p.To.Index = gc.SSARegNum(v.Args[1])
   600  		gc.AddAux(&p.To, v)
   601  	case ssa.OpAMD64MOVSSstoreidx4, ssa.OpAMD64MOVLstoreidx4:
   602  		p := gc.Prog(v.Op.Asm())
   603  		p.From.Type = obj.TYPE_REG
   604  		p.From.Reg = gc.SSARegNum(v.Args[2])
   605  		p.To.Type = obj.TYPE_MEM
   606  		p.To.Reg = gc.SSARegNum(v.Args[0])
   607  		p.To.Scale = 4
   608  		p.To.Index = gc.SSARegNum(v.Args[1])
   609  		gc.AddAux(&p.To, v)
   610  	case ssa.OpAMD64MOVWstoreidx2:
   611  		p := gc.Prog(v.Op.Asm())
   612  		p.From.Type = obj.TYPE_REG
   613  		p.From.Reg = gc.SSARegNum(v.Args[2])
   614  		p.To.Type = obj.TYPE_MEM
   615  		p.To.Reg = gc.SSARegNum(v.Args[0])
   616  		p.To.Scale = 2
   617  		p.To.Index = gc.SSARegNum(v.Args[1])
   618  		gc.AddAux(&p.To, v)
   619  	case ssa.OpAMD64MOVBstoreidx1, ssa.OpAMD64MOVWstoreidx1, ssa.OpAMD64MOVLstoreidx1, ssa.OpAMD64MOVQstoreidx1, ssa.OpAMD64MOVSSstoreidx1, ssa.OpAMD64MOVSDstoreidx1:
   620  		r := gc.SSARegNum(v.Args[0])
   621  		i := gc.SSARegNum(v.Args[1])
   622  		if i == x86.REG_SP {
   623  			r, i = i, r
   624  		}
   625  		p := gc.Prog(v.Op.Asm())
   626  		p.From.Type = obj.TYPE_REG
   627  		p.From.Reg = gc.SSARegNum(v.Args[2])
   628  		p.To.Type = obj.TYPE_MEM
   629  		p.To.Reg = r
   630  		p.To.Scale = 1
   631  		p.To.Index = i
   632  		gc.AddAux(&p.To, v)
   633  	case ssa.OpAMD64MOVQstoreconst, ssa.OpAMD64MOVLstoreconst, ssa.OpAMD64MOVWstoreconst, ssa.OpAMD64MOVBstoreconst:
   634  		p := gc.Prog(v.Op.Asm())
   635  		p.From.Type = obj.TYPE_CONST
   636  		sc := v.AuxValAndOff()
   637  		p.From.Offset = sc.Val()
   638  		p.To.Type = obj.TYPE_MEM
   639  		p.To.Reg = gc.SSARegNum(v.Args[0])
   640  		gc.AddAux2(&p.To, v, sc.Off())
   641  	case ssa.OpAMD64MOVQstoreconstidx1, ssa.OpAMD64MOVQstoreconstidx8, ssa.OpAMD64MOVLstoreconstidx1, ssa.OpAMD64MOVLstoreconstidx4, ssa.OpAMD64MOVWstoreconstidx1, ssa.OpAMD64MOVWstoreconstidx2, ssa.OpAMD64MOVBstoreconstidx1:
   642  		p := gc.Prog(v.Op.Asm())
   643  		p.From.Type = obj.TYPE_CONST
   644  		sc := v.AuxValAndOff()
   645  		p.From.Offset = sc.Val()
   646  		r := gc.SSARegNum(v.Args[0])
   647  		i := gc.SSARegNum(v.Args[1])
   648  		switch v.Op {
   649  		case ssa.OpAMD64MOVBstoreconstidx1, ssa.OpAMD64MOVWstoreconstidx1, ssa.OpAMD64MOVLstoreconstidx1, ssa.OpAMD64MOVQstoreconstidx1:
   650  			p.To.Scale = 1
   651  			if i == x86.REG_SP {
   652  				r, i = i, r
   653  			}
   654  		case ssa.OpAMD64MOVWstoreconstidx2:
   655  			p.To.Scale = 2
   656  		case ssa.OpAMD64MOVLstoreconstidx4:
   657  			p.To.Scale = 4
   658  		case ssa.OpAMD64MOVQstoreconstidx8:
   659  			p.To.Scale = 8
   660  		}
   661  		p.To.Type = obj.TYPE_MEM
   662  		p.To.Reg = r
   663  		p.To.Index = i
   664  		gc.AddAux2(&p.To, v, sc.Off())
   665  	case ssa.OpAMD64MOVLQSX, ssa.OpAMD64MOVWQSX, ssa.OpAMD64MOVBQSX, ssa.OpAMD64MOVLQZX, ssa.OpAMD64MOVWQZX, ssa.OpAMD64MOVBQZX,
   666  		ssa.OpAMD64CVTSL2SS, ssa.OpAMD64CVTSL2SD, ssa.OpAMD64CVTSQ2SS, ssa.OpAMD64CVTSQ2SD,
   667  		ssa.OpAMD64CVTTSS2SL, ssa.OpAMD64CVTTSD2SL, ssa.OpAMD64CVTTSS2SQ, ssa.OpAMD64CVTTSD2SQ,
   668  		ssa.OpAMD64CVTSS2SD, ssa.OpAMD64CVTSD2SS:
   669  		opregreg(v.Op.Asm(), gc.SSARegNum(v), gc.SSARegNum(v.Args[0]))
   670  	case ssa.OpAMD64DUFFZERO:
   671  		off := duffStart(v.AuxInt)
   672  		adj := duffAdj(v.AuxInt)
   673  		var p *obj.Prog
   674  		if adj != 0 {
   675  			p = gc.Prog(x86.AADDQ)
   676  			p.From.Type = obj.TYPE_CONST
   677  			p.From.Offset = adj
   678  			p.To.Type = obj.TYPE_REG
   679  			p.To.Reg = x86.REG_DI
   680  		}
   681  		p = gc.Prog(obj.ADUFFZERO)
   682  		p.To.Type = obj.TYPE_ADDR
   683  		p.To.Sym = gc.Linksym(gc.Pkglookup("duffzero", gc.Runtimepkg))
   684  		p.To.Offset = off
   685  	case ssa.OpAMD64MOVOconst:
   686  		if v.AuxInt != 0 {
   687  			v.Unimplementedf("MOVOconst can only do constant=0")
   688  		}
   689  		r := gc.SSARegNum(v)
   690  		opregreg(x86.AXORPS, r, r)
   691  	case ssa.OpAMD64DUFFCOPY:
   692  		p := gc.Prog(obj.ADUFFCOPY)
   693  		p.To.Type = obj.TYPE_ADDR
   694  		p.To.Sym = gc.Linksym(gc.Pkglookup("duffcopy", gc.Runtimepkg))
   695  		p.To.Offset = v.AuxInt
   696  
   697  	case ssa.OpCopy, ssa.OpAMD64MOVQconvert, ssa.OpAMD64MOVLconvert: // TODO: use MOVQreg for reg->reg copies instead of OpCopy?
   698  		if v.Type.IsMemory() {
   699  			return
   700  		}
   701  		x := gc.SSARegNum(v.Args[0])
   702  		y := gc.SSARegNum(v)
   703  		if x != y {
   704  			opregreg(moveByType(v.Type), y, x)
   705  		}
   706  	case ssa.OpLoadReg:
   707  		if v.Type.IsFlags() {
   708  			v.Unimplementedf("load flags not implemented: %v", v.LongString())
   709  			return
   710  		}
   711  		p := gc.Prog(loadByType(v.Type))
   712  		n, off := gc.AutoVar(v.Args[0])
   713  		p.From.Type = obj.TYPE_MEM
   714  		p.From.Node = n
   715  		p.From.Sym = gc.Linksym(n.Sym)
   716  		p.From.Offset = off
   717  		if n.Class == gc.PPARAM || n.Class == gc.PPARAMOUT {
   718  			p.From.Name = obj.NAME_PARAM
   719  			p.From.Offset += n.Xoffset
   720  		} else {
   721  			p.From.Name = obj.NAME_AUTO
   722  		}
   723  		p.To.Type = obj.TYPE_REG
   724  		p.To.Reg = gc.SSARegNum(v)
   725  
   726  	case ssa.OpStoreReg:
   727  		if v.Type.IsFlags() {
   728  			v.Unimplementedf("store flags not implemented: %v", v.LongString())
   729  			return
   730  		}
   731  		p := gc.Prog(storeByType(v.Type))
   732  		p.From.Type = obj.TYPE_REG
   733  		p.From.Reg = gc.SSARegNum(v.Args[0])
   734  		n, off := gc.AutoVar(v)
   735  		p.To.Type = obj.TYPE_MEM
   736  		p.To.Node = n
   737  		p.To.Sym = gc.Linksym(n.Sym)
   738  		p.To.Offset = off
   739  		if n.Class == gc.PPARAM || n.Class == gc.PPARAMOUT {
   740  			p.To.Name = obj.NAME_PARAM
   741  			p.To.Offset += n.Xoffset
   742  		} else {
   743  			p.To.Name = obj.NAME_AUTO
   744  		}
   745  	case ssa.OpPhi:
   746  		gc.CheckLoweredPhi(v)
   747  	case ssa.OpInitMem:
   748  		// memory arg needs no code
   749  	case ssa.OpArg:
   750  		// input args need no code
   751  	case ssa.OpAMD64LoweredGetClosurePtr:
   752  		// Closure pointer is DX.
   753  		gc.CheckLoweredGetClosurePtr(v)
   754  	case ssa.OpAMD64LoweredGetG:
   755  		r := gc.SSARegNum(v)
   756  		// See the comments in cmd/internal/obj/x86/obj6.go
   757  		// near CanUse1InsnTLS for a detailed explanation of these instructions.
   758  		if x86.CanUse1InsnTLS(gc.Ctxt) {
   759  			// MOVQ (TLS), r
   760  			p := gc.Prog(x86.AMOVQ)
   761  			p.From.Type = obj.TYPE_MEM
   762  			p.From.Reg = x86.REG_TLS
   763  			p.To.Type = obj.TYPE_REG
   764  			p.To.Reg = r
   765  		} else {
   766  			// MOVQ TLS, r
   767  			// MOVQ (r)(TLS*1), r
   768  			p := gc.Prog(x86.AMOVQ)
   769  			p.From.Type = obj.TYPE_REG
   770  			p.From.Reg = x86.REG_TLS
   771  			p.To.Type = obj.TYPE_REG
   772  			p.To.Reg = r
   773  			q := gc.Prog(x86.AMOVQ)
   774  			q.From.Type = obj.TYPE_MEM
   775  			q.From.Reg = r
   776  			q.From.Index = x86.REG_TLS
   777  			q.From.Scale = 1
   778  			q.To.Type = obj.TYPE_REG
   779  			q.To.Reg = r
   780  		}
   781  	case ssa.OpAMD64CALLstatic:
   782  		if v.Aux.(*gc.Sym) == gc.Deferreturn.Sym {
   783  			// Deferred calls will appear to be returning to
   784  			// the CALL deferreturn(SB) that we are about to emit.
   785  			// However, the stack trace code will show the line
   786  			// of the instruction byte before the return PC.
   787  			// To avoid that being an unrelated instruction,
   788  			// insert an actual hardware NOP that will have the right line number.
   789  			// This is different from obj.ANOP, which is a virtual no-op
   790  			// that doesn't make it into the instruction stream.
   791  			ginsnop()
   792  		}
   793  		p := gc.Prog(obj.ACALL)
   794  		p.To.Type = obj.TYPE_MEM
   795  		p.To.Name = obj.NAME_EXTERN
   796  		p.To.Sym = gc.Linksym(v.Aux.(*gc.Sym))
   797  		if gc.Maxarg < v.AuxInt {
   798  			gc.Maxarg = v.AuxInt
   799  		}
   800  	case ssa.OpAMD64CALLclosure:
   801  		p := gc.Prog(obj.ACALL)
   802  		p.To.Type = obj.TYPE_REG
   803  		p.To.Reg = gc.SSARegNum(v.Args[0])
   804  		if gc.Maxarg < v.AuxInt {
   805  			gc.Maxarg = v.AuxInt
   806  		}
   807  	case ssa.OpAMD64CALLdefer:
   808  		p := gc.Prog(obj.ACALL)
   809  		p.To.Type = obj.TYPE_MEM
   810  		p.To.Name = obj.NAME_EXTERN
   811  		p.To.Sym = gc.Linksym(gc.Deferproc.Sym)
   812  		if gc.Maxarg < v.AuxInt {
   813  			gc.Maxarg = v.AuxInt
   814  		}
   815  	case ssa.OpAMD64CALLgo:
   816  		p := gc.Prog(obj.ACALL)
   817  		p.To.Type = obj.TYPE_MEM
   818  		p.To.Name = obj.NAME_EXTERN
   819  		p.To.Sym = gc.Linksym(gc.Newproc.Sym)
   820  		if gc.Maxarg < v.AuxInt {
   821  			gc.Maxarg = v.AuxInt
   822  		}
   823  	case ssa.OpAMD64CALLinter:
   824  		p := gc.Prog(obj.ACALL)
   825  		p.To.Type = obj.TYPE_REG
   826  		p.To.Reg = gc.SSARegNum(v.Args[0])
   827  		if gc.Maxarg < v.AuxInt {
   828  			gc.Maxarg = v.AuxInt
   829  		}
   830  	case ssa.OpAMD64NEGQ, ssa.OpAMD64NEGL,
   831  		ssa.OpAMD64BSWAPQ, ssa.OpAMD64BSWAPL,
   832  		ssa.OpAMD64NOTQ, ssa.OpAMD64NOTL:
   833  		r := gc.SSARegNum(v)
   834  		if r != gc.SSARegNum(v.Args[0]) {
   835  			v.Fatalf("input[0] and output not in same register %s", v.LongString())
   836  		}
   837  		p := gc.Prog(v.Op.Asm())
   838  		p.To.Type = obj.TYPE_REG
   839  		p.To.Reg = r
   840  	case ssa.OpAMD64BSFQ, ssa.OpAMD64BSFL:
   841  		p := gc.Prog(v.Op.Asm())
   842  		p.From.Type = obj.TYPE_REG
   843  		p.From.Reg = gc.SSARegNum(v.Args[0])
   844  		p.To.Type = obj.TYPE_REG
   845  		p.To.Reg = gc.SSARegNum0(v)
   846  	case ssa.OpAMD64SQRTSD:
   847  		p := gc.Prog(v.Op.Asm())
   848  		p.From.Type = obj.TYPE_REG
   849  		p.From.Reg = gc.SSARegNum(v.Args[0])
   850  		p.To.Type = obj.TYPE_REG
   851  		p.To.Reg = gc.SSARegNum(v)
   852  	case ssa.OpSP, ssa.OpSB:
   853  		// nothing to do
   854  	case ssa.OpSelect0, ssa.OpSelect1:
   855  		// nothing to do
   856  	case ssa.OpAMD64SETEQ, ssa.OpAMD64SETNE,
   857  		ssa.OpAMD64SETL, ssa.OpAMD64SETLE,
   858  		ssa.OpAMD64SETG, ssa.OpAMD64SETGE,
   859  		ssa.OpAMD64SETGF, ssa.OpAMD64SETGEF,
   860  		ssa.OpAMD64SETB, ssa.OpAMD64SETBE,
   861  		ssa.OpAMD64SETORD, ssa.OpAMD64SETNAN,
   862  		ssa.OpAMD64SETA, ssa.OpAMD64SETAE:
   863  		p := gc.Prog(v.Op.Asm())
   864  		p.To.Type = obj.TYPE_REG
   865  		p.To.Reg = gc.SSARegNum(v)
   866  
   867  	case ssa.OpAMD64SETNEF:
   868  		p := gc.Prog(v.Op.Asm())
   869  		p.To.Type = obj.TYPE_REG
   870  		p.To.Reg = gc.SSARegNum(v)
   871  		q := gc.Prog(x86.ASETPS)
   872  		q.To.Type = obj.TYPE_REG
   873  		q.To.Reg = x86.REG_AX
   874  		// ORL avoids partial register write and is smaller than ORQ, used by old compiler
   875  		opregreg(x86.AORL, gc.SSARegNum(v), x86.REG_AX)
   876  
   877  	case ssa.OpAMD64SETEQF:
   878  		p := gc.Prog(v.Op.Asm())
   879  		p.To.Type = obj.TYPE_REG
   880  		p.To.Reg = gc.SSARegNum(v)
   881  		q := gc.Prog(x86.ASETPC)
   882  		q.To.Type = obj.TYPE_REG
   883  		q.To.Reg = x86.REG_AX
   884  		// ANDL avoids partial register write and is smaller than ANDQ, used by old compiler
   885  		opregreg(x86.AANDL, gc.SSARegNum(v), x86.REG_AX)
   886  
   887  	case ssa.OpAMD64InvertFlags:
   888  		v.Fatalf("InvertFlags should never make it to codegen %v", v.LongString())
   889  	case ssa.OpAMD64FlagEQ, ssa.OpAMD64FlagLT_ULT, ssa.OpAMD64FlagLT_UGT, ssa.OpAMD64FlagGT_ULT, ssa.OpAMD64FlagGT_UGT:
   890  		v.Fatalf("Flag* ops should never make it to codegen %v", v.LongString())
   891  	case ssa.OpAMD64AddTupleFirst32, ssa.OpAMD64AddTupleFirst64:
   892  		v.Fatalf("AddTupleFirst* should never make it to codegen %v", v.LongString())
   893  	case ssa.OpAMD64REPSTOSQ:
   894  		gc.Prog(x86.AREP)
   895  		gc.Prog(x86.ASTOSQ)
   896  	case ssa.OpAMD64REPMOVSQ:
   897  		gc.Prog(x86.AREP)
   898  		gc.Prog(x86.AMOVSQ)
   899  	case ssa.OpVarDef:
   900  		gc.Gvardef(v.Aux.(*gc.Node))
   901  	case ssa.OpVarKill:
   902  		gc.Gvarkill(v.Aux.(*gc.Node))
   903  	case ssa.OpVarLive:
   904  		gc.Gvarlive(v.Aux.(*gc.Node))
   905  	case ssa.OpKeepAlive:
   906  		if !v.Args[0].Type.IsPtrShaped() {
   907  			v.Fatalf("keeping non-pointer alive %v", v.Args[0])
   908  		}
   909  		n, off := gc.AutoVar(v.Args[0])
   910  		if n == nil {
   911  			v.Fatalf("KeepLive with non-spilled value %s %s", v, v.Args[0])
   912  		}
   913  		if off != 0 {
   914  			v.Fatalf("KeepLive with non-zero offset spill location %s:%d", n, off)
   915  		}
   916  		gc.Gvarlive(n)
   917  	case ssa.OpAMD64LoweredNilCheck:
   918  		// Optimization - if the subsequent block has a load or store
   919  		// at the same address, we don't need to issue this instruction.
   920  		mem := v.Args[1]
   921  		for _, w := range v.Block.Succs[0].Block().Values {
   922  			if w.Op == ssa.OpPhi {
   923  				if w.Type.IsMemory() {
   924  					mem = w
   925  				}
   926  				continue
   927  			}
   928  			if len(w.Args) == 0 || !w.Args[len(w.Args)-1].Type.IsMemory() {
   929  				// w doesn't use a store - can't be a memory op.
   930  				continue
   931  			}
   932  			if w.Args[len(w.Args)-1] != mem {
   933  				v.Fatalf("wrong store after nilcheck v=%s w=%s", v, w)
   934  			}
   935  			switch w.Op {
   936  			case ssa.OpAMD64MOVQload, ssa.OpAMD64MOVLload, ssa.OpAMD64MOVWload, ssa.OpAMD64MOVBload,
   937  				ssa.OpAMD64MOVQstore, ssa.OpAMD64MOVLstore, ssa.OpAMD64MOVWstore, ssa.OpAMD64MOVBstore,
   938  				ssa.OpAMD64MOVBQSXload, ssa.OpAMD64MOVWQSXload, ssa.OpAMD64MOVLQSXload,
   939  				ssa.OpAMD64MOVSSload, ssa.OpAMD64MOVSDload, ssa.OpAMD64MOVOload,
   940  				ssa.OpAMD64MOVSSstore, ssa.OpAMD64MOVSDstore, ssa.OpAMD64MOVOstore,
   941  				ssa.OpAMD64MOVQatomicload, ssa.OpAMD64MOVLatomicload,
   942  				ssa.OpAMD64CMPXCHGQlock, ssa.OpAMD64CMPXCHGLlock,
   943  				ssa.OpAMD64ANDBlock, ssa.OpAMD64ORBlock:
   944  				if w.Args[0] == v.Args[0] && w.Aux == nil && w.AuxInt >= 0 && w.AuxInt < minZeroPage {
   945  					if gc.Debug_checknil != 0 && int(v.Line) > 1 {
   946  						gc.Warnl(v.Line, "removed nil check")
   947  					}
   948  					return
   949  				}
   950  			case ssa.OpAMD64XCHGL, ssa.OpAMD64XCHGQ, ssa.OpAMD64XADDLlock, ssa.OpAMD64XADDQlock:
   951  				if w.Args[1] == v.Args[0] && w.Aux == nil && w.AuxInt >= 0 && w.AuxInt < minZeroPage {
   952  					if gc.Debug_checknil != 0 && int(v.Line) > 1 {
   953  						gc.Warnl(v.Line, "removed nil check")
   954  					}
   955  					return
   956  				}
   957  			case ssa.OpAMD64MOVQstoreconst, ssa.OpAMD64MOVLstoreconst, ssa.OpAMD64MOVWstoreconst, ssa.OpAMD64MOVBstoreconst:
   958  				off := ssa.ValAndOff(v.AuxInt).Off()
   959  				if w.Args[0] == v.Args[0] && w.Aux == nil && off >= 0 && off < minZeroPage {
   960  					if gc.Debug_checknil != 0 && int(v.Line) > 1 {
   961  						gc.Warnl(v.Line, "removed nil check")
   962  					}
   963  					return
   964  				}
   965  			}
   966  			if w.Type.IsMemory() || w.Type.IsTuple() && w.Type.FieldType(1).IsMemory() {
   967  				if w.Op == ssa.OpVarDef || w.Op == ssa.OpVarKill || w.Op == ssa.OpVarLive {
   968  					// these ops are OK
   969  					mem = w
   970  					continue
   971  				}
   972  				// We can't delay the nil check past the next store.
   973  				break
   974  			}
   975  		}
   976  		// Issue a load which will fault if the input is nil.
   977  		// TODO: We currently use the 2-byte instruction TESTB AX, (reg).
   978  		// Should we use the 3-byte TESTB $0, (reg) instead?  It is larger
   979  		// but it doesn't have false dependency on AX.
   980  		// Or maybe allocate an output register and use MOVL (reg),reg2 ?
   981  		// That trades clobbering flags for clobbering a register.
   982  		p := gc.Prog(x86.ATESTB)
   983  		p.From.Type = obj.TYPE_REG
   984  		p.From.Reg = x86.REG_AX
   985  		p.To.Type = obj.TYPE_MEM
   986  		p.To.Reg = gc.SSARegNum(v.Args[0])
   987  		gc.AddAux(&p.To, v)
   988  		if gc.Debug_checknil != 0 && v.Line > 1 { // v.Line==1 in generated wrappers
   989  			gc.Warnl(v.Line, "generated nil check")
   990  		}
   991  	case ssa.OpAMD64MOVLatomicload, ssa.OpAMD64MOVQatomicload:
   992  		p := gc.Prog(v.Op.Asm())
   993  		p.From.Type = obj.TYPE_MEM
   994  		p.From.Reg = gc.SSARegNum(v.Args[0])
   995  		gc.AddAux(&p.From, v)
   996  		p.To.Type = obj.TYPE_REG
   997  		p.To.Reg = gc.SSARegNum0(v)
   998  	case ssa.OpAMD64XCHGL, ssa.OpAMD64XCHGQ:
   999  		r := gc.SSARegNum0(v)
  1000  		if r != gc.SSARegNum(v.Args[0]) {
  1001  			v.Fatalf("input[0] and output[0] not in same register %s", v.LongString())
  1002  		}
  1003  		p := gc.Prog(v.Op.Asm())
  1004  		p.From.Type = obj.TYPE_REG
  1005  		p.From.Reg = r
  1006  		p.To.Type = obj.TYPE_MEM
  1007  		p.To.Reg = gc.SSARegNum(v.Args[1])
  1008  		gc.AddAux(&p.To, v)
  1009  	case ssa.OpAMD64XADDLlock, ssa.OpAMD64XADDQlock:
  1010  		r := gc.SSARegNum0(v)
  1011  		if r != gc.SSARegNum(v.Args[0]) {
  1012  			v.Fatalf("input[0] and output[0] not in same register %s", v.LongString())
  1013  		}
  1014  		gc.Prog(x86.ALOCK)
  1015  		p := gc.Prog(v.Op.Asm())
  1016  		p.From.Type = obj.TYPE_REG
  1017  		p.From.Reg = r
  1018  		p.To.Type = obj.TYPE_MEM
  1019  		p.To.Reg = gc.SSARegNum(v.Args[1])
  1020  		gc.AddAux(&p.To, v)
  1021  	case ssa.OpAMD64CMPXCHGLlock, ssa.OpAMD64CMPXCHGQlock:
  1022  		if gc.SSARegNum(v.Args[1]) != x86.REG_AX {
  1023  			v.Fatalf("input[1] not in AX %s", v.LongString())
  1024  		}
  1025  		gc.Prog(x86.ALOCK)
  1026  		p := gc.Prog(v.Op.Asm())
  1027  		p.From.Type = obj.TYPE_REG
  1028  		p.From.Reg = gc.SSARegNum(v.Args[2])
  1029  		p.To.Type = obj.TYPE_MEM
  1030  		p.To.Reg = gc.SSARegNum(v.Args[0])
  1031  		gc.AddAux(&p.To, v)
  1032  		p = gc.Prog(x86.ASETEQ)
  1033  		p.To.Type = obj.TYPE_REG
  1034  		p.To.Reg = gc.SSARegNum0(v)
  1035  	case ssa.OpAMD64ANDBlock, ssa.OpAMD64ORBlock:
  1036  		gc.Prog(x86.ALOCK)
  1037  		p := gc.Prog(v.Op.Asm())
  1038  		p.From.Type = obj.TYPE_REG
  1039  		p.From.Reg = gc.SSARegNum(v.Args[1])
  1040  		p.To.Type = obj.TYPE_MEM
  1041  		p.To.Reg = gc.SSARegNum(v.Args[0])
  1042  		gc.AddAux(&p.To, v)
  1043  	default:
  1044  		v.Unimplementedf("genValue not implemented: %s", v.LongString())
  1045  	}
  1046  }
  1047  
  1048  var blockJump = [...]struct {
  1049  	asm, invasm obj.As
  1050  }{
  1051  	ssa.BlockAMD64EQ:  {x86.AJEQ, x86.AJNE},
  1052  	ssa.BlockAMD64NE:  {x86.AJNE, x86.AJEQ},
  1053  	ssa.BlockAMD64LT:  {x86.AJLT, x86.AJGE},
  1054  	ssa.BlockAMD64GE:  {x86.AJGE, x86.AJLT},
  1055  	ssa.BlockAMD64LE:  {x86.AJLE, x86.AJGT},
  1056  	ssa.BlockAMD64GT:  {x86.AJGT, x86.AJLE},
  1057  	ssa.BlockAMD64ULT: {x86.AJCS, x86.AJCC},
  1058  	ssa.BlockAMD64UGE: {x86.AJCC, x86.AJCS},
  1059  	ssa.BlockAMD64UGT: {x86.AJHI, x86.AJLS},
  1060  	ssa.BlockAMD64ULE: {x86.AJLS, x86.AJHI},
  1061  	ssa.BlockAMD64ORD: {x86.AJPC, x86.AJPS},
  1062  	ssa.BlockAMD64NAN: {x86.AJPS, x86.AJPC},
  1063  }
  1064  
  1065  var eqfJumps = [2][2]gc.FloatingEQNEJump{
  1066  	{{Jump: x86.AJNE, Index: 1}, {Jump: x86.AJPS, Index: 1}}, // next == b.Succs[0]
  1067  	{{Jump: x86.AJNE, Index: 1}, {Jump: x86.AJPC, Index: 0}}, // next == b.Succs[1]
  1068  }
  1069  var nefJumps = [2][2]gc.FloatingEQNEJump{
  1070  	{{Jump: x86.AJNE, Index: 0}, {Jump: x86.AJPC, Index: 1}}, // next == b.Succs[0]
  1071  	{{Jump: x86.AJNE, Index: 0}, {Jump: x86.AJPS, Index: 0}}, // next == b.Succs[1]
  1072  }
  1073  
  1074  func ssaGenBlock(s *gc.SSAGenState, b, next *ssa.Block) {
  1075  	s.SetLineno(b.Line)
  1076  
  1077  	switch b.Kind {
  1078  	case ssa.BlockPlain, ssa.BlockCall, ssa.BlockCheck:
  1079  		if b.Succs[0].Block() != next {
  1080  			p := gc.Prog(obj.AJMP)
  1081  			p.To.Type = obj.TYPE_BRANCH
  1082  			s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()})
  1083  		}
  1084  	case ssa.BlockDefer:
  1085  		// defer returns in rax:
  1086  		// 0 if we should continue executing
  1087  		// 1 if we should jump to deferreturn call
  1088  		p := gc.Prog(x86.ATESTL)
  1089  		p.From.Type = obj.TYPE_REG
  1090  		p.From.Reg = x86.REG_AX
  1091  		p.To.Type = obj.TYPE_REG
  1092  		p.To.Reg = x86.REG_AX
  1093  		p = gc.Prog(x86.AJNE)
  1094  		p.To.Type = obj.TYPE_BRANCH
  1095  		s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[1].Block()})
  1096  		if b.Succs[0].Block() != next {
  1097  			p := gc.Prog(obj.AJMP)
  1098  			p.To.Type = obj.TYPE_BRANCH
  1099  			s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()})
  1100  		}
  1101  	case ssa.BlockExit:
  1102  		gc.Prog(obj.AUNDEF) // tell plive.go that we never reach here
  1103  	case ssa.BlockRet:
  1104  		gc.Prog(obj.ARET)
  1105  	case ssa.BlockRetJmp:
  1106  		p := gc.Prog(obj.AJMP)
  1107  		p.To.Type = obj.TYPE_MEM
  1108  		p.To.Name = obj.NAME_EXTERN
  1109  		p.To.Sym = gc.Linksym(b.Aux.(*gc.Sym))
  1110  
  1111  	case ssa.BlockAMD64EQF:
  1112  		gc.SSAGenFPJump(s, b, next, &eqfJumps)
  1113  
  1114  	case ssa.BlockAMD64NEF:
  1115  		gc.SSAGenFPJump(s, b, next, &nefJumps)
  1116  
  1117  	case ssa.BlockAMD64EQ, ssa.BlockAMD64NE,
  1118  		ssa.BlockAMD64LT, ssa.BlockAMD64GE,
  1119  		ssa.BlockAMD64LE, ssa.BlockAMD64GT,
  1120  		ssa.BlockAMD64ULT, ssa.BlockAMD64UGT,
  1121  		ssa.BlockAMD64ULE, ssa.BlockAMD64UGE:
  1122  		jmp := blockJump[b.Kind]
  1123  		likely := b.Likely
  1124  		var p *obj.Prog
  1125  		switch next {
  1126  		case b.Succs[0].Block():
  1127  			p = gc.Prog(jmp.invasm)
  1128  			likely *= -1
  1129  			p.To.Type = obj.TYPE_BRANCH
  1130  			s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[1].Block()})
  1131  		case b.Succs[1].Block():
  1132  			p = gc.Prog(jmp.asm)
  1133  			p.To.Type = obj.TYPE_BRANCH
  1134  			s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()})
  1135  		default:
  1136  			p = gc.Prog(jmp.asm)
  1137  			p.To.Type = obj.TYPE_BRANCH
  1138  			s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()})
  1139  			q := gc.Prog(obj.AJMP)
  1140  			q.To.Type = obj.TYPE_BRANCH
  1141  			s.Branches = append(s.Branches, gc.Branch{P: q, B: b.Succs[1].Block()})
  1142  		}
  1143  
  1144  		// liblink reorders the instruction stream as it sees fit.
  1145  		// Pass along what we know so liblink can make use of it.
  1146  		// TODO: Once we've fully switched to SSA,
  1147  		// make liblink leave our output alone.
  1148  		switch likely {
  1149  		case ssa.BranchUnlikely:
  1150  			p.From.Type = obj.TYPE_CONST
  1151  			p.From.Offset = 0
  1152  		case ssa.BranchLikely:
  1153  			p.From.Type = obj.TYPE_CONST
  1154  			p.From.Offset = 1
  1155  		}
  1156  
  1157  	default:
  1158  		b.Unimplementedf("branch not implemented: %s. Control: %s", b.LongString(), b.Control.LongString())
  1159  	}
  1160  }