github.com/megatontech/mynoteforgo@v0.0.0-20200507084910-5d0c6ea6e890/源码/cmd/compile/internal/amd64/ssa.go (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package amd64
     6  
     7  import (
     8  	"fmt"
     9  	"math"
    10  
    11  	"cmd/compile/internal/gc"
    12  	"cmd/compile/internal/ssa"
    13  	"cmd/compile/internal/types"
    14  	"cmd/internal/obj"
    15  	"cmd/internal/obj/x86"
    16  )
    17  
    18  // markMoves marks any MOVXconst ops that need to avoid clobbering flags.
    19  func ssaMarkMoves(s *gc.SSAGenState, b *ssa.Block) {
    20  	flive := b.FlagsLiveAtEnd
    21  	if b.Control != nil && b.Control.Type.IsFlags() {
    22  		flive = true
    23  	}
    24  	for i := len(b.Values) - 1; i >= 0; i-- {
    25  		v := b.Values[i]
    26  		if flive && (v.Op == ssa.OpAMD64MOVLconst || v.Op == ssa.OpAMD64MOVQconst) {
    27  			// The "mark" is any non-nil Aux value.
    28  			v.Aux = v
    29  		}
    30  		if v.Type.IsFlags() {
    31  			flive = false
    32  		}
    33  		for _, a := range v.Args {
    34  			if a.Type.IsFlags() {
    35  				flive = true
    36  			}
    37  		}
    38  	}
    39  }
    40  
    41  // loadByType returns the load instruction of the given type.
    42  func loadByType(t *types.Type) obj.As {
    43  	// Avoid partial register write
    44  	if !t.IsFloat() && t.Size() <= 2 {
    45  		if t.Size() == 1 {
    46  			return x86.AMOVBLZX
    47  		} else {
    48  			return x86.AMOVWLZX
    49  		}
    50  	}
    51  	// Otherwise, there's no difference between load and store opcodes.
    52  	return storeByType(t)
    53  }
    54  
    55  // storeByType returns the store instruction of the given type.
    56  func storeByType(t *types.Type) obj.As {
    57  	width := t.Size()
    58  	if t.IsFloat() {
    59  		switch width {
    60  		case 4:
    61  			return x86.AMOVSS
    62  		case 8:
    63  			return x86.AMOVSD
    64  		}
    65  	} else {
    66  		switch width {
    67  		case 1:
    68  			return x86.AMOVB
    69  		case 2:
    70  			return x86.AMOVW
    71  		case 4:
    72  			return x86.AMOVL
    73  		case 8:
    74  			return x86.AMOVQ
    75  		}
    76  	}
    77  	panic("bad store type")
    78  }
    79  
    80  // moveByType returns the reg->reg move instruction of the given type.
    81  func moveByType(t *types.Type) obj.As {
    82  	if t.IsFloat() {
    83  		// Moving the whole sse2 register is faster
    84  		// than moving just the correct low portion of it.
    85  		// There is no xmm->xmm move with 1 byte opcode,
    86  		// so use movups, which has 2 byte opcode.
    87  		return x86.AMOVUPS
    88  	} else {
    89  		switch t.Size() {
    90  		case 1:
    91  			// Avoids partial register write
    92  			return x86.AMOVL
    93  		case 2:
    94  			return x86.AMOVL
    95  		case 4:
    96  			return x86.AMOVL
    97  		case 8:
    98  			return x86.AMOVQ
    99  		case 16:
   100  			return x86.AMOVUPS // int128s are in SSE registers
   101  		default:
   102  			panic(fmt.Sprintf("bad int register width %d:%s", t.Size(), t))
   103  		}
   104  	}
   105  }
   106  
   107  // opregreg emits instructions for
   108  //     dest := dest(To) op src(From)
   109  // and also returns the created obj.Prog so it
   110  // may be further adjusted (offset, scale, etc).
   111  func opregreg(s *gc.SSAGenState, op obj.As, dest, src int16) *obj.Prog {
   112  	p := s.Prog(op)
   113  	p.From.Type = obj.TYPE_REG
   114  	p.To.Type = obj.TYPE_REG
   115  	p.To.Reg = dest
   116  	p.From.Reg = src
   117  	return p
   118  }
   119  
   120  // DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ,
   121  // See runtime/mkduff.go.
   122  func duffStart(size int64) int64 {
   123  	x, _ := duff(size)
   124  	return x
   125  }
   126  func duffAdj(size int64) int64 {
   127  	_, x := duff(size)
   128  	return x
   129  }
   130  
   131  // duff returns the offset (from duffzero, in bytes) and pointer adjust (in bytes)
   132  // required to use the duffzero mechanism for a block of the given size.
   133  func duff(size int64) (int64, int64) {
   134  	if size < 32 || size > 1024 || size%dzClearStep != 0 {
   135  		panic("bad duffzero size")
   136  	}
   137  	steps := size / dzClearStep
   138  	blocks := steps / dzBlockLen
   139  	steps %= dzBlockLen
   140  	off := dzBlockSize * (dzBlocks - blocks)
   141  	var adj int64
   142  	if steps != 0 {
   143  		off -= dzLeaqSize
   144  		off -= dzMovSize * steps
   145  		adj -= dzClearStep * (dzBlockLen - steps)
   146  	}
   147  	return off, adj
   148  }
   149  
   150  func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
   151  	switch v.Op {
   152  	case ssa.OpAMD64ADDQ, ssa.OpAMD64ADDL:
   153  		r := v.Reg()
   154  		r1 := v.Args[0].Reg()
   155  		r2 := v.Args[1].Reg()
   156  		switch {
   157  		case r == r1:
   158  			p := s.Prog(v.Op.Asm())
   159  			p.From.Type = obj.TYPE_REG
   160  			p.From.Reg = r2
   161  			p.To.Type = obj.TYPE_REG
   162  			p.To.Reg = r
   163  		case r == r2:
   164  			p := s.Prog(v.Op.Asm())
   165  			p.From.Type = obj.TYPE_REG
   166  			p.From.Reg = r1
   167  			p.To.Type = obj.TYPE_REG
   168  			p.To.Reg = r
   169  		default:
   170  			var asm obj.As
   171  			if v.Op == ssa.OpAMD64ADDQ {
   172  				asm = x86.ALEAQ
   173  			} else {
   174  				asm = x86.ALEAL
   175  			}
   176  			p := s.Prog(asm)
   177  			p.From.Type = obj.TYPE_MEM
   178  			p.From.Reg = r1
   179  			p.From.Scale = 1
   180  			p.From.Index = r2
   181  			p.To.Type = obj.TYPE_REG
   182  			p.To.Reg = r
   183  		}
   184  	// 2-address opcode arithmetic
   185  	case ssa.OpAMD64SUBQ, ssa.OpAMD64SUBL,
   186  		ssa.OpAMD64MULQ, ssa.OpAMD64MULL,
   187  		ssa.OpAMD64ANDQ, ssa.OpAMD64ANDL,
   188  		ssa.OpAMD64ORQ, ssa.OpAMD64ORL,
   189  		ssa.OpAMD64XORQ, ssa.OpAMD64XORL,
   190  		ssa.OpAMD64SHLQ, ssa.OpAMD64SHLL,
   191  		ssa.OpAMD64SHRQ, ssa.OpAMD64SHRL, ssa.OpAMD64SHRW, ssa.OpAMD64SHRB,
   192  		ssa.OpAMD64SARQ, ssa.OpAMD64SARL, ssa.OpAMD64SARW, ssa.OpAMD64SARB,
   193  		ssa.OpAMD64ROLQ, ssa.OpAMD64ROLL, ssa.OpAMD64ROLW, ssa.OpAMD64ROLB,
   194  		ssa.OpAMD64RORQ, ssa.OpAMD64RORL, ssa.OpAMD64RORW, ssa.OpAMD64RORB,
   195  		ssa.OpAMD64ADDSS, ssa.OpAMD64ADDSD, ssa.OpAMD64SUBSS, ssa.OpAMD64SUBSD,
   196  		ssa.OpAMD64MULSS, ssa.OpAMD64MULSD, ssa.OpAMD64DIVSS, ssa.OpAMD64DIVSD,
   197  		ssa.OpAMD64PXOR,
   198  		ssa.OpAMD64BTSL, ssa.OpAMD64BTSQ,
   199  		ssa.OpAMD64BTCL, ssa.OpAMD64BTCQ,
   200  		ssa.OpAMD64BTRL, ssa.OpAMD64BTRQ:
   201  		r := v.Reg()
   202  		if r != v.Args[0].Reg() {
   203  			v.Fatalf("input[0] and output not in same register %s", v.LongString())
   204  		}
   205  		opregreg(s, v.Op.Asm(), r, v.Args[1].Reg())
   206  
   207  	case ssa.OpAMD64DIVQU, ssa.OpAMD64DIVLU, ssa.OpAMD64DIVWU:
   208  		// Arg[0] (the dividend) is in AX.
   209  		// Arg[1] (the divisor) can be in any other register.
   210  		// Result[0] (the quotient) is in AX.
   211  		// Result[1] (the remainder) is in DX.
   212  		r := v.Args[1].Reg()
   213  
   214  		// Zero extend dividend.
   215  		c := s.Prog(x86.AXORL)
   216  		c.From.Type = obj.TYPE_REG
   217  		c.From.Reg = x86.REG_DX
   218  		c.To.Type = obj.TYPE_REG
   219  		c.To.Reg = x86.REG_DX
   220  
   221  		// Issue divide.
   222  		p := s.Prog(v.Op.Asm())
   223  		p.From.Type = obj.TYPE_REG
   224  		p.From.Reg = r
   225  
   226  	case ssa.OpAMD64DIVQ, ssa.OpAMD64DIVL, ssa.OpAMD64DIVW:
   227  		// Arg[0] (the dividend) is in AX.
   228  		// Arg[1] (the divisor) can be in any other register.
   229  		// Result[0] (the quotient) is in AX.
   230  		// Result[1] (the remainder) is in DX.
   231  		r := v.Args[1].Reg()
   232  		var j1 *obj.Prog
   233  
   234  		// CPU faults upon signed overflow, which occurs when the most
   235  		// negative int is divided by -1. Handle divide by -1 as a special case.
   236  		if ssa.NeedsFixUp(v) {
   237  			var c *obj.Prog
   238  			switch v.Op {
   239  			case ssa.OpAMD64DIVQ:
   240  				c = s.Prog(x86.ACMPQ)
   241  			case ssa.OpAMD64DIVL:
   242  				c = s.Prog(x86.ACMPL)
   243  			case ssa.OpAMD64DIVW:
   244  				c = s.Prog(x86.ACMPW)
   245  			}
   246  			c.From.Type = obj.TYPE_REG
   247  			c.From.Reg = r
   248  			c.To.Type = obj.TYPE_CONST
   249  			c.To.Offset = -1
   250  			j1 = s.Prog(x86.AJEQ)
   251  			j1.To.Type = obj.TYPE_BRANCH
   252  		}
   253  
   254  		// Sign extend dividend.
   255  		switch v.Op {
   256  		case ssa.OpAMD64DIVQ:
   257  			s.Prog(x86.ACQO)
   258  		case ssa.OpAMD64DIVL:
   259  			s.Prog(x86.ACDQ)
   260  		case ssa.OpAMD64DIVW:
   261  			s.Prog(x86.ACWD)
   262  		}
   263  
   264  		// Issue divide.
   265  		p := s.Prog(v.Op.Asm())
   266  		p.From.Type = obj.TYPE_REG
   267  		p.From.Reg = r
   268  
   269  		if j1 != nil {
   270  			// Skip over -1 fixup code.
   271  			j2 := s.Prog(obj.AJMP)
   272  			j2.To.Type = obj.TYPE_BRANCH
   273  
   274  			// Issue -1 fixup code.
   275  			// n / -1 = -n
   276  			var n1 *obj.Prog
   277  			switch v.Op {
   278  			case ssa.OpAMD64DIVQ:
   279  				n1 = s.Prog(x86.ANEGQ)
   280  			case ssa.OpAMD64DIVL:
   281  				n1 = s.Prog(x86.ANEGL)
   282  			case ssa.OpAMD64DIVW:
   283  				n1 = s.Prog(x86.ANEGW)
   284  			}
   285  			n1.To.Type = obj.TYPE_REG
   286  			n1.To.Reg = x86.REG_AX
   287  
   288  			// n % -1 == 0
   289  			n2 := s.Prog(x86.AXORL)
   290  			n2.From.Type = obj.TYPE_REG
   291  			n2.From.Reg = x86.REG_DX
   292  			n2.To.Type = obj.TYPE_REG
   293  			n2.To.Reg = x86.REG_DX
   294  
   295  			// TODO(khr): issue only the -1 fixup code we need.
   296  			// For instance, if only the quotient is used, no point in zeroing the remainder.
   297  
   298  			j1.To.Val = n1
   299  			j2.To.Val = s.Pc()
   300  		}
   301  
   302  	case ssa.OpAMD64HMULQ, ssa.OpAMD64HMULL, ssa.OpAMD64HMULQU, ssa.OpAMD64HMULLU:
   303  		// the frontend rewrites constant division by 8/16/32 bit integers into
   304  		// HMUL by a constant
   305  		// SSA rewrites generate the 64 bit versions
   306  
   307  		// Arg[0] is already in AX as it's the only register we allow
   308  		// and DX is the only output we care about (the high bits)
   309  		p := s.Prog(v.Op.Asm())
   310  		p.From.Type = obj.TYPE_REG
   311  		p.From.Reg = v.Args[1].Reg()
   312  
   313  		// IMULB puts the high portion in AH instead of DL,
   314  		// so move it to DL for consistency
   315  		if v.Type.Size() == 1 {
   316  			m := s.Prog(x86.AMOVB)
   317  			m.From.Type = obj.TYPE_REG
   318  			m.From.Reg = x86.REG_AH
   319  			m.To.Type = obj.TYPE_REG
   320  			m.To.Reg = x86.REG_DX
   321  		}
   322  
   323  	case ssa.OpAMD64MULQU, ssa.OpAMD64MULLU:
   324  		// Arg[0] is already in AX as it's the only register we allow
   325  		// results lo in AX
   326  		p := s.Prog(v.Op.Asm())
   327  		p.From.Type = obj.TYPE_REG
   328  		p.From.Reg = v.Args[1].Reg()
   329  
   330  	case ssa.OpAMD64MULQU2:
   331  		// Arg[0] is already in AX as it's the only register we allow
   332  		// results hi in DX, lo in AX
   333  		p := s.Prog(v.Op.Asm())
   334  		p.From.Type = obj.TYPE_REG
   335  		p.From.Reg = v.Args[1].Reg()
   336  
   337  	case ssa.OpAMD64DIVQU2:
   338  		// Arg[0], Arg[1] are already in Dx, AX, as they're the only registers we allow
   339  		// results q in AX, r in DX
   340  		p := s.Prog(v.Op.Asm())
   341  		p.From.Type = obj.TYPE_REG
   342  		p.From.Reg = v.Args[2].Reg()
   343  
   344  	case ssa.OpAMD64AVGQU:
   345  		// compute (x+y)/2 unsigned.
   346  		// Do a 64-bit add, the overflow goes into the carry.
   347  		// Shift right once and pull the carry back into the 63rd bit.
   348  		r := v.Reg()
   349  		if r != v.Args[0].Reg() {
   350  			v.Fatalf("input[0] and output not in same register %s", v.LongString())
   351  		}
   352  		p := s.Prog(x86.AADDQ)
   353  		p.From.Type = obj.TYPE_REG
   354  		p.To.Type = obj.TYPE_REG
   355  		p.To.Reg = r
   356  		p.From.Reg = v.Args[1].Reg()
   357  		p = s.Prog(x86.ARCRQ)
   358  		p.From.Type = obj.TYPE_CONST
   359  		p.From.Offset = 1
   360  		p.To.Type = obj.TYPE_REG
   361  		p.To.Reg = r
   362  
   363  	case ssa.OpAMD64ADDQcarry, ssa.OpAMD64ADCQ:
   364  		r := v.Reg0()
   365  		r0 := v.Args[0].Reg()
   366  		r1 := v.Args[1].Reg()
   367  		switch r {
   368  		case r0:
   369  			p := s.Prog(v.Op.Asm())
   370  			p.From.Type = obj.TYPE_REG
   371  			p.From.Reg = r1
   372  			p.To.Type = obj.TYPE_REG
   373  			p.To.Reg = r
   374  		case r1:
   375  			p := s.Prog(v.Op.Asm())
   376  			p.From.Type = obj.TYPE_REG
   377  			p.From.Reg = r0
   378  			p.To.Type = obj.TYPE_REG
   379  			p.To.Reg = r
   380  		default:
   381  			v.Fatalf("output not in same register as an input %s", v.LongString())
   382  		}
   383  
   384  	case ssa.OpAMD64SUBQborrow, ssa.OpAMD64SBBQ:
   385  		p := s.Prog(v.Op.Asm())
   386  		p.From.Type = obj.TYPE_REG
   387  		p.From.Reg = v.Args[1].Reg()
   388  		p.To.Type = obj.TYPE_REG
   389  		p.To.Reg = v.Reg0()
   390  
   391  	case ssa.OpAMD64ADDQconstcarry, ssa.OpAMD64ADCQconst, ssa.OpAMD64SUBQconstborrow, ssa.OpAMD64SBBQconst:
   392  		p := s.Prog(v.Op.Asm())
   393  		p.From.Type = obj.TYPE_CONST
   394  		p.From.Offset = v.AuxInt
   395  		p.To.Type = obj.TYPE_REG
   396  		p.To.Reg = v.Reg0()
   397  
   398  	case ssa.OpAMD64ADDQconst, ssa.OpAMD64ADDLconst:
   399  		r := v.Reg()
   400  		a := v.Args[0].Reg()
   401  		if r == a {
   402  			if v.AuxInt == 1 {
   403  				var asm obj.As
   404  				// Software optimization manual recommends add $1,reg.
   405  				// But inc/dec is 1 byte smaller. ICC always uses inc
   406  				// Clang/GCC choose depending on flags, but prefer add.
   407  				// Experiments show that inc/dec is both a little faster
   408  				// and make a binary a little smaller.
   409  				if v.Op == ssa.OpAMD64ADDQconst {
   410  					asm = x86.AINCQ
   411  				} else {
   412  					asm = x86.AINCL
   413  				}
   414  				p := s.Prog(asm)
   415  				p.To.Type = obj.TYPE_REG
   416  				p.To.Reg = r
   417  				return
   418  			}
   419  			if v.AuxInt == -1 {
   420  				var asm obj.As
   421  				if v.Op == ssa.OpAMD64ADDQconst {
   422  					asm = x86.ADECQ
   423  				} else {
   424  					asm = x86.ADECL
   425  				}
   426  				p := s.Prog(asm)
   427  				p.To.Type = obj.TYPE_REG
   428  				p.To.Reg = r
   429  				return
   430  			}
   431  			p := s.Prog(v.Op.Asm())
   432  			p.From.Type = obj.TYPE_CONST
   433  			p.From.Offset = v.AuxInt
   434  			p.To.Type = obj.TYPE_REG
   435  			p.To.Reg = r
   436  			return
   437  		}
   438  		var asm obj.As
   439  		if v.Op == ssa.OpAMD64ADDQconst {
   440  			asm = x86.ALEAQ
   441  		} else {
   442  			asm = x86.ALEAL
   443  		}
   444  		p := s.Prog(asm)
   445  		p.From.Type = obj.TYPE_MEM
   446  		p.From.Reg = a
   447  		p.From.Offset = v.AuxInt
   448  		p.To.Type = obj.TYPE_REG
   449  		p.To.Reg = r
   450  
   451  	case ssa.OpAMD64CMOVQEQ, ssa.OpAMD64CMOVLEQ, ssa.OpAMD64CMOVWEQ,
   452  		ssa.OpAMD64CMOVQLT, ssa.OpAMD64CMOVLLT, ssa.OpAMD64CMOVWLT,
   453  		ssa.OpAMD64CMOVQNE, ssa.OpAMD64CMOVLNE, ssa.OpAMD64CMOVWNE,
   454  		ssa.OpAMD64CMOVQGT, ssa.OpAMD64CMOVLGT, ssa.OpAMD64CMOVWGT,
   455  		ssa.OpAMD64CMOVQLE, ssa.OpAMD64CMOVLLE, ssa.OpAMD64CMOVWLE,
   456  		ssa.OpAMD64CMOVQGE, ssa.OpAMD64CMOVLGE, ssa.OpAMD64CMOVWGE,
   457  		ssa.OpAMD64CMOVQHI, ssa.OpAMD64CMOVLHI, ssa.OpAMD64CMOVWHI,
   458  		ssa.OpAMD64CMOVQLS, ssa.OpAMD64CMOVLLS, ssa.OpAMD64CMOVWLS,
   459  		ssa.OpAMD64CMOVQCC, ssa.OpAMD64CMOVLCC, ssa.OpAMD64CMOVWCC,
   460  		ssa.OpAMD64CMOVQCS, ssa.OpAMD64CMOVLCS, ssa.OpAMD64CMOVWCS,
   461  		ssa.OpAMD64CMOVQGTF, ssa.OpAMD64CMOVLGTF, ssa.OpAMD64CMOVWGTF,
   462  		ssa.OpAMD64CMOVQGEF, ssa.OpAMD64CMOVLGEF, ssa.OpAMD64CMOVWGEF:
   463  		r := v.Reg()
   464  		if r != v.Args[0].Reg() {
   465  			v.Fatalf("input[0] and output not in same register %s", v.LongString())
   466  		}
   467  		p := s.Prog(v.Op.Asm())
   468  		p.From.Type = obj.TYPE_REG
   469  		p.From.Reg = v.Args[1].Reg()
   470  		p.To.Type = obj.TYPE_REG
   471  		p.To.Reg = r
   472  
   473  	case ssa.OpAMD64CMOVQNEF, ssa.OpAMD64CMOVLNEF, ssa.OpAMD64CMOVWNEF:
   474  		r := v.Reg()
   475  		if r != v.Args[0].Reg() {
   476  			v.Fatalf("input[0] and output not in same register %s", v.LongString())
   477  		}
   478  		// Flag condition: ^ZERO || PARITY
   479  		// Generate:
   480  		//   CMOV*NE  SRC,DST
   481  		//   CMOV*PS  SRC,DST
   482  		p := s.Prog(v.Op.Asm())
   483  		p.From.Type = obj.TYPE_REG
   484  		p.From.Reg = v.Args[1].Reg()
   485  		p.To.Type = obj.TYPE_REG
   486  		p.To.Reg = r
   487  		var q *obj.Prog
   488  		if v.Op == ssa.OpAMD64CMOVQNEF {
   489  			q = s.Prog(x86.ACMOVQPS)
   490  		} else if v.Op == ssa.OpAMD64CMOVLNEF {
   491  			q = s.Prog(x86.ACMOVLPS)
   492  		} else {
   493  			q = s.Prog(x86.ACMOVWPS)
   494  		}
   495  		q.From.Type = obj.TYPE_REG
   496  		q.From.Reg = v.Args[1].Reg()
   497  		q.To.Type = obj.TYPE_REG
   498  		q.To.Reg = r
   499  
   500  	case ssa.OpAMD64CMOVQEQF, ssa.OpAMD64CMOVLEQF, ssa.OpAMD64CMOVWEQF:
   501  		r := v.Reg()
   502  		if r != v.Args[0].Reg() {
   503  			v.Fatalf("input[0] and output not in same register %s", v.LongString())
   504  		}
   505  
   506  		// Flag condition: ZERO && !PARITY
   507  		// Generate:
   508  		//   MOV      SRC,AX
   509  		//   CMOV*NE  DST,AX
   510  		//   CMOV*PC  AX,DST
   511  		//
   512  		// TODO(rasky): we could generate:
   513  		//   CMOV*NE  DST,SRC
   514  		//   CMOV*PC  SRC,DST
   515  		// But this requires a way for regalloc to know that SRC might be
   516  		// clobbered by this instruction.
   517  		if v.Args[1].Reg() != x86.REG_AX {
   518  			opregreg(s, moveByType(v.Type), x86.REG_AX, v.Args[1].Reg())
   519  		}
   520  		p := s.Prog(v.Op.Asm())
   521  		p.From.Type = obj.TYPE_REG
   522  		p.From.Reg = r
   523  		p.To.Type = obj.TYPE_REG
   524  		p.To.Reg = x86.REG_AX
   525  		var q *obj.Prog
   526  		if v.Op == ssa.OpAMD64CMOVQEQF {
   527  			q = s.Prog(x86.ACMOVQPC)
   528  		} else if v.Op == ssa.OpAMD64CMOVLEQF {
   529  			q = s.Prog(x86.ACMOVLPC)
   530  		} else {
   531  			q = s.Prog(x86.ACMOVWPC)
   532  		}
   533  		q.From.Type = obj.TYPE_REG
   534  		q.From.Reg = x86.REG_AX
   535  		q.To.Type = obj.TYPE_REG
   536  		q.To.Reg = r
   537  
   538  	case ssa.OpAMD64MULQconst, ssa.OpAMD64MULLconst:
   539  		r := v.Reg()
   540  		p := s.Prog(v.Op.Asm())
   541  		p.From.Type = obj.TYPE_CONST
   542  		p.From.Offset = v.AuxInt
   543  		p.To.Type = obj.TYPE_REG
   544  		p.To.Reg = r
   545  		p.SetFrom3(obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[0].Reg()})
   546  
   547  	case ssa.OpAMD64SUBQconst, ssa.OpAMD64SUBLconst,
   548  		ssa.OpAMD64ANDQconst, ssa.OpAMD64ANDLconst,
   549  		ssa.OpAMD64ORQconst, ssa.OpAMD64ORLconst,
   550  		ssa.OpAMD64XORQconst, ssa.OpAMD64XORLconst,
   551  		ssa.OpAMD64SHLQconst, ssa.OpAMD64SHLLconst,
   552  		ssa.OpAMD64SHRQconst, ssa.OpAMD64SHRLconst, ssa.OpAMD64SHRWconst, ssa.OpAMD64SHRBconst,
   553  		ssa.OpAMD64SARQconst, ssa.OpAMD64SARLconst, ssa.OpAMD64SARWconst, ssa.OpAMD64SARBconst,
   554  		ssa.OpAMD64ROLQconst, ssa.OpAMD64ROLLconst, ssa.OpAMD64ROLWconst, ssa.OpAMD64ROLBconst:
   555  		r := v.Reg()
   556  		if r != v.Args[0].Reg() {
   557  			v.Fatalf("input[0] and output not in same register %s", v.LongString())
   558  		}
   559  		p := s.Prog(v.Op.Asm())
   560  		p.From.Type = obj.TYPE_CONST
   561  		p.From.Offset = v.AuxInt
   562  		p.To.Type = obj.TYPE_REG
   563  		p.To.Reg = r
   564  	case ssa.OpAMD64SBBQcarrymask, ssa.OpAMD64SBBLcarrymask:
   565  		r := v.Reg()
   566  		p := s.Prog(v.Op.Asm())
   567  		p.From.Type = obj.TYPE_REG
   568  		p.From.Reg = r
   569  		p.To.Type = obj.TYPE_REG
   570  		p.To.Reg = r
   571  	case ssa.OpAMD64LEAQ1, ssa.OpAMD64LEAQ2, ssa.OpAMD64LEAQ4, ssa.OpAMD64LEAQ8,
   572  		ssa.OpAMD64LEAL1, ssa.OpAMD64LEAL2, ssa.OpAMD64LEAL4, ssa.OpAMD64LEAL8,
   573  		ssa.OpAMD64LEAW1, ssa.OpAMD64LEAW2, ssa.OpAMD64LEAW4, ssa.OpAMD64LEAW8:
   574  		o := v.Reg()
   575  		r := v.Args[0].Reg()
   576  		i := v.Args[1].Reg()
   577  		p := s.Prog(v.Op.Asm())
   578  		switch v.Op {
   579  		case ssa.OpAMD64LEAQ1, ssa.OpAMD64LEAL1, ssa.OpAMD64LEAW1:
   580  			p.From.Scale = 1
   581  			if i == x86.REG_SP {
   582  				r, i = i, r
   583  			}
   584  		case ssa.OpAMD64LEAQ2, ssa.OpAMD64LEAL2, ssa.OpAMD64LEAW2:
   585  			p.From.Scale = 2
   586  		case ssa.OpAMD64LEAQ4, ssa.OpAMD64LEAL4, ssa.OpAMD64LEAW4:
   587  			p.From.Scale = 4
   588  		case ssa.OpAMD64LEAQ8, ssa.OpAMD64LEAL8, ssa.OpAMD64LEAW8:
   589  			p.From.Scale = 8
   590  		}
   591  		p.From.Type = obj.TYPE_MEM
   592  		p.From.Reg = r
   593  		p.From.Index = i
   594  		p.To.Type = obj.TYPE_REG
   595  		p.To.Reg = o
   596  		if v.AuxInt != 0 && v.Aux == nil {
   597  			// Emit an additional LEA to add the displacement instead of creating a slow 3 operand LEA.
   598  			switch v.Op {
   599  			case ssa.OpAMD64LEAQ1, ssa.OpAMD64LEAQ2, ssa.OpAMD64LEAQ4, ssa.OpAMD64LEAQ8:
   600  				p = s.Prog(x86.ALEAQ)
   601  			case ssa.OpAMD64LEAL1, ssa.OpAMD64LEAL2, ssa.OpAMD64LEAL4, ssa.OpAMD64LEAL8:
   602  				p = s.Prog(x86.ALEAL)
   603  			case ssa.OpAMD64LEAW1, ssa.OpAMD64LEAW2, ssa.OpAMD64LEAW4, ssa.OpAMD64LEAW8:
   604  				p = s.Prog(x86.ALEAW)
   605  			}
   606  			p.From.Type = obj.TYPE_MEM
   607  			p.From.Reg = o
   608  			p.To.Type = obj.TYPE_REG
   609  			p.To.Reg = o
   610  		}
   611  		gc.AddAux(&p.From, v)
   612  	case ssa.OpAMD64LEAQ, ssa.OpAMD64LEAL, ssa.OpAMD64LEAW:
   613  		p := s.Prog(v.Op.Asm())
   614  		p.From.Type = obj.TYPE_MEM
   615  		p.From.Reg = v.Args[0].Reg()
   616  		gc.AddAux(&p.From, v)
   617  		p.To.Type = obj.TYPE_REG
   618  		p.To.Reg = v.Reg()
   619  	case ssa.OpAMD64CMPQ, ssa.OpAMD64CMPL, ssa.OpAMD64CMPW, ssa.OpAMD64CMPB,
   620  		ssa.OpAMD64TESTQ, ssa.OpAMD64TESTL, ssa.OpAMD64TESTW, ssa.OpAMD64TESTB,
   621  		ssa.OpAMD64BTL, ssa.OpAMD64BTQ:
   622  		opregreg(s, v.Op.Asm(), v.Args[1].Reg(), v.Args[0].Reg())
   623  	case ssa.OpAMD64UCOMISS, ssa.OpAMD64UCOMISD:
   624  		// Go assembler has swapped operands for UCOMISx relative to CMP,
   625  		// must account for that right here.
   626  		opregreg(s, v.Op.Asm(), v.Args[0].Reg(), v.Args[1].Reg())
   627  	case ssa.OpAMD64CMPQconst, ssa.OpAMD64CMPLconst, ssa.OpAMD64CMPWconst, ssa.OpAMD64CMPBconst:
   628  		p := s.Prog(v.Op.Asm())
   629  		p.From.Type = obj.TYPE_REG
   630  		p.From.Reg = v.Args[0].Reg()
   631  		p.To.Type = obj.TYPE_CONST
   632  		p.To.Offset = v.AuxInt
   633  	case ssa.OpAMD64BTLconst, ssa.OpAMD64BTQconst,
   634  		ssa.OpAMD64TESTQconst, ssa.OpAMD64TESTLconst, ssa.OpAMD64TESTWconst, ssa.OpAMD64TESTBconst,
   635  		ssa.OpAMD64BTSLconst, ssa.OpAMD64BTSQconst,
   636  		ssa.OpAMD64BTCLconst, ssa.OpAMD64BTCQconst,
   637  		ssa.OpAMD64BTRLconst, ssa.OpAMD64BTRQconst:
   638  		op := v.Op
   639  		if op == ssa.OpAMD64BTQconst && v.AuxInt < 32 {
   640  			// Emit 32-bit version because it's shorter
   641  			op = ssa.OpAMD64BTLconst
   642  		}
   643  		p := s.Prog(op.Asm())
   644  		p.From.Type = obj.TYPE_CONST
   645  		p.From.Offset = v.AuxInt
   646  		p.To.Type = obj.TYPE_REG
   647  		p.To.Reg = v.Args[0].Reg()
   648  	case ssa.OpAMD64CMPQload, ssa.OpAMD64CMPLload, ssa.OpAMD64CMPWload, ssa.OpAMD64CMPBload:
   649  		p := s.Prog(v.Op.Asm())
   650  		p.From.Type = obj.TYPE_MEM
   651  		p.From.Reg = v.Args[0].Reg()
   652  		gc.AddAux(&p.From, v)
   653  		p.To.Type = obj.TYPE_REG
   654  		p.To.Reg = v.Args[1].Reg()
   655  	case ssa.OpAMD64CMPQconstload, ssa.OpAMD64CMPLconstload, ssa.OpAMD64CMPWconstload, ssa.OpAMD64CMPBconstload:
   656  		sc := v.AuxValAndOff()
   657  		p := s.Prog(v.Op.Asm())
   658  		p.From.Type = obj.TYPE_MEM
   659  		p.From.Reg = v.Args[0].Reg()
   660  		gc.AddAux2(&p.From, v, sc.Off())
   661  		p.To.Type = obj.TYPE_CONST
   662  		p.To.Offset = sc.Val()
   663  	case ssa.OpAMD64MOVLconst, ssa.OpAMD64MOVQconst:
   664  		x := v.Reg()
   665  
   666  		// If flags aren't live (indicated by v.Aux == nil),
   667  		// then we can rewrite MOV $0, AX into XOR AX, AX.
   668  		if v.AuxInt == 0 && v.Aux == nil {
   669  			p := s.Prog(x86.AXORL)
   670  			p.From.Type = obj.TYPE_REG
   671  			p.From.Reg = x
   672  			p.To.Type = obj.TYPE_REG
   673  			p.To.Reg = x
   674  			break
   675  		}
   676  
   677  		asm := v.Op.Asm()
   678  		// Use MOVL to move a small constant into a register
   679  		// when the constant is positive and fits into 32 bits.
   680  		if 0 <= v.AuxInt && v.AuxInt <= (1<<32-1) {
   681  			// The upper 32bit are zeroed automatically when using MOVL.
   682  			asm = x86.AMOVL
   683  		}
   684  		p := s.Prog(asm)
   685  		p.From.Type = obj.TYPE_CONST
   686  		p.From.Offset = v.AuxInt
   687  		p.To.Type = obj.TYPE_REG
   688  		p.To.Reg = x
   689  	case ssa.OpAMD64MOVSSconst, ssa.OpAMD64MOVSDconst:
   690  		x := v.Reg()
   691  		p := s.Prog(v.Op.Asm())
   692  		p.From.Type = obj.TYPE_FCONST
   693  		p.From.Val = math.Float64frombits(uint64(v.AuxInt))
   694  		p.To.Type = obj.TYPE_REG
   695  		p.To.Reg = x
   696  	case ssa.OpAMD64MOVQload, ssa.OpAMD64MOVSSload, ssa.OpAMD64MOVSDload, ssa.OpAMD64MOVLload, ssa.OpAMD64MOVWload, ssa.OpAMD64MOVBload, ssa.OpAMD64MOVBQSXload, ssa.OpAMD64MOVWQSXload, ssa.OpAMD64MOVLQSXload, ssa.OpAMD64MOVOload:
   697  		p := s.Prog(v.Op.Asm())
   698  		p.From.Type = obj.TYPE_MEM
   699  		p.From.Reg = v.Args[0].Reg()
   700  		gc.AddAux(&p.From, v)
   701  		p.To.Type = obj.TYPE_REG
   702  		p.To.Reg = v.Reg()
   703  	case ssa.OpAMD64MOVBloadidx1, ssa.OpAMD64MOVWloadidx1, ssa.OpAMD64MOVLloadidx1, ssa.OpAMD64MOVQloadidx1, ssa.OpAMD64MOVSSloadidx1, ssa.OpAMD64MOVSDloadidx1,
   704  		ssa.OpAMD64MOVQloadidx8, ssa.OpAMD64MOVSDloadidx8, ssa.OpAMD64MOVLloadidx8, ssa.OpAMD64MOVLloadidx4, ssa.OpAMD64MOVSSloadidx4, ssa.OpAMD64MOVWloadidx2:
   705  		r := v.Args[0].Reg()
   706  		i := v.Args[1].Reg()
   707  		p := s.Prog(v.Op.Asm())
   708  		p.From.Type = obj.TYPE_MEM
   709  		switch v.Op {
   710  		case ssa.OpAMD64MOVBloadidx1, ssa.OpAMD64MOVWloadidx1, ssa.OpAMD64MOVLloadidx1, ssa.OpAMD64MOVQloadidx1, ssa.OpAMD64MOVSSloadidx1, ssa.OpAMD64MOVSDloadidx1:
   711  			if i == x86.REG_SP {
   712  				r, i = i, r
   713  			}
   714  			p.From.Scale = 1
   715  		case ssa.OpAMD64MOVQloadidx8, ssa.OpAMD64MOVSDloadidx8, ssa.OpAMD64MOVLloadidx8:
   716  			p.From.Scale = 8
   717  		case ssa.OpAMD64MOVLloadidx4, ssa.OpAMD64MOVSSloadidx4:
   718  			p.From.Scale = 4
   719  		case ssa.OpAMD64MOVWloadidx2:
   720  			p.From.Scale = 2
   721  		}
   722  		p.From.Reg = r
   723  		p.From.Index = i
   724  		gc.AddAux(&p.From, v)
   725  		p.To.Type = obj.TYPE_REG
   726  		p.To.Reg = v.Reg()
   727  	case ssa.OpAMD64MOVQstore, ssa.OpAMD64MOVSSstore, ssa.OpAMD64MOVSDstore, ssa.OpAMD64MOVLstore, ssa.OpAMD64MOVWstore, ssa.OpAMD64MOVBstore, ssa.OpAMD64MOVOstore,
   728  		ssa.OpAMD64BTCQmodify, ssa.OpAMD64BTCLmodify, ssa.OpAMD64BTRQmodify, ssa.OpAMD64BTRLmodify, ssa.OpAMD64BTSQmodify, ssa.OpAMD64BTSLmodify,
   729  		ssa.OpAMD64ADDQmodify, ssa.OpAMD64SUBQmodify, ssa.OpAMD64ANDQmodify, ssa.OpAMD64ORQmodify, ssa.OpAMD64XORQmodify,
   730  		ssa.OpAMD64ADDLmodify, ssa.OpAMD64SUBLmodify, ssa.OpAMD64ANDLmodify, ssa.OpAMD64ORLmodify, ssa.OpAMD64XORLmodify:
   731  		p := s.Prog(v.Op.Asm())
   732  		p.From.Type = obj.TYPE_REG
   733  		p.From.Reg = v.Args[1].Reg()
   734  		p.To.Type = obj.TYPE_MEM
   735  		p.To.Reg = v.Args[0].Reg()
   736  		gc.AddAux(&p.To, v)
   737  	case ssa.OpAMD64MOVBstoreidx1, ssa.OpAMD64MOVWstoreidx1, ssa.OpAMD64MOVLstoreidx1, ssa.OpAMD64MOVQstoreidx1, ssa.OpAMD64MOVSSstoreidx1, ssa.OpAMD64MOVSDstoreidx1,
   738  		ssa.OpAMD64MOVQstoreidx8, ssa.OpAMD64MOVSDstoreidx8, ssa.OpAMD64MOVLstoreidx8, ssa.OpAMD64MOVSSstoreidx4, ssa.OpAMD64MOVLstoreidx4, ssa.OpAMD64MOVWstoreidx2:
   739  		r := v.Args[0].Reg()
   740  		i := v.Args[1].Reg()
   741  		p := s.Prog(v.Op.Asm())
   742  		p.From.Type = obj.TYPE_REG
   743  		p.From.Reg = v.Args[2].Reg()
   744  		p.To.Type = obj.TYPE_MEM
   745  		switch v.Op {
   746  		case ssa.OpAMD64MOVBstoreidx1, ssa.OpAMD64MOVWstoreidx1, ssa.OpAMD64MOVLstoreidx1, ssa.OpAMD64MOVQstoreidx1, ssa.OpAMD64MOVSSstoreidx1, ssa.OpAMD64MOVSDstoreidx1:
   747  			if i == x86.REG_SP {
   748  				r, i = i, r
   749  			}
   750  			p.To.Scale = 1
   751  		case ssa.OpAMD64MOVQstoreidx8, ssa.OpAMD64MOVSDstoreidx8, ssa.OpAMD64MOVLstoreidx8:
   752  			p.To.Scale = 8
   753  		case ssa.OpAMD64MOVSSstoreidx4, ssa.OpAMD64MOVLstoreidx4:
   754  			p.To.Scale = 4
   755  		case ssa.OpAMD64MOVWstoreidx2:
   756  			p.To.Scale = 2
   757  		}
   758  		p.To.Reg = r
   759  		p.To.Index = i
   760  		gc.AddAux(&p.To, v)
   761  	case ssa.OpAMD64ADDQconstmodify, ssa.OpAMD64ADDLconstmodify:
   762  		sc := v.AuxValAndOff()
   763  		off := sc.Off()
   764  		val := sc.Val()
   765  		if val == 1 || val == -1 {
   766  			var asm obj.As
   767  			if v.Op == ssa.OpAMD64ADDQconstmodify {
   768  				if val == 1 {
   769  					asm = x86.AINCQ
   770  				} else {
   771  					asm = x86.ADECQ
   772  				}
   773  			} else {
   774  				if val == 1 {
   775  					asm = x86.AINCL
   776  				} else {
   777  					asm = x86.ADECL
   778  				}
   779  			}
   780  			p := s.Prog(asm)
   781  			p.To.Type = obj.TYPE_MEM
   782  			p.To.Reg = v.Args[0].Reg()
   783  			gc.AddAux2(&p.To, v, off)
   784  			break
   785  		}
   786  		fallthrough
   787  	case ssa.OpAMD64ANDQconstmodify, ssa.OpAMD64ANDLconstmodify, ssa.OpAMD64ORQconstmodify, ssa.OpAMD64ORLconstmodify,
   788  		ssa.OpAMD64BTCQconstmodify, ssa.OpAMD64BTCLconstmodify, ssa.OpAMD64BTSQconstmodify, ssa.OpAMD64BTSLconstmodify,
   789  		ssa.OpAMD64BTRQconstmodify, ssa.OpAMD64BTRLconstmodify, ssa.OpAMD64XORQconstmodify, ssa.OpAMD64XORLconstmodify:
   790  		sc := v.AuxValAndOff()
   791  		off := sc.Off()
   792  		val := sc.Val()
   793  		p := s.Prog(v.Op.Asm())
   794  		p.From.Type = obj.TYPE_CONST
   795  		p.From.Offset = val
   796  		p.To.Type = obj.TYPE_MEM
   797  		p.To.Reg = v.Args[0].Reg()
   798  		gc.AddAux2(&p.To, v, off)
   799  	case ssa.OpAMD64MOVQstoreconst, ssa.OpAMD64MOVLstoreconst, ssa.OpAMD64MOVWstoreconst, ssa.OpAMD64MOVBstoreconst:
   800  		p := s.Prog(v.Op.Asm())
   801  		p.From.Type = obj.TYPE_CONST
   802  		sc := v.AuxValAndOff()
   803  		p.From.Offset = sc.Val()
   804  		p.To.Type = obj.TYPE_MEM
   805  		p.To.Reg = v.Args[0].Reg()
   806  		gc.AddAux2(&p.To, v, sc.Off())
   807  	case ssa.OpAMD64MOVQstoreconstidx1, ssa.OpAMD64MOVQstoreconstidx8, ssa.OpAMD64MOVLstoreconstidx1, ssa.OpAMD64MOVLstoreconstidx4, ssa.OpAMD64MOVWstoreconstidx1, ssa.OpAMD64MOVWstoreconstidx2, ssa.OpAMD64MOVBstoreconstidx1:
   808  		p := s.Prog(v.Op.Asm())
   809  		p.From.Type = obj.TYPE_CONST
   810  		sc := v.AuxValAndOff()
   811  		p.From.Offset = sc.Val()
   812  		r := v.Args[0].Reg()
   813  		i := v.Args[1].Reg()
   814  		switch v.Op {
   815  		case ssa.OpAMD64MOVBstoreconstidx1, ssa.OpAMD64MOVWstoreconstidx1, ssa.OpAMD64MOVLstoreconstidx1, ssa.OpAMD64MOVQstoreconstidx1:
   816  			p.To.Scale = 1
   817  			if i == x86.REG_SP {
   818  				r, i = i, r
   819  			}
   820  		case ssa.OpAMD64MOVWstoreconstidx2:
   821  			p.To.Scale = 2
   822  		case ssa.OpAMD64MOVLstoreconstidx4:
   823  			p.To.Scale = 4
   824  		case ssa.OpAMD64MOVQstoreconstidx8:
   825  			p.To.Scale = 8
   826  		}
   827  		p.To.Type = obj.TYPE_MEM
   828  		p.To.Reg = r
   829  		p.To.Index = i
   830  		gc.AddAux2(&p.To, v, sc.Off())
   831  	case ssa.OpAMD64MOVLQSX, ssa.OpAMD64MOVWQSX, ssa.OpAMD64MOVBQSX, ssa.OpAMD64MOVLQZX, ssa.OpAMD64MOVWQZX, ssa.OpAMD64MOVBQZX,
   832  		ssa.OpAMD64CVTTSS2SL, ssa.OpAMD64CVTTSD2SL, ssa.OpAMD64CVTTSS2SQ, ssa.OpAMD64CVTTSD2SQ,
   833  		ssa.OpAMD64CVTSS2SD, ssa.OpAMD64CVTSD2SS:
   834  		opregreg(s, v.Op.Asm(), v.Reg(), v.Args[0].Reg())
   835  	case ssa.OpAMD64CVTSL2SD, ssa.OpAMD64CVTSQ2SD, ssa.OpAMD64CVTSQ2SS, ssa.OpAMD64CVTSL2SS:
   836  		r := v.Reg()
   837  		// Break false dependency on destination register.
   838  		opregreg(s, x86.AXORPS, r, r)
   839  		opregreg(s, v.Op.Asm(), r, v.Args[0].Reg())
   840  	case ssa.OpAMD64MOVQi2f, ssa.OpAMD64MOVQf2i, ssa.OpAMD64MOVLi2f, ssa.OpAMD64MOVLf2i:
   841  		var p *obj.Prog
   842  		switch v.Op {
   843  		case ssa.OpAMD64MOVQi2f, ssa.OpAMD64MOVQf2i:
   844  			p = s.Prog(x86.AMOVQ)
   845  		case ssa.OpAMD64MOVLi2f, ssa.OpAMD64MOVLf2i:
   846  			p = s.Prog(x86.AMOVL)
   847  		}
   848  		p.From.Type = obj.TYPE_REG
   849  		p.From.Reg = v.Args[0].Reg()
   850  		p.To.Type = obj.TYPE_REG
   851  		p.To.Reg = v.Reg()
   852  	case ssa.OpAMD64ADDQload, ssa.OpAMD64ADDLload, ssa.OpAMD64SUBQload, ssa.OpAMD64SUBLload,
   853  		ssa.OpAMD64ANDQload, ssa.OpAMD64ANDLload, ssa.OpAMD64ORQload, ssa.OpAMD64ORLload,
   854  		ssa.OpAMD64XORQload, ssa.OpAMD64XORLload, ssa.OpAMD64ADDSDload, ssa.OpAMD64ADDSSload,
   855  		ssa.OpAMD64SUBSDload, ssa.OpAMD64SUBSSload, ssa.OpAMD64MULSDload, ssa.OpAMD64MULSSload,
   856  		ssa.OpAMD64DIVSDload, ssa.OpAMD64DIVSSload:
   857  		p := s.Prog(v.Op.Asm())
   858  		p.From.Type = obj.TYPE_MEM
   859  		p.From.Reg = v.Args[1].Reg()
   860  		gc.AddAux(&p.From, v)
   861  		p.To.Type = obj.TYPE_REG
   862  		p.To.Reg = v.Reg()
   863  		if v.Reg() != v.Args[0].Reg() {
   864  			v.Fatalf("input[0] and output not in same register %s", v.LongString())
   865  		}
   866  	case ssa.OpAMD64DUFFZERO:
   867  		off := duffStart(v.AuxInt)
   868  		adj := duffAdj(v.AuxInt)
   869  		var p *obj.Prog
   870  		if adj != 0 {
   871  			p = s.Prog(x86.ALEAQ)
   872  			p.From.Type = obj.TYPE_MEM
   873  			p.From.Offset = adj
   874  			p.From.Reg = x86.REG_DI
   875  			p.To.Type = obj.TYPE_REG
   876  			p.To.Reg = x86.REG_DI
   877  		}
   878  		p = s.Prog(obj.ADUFFZERO)
   879  		p.To.Type = obj.TYPE_ADDR
   880  		p.To.Sym = gc.Duffzero
   881  		p.To.Offset = off
   882  	case ssa.OpAMD64MOVOconst:
   883  		if v.AuxInt != 0 {
   884  			v.Fatalf("MOVOconst can only do constant=0")
   885  		}
   886  		r := v.Reg()
   887  		opregreg(s, x86.AXORPS, r, r)
   888  	case ssa.OpAMD64DUFFCOPY:
   889  		p := s.Prog(obj.ADUFFCOPY)
   890  		p.To.Type = obj.TYPE_ADDR
   891  		p.To.Sym = gc.Duffcopy
   892  		p.To.Offset = v.AuxInt
   893  
   894  	case ssa.OpCopy: // TODO: use MOVQreg for reg->reg copies instead of OpCopy?
   895  		if v.Type.IsMemory() {
   896  			return
   897  		}
   898  		x := v.Args[0].Reg()
   899  		y := v.Reg()
   900  		if x != y {
   901  			opregreg(s, moveByType(v.Type), y, x)
   902  		}
   903  	case ssa.OpLoadReg:
   904  		if v.Type.IsFlags() {
   905  			v.Fatalf("load flags not implemented: %v", v.LongString())
   906  			return
   907  		}
   908  		p := s.Prog(loadByType(v.Type))
   909  		gc.AddrAuto(&p.From, v.Args[0])
   910  		p.To.Type = obj.TYPE_REG
   911  		p.To.Reg = v.Reg()
   912  
   913  	case ssa.OpStoreReg:
   914  		if v.Type.IsFlags() {
   915  			v.Fatalf("store flags not implemented: %v", v.LongString())
   916  			return
   917  		}
   918  		p := s.Prog(storeByType(v.Type))
   919  		p.From.Type = obj.TYPE_REG
   920  		p.From.Reg = v.Args[0].Reg()
   921  		gc.AddrAuto(&p.To, v)
   922  	case ssa.OpAMD64LoweredGetClosurePtr:
   923  		// Closure pointer is DX.
   924  		gc.CheckLoweredGetClosurePtr(v)
   925  	case ssa.OpAMD64LoweredGetG:
   926  		r := v.Reg()
   927  		// See the comments in cmd/internal/obj/x86/obj6.go
   928  		// near CanUse1InsnTLS for a detailed explanation of these instructions.
   929  		if x86.CanUse1InsnTLS(gc.Ctxt) {
   930  			// MOVQ (TLS), r
   931  			p := s.Prog(x86.AMOVQ)
   932  			p.From.Type = obj.TYPE_MEM
   933  			p.From.Reg = x86.REG_TLS
   934  			p.To.Type = obj.TYPE_REG
   935  			p.To.Reg = r
   936  		} else {
   937  			// MOVQ TLS, r
   938  			// MOVQ (r)(TLS*1), r
   939  			p := s.Prog(x86.AMOVQ)
   940  			p.From.Type = obj.TYPE_REG
   941  			p.From.Reg = x86.REG_TLS
   942  			p.To.Type = obj.TYPE_REG
   943  			p.To.Reg = r
   944  			q := s.Prog(x86.AMOVQ)
   945  			q.From.Type = obj.TYPE_MEM
   946  			q.From.Reg = r
   947  			q.From.Index = x86.REG_TLS
   948  			q.From.Scale = 1
   949  			q.To.Type = obj.TYPE_REG
   950  			q.To.Reg = r
   951  		}
   952  	case ssa.OpAMD64CALLstatic, ssa.OpAMD64CALLclosure, ssa.OpAMD64CALLinter:
   953  		s.Call(v)
   954  
   955  	case ssa.OpAMD64LoweredGetCallerPC:
   956  		p := s.Prog(x86.AMOVQ)
   957  		p.From.Type = obj.TYPE_MEM
   958  		p.From.Offset = -8 // PC is stored 8 bytes below first parameter.
   959  		p.From.Name = obj.NAME_PARAM
   960  		p.To.Type = obj.TYPE_REG
   961  		p.To.Reg = v.Reg()
   962  
   963  	case ssa.OpAMD64LoweredGetCallerSP:
   964  		// caller's SP is the address of the first arg
   965  		mov := x86.AMOVQ
   966  		if gc.Widthptr == 4 {
   967  			mov = x86.AMOVL
   968  		}
   969  		p := s.Prog(mov)
   970  		p.From.Type = obj.TYPE_ADDR
   971  		p.From.Offset = -gc.Ctxt.FixedFrameSize() // 0 on amd64, just to be consistent with other architectures
   972  		p.From.Name = obj.NAME_PARAM
   973  		p.To.Type = obj.TYPE_REG
   974  		p.To.Reg = v.Reg()
   975  
   976  	case ssa.OpAMD64LoweredWB:
   977  		p := s.Prog(obj.ACALL)
   978  		p.To.Type = obj.TYPE_MEM
   979  		p.To.Name = obj.NAME_EXTERN
   980  		p.To.Sym = v.Aux.(*obj.LSym)
   981  
   982  	case ssa.OpAMD64NEGQ, ssa.OpAMD64NEGL,
   983  		ssa.OpAMD64BSWAPQ, ssa.OpAMD64BSWAPL,
   984  		ssa.OpAMD64NOTQ, ssa.OpAMD64NOTL:
   985  		r := v.Reg()
   986  		if r != v.Args[0].Reg() {
   987  			v.Fatalf("input[0] and output not in same register %s", v.LongString())
   988  		}
   989  		p := s.Prog(v.Op.Asm())
   990  		p.To.Type = obj.TYPE_REG
   991  		p.To.Reg = r
   992  
   993  	case ssa.OpAMD64NEGLflags:
   994  		r := v.Reg0()
   995  		if r != v.Args[0].Reg() {
   996  			v.Fatalf("input[0] and output not in same register %s", v.LongString())
   997  		}
   998  		p := s.Prog(v.Op.Asm())
   999  		p.To.Type = obj.TYPE_REG
  1000  		p.To.Reg = r
  1001  
  1002  	case ssa.OpAMD64BSFQ, ssa.OpAMD64BSRQ, ssa.OpAMD64BSFL, ssa.OpAMD64BSRL, ssa.OpAMD64SQRTSD:
  1003  		p := s.Prog(v.Op.Asm())
  1004  		p.From.Type = obj.TYPE_REG
  1005  		p.From.Reg = v.Args[0].Reg()
  1006  		p.To.Type = obj.TYPE_REG
  1007  		switch v.Op {
  1008  		case ssa.OpAMD64BSFQ, ssa.OpAMD64BSRQ:
  1009  			p.To.Reg = v.Reg0()
  1010  		case ssa.OpAMD64BSFL, ssa.OpAMD64BSRL, ssa.OpAMD64SQRTSD:
  1011  			p.To.Reg = v.Reg()
  1012  		}
  1013  	case ssa.OpAMD64ROUNDSD:
  1014  		p := s.Prog(v.Op.Asm())
  1015  		val := v.AuxInt
  1016  		// 0 means math.RoundToEven, 1 Floor, 2 Ceil, 3 Trunc
  1017  		if val != 0 && val != 1 && val != 2 && val != 3 {
  1018  			v.Fatalf("Invalid rounding mode")
  1019  		}
  1020  		p.From.Offset = val
  1021  		p.From.Type = obj.TYPE_CONST
  1022  		p.SetFrom3(obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[0].Reg()})
  1023  		p.To.Type = obj.TYPE_REG
  1024  		p.To.Reg = v.Reg()
  1025  	case ssa.OpAMD64POPCNTQ, ssa.OpAMD64POPCNTL:
  1026  		if v.Args[0].Reg() != v.Reg() {
  1027  			// POPCNT on Intel has a false dependency on the destination register.
  1028  			// Xor register with itself to break the dependency.
  1029  			p := s.Prog(x86.AXORQ)
  1030  			p.From.Type = obj.TYPE_REG
  1031  			p.From.Reg = v.Reg()
  1032  			p.To.Type = obj.TYPE_REG
  1033  			p.To.Reg = v.Reg()
  1034  		}
  1035  		p := s.Prog(v.Op.Asm())
  1036  		p.From.Type = obj.TYPE_REG
  1037  		p.From.Reg = v.Args[0].Reg()
  1038  		p.To.Type = obj.TYPE_REG
  1039  		p.To.Reg = v.Reg()
  1040  
  1041  	case ssa.OpAMD64SETEQ, ssa.OpAMD64SETNE,
  1042  		ssa.OpAMD64SETL, ssa.OpAMD64SETLE,
  1043  		ssa.OpAMD64SETG, ssa.OpAMD64SETGE,
  1044  		ssa.OpAMD64SETGF, ssa.OpAMD64SETGEF,
  1045  		ssa.OpAMD64SETB, ssa.OpAMD64SETBE,
  1046  		ssa.OpAMD64SETORD, ssa.OpAMD64SETNAN,
  1047  		ssa.OpAMD64SETA, ssa.OpAMD64SETAE,
  1048  		ssa.OpAMD64SETO:
  1049  		p := s.Prog(v.Op.Asm())
  1050  		p.To.Type = obj.TYPE_REG
  1051  		p.To.Reg = v.Reg()
  1052  
  1053  	case ssa.OpAMD64SETEQstore, ssa.OpAMD64SETNEstore,
  1054  		ssa.OpAMD64SETLstore, ssa.OpAMD64SETLEstore,
  1055  		ssa.OpAMD64SETGstore, ssa.OpAMD64SETGEstore,
  1056  		ssa.OpAMD64SETBstore, ssa.OpAMD64SETBEstore,
  1057  		ssa.OpAMD64SETAstore, ssa.OpAMD64SETAEstore:
  1058  		p := s.Prog(v.Op.Asm())
  1059  		p.To.Type = obj.TYPE_MEM
  1060  		p.To.Reg = v.Args[0].Reg()
  1061  		gc.AddAux(&p.To, v)
  1062  
  1063  	case ssa.OpAMD64SETNEF:
  1064  		p := s.Prog(v.Op.Asm())
  1065  		p.To.Type = obj.TYPE_REG
  1066  		p.To.Reg = v.Reg()
  1067  		q := s.Prog(x86.ASETPS)
  1068  		q.To.Type = obj.TYPE_REG
  1069  		q.To.Reg = x86.REG_AX
  1070  		// ORL avoids partial register write and is smaller than ORQ, used by old compiler
  1071  		opregreg(s, x86.AORL, v.Reg(), x86.REG_AX)
  1072  
  1073  	case ssa.OpAMD64SETEQF:
  1074  		p := s.Prog(v.Op.Asm())
  1075  		p.To.Type = obj.TYPE_REG
  1076  		p.To.Reg = v.Reg()
  1077  		q := s.Prog(x86.ASETPC)
  1078  		q.To.Type = obj.TYPE_REG
  1079  		q.To.Reg = x86.REG_AX
  1080  		// ANDL avoids partial register write and is smaller than ANDQ, used by old compiler
  1081  		opregreg(s, x86.AANDL, v.Reg(), x86.REG_AX)
  1082  
  1083  	case ssa.OpAMD64InvertFlags:
  1084  		v.Fatalf("InvertFlags should never make it to codegen %v", v.LongString())
  1085  	case ssa.OpAMD64FlagEQ, ssa.OpAMD64FlagLT_ULT, ssa.OpAMD64FlagLT_UGT, ssa.OpAMD64FlagGT_ULT, ssa.OpAMD64FlagGT_UGT:
  1086  		v.Fatalf("Flag* ops should never make it to codegen %v", v.LongString())
  1087  	case ssa.OpAMD64AddTupleFirst32, ssa.OpAMD64AddTupleFirst64:
  1088  		v.Fatalf("AddTupleFirst* should never make it to codegen %v", v.LongString())
  1089  	case ssa.OpAMD64REPSTOSQ:
  1090  		s.Prog(x86.AREP)
  1091  		s.Prog(x86.ASTOSQ)
  1092  	case ssa.OpAMD64REPMOVSQ:
  1093  		s.Prog(x86.AREP)
  1094  		s.Prog(x86.AMOVSQ)
  1095  	case ssa.OpAMD64LoweredNilCheck:
  1096  		// Issue a load which will fault if the input is nil.
  1097  		// TODO: We currently use the 2-byte instruction TESTB AX, (reg).
  1098  		// Should we use the 3-byte TESTB $0, (reg) instead? It is larger
  1099  		// but it doesn't have false dependency on AX.
  1100  		// Or maybe allocate an output register and use MOVL (reg),reg2 ?
  1101  		// That trades clobbering flags for clobbering a register.
  1102  		p := s.Prog(x86.ATESTB)
  1103  		p.From.Type = obj.TYPE_REG
  1104  		p.From.Reg = x86.REG_AX
  1105  		p.To.Type = obj.TYPE_MEM
  1106  		p.To.Reg = v.Args[0].Reg()
  1107  		gc.AddAux(&p.To, v)
  1108  		if gc.Debug_checknil != 0 && v.Pos.Line() > 1 { // v.Pos.Line()==1 in generated wrappers
  1109  			gc.Warnl(v.Pos, "generated nil check")
  1110  		}
  1111  	case ssa.OpAMD64MOVLatomicload, ssa.OpAMD64MOVQatomicload:
  1112  		p := s.Prog(v.Op.Asm())
  1113  		p.From.Type = obj.TYPE_MEM
  1114  		p.From.Reg = v.Args[0].Reg()
  1115  		gc.AddAux(&p.From, v)
  1116  		p.To.Type = obj.TYPE_REG
  1117  		p.To.Reg = v.Reg0()
  1118  	case ssa.OpAMD64XCHGL, ssa.OpAMD64XCHGQ:
  1119  		r := v.Reg0()
  1120  		if r != v.Args[0].Reg() {
  1121  			v.Fatalf("input[0] and output[0] not in same register %s", v.LongString())
  1122  		}
  1123  		p := s.Prog(v.Op.Asm())
  1124  		p.From.Type = obj.TYPE_REG
  1125  		p.From.Reg = r
  1126  		p.To.Type = obj.TYPE_MEM
  1127  		p.To.Reg = v.Args[1].Reg()
  1128  		gc.AddAux(&p.To, v)
  1129  	case ssa.OpAMD64XADDLlock, ssa.OpAMD64XADDQlock:
  1130  		r := v.Reg0()
  1131  		if r != v.Args[0].Reg() {
  1132  			v.Fatalf("input[0] and output[0] not in same register %s", v.LongString())
  1133  		}
  1134  		s.Prog(x86.ALOCK)
  1135  		p := s.Prog(v.Op.Asm())
  1136  		p.From.Type = obj.TYPE_REG
  1137  		p.From.Reg = r
  1138  		p.To.Type = obj.TYPE_MEM
  1139  		p.To.Reg = v.Args[1].Reg()
  1140  		gc.AddAux(&p.To, v)
  1141  	case ssa.OpAMD64CMPXCHGLlock, ssa.OpAMD64CMPXCHGQlock:
  1142  		if v.Args[1].Reg() != x86.REG_AX {
  1143  			v.Fatalf("input[1] not in AX %s", v.LongString())
  1144  		}
  1145  		s.Prog(x86.ALOCK)
  1146  		p := s.Prog(v.Op.Asm())
  1147  		p.From.Type = obj.TYPE_REG
  1148  		p.From.Reg = v.Args[2].Reg()
  1149  		p.To.Type = obj.TYPE_MEM
  1150  		p.To.Reg = v.Args[0].Reg()
  1151  		gc.AddAux(&p.To, v)
  1152  		p = s.Prog(x86.ASETEQ)
  1153  		p.To.Type = obj.TYPE_REG
  1154  		p.To.Reg = v.Reg0()
  1155  	case ssa.OpAMD64ANDBlock, ssa.OpAMD64ORBlock:
  1156  		s.Prog(x86.ALOCK)
  1157  		p := s.Prog(v.Op.Asm())
  1158  		p.From.Type = obj.TYPE_REG
  1159  		p.From.Reg = v.Args[1].Reg()
  1160  		p.To.Type = obj.TYPE_MEM
  1161  		p.To.Reg = v.Args[0].Reg()
  1162  		gc.AddAux(&p.To, v)
  1163  	case ssa.OpClobber:
  1164  		p := s.Prog(x86.AMOVL)
  1165  		p.From.Type = obj.TYPE_CONST
  1166  		p.From.Offset = 0xdeaddead
  1167  		p.To.Type = obj.TYPE_MEM
  1168  		p.To.Reg = x86.REG_SP
  1169  		gc.AddAux(&p.To, v)
  1170  		p = s.Prog(x86.AMOVL)
  1171  		p.From.Type = obj.TYPE_CONST
  1172  		p.From.Offset = 0xdeaddead
  1173  		p.To.Type = obj.TYPE_MEM
  1174  		p.To.Reg = x86.REG_SP
  1175  		gc.AddAux(&p.To, v)
  1176  		p.To.Offset += 4
  1177  	default:
  1178  		v.Fatalf("genValue not implemented: %s", v.LongString())
  1179  	}
  1180  }
  1181  
  1182  var blockJump = [...]struct {
  1183  	asm, invasm obj.As
  1184  }{
  1185  	ssa.BlockAMD64EQ:  {x86.AJEQ, x86.AJNE},
  1186  	ssa.BlockAMD64NE:  {x86.AJNE, x86.AJEQ},
  1187  	ssa.BlockAMD64LT:  {x86.AJLT, x86.AJGE},
  1188  	ssa.BlockAMD64GE:  {x86.AJGE, x86.AJLT},
  1189  	ssa.BlockAMD64LE:  {x86.AJLE, x86.AJGT},
  1190  	ssa.BlockAMD64GT:  {x86.AJGT, x86.AJLE},
  1191  	ssa.BlockAMD64OS:  {x86.AJOS, x86.AJOC},
  1192  	ssa.BlockAMD64OC:  {x86.AJOC, x86.AJOS},
  1193  	ssa.BlockAMD64ULT: {x86.AJCS, x86.AJCC},
  1194  	ssa.BlockAMD64UGE: {x86.AJCC, x86.AJCS},
  1195  	ssa.BlockAMD64UGT: {x86.AJHI, x86.AJLS},
  1196  	ssa.BlockAMD64ULE: {x86.AJLS, x86.AJHI},
  1197  	ssa.BlockAMD64ORD: {x86.AJPC, x86.AJPS},
  1198  	ssa.BlockAMD64NAN: {x86.AJPS, x86.AJPC},
  1199  }
  1200  
  1201  var eqfJumps = [2][2]gc.FloatingEQNEJump{
  1202  	{{Jump: x86.AJNE, Index: 1}, {Jump: x86.AJPS, Index: 1}}, // next == b.Succs[0]
  1203  	{{Jump: x86.AJNE, Index: 1}, {Jump: x86.AJPC, Index: 0}}, // next == b.Succs[1]
  1204  }
  1205  var nefJumps = [2][2]gc.FloatingEQNEJump{
  1206  	{{Jump: x86.AJNE, Index: 0}, {Jump: x86.AJPC, Index: 1}}, // next == b.Succs[0]
  1207  	{{Jump: x86.AJNE, Index: 0}, {Jump: x86.AJPS, Index: 0}}, // next == b.Succs[1]
  1208  }
  1209  
  1210  func ssaGenBlock(s *gc.SSAGenState, b, next *ssa.Block) {
  1211  	switch b.Kind {
  1212  	case ssa.BlockPlain:
  1213  		if b.Succs[0].Block() != next {
  1214  			p := s.Prog(obj.AJMP)
  1215  			p.To.Type = obj.TYPE_BRANCH
  1216  			s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()})
  1217  		}
  1218  	case ssa.BlockDefer:
  1219  		// defer returns in rax:
  1220  		// 0 if we should continue executing
  1221  		// 1 if we should jump to deferreturn call
  1222  		p := s.Prog(x86.ATESTL)
  1223  		p.From.Type = obj.TYPE_REG
  1224  		p.From.Reg = x86.REG_AX
  1225  		p.To.Type = obj.TYPE_REG
  1226  		p.To.Reg = x86.REG_AX
  1227  		p = s.Prog(x86.AJNE)
  1228  		p.To.Type = obj.TYPE_BRANCH
  1229  		s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[1].Block()})
  1230  		if b.Succs[0].Block() != next {
  1231  			p := s.Prog(obj.AJMP)
  1232  			p.To.Type = obj.TYPE_BRANCH
  1233  			s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()})
  1234  		}
  1235  	case ssa.BlockExit:
  1236  		s.Prog(obj.AUNDEF) // tell plive.go that we never reach here
  1237  	case ssa.BlockRet:
  1238  		s.Prog(obj.ARET)
  1239  	case ssa.BlockRetJmp:
  1240  		p := s.Prog(obj.ARET)
  1241  		p.To.Type = obj.TYPE_MEM
  1242  		p.To.Name = obj.NAME_EXTERN
  1243  		p.To.Sym = b.Aux.(*obj.LSym)
  1244  
  1245  	case ssa.BlockAMD64EQF:
  1246  		s.FPJump(b, next, &eqfJumps)
  1247  
  1248  	case ssa.BlockAMD64NEF:
  1249  		s.FPJump(b, next, &nefJumps)
  1250  
  1251  	case ssa.BlockAMD64EQ, ssa.BlockAMD64NE,
  1252  		ssa.BlockAMD64LT, ssa.BlockAMD64GE,
  1253  		ssa.BlockAMD64LE, ssa.BlockAMD64GT,
  1254  		ssa.BlockAMD64OS, ssa.BlockAMD64OC,
  1255  		ssa.BlockAMD64ULT, ssa.BlockAMD64UGT,
  1256  		ssa.BlockAMD64ULE, ssa.BlockAMD64UGE:
  1257  		jmp := blockJump[b.Kind]
  1258  		switch next {
  1259  		case b.Succs[0].Block():
  1260  			s.Br(jmp.invasm, b.Succs[1].Block())
  1261  		case b.Succs[1].Block():
  1262  			s.Br(jmp.asm, b.Succs[0].Block())
  1263  		default:
  1264  			if b.Likely != ssa.BranchUnlikely {
  1265  				s.Br(jmp.asm, b.Succs[0].Block())
  1266  				s.Br(obj.AJMP, b.Succs[1].Block())
  1267  			} else {
  1268  				s.Br(jmp.invasm, b.Succs[1].Block())
  1269  				s.Br(obj.AJMP, b.Succs[0].Block())
  1270  			}
  1271  		}
  1272  
  1273  	default:
  1274  		b.Fatalf("branch not implemented: %s. Control: %s", b.LongString(), b.Control.LongString())
  1275  	}
  1276  }