github.com/mh-cbon/go@v0.0.0-20160603070303-9e112a3fe4c0/src/cmd/compile/internal/amd64/ggen.go

github.com/mh-cbon/go@v0.0.0-20160603070303-9e112a3fe4c0/src/cmd/compile/internal/amd64/ggen.go (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package amd64
     6  
     7  import (
     8  	"cmd/compile/internal/gc"
     9  	"cmd/internal/obj"
    10  	"cmd/internal/obj/x86"
    11  )
    12  
    13  // no floating point in note handlers on Plan 9
    14  var isPlan9 = obj.Getgoos() == "plan9"
    15  
    16  func defframe(ptxt *obj.Prog) {
    17  	// fill in argument size, stack size
    18  	ptxt.To.Type = obj.TYPE_TEXTSIZE
    19  
    20  	ptxt.To.Val = int32(gc.Rnd(gc.Curfn.Type.ArgWidth(), int64(gc.Widthptr)))
    21  	frame := uint32(gc.Rnd(gc.Stksize+gc.Maxarg, int64(gc.Widthreg)))
    22  	ptxt.To.Offset = int64(frame)
    23  
    24  	// insert code to zero ambiguously live variables
    25  	// so that the garbage collector only sees initialized values
    26  	// when it looks for pointers.
    27  	p := ptxt
    28  
    29  	hi := int64(0)
    30  	lo := hi
    31  	ax := uint32(0)
    32  	x0 := uint32(0)
    33  
    34  	// iterate through declarations - they are sorted in decreasing xoffset order.
    35  	for _, n := range gc.Curfn.Func.Dcl {
    36  		if !n.Name.Needzero {
    37  			continue
    38  		}
    39  		if n.Class != gc.PAUTO {
    40  			gc.Fatalf("needzero class %d", n.Class)
    41  		}
    42  		if n.Type.Width%int64(gc.Widthptr) != 0 || n.Xoffset%int64(gc.Widthptr) != 0 || n.Type.Width == 0 {
    43  			gc.Fatalf("var %v has size %d offset %d", gc.Nconv(n, gc.FmtLong), int(n.Type.Width), int(n.Xoffset))
    44  		}
    45  
    46  		if lo != hi && n.Xoffset+n.Type.Width >= lo-int64(2*gc.Widthreg) {
    47  			// merge with range we already have
    48  			lo = n.Xoffset
    49  
    50  			continue
    51  		}
    52  
    53  		// zero old range
    54  		p = zerorange(p, int64(frame), lo, hi, &ax, &x0)
    55  
    56  		// set new range
    57  		hi = n.Xoffset + n.Type.Width
    58  
    59  		lo = n.Xoffset
    60  	}
    61  
    62  	// zero final range
    63  	zerorange(p, int64(frame), lo, hi, &ax, &x0)
    64  }
    65  
    66  // DUFFZERO consists of repeated blocks of 4 MOVUPSs + ADD,
    67  // See runtime/mkduff.go.
    68  const (
    69  	dzBlocks    = 16 // number of MOV/ADD blocks
    70  	dzBlockLen  = 4  // number of clears per block
    71  	dzBlockSize = 19 // size of instructions in a single block
    72  	dzMovSize   = 4  // size of single MOV instruction w/ offset
    73  	dzAddSize   = 4  // size of single ADD instruction
    74  	dzClearStep = 16 // number of bytes cleared by each MOV instruction
    75  
    76  	dzClearLen = dzClearStep * dzBlockLen // bytes cleared by one block
    77  	dzSize     = dzBlocks * dzBlockSize
    78  )
    79  
    80  // dzOff returns the offset for a jump into DUFFZERO.
    81  // b is the number of bytes to zero.
    82  func dzOff(b int64) int64 {
    83  	off := int64(dzSize)
    84  	off -= b / dzClearLen * dzBlockSize
    85  	tailLen := b % dzClearLen
    86  	if tailLen >= dzClearStep {
    87  		off -= dzAddSize + dzMovSize*(tailLen/dzClearStep)
    88  	}
    89  	return off
    90  }
    91  
    92  // duffzeroDI returns the pre-adjustment to DI for a call to DUFFZERO.
    93  // b is the number of bytes to zero.
    94  func dzDI(b int64) int64 {
    95  	tailLen := b % dzClearLen
    96  	if tailLen < dzClearStep {
    97  		return 0
    98  	}
    99  	tailSteps := tailLen / dzClearStep
   100  	return -dzClearStep * (dzBlockLen - tailSteps)
   101  }
   102  
   103  func zerorange(p *obj.Prog, frame int64, lo int64, hi int64, ax *uint32, x0 *uint32) *obj.Prog {
   104  	cnt := hi - lo
   105  	if cnt == 0 {
   106  		return p
   107  	}
   108  
   109  	if cnt%int64(gc.Widthreg) != 0 {
   110  		// should only happen with nacl
   111  		if cnt%int64(gc.Widthptr) != 0 {
   112  			gc.Fatalf("zerorange count not a multiple of widthptr %d", cnt)
   113  		}
   114  		if *ax == 0 {
   115  			p = appendpp(p, x86.AMOVQ, obj.TYPE_CONST, 0, 0, obj.TYPE_REG, x86.REG_AX, 0)
   116  			*ax = 1
   117  		}
   118  		p = appendpp(p, x86.AMOVL, obj.TYPE_REG, x86.REG_AX, 0, obj.TYPE_MEM, x86.REG_SP, frame+lo)
   119  		lo += int64(gc.Widthptr)
   120  		cnt -= int64(gc.Widthptr)
   121  	}
   122  
   123  	if cnt == 8 {
   124  		if *ax == 0 {
   125  			p = appendpp(p, x86.AMOVQ, obj.TYPE_CONST, 0, 0, obj.TYPE_REG, x86.REG_AX, 0)
   126  			*ax = 1
   127  		}
   128  		p = appendpp(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_AX, 0, obj.TYPE_MEM, x86.REG_SP, frame+lo)
   129  	} else if !isPlan9 && cnt <= int64(8*gc.Widthreg) {
   130  		if *x0 == 0 {
   131  			p = appendpp(p, x86.AXORPS, obj.TYPE_REG, x86.REG_X0, 0, obj.TYPE_REG, x86.REG_X0, 0)
   132  			*x0 = 1
   133  		}
   134  
   135  		for i := int64(0); i < cnt/16; i++ {
   136  			p = appendpp(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X0, 0, obj.TYPE_MEM, x86.REG_SP, frame+lo+i*16)
   137  		}
   138  
   139  		if cnt%16 != 0 {
   140  			p = appendpp(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X0, 0, obj.TYPE_MEM, x86.REG_SP, frame+lo+cnt-int64(16))
   141  		}
   142  	} else if !gc.Nacl && !isPlan9 && (cnt <= int64(128*gc.Widthreg)) {
   143  		if *x0 == 0 {
   144  			p = appendpp(p, x86.AXORPS, obj.TYPE_REG, x86.REG_X0, 0, obj.TYPE_REG, x86.REG_X0, 0)
   145  			*x0 = 1
   146  		}
   147  		p = appendpp(p, leaptr, obj.TYPE_MEM, x86.REG_SP, frame+lo+dzDI(cnt), obj.TYPE_REG, x86.REG_DI, 0)
   148  		p = appendpp(p, obj.ADUFFZERO, obj.TYPE_NONE, 0, 0, obj.TYPE_ADDR, 0, dzOff(cnt))
   149  		p.To.Sym = gc.Linksym(gc.Pkglookup("duffzero", gc.Runtimepkg))
   150  
   151  		if cnt%16 != 0 {
   152  			p = appendpp(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X0, 0, obj.TYPE_MEM, x86.REG_DI, -int64(8))
   153  		}
   154  	} else {
   155  		if *ax == 0 {
   156  			p = appendpp(p, x86.AMOVQ, obj.TYPE_CONST, 0, 0, obj.TYPE_REG, x86.REG_AX, 0)
   157  			*ax = 1
   158  		}
   159  
   160  		p = appendpp(p, x86.AMOVQ, obj.TYPE_CONST, 0, cnt/int64(gc.Widthreg), obj.TYPE_REG, x86.REG_CX, 0)
   161  		p = appendpp(p, leaptr, obj.TYPE_MEM, x86.REG_SP, frame+lo, obj.TYPE_REG, x86.REG_DI, 0)
   162  		p = appendpp(p, x86.AREP, obj.TYPE_NONE, 0, 0, obj.TYPE_NONE, 0, 0)
   163  		p = appendpp(p, x86.ASTOSQ, obj.TYPE_NONE, 0, 0, obj.TYPE_NONE, 0, 0)
   164  	}
   165  
   166  	return p
   167  }
   168  
   169  func appendpp(p *obj.Prog, as obj.As, ftype obj.AddrType, freg int, foffset int64, ttype obj.AddrType, treg int, toffset int64) *obj.Prog {
   170  	q := gc.Ctxt.NewProg()
   171  	gc.Clearp(q)
   172  	q.As = as
   173  	q.Lineno = p.Lineno
   174  	q.From.Type = ftype
   175  	q.From.Reg = int16(freg)
   176  	q.From.Offset = foffset
   177  	q.To.Type = ttype
   178  	q.To.Reg = int16(treg)
   179  	q.To.Offset = toffset
   180  	q.Link = p.Link
   181  	p.Link = q
   182  	return q
   183  }
   184  
   185  var panicdiv *gc.Node
   186  
   187  /*
   188   * generate division.
   189   * generates one of:
   190   *	res = nl / nr
   191   *	res = nl % nr
   192   * according to op.
   193   */
   194  func dodiv(op gc.Op, nl *gc.Node, nr *gc.Node, res *gc.Node) {
   195  	// Have to be careful about handling
   196  	// most negative int divided by -1 correctly.
   197  	// The hardware will trap.
   198  	// Also the byte divide instruction needs AH,
   199  	// which we otherwise don't have to deal with.
   200  	// Easiest way to avoid for int8, int16: use int32.
   201  	// For int32 and int64, use explicit test.
   202  	// Could use int64 hw for int32.
   203  	t := nl.Type
   204  
   205  	t0 := t
   206  	check := false
   207  	if t.IsSigned() {
   208  		check = true
   209  		if gc.Isconst(nl, gc.CTINT) && nl.Int64() != -(1<<uint64(t.Width*8-1)) {
   210  			check = false
   211  		} else if gc.Isconst(nr, gc.CTINT) && nr.Int64() != -1 {
   212  			check = false
   213  		}
   214  	}
   215  
   216  	if t.Width < 4 {
   217  		if t.IsSigned() {
   218  			t = gc.Types[gc.TINT32]
   219  		} else {
   220  			t = gc.Types[gc.TUINT32]
   221  		}
   222  		check = false
   223  	}
   224  
   225  	a := optoas(op, t)
   226  
   227  	var n3 gc.Node
   228  	gc.Regalloc(&n3, t0, nil)
   229  	var ax gc.Node
   230  	var oldax gc.Node
   231  	if nl.Ullman >= nr.Ullman {
   232  		savex(x86.REG_AX, &ax, &oldax, res, t0)
   233  		gc.Cgen(nl, &ax)
   234  		gc.Regalloc(&ax, t0, &ax) // mark ax live during cgen
   235  		gc.Cgen(nr, &n3)
   236  		gc.Regfree(&ax)
   237  	} else {
   238  		gc.Cgen(nr, &n3)
   239  		savex(x86.REG_AX, &ax, &oldax, res, t0)
   240  		gc.Cgen(nl, &ax)
   241  	}
   242  
   243  	if t != t0 {
   244  		// Convert
   245  		ax1 := ax
   246  
   247  		n31 := n3
   248  		ax.Type = t
   249  		n3.Type = t
   250  		gmove(&ax1, &ax)
   251  		gmove(&n31, &n3)
   252  	}
   253  
   254  	var n4 gc.Node
   255  	if gc.Nacl {
   256  		// Native Client does not relay the divide-by-zero trap
   257  		// to the executing program, so we must insert a check
   258  		// for ourselves.
   259  		gc.Nodconst(&n4, t, 0)
   260  
   261  		gins(optoas(gc.OCMP, t), &n3, &n4)
   262  		p1 := gc.Gbranch(optoas(gc.ONE, t), nil, +1)
   263  		if panicdiv == nil {
   264  			panicdiv = gc.Sysfunc("panicdivide")
   265  		}
   266  		gc.Ginscall(panicdiv, -1)
   267  		gc.Patch(p1, gc.Pc)
   268  	}
   269  
   270  	var p2 *obj.Prog
   271  	if check {
   272  		gc.Nodconst(&n4, t, -1)
   273  		gins(optoas(gc.OCMP, t), &n3, &n4)
   274  		p1 := gc.Gbranch(optoas(gc.ONE, t), nil, +1)
   275  		if op == gc.ODIV {
   276  			// a / (-1) is -a.
   277  			gins(optoas(gc.OMINUS, t), nil, &ax)
   278  
   279  			gmove(&ax, res)
   280  		} else {
   281  			// a % (-1) is 0.
   282  			gc.Nodconst(&n4, t, 0)
   283  
   284  			gmove(&n4, res)
   285  		}
   286  
   287  		p2 = gc.Gbranch(obj.AJMP, nil, 0)
   288  		gc.Patch(p1, gc.Pc)
   289  	}
   290  
   291  	var olddx gc.Node
   292  	var dx gc.Node
   293  	savex(x86.REG_DX, &dx, &olddx, res, t)
   294  	if !t.IsSigned() {
   295  		gc.Nodconst(&n4, t, 0)
   296  		gmove(&n4, &dx)
   297  	} else {
   298  		gins(optoas(gc.OEXTEND, t), nil, nil)
   299  	}
   300  	gins(a, &n3, nil)
   301  	gc.Regfree(&n3)
   302  	if op == gc.ODIV {
   303  		gmove(&ax, res)
   304  	} else {
   305  		gmove(&dx, res)
   306  	}
   307  	restx(&dx, &olddx)
   308  	if check {
   309  		gc.Patch(p2, gc.Pc)
   310  	}
   311  	restx(&ax, &oldax)
   312  }
   313  
   314  /*
   315   * register dr is one of the special ones (AX, CX, DI, SI, etc.).
   316   * we need to use it.  if it is already allocated as a temporary
   317   * (r > 1; can only happen if a routine like sgen passed a
   318   * special as cgen's res and then cgen used regalloc to reuse
   319   * it as its own temporary), then move it for now to another
   320   * register.  caller must call restx to move it back.
   321   * the move is not necessary if dr == res, because res is
   322   * known to be dead.
   323   */
   324  func savex(dr int, x *gc.Node, oldx *gc.Node, res *gc.Node, t *gc.Type) {
   325  	r := uint8(gc.GetReg(dr))
   326  
   327  	// save current ax and dx if they are live
   328  	// and not the destination
   329  	*oldx = gc.Node{}
   330  
   331  	gc.Nodreg(x, t, dr)
   332  	if r > 1 && !gc.Samereg(x, res) {
   333  		gc.Regalloc(oldx, gc.Types[gc.TINT64], nil)
   334  		x.Type = gc.Types[gc.TINT64]
   335  		gmove(x, oldx)
   336  		x.Type = t
   337  		// TODO(marvin): Fix Node.EType type union.
   338  		oldx.Etype = gc.EType(r) // squirrel away old r value
   339  		gc.SetReg(dr, 1)
   340  	}
   341  }
   342  
   343  func restx(x *gc.Node, oldx *gc.Node) {
   344  	if oldx.Op != 0 {
   345  		x.Type = gc.Types[gc.TINT64]
   346  		gc.SetReg(int(x.Reg), int(oldx.Etype))
   347  		gmove(oldx, x)
   348  		gc.Regfree(oldx)
   349  	}
   350  }
   351  
   352  /*
   353   * generate high multiply:
   354   *   res = (nl*nr) >> width
   355   */
   356  func cgen_hmul(nl *gc.Node, nr *gc.Node, res *gc.Node) {
   357  	t := nl.Type
   358  	a := optoas(gc.OHMUL, t)
   359  	if nl.Ullman < nr.Ullman {
   360  		nl, nr = nr, nl
   361  	}
   362  
   363  	var n1 gc.Node
   364  	gc.Cgenr(nl, &n1, res)
   365  	var n2 gc.Node
   366  	gc.Cgenr(nr, &n2, nil)
   367  	var ax, oldax, dx, olddx gc.Node
   368  	savex(x86.REG_AX, &ax, &oldax, res, gc.Types[gc.TUINT64])
   369  	savex(x86.REG_DX, &dx, &olddx, res, gc.Types[gc.TUINT64])
   370  	gmove(&n1, &ax)
   371  	gins(a, &n2, nil)
   372  	gc.Regfree(&n2)
   373  	gc.Regfree(&n1)
   374  
   375  	if t.Width == 1 {
   376  		// byte multiply behaves differently.
   377  		var byteAH, byteDX gc.Node
   378  		gc.Nodreg(&byteAH, t, x86.REG_AH)
   379  		gc.Nodreg(&byteDX, t, x86.REG_DX)
   380  		gmove(&byteAH, &byteDX)
   381  	}
   382  	gmove(&dx, res)
   383  
   384  	restx(&ax, &oldax)
   385  	restx(&dx, &olddx)
   386  }
   387  
   388  /*
   389   * generate shift according to op, one of:
   390   *	res = nl << nr
   391   *	res = nl >> nr
   392   */
   393  func cgen_shift(op gc.Op, bounded bool, nl *gc.Node, nr *gc.Node, res *gc.Node) {
   394  	a := optoas(op, nl.Type)
   395  
   396  	if nr.Op == gc.OLITERAL {
   397  		var n1 gc.Node
   398  		gc.Regalloc(&n1, nl.Type, res)
   399  		gc.Cgen(nl, &n1)
   400  		sc := uint64(nr.Int64())
   401  		if sc >= uint64(nl.Type.Width*8) {
   402  			// large shift gets 2 shifts by width-1
   403  			var n3 gc.Node
   404  			gc.Nodconst(&n3, gc.Types[gc.TUINT32], nl.Type.Width*8-1)
   405  
   406  			gins(a, &n3, &n1)
   407  			gins(a, &n3, &n1)
   408  		} else {
   409  			gins(a, nr, &n1)
   410  		}
   411  		gmove(&n1, res)
   412  		gc.Regfree(&n1)
   413  		return
   414  	}
   415  
   416  	if nl.Ullman >= gc.UINF {
   417  		var n4 gc.Node
   418  		gc.Tempname(&n4, nl.Type)
   419  		gc.Cgen(nl, &n4)
   420  		nl = &n4
   421  	}
   422  
   423  	if nr.Ullman >= gc.UINF {
   424  		var n5 gc.Node
   425  		gc.Tempname(&n5, nr.Type)
   426  		gc.Cgen(nr, &n5)
   427  		nr = &n5
   428  	}
   429  
   430  	rcx := gc.GetReg(x86.REG_CX)
   431  	var n1 gc.Node
   432  	gc.Nodreg(&n1, gc.Types[gc.TUINT32], x86.REG_CX)
   433  
   434  	// Allow either uint32 or uint64 as shift type,
   435  	// to avoid unnecessary conversion from uint32 to uint64
   436  	// just to do the comparison.
   437  	tcount := gc.Types[gc.Simtype[nr.Type.Etype]]
   438  
   439  	if tcount.Etype < gc.TUINT32 {
   440  		tcount = gc.Types[gc.TUINT32]
   441  	}
   442  
   443  	gc.Regalloc(&n1, nr.Type, &n1) // to hold the shift type in CX
   444  	var n3 gc.Node
   445  	gc.Regalloc(&n3, tcount, &n1) // to clear high bits of CX
   446  
   447  	var cx gc.Node
   448  	gc.Nodreg(&cx, gc.Types[gc.TUINT64], x86.REG_CX)
   449  
   450  	var oldcx gc.Node
   451  	if rcx > 0 && !gc.Samereg(&cx, res) {
   452  		gc.Regalloc(&oldcx, gc.Types[gc.TUINT64], nil)
   453  		gmove(&cx, &oldcx)
   454  	}
   455  
   456  	cx.Type = tcount
   457  
   458  	var n2 gc.Node
   459  	if gc.Samereg(&cx, res) {
   460  		gc.Regalloc(&n2, nl.Type, nil)
   461  	} else {
   462  		gc.Regalloc(&n2, nl.Type, res)
   463  	}
   464  	if nl.Ullman >= nr.Ullman {
   465  		gc.Cgen(nl, &n2)
   466  		gc.Cgen(nr, &n1)
   467  		gmove(&n1, &n3)
   468  	} else {
   469  		gc.Cgen(nr, &n1)
   470  		gmove(&n1, &n3)
   471  		gc.Cgen(nl, &n2)
   472  	}
   473  
   474  	gc.Regfree(&n3)
   475  
   476  	// test and fix up large shifts
   477  	if !bounded {
   478  		gc.Nodconst(&n3, tcount, nl.Type.Width*8)
   479  		gins(optoas(gc.OCMP, tcount), &n1, &n3)
   480  		p1 := gc.Gbranch(optoas(gc.OLT, tcount), nil, +1)
   481  		if op == gc.ORSH && nl.Type.IsSigned() {
   482  			gc.Nodconst(&n3, gc.Types[gc.TUINT32], nl.Type.Width*8-1)
   483  			gins(a, &n3, &n2)
   484  		} else {
   485  			gc.Nodconst(&n3, nl.Type, 0)
   486  			gmove(&n3, &n2)
   487  		}
   488  
   489  		gc.Patch(p1, gc.Pc)
   490  	}
   491  
   492  	gins(a, &n1, &n2)
   493  
   494  	if oldcx.Op != 0 {
   495  		cx.Type = gc.Types[gc.TUINT64]
   496  		gmove(&oldcx, &cx)
   497  		gc.Regfree(&oldcx)
   498  	}
   499  
   500  	gmove(&n2, res)
   501  
   502  	gc.Regfree(&n1)
   503  	gc.Regfree(&n2)
   504  }
   505  
   506  /*
   507   * generate byte multiply:
   508   *	res = nl * nr
   509   * there is no 2-operand byte multiply instruction so
   510   * we do a full-width multiplication and truncate afterwards.
   511   */
   512  func cgen_bmul(op gc.Op, nl *gc.Node, nr *gc.Node, res *gc.Node) bool {
   513  	if optoas(op, nl.Type) != x86.AIMULB {
   514  		return false
   515  	}
   516  
   517  	// largest ullman on left.
   518  	if nl.Ullman < nr.Ullman {
   519  		nl, nr = nr, nl
   520  	}
   521  
   522  	// generate operands in "8-bit" registers.
   523  	var n1b gc.Node
   524  	gc.Regalloc(&n1b, nl.Type, res)
   525  
   526  	gc.Cgen(nl, &n1b)
   527  	var n2b gc.Node
   528  	gc.Regalloc(&n2b, nr.Type, nil)
   529  	gc.Cgen(nr, &n2b)
   530  
   531  	// perform full-width multiplication.
   532  	t := gc.Types[gc.TUINT64]
   533  
   534  	if nl.Type.IsSigned() {
   535  		t = gc.Types[gc.TINT64]
   536  	}
   537  	var n1 gc.Node
   538  	gc.Nodreg(&n1, t, int(n1b.Reg))
   539  	var n2 gc.Node
   540  	gc.Nodreg(&n2, t, int(n2b.Reg))
   541  	a := optoas(op, t)
   542  	gins(a, &n2, &n1)
   543  
   544  	// truncate.
   545  	gmove(&n1, res)
   546  
   547  	gc.Regfree(&n1b)
   548  	gc.Regfree(&n2b)
   549  	return true
   550  }
   551  
   552  func clearfat(nl *gc.Node) {
   553  	/* clear a fat object */
   554  	if gc.Debug['g'] != 0 {
   555  		gc.Dump("\nclearfat", nl)
   556  	}
   557  
   558  	// Avoid taking the address for simple enough types.
   559  	if gc.Componentgen(nil, nl) {
   560  		return
   561  	}
   562  
   563  	w := nl.Type.Width
   564  
   565  	if w > 1024 || (w >= 64 && (gc.Nacl || isPlan9)) {
   566  		var oldn1 gc.Node
   567  		var n1 gc.Node
   568  		savex(x86.REG_DI, &n1, &oldn1, nil, gc.Types[gc.Tptr])
   569  		gc.Agen(nl, &n1)
   570  
   571  		var ax gc.Node
   572  		var oldax gc.Node
   573  		savex(x86.REG_AX, &ax, &oldax, nil, gc.Types[gc.Tptr])
   574  		gconreg(x86.AMOVL, 0, x86.REG_AX)
   575  		gconreg(movptr, w/8, x86.REG_CX)
   576  
   577  		gins(x86.AREP, nil, nil)   // repeat
   578  		gins(x86.ASTOSQ, nil, nil) // STOQ AL,*(DI)+
   579  
   580  		if w%8 != 0 {
   581  			n1.Op = gc.OINDREG
   582  			clearfat_tail(&n1, w%8)
   583  		}
   584  
   585  		restx(&n1, &oldn1)
   586  		restx(&ax, &oldax)
   587  		return
   588  	}
   589  
   590  	if w >= 64 {
   591  		var oldn1 gc.Node
   592  		var n1 gc.Node
   593  		savex(x86.REG_DI, &n1, &oldn1, nil, gc.Types[gc.Tptr])
   594  		gc.Agen(nl, &n1)
   595  
   596  		var vec_zero gc.Node
   597  		var old_x0 gc.Node
   598  		savex(x86.REG_X0, &vec_zero, &old_x0, nil, gc.Types[gc.TFLOAT64])
   599  		gins(x86.AXORPS, &vec_zero, &vec_zero)
   600  
   601  		if di := dzDI(w); di != 0 {
   602  			gconreg(addptr, di, x86.REG_DI)
   603  		}
   604  		p := gins(obj.ADUFFZERO, nil, nil)
   605  		p.To.Type = obj.TYPE_ADDR
   606  		p.To.Sym = gc.Linksym(gc.Pkglookup("duffzero", gc.Runtimepkg))
   607  		p.To.Offset = dzOff(w)
   608  
   609  		if w%16 != 0 {
   610  			n1.Op = gc.OINDREG
   611  			n1.Xoffset -= 16 - w%16
   612  			gins(x86.AMOVUPS, &vec_zero, &n1)
   613  		}
   614  
   615  		restx(&vec_zero, &old_x0)
   616  		restx(&n1, &oldn1)
   617  		return
   618  	}
   619  
   620  	// NOTE: Must use agen, not igen, so that optimizer sees address
   621  	// being taken. We are not writing on field boundaries.
   622  	var n1 gc.Node
   623  	gc.Agenr(nl, &n1, nil)
   624  	n1.Op = gc.OINDREG
   625  
   626  	clearfat_tail(&n1, w)
   627  
   628  	gc.Regfree(&n1)
   629  }
   630  
   631  func clearfat_tail(n1 *gc.Node, b int64) {
   632  	if b >= 16 && isPlan9 {
   633  		var z gc.Node
   634  		gc.Nodconst(&z, gc.Types[gc.TUINT64], 0)
   635  		q := b / 8
   636  		for ; q > 0; q-- {
   637  			n1.Type = z.Type
   638  			gins(x86.AMOVQ, &z, n1)
   639  			n1.Xoffset += 8
   640  			b -= 8
   641  		}
   642  		if b != 0 {
   643  			n1.Xoffset -= 8 - b
   644  			gins(x86.AMOVQ, &z, n1)
   645  		}
   646  		return
   647  	}
   648  	if b >= 16 {
   649  		var vec_zero gc.Node
   650  		gc.Regalloc(&vec_zero, gc.Types[gc.TFLOAT64], nil)
   651  		gins(x86.AXORPS, &vec_zero, &vec_zero)
   652  
   653  		for b >= 16 {
   654  			gins(x86.AMOVUPS, &vec_zero, n1)
   655  			n1.Xoffset += 16
   656  			b -= 16
   657  		}
   658  
   659  		// MOVUPS X0, off(base) is a few bytes shorter than MOV 0, off(base)
   660  		if b != 0 {
   661  			n1.Xoffset -= 16 - b
   662  			gins(x86.AMOVUPS, &vec_zero, n1)
   663  		}
   664  
   665  		gc.Regfree(&vec_zero)
   666  		return
   667  	}
   668  
   669  	// Write sequence of MOV 0, off(base) instead of using STOSQ.
   670  	// The hope is that although the code will be slightly longer,
   671  	// the MOVs will have no dependencies and pipeline better
   672  	// than the unrolled STOSQ loop.
   673  	var z gc.Node
   674  	gc.Nodconst(&z, gc.Types[gc.TUINT64], 0)
   675  	if b >= 8 {
   676  		n1.Type = z.Type
   677  		gins(x86.AMOVQ, &z, n1)
   678  		n1.Xoffset += 8
   679  		b -= 8
   680  
   681  		if b != 0 {
   682  			n1.Xoffset -= 8 - b
   683  			gins(x86.AMOVQ, &z, n1)
   684  		}
   685  		return
   686  	}
   687  
   688  	if b >= 4 {
   689  		gc.Nodconst(&z, gc.Types[gc.TUINT32], 0)
   690  		n1.Type = z.Type
   691  		gins(x86.AMOVL, &z, n1)
   692  		n1.Xoffset += 4
   693  		b -= 4
   694  
   695  		if b != 0 {
   696  			n1.Xoffset -= 4 - b
   697  			gins(x86.AMOVL, &z, n1)
   698  		}
   699  		return
   700  	}
   701  
   702  	if b >= 2 {
   703  		gc.Nodconst(&z, gc.Types[gc.TUINT16], 0)
   704  		n1.Type = z.Type
   705  		gins(x86.AMOVW, &z, n1)
   706  		n1.Xoffset += 2
   707  		b -= 2
   708  	}
   709  
   710  	gc.Nodconst(&z, gc.Types[gc.TUINT8], 0)
   711  	for b > 0 {
   712  		n1.Type = z.Type
   713  		gins(x86.AMOVB, &z, n1)
   714  		n1.Xoffset++
   715  		b--
   716  	}
   717  
   718  }
   719  
   720  // Called after regopt and peep have run.
   721  // Expand CHECKNIL pseudo-op into actual nil pointer check.
   722  func expandchecks(firstp *obj.Prog) {
   723  	var p1 *obj.Prog
   724  	var p2 *obj.Prog
   725  
   726  	for p := firstp; p != nil; p = p.Link {
   727  		if p.As != obj.ACHECKNIL {
   728  			continue
   729  		}
   730  		if gc.Debug_checknil != 0 && p.Lineno > 1 { // p->lineno==1 in generated wrappers
   731  			gc.Warnl(p.Lineno, "generated nil check")
   732  		}
   733  
   734  		// check is
   735  		//	CMP arg, $0
   736  		//	JNE 2(PC) (likely)
   737  		//	MOV AX, 0
   738  		p1 = gc.Ctxt.NewProg()
   739  
   740  		p2 = gc.Ctxt.NewProg()
   741  		gc.Clearp(p1)
   742  		gc.Clearp(p2)
   743  		p1.Link = p2
   744  		p2.Link = p.Link
   745  		p.Link = p1
   746  		p1.Lineno = p.Lineno
   747  		p2.Lineno = p.Lineno
   748  		p1.Pc = 9999
   749  		p2.Pc = 9999
   750  		p.As = cmpptr
   751  		p.To.Type = obj.TYPE_CONST
   752  		p.To.Offset = 0
   753  		p1.As = x86.AJNE
   754  		p1.From.Type = obj.TYPE_CONST
   755  		p1.From.Offset = 1 // likely
   756  		p1.To.Type = obj.TYPE_BRANCH
   757  		p1.To.Val = p2.Link
   758  
   759  		// crash by write to memory address 0.
   760  		// if possible, since we know arg is 0, use 0(arg),
   761  		// which will be shorter to encode than plain 0.
   762  		p2.As = x86.AMOVL
   763  
   764  		p2.From.Type = obj.TYPE_REG
   765  		p2.From.Reg = x86.REG_AX
   766  		if regtyp(&p.From) {
   767  			p2.To.Type = obj.TYPE_MEM
   768  			p2.To.Reg = p.From.Reg
   769  		} else {
   770  			p2.To.Type = obj.TYPE_MEM
   771  			p2.To.Reg = x86.REG_NONE
   772  		}
   773  
   774  		p2.To.Offset = 0
   775  	}
   776  }
   777  
   778  // addr += index*width if possible.
   779  func addindex(index *gc.Node, width int64, addr *gc.Node) bool {
   780  	switch width {
   781  	case 1, 2, 4, 8:
   782  		p1 := gins(x86.ALEAQ, index, addr)
   783  		p1.From.Type = obj.TYPE_MEM
   784  		p1.From.Scale = int16(width)
   785  		p1.From.Index = p1.From.Reg
   786  		p1.From.Reg = p1.To.Reg
   787  		return true
   788  	}
   789  	return false
   790  }
   791  
   792  // res = runtime.getg()
   793  func getg(res *gc.Node) {
   794  	var n1 gc.Node
   795  	gc.Regalloc(&n1, res.Type, res)
   796  	mov := optoas(gc.OAS, gc.Types[gc.Tptr])
   797  	p := gins(mov, nil, &n1)
   798  	p.From.Type = obj.TYPE_REG
   799  	p.From.Reg = x86.REG_TLS
   800  	p = gins(mov, nil, &n1)
   801  	p.From = p.To
   802  	p.From.Type = obj.TYPE_MEM
   803  	p.From.Index = x86.REG_TLS
   804  	p.From.Scale = 1
   805  	gmove(&n1, res)
   806  	gc.Regfree(&n1)
   807  }